I study CUDA architecture.
I made some of parallel processing code in environment like below.
GPU : GTX580 (CC is 2.0)
Threads Per Block : 16x16 = 256
Registers Per Thread : 16
Shared Memory Per Block : 48 bytes
I know the number of Registers and Shared Memory size by the compile option: --ptxas-options=-v In addition, grid size is 32x32 = 1024 and there is not extra shared memory.
So, I tried to use CUDA_Occupancy_Calculator by NVIDIA. then, It said,
3.) GPU Occupancy Data is displayed here and in the graphs: Active Threads per Multiprocessor 1536 Active Warps per Multiprocessor 48 Active Thread Blocks per Multiprocessor 6 Occupancy of each Multiprocessor 100%
So, I run the application. But, the result showed that the block size is 8x8 faster than 16x16.
8x8 means the block size, and the gird size is 64x64. 16x16 means the block size, and the grid size is 32x32. So, the total amount of threads is same. It's unchanged.
I don't know the why. Please help me.
Following code is a part of my Program.
void LOAD_VERTEX(){
MEM[0] = 60; //y0
MEM[1] = 50; //x0
MEM[2] = 128; //r0
MEM[3] = 0; //g0
MEM[4] = 70; //b0
MEM[5] = 260;
MEM[6] = 50;
MEM[7] = 135;
MEM[8] = 70;
MEM[9] = 0;
MEM[10] = 260;
MEM[11] = 250;
MEM[12] = 0;
MEM[13] = 200;
MEM[14] = 55;
MEM[15] = 60;
MEM[16] = 250;
MEM[17] = 55;
MEM[18] = 182;
MEM[19] = 100;
MEM[20] = 30;
MEM[21] = 330;
MEM[22] = 72;
MEM[23] = 12;
MEM[24] = 25;
MEM[25] = 30;
MEM[26] = 130;
MEM[27] = 80;
MEM[28] = 255;
MEM[29] = 15;
MEM[30] = 230;
MEM[31] = 330;
MEM[32] = 56;
MEM[33] = 186;
MEM[34] = 201;
}
__global__ void PRINT_POLYGON( unsigned char *IMAGEin, int *MEMin, int dev_ID, int a, int b, int c)
{
int i = blockIdx.x*TILE_WIDTH + threadIdx.x;
int j = blockIdx.y*TILE_HEIGHT + threadIdx.y;
float result_a, result_b;
int temp[15];
int k;
for(k = 0; k < 5; k++){
temp[k] = a*5+k;
temp[k+5] = b*5+k;
temp[k+10] = c*5+k;
}
int result_a_up = ((MEMin[temp[11]]-MEMin[temp[1]])*(i-MEMin[temp[0]]))-((MEMin[temp[10]]-MEMin[temp[0]])*(j-MEMin[temp[1]]));
int result_a_down = ((MEMin[temp[11]]-MEMin[temp[1]])*(MEMin[temp[5]]-MEMin[temp[0]]))-((MEMin[temp[6]]-MEMin[temp[1]])*(MEMin[temp[10]]-MEMin[temp[0]]));
int result_b_up = ((MEMin[temp[6]] -MEMin[temp[1]])*(MEMin[temp[0]]-i))-((MEMin[temp[5]] -MEMin[temp[0]])*(MEMin[temp[1]]-j));
int result_b_down = ((MEMin[temp[11]]-MEMin[temp[1]])*(MEMin[temp[5]]-MEMin[temp[0]]))-((MEMin[temp[6]]-MEMin[temp[1]])*(MEMin[temp[10]]-MEMin[temp[0]]));
result_a = float(result_a_up) / float(result_a_down);
result_b = float(result_b_up) / float(result_b_down);
int isIn = (0 <= result_a && result_a <=1) && ((0 <= result_b && result_b <= 1)) && ((0 <= (result_a+result_b) && (result_a+result_b) <= 1));
IMAGEin[(i*HEIGHTs+j)*CHANNELS] += (int)(float(MEMin[temp[2]]) + (float(MEMin[temp[7]])-float(MEMin[temp[2]]))*result_a + (float(MEMin[temp[12]])-float(MEMin[temp[2]]))*result_b) * isIn; //Red Channel
IMAGEin[(i*HEIGHTs+j)*CHANNELS+1] += (int)(float(MEMin[temp[3]]) + (float(MEMin[temp[8]])-float(MEMin[temp[3]]))*result_a + (float(MEMin[temp[13]])-float(MEMin[temp[3]]))*result_b) * isIn; //Green Channel
IMAGEin[(i*HEIGHTs+j)*CHANNELS+2] += (int)(float(MEMin[temp[4]]) + (float(MEMin[temp[9]])-float(MEMin[temp[4]]))*result_a + (float(MEMin[temp[14]])-float(MEMin[temp[4]]))*result_b) * isIn; //Blue Channel
}
//The information each device
struct DataStruct {
int deviceID;
unsigned char IMAGE_SEG[WIDTH*HEIGHTs*CHANNELS];
};
void* routine( void *pvoidData ) {
DataStruct *data = (DataStruct*)pvoidData;
unsigned char *dev_IMAGE;
int *dev_MEM;
unsigned char *IMAGE_SEG = data->IMAGE_SEG;
HANDLE_ERROR(cudaSetDevice(data->deviceID));
//initialize array
memset(IMAGE_SEG, 0, WIDTH*HEIGHTs*CHANNELS);
printf("Device %d Starting..\n", data->deviceID);
//Evaluate Time
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
HANDLE_ERROR( cudaMalloc( (void **)&dev_MEM, sizeof(int)*35) ); //Creating int array each Block
HANDLE_ERROR( cudaMalloc( (void **)&dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS) ); //output array
cudaMemcpy(dev_MEM, MEM, sizeof(int)*256, cudaMemcpyHostToDevice);
cudaMemset(dev_IMAGE, 0, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS);
dim3 grid(WIDTH/TILE_WIDTH, HEIGHTs/TILE_HEIGHT); //blocks in a grid
dim3 block(TILE_WIDTH, TILE_HEIGHT); //threads in a block
cudaEventRecord(start, 0);
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 1, 2); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 2, 3); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 3, 4); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 4, 5); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 3, 2, 4); //Start the Kernel
PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 2, 6, 4); //Start the Kernel
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
HANDLE_ERROR( cudaMemcpy( IMAGE_SEG, dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS, cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaFree( dev_MEM ) );
HANDLE_ERROR( cudaFree( dev_IMAGE ) );
cudaEventElapsedTime( &elapsed_time_ms[data->deviceID], start, stop ); //Calculate elapsed time
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("Algorithm Elapsed Time : %f ms(Device %d)\n", elapsed_time_ms[data->deviceID], data->deviceID);
printf("Device %d Complete!\n", data->deviceID);
return 0;
}
int main( void )
{
int i;
CUTThread thread[7];
printf("Program Start.\n");
LOAD_VERTEX();
DataStruct data[DEVICENUM]; //define device info
for(i = 0; i < DEVICENUM; i++){
data[i].deviceID = i;
thread[i] = start_thread(routine, &(data[i]));
}
for(i = 0; i < DEVICENUM; i++){
end_thread(thread[i]);
}
cudaFreeHost(MEM);
return 0;
}