0
votes

I study CUDA architecture.

I made some of parallel processing code in environment like below.

GPU : GTX580 (CC is 2.0)

Threads Per Block : 16x16 = 256

Registers Per Thread : 16

Shared Memory Per Block : 48 bytes

I know the number of Registers and Shared Memory size by the compile option: --ptxas-options=-v In addition, grid size is 32x32 = 1024 and there is not extra shared memory.

So, I tried to use CUDA_Occupancy_Calculator by NVIDIA. then, It said,

3.) GPU Occupancy Data is displayed here and in the graphs: Active Threads per Multiprocessor 1536 Active Warps per Multiprocessor 48 Active Thread Blocks per Multiprocessor 6 Occupancy of each Multiprocessor 100%

So, I run the application. But, the result showed that the block size is 8x8 faster than 16x16.

8x8 means the block size, and the gird size is 64x64. 16x16 means the block size, and the grid size is 32x32. So, the total amount of threads is same. It's unchanged.

I don't know the why. Please help me.

Following code is a part of my Program.

void LOAD_VERTEX(){
        MEM[0] = 60;    //y0 
        MEM[1] = 50;    //x0
        MEM[2] = 128;   //r0
        MEM[3] = 0;     //g0
        MEM[4] = 70;    //b0
        MEM[5] = 260;
        MEM[6] = 50;
        MEM[7] = 135;
        MEM[8] = 70;
        MEM[9] = 0;
        MEM[10] = 260;
        MEM[11] = 250;
        MEM[12] = 0;
        MEM[13] = 200;
        MEM[14] = 55;
        MEM[15] = 60;
        MEM[16] = 250;
        MEM[17] = 55;
        MEM[18] = 182;
        MEM[19] = 100;
        MEM[20] = 30;
        MEM[21] = 330;
        MEM[22] = 72;
        MEM[23] = 12;
        MEM[24] = 25;
        MEM[25] = 30;
        MEM[26] = 130;
        MEM[27] = 80;
        MEM[28] = 255;
        MEM[29] = 15;
        MEM[30] = 230; 
        MEM[31] = 330;
        MEM[32] = 56;   
        MEM[33] = 186;  
        MEM[34] = 201;
}

__global__ void PRINT_POLYGON( unsigned char *IMAGEin, int *MEMin, int dev_ID, int a, int b, int c)
{
        int i = blockIdx.x*TILE_WIDTH + threadIdx.x;
        int j = blockIdx.y*TILE_HEIGHT + threadIdx.y;

        float result_a, result_b;
        int temp[15];
        int k;

        for(k = 0; k < 5; k++){
                temp[k] = a*5+k;
                temp[k+5] = b*5+k;
                temp[k+10] = c*5+k;
        }

        int result_a_up = ((MEMin[temp[11]]-MEMin[temp[1]])*(i-MEMin[temp[0]]))-((MEMin[temp[10]]-MEMin[temp[0]])*(j-MEMin[temp[1]]));
        int result_a_down = ((MEMin[temp[11]]-MEMin[temp[1]])*(MEMin[temp[5]]-MEMin[temp[0]]))-((MEMin[temp[6]]-MEMin[temp[1]])*(MEMin[temp[10]]-MEMin[temp[0]]));

        int result_b_up = ((MEMin[temp[6]] -MEMin[temp[1]])*(MEMin[temp[0]]-i))-((MEMin[temp[5]] -MEMin[temp[0]])*(MEMin[temp[1]]-j));
        int result_b_down = ((MEMin[temp[11]]-MEMin[temp[1]])*(MEMin[temp[5]]-MEMin[temp[0]]))-((MEMin[temp[6]]-MEMin[temp[1]])*(MEMin[temp[10]]-MEMin[temp[0]]));

        result_a = float(result_a_up) / float(result_a_down);
        result_b = float(result_b_up) / float(result_b_down);

        int isIn = (0 <= result_a && result_a <=1) && ((0 <= result_b && result_b <= 1)) && ((0 <= (result_a+result_b) && (result_a+result_b) <= 1));

        IMAGEin[(i*HEIGHTs+j)*CHANNELS] += (int)(float(MEMin[temp[2]]) + (float(MEMin[temp[7]])-float(MEMin[temp[2]]))*result_a + (float(MEMin[temp[12]])-float(MEMin[temp[2]]))*result_b) * isIn;      //Red Channel
        IMAGEin[(i*HEIGHTs+j)*CHANNELS+1] += (int)(float(MEMin[temp[3]]) + (float(MEMin[temp[8]])-float(MEMin[temp[3]]))*result_a + (float(MEMin[temp[13]])-float(MEMin[temp[3]]))*result_b) * isIn;    //Green Channel
        IMAGEin[(i*HEIGHTs+j)*CHANNELS+2] += (int)(float(MEMin[temp[4]]) + (float(MEMin[temp[9]])-float(MEMin[temp[4]]))*result_a + (float(MEMin[temp[14]])-float(MEMin[temp[4]]))*result_b) * isIn;    //Blue Channel

}

//The information each device
struct DataStruct {
    int                 deviceID;
    unsigned char       IMAGE_SEG[WIDTH*HEIGHTs*CHANNELS];
};

void* routine( void *pvoidData ) {
        DataStruct  *data = (DataStruct*)pvoidData;
        unsigned char *dev_IMAGE;
        int *dev_MEM;
        unsigned char *IMAGE_SEG = data->IMAGE_SEG;

        HANDLE_ERROR(cudaSetDevice(data->deviceID));

        //initialize array
        memset(IMAGE_SEG, 0, WIDTH*HEIGHTs*CHANNELS);

        printf("Device %d Starting..\n", data->deviceID);

        //Evaluate Time
        cudaEvent_t start, stop;
        cudaEventCreate( &start );
        cudaEventCreate( &stop );

        HANDLE_ERROR( cudaMalloc( (void **)&dev_MEM, sizeof(int)*35) );                                //Creating int array each Block
        HANDLE_ERROR( cudaMalloc( (void **)&dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS) ); //output array

        cudaMemcpy(dev_MEM, MEM, sizeof(int)*256, cudaMemcpyHostToDevice);
        cudaMemset(dev_IMAGE, 0, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS);

        dim3    grid(WIDTH/TILE_WIDTH, HEIGHTs/TILE_HEIGHT);            //blocks in a grid
        dim3    block(TILE_WIDTH, TILE_HEIGHT);                         //threads in a block

        cudaEventRecord(start, 0);

        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 1, 2);                    //Start the Kernel
        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 2, 3);                    //Start the Kernel
        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 3, 4);                    //Start the Kernel
        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 0, 4, 5);                    //Start the Kernel
        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 3, 2, 4);                    //Start the Kernel
        PRINT_POLYGON<<<grid,block>>>( dev_IMAGE, dev_MEM, data->deviceID, 2, 6, 4);                    //Start the Kernel

        cudaEventRecord(stop, 0);
        cudaEventSynchronize(stop);

        HANDLE_ERROR( cudaMemcpy( IMAGE_SEG, dev_IMAGE, sizeof(unsigned char)*WIDTH*HEIGHTs*CHANNELS, cudaMemcpyDeviceToHost ) );
        HANDLE_ERROR( cudaFree( dev_MEM ) );
        HANDLE_ERROR( cudaFree( dev_IMAGE ) );

        cudaEventElapsedTime( &elapsed_time_ms[data->deviceID], start, stop );          //Calculate elapsed time
        cudaEventDestroy(start);
        cudaEventDestroy(stop);

        printf("Algorithm Elapsed Time : %f ms(Device %d)\n", elapsed_time_ms[data->deviceID], data->deviceID);
        printf("Device %d Complete!\n", data->deviceID);

        return 0;
}

int main( void )
{       
        int i;
        CUTThread thread[7];

        printf("Program Start.\n");                     
        LOAD_VERTEX();

        DataStruct data[DEVICENUM];                     //define device info

        for(i = 0; i < DEVICENUM; i++){
                data[i].deviceID = i;
                thread[i] = start_thread(routine, &(data[i]));
        }

        for(i = 0; i < DEVICENUM; i++){
                end_thread(thread[i]);
        }

        cudaFreeHost(MEM);

    return 0;
}
1
Is you question really "why does the 16x16 case run slower than the 8x8 case, when the 16x16 case achieves 100% occupany?"talonmies
Totally right, talonmies!strawnut

1 Answers

1
votes

Since you copied over your question from the Nvidia forum, I'll copy my answer as well:

For your kernel your finding of reduced performance with higher occupancy is easily explained by the cache overflowing for higher occupancy.

The local array temp[] at full occupancy requires 1536×15×4=92160 bytes of cache, while at 33% occupancy (for the smaller 8×8 block size) only 512×15×4=30720 bytes are required per SM. With the larger 48kB cache/SM setting the latter could be fully cached eliminating off-chip memory accesses for temp[] almost completely, but even in the default 16kB cache/SM setting the cache hit probability is substantially higher.

As the temp[] array is not needed anyway, the fastest option (at either occupancy) would be to completely eliminate it. The compiler might already be able to achieve this if you just insert a #pragma unroll before the initialization loop. Otherwise replace all uses of temp[] with a little macro or inline function, or even just substitute the result into the code (which in this case I would even find more readable).