I have two versions of the same algorithm. It was originally the convolution but I modified it to reduce it to this to check where is my bottle neck (note that there is a single access to global memory per loop):
__global__
void convolve (unsigned char * Md, float * Kd, unsigned char * Rd, int width, int height, int kernel_size, int tile_width, int channels){
int row = blockIdx.y*tile_width + threadIdx.y;
int col = blockIdx.x*tile_width + threadIdx.x;
int sum = 0;
int pixel;
int local_pixel;
int working_pixel;
int row_offset = (kernel_size/2)*(width+kernel_size-1);
int col_offset = kernel_size/2;
for(int color=0; color<channels; color++){
pixel = color*width*height + row*width + col;
local_pixel = color*(width+kernel_size-1)*(height+kernel_size-1) + row*(width+kernel_size-1) + col + row_offset + col_offset;
if(row < height && col < width){
for(int x=(-1)*kernel_size/2; x<=kernel_size/2; x++)
for(int y=(-1)*kernel_size/2; y<=kernel_size/2; y++){
working_pixel = local_pixel + x + y*(width+kernel_size-1);
sum += (int)((float)Md[working_pixel]);// * ((float)Kd[x+kernel_size/2 + (y+kernel_size/2)*kernel_size]);
}
Rd[pixel] = (int) sum;
sum = 0;
}
}
}
and this is the shared memory version (one single access to shared memory per loop)
__global__
void convolve (unsigned char * Md, float * Kd, unsigned char * Rd, int width, int height, int kernel_size, int tile_width, int channels){
__shared__ unsigned char Mds[256 + 16*4 +4];
int row = blockIdx.y*tile_width + threadIdx.y;
int col = blockIdx.x*tile_width + threadIdx.x;
if(row < height && col < width){
int sum = 0;
int pixel; //the pixel to copy from Md (the input image)
int local_pixel; //the pixel in shared memory
int start_pixel; //the offset to copy the borders
int mds_width = tile_width+kernel_size-1;
int md_width = width+kernel_size-1;
int md_height = height+kernel_size-1;
for(int color=0; color<channels; color++){
pixel = color*md_width*md_height + row*md_width + col + (kernel_size/2)*md_width + kernel_size/2; //position (including borders) + offset
local_pixel = threadIdx.y*mds_width + threadIdx.x + (kernel_size/2)*mds_width + kernel_size/2; //position + offset
//Loading the pixels
Mds[local_pixel] = Md[pixel];//bringing the central pixel itself (position + offset)
__syncthreads();
//Convolving
for(int x=(-1)*kernel_size/2; x<=kernel_size/2; x++)
for(int y=(-1)*kernel_size/2; y<=kernel_size/2; y++)
sum += (int)((float)Mds[local_pixel + x + y*mds_width]); // * ((float)Kd[x+kernel_size/2 + (y+kernel_size/2)*kernel_size]);
Rd[color*width*height + row*width + col] = (int) sum;
sum = 0;
__syncthreads();
}
}
}
the executions parameters are
convolve<<<dimGrid,dimBlock>>>(Md,Kd,Rd,width,new_height,kernel_size,block_size,colors);
dimGrid = (1376,768)
dimBlock = (16,16)
Md is the read only image
Kd is the filter (3x3)
width = 22016
height = 12288
kernel_size = 3
block_size=16
colors=3
I obtain 1249.59 ms with the first algorithm and 1178.2 ms with the second one, which I find ridiculous. I think that the number of registers should not be a problem. Compiling with ptxas I get:
ptxas info: 560 bytes gmem, 52 bytes cmem[14]
ptxas info: Compiling entry function '_Z8convolvePhPfS_iiiii' for 'sm_10'
ptxas info: Used 16 registers, 384 bytes smem, 4 bytes cmem[1]
while the info of my device is:
Name: GeForce GTX 660 Ti
Minor Compute Capability: 0
Major Compute Capability: 3
Warp Size: 32
Max Treads per Block: 1024
Max Threads Dimension: (1024,1024,64)
Max Grid Size: (2147483647,65535,65535)
Number of SM: 7
Max Threads Per SM: 2048
Regs per Block (SM): 65536
Total global Memory: 2146762752
Shared Memory per Block: 49152
Does anyone remotely have any hint about this poor improvement? I don't know anybody else to ask..
EDIT: I'm using today a different nvidia card since I cannot access the lab. It also has compute capability 3.0. I put both if statements out of the loop. I'm compiling with -arch compute_30 -code sm_30 I remove all the castings. The global matrix is now declared as const unsigned char * restrict Md I used this time a 9x9 filter which makes each pixel be reused 81 times after be brought in shared memory.
I get 3138.41 ms (global version) and 3120.96 ms (shared version) from the terminal. In the visual profiler it takes longer. This is what I get (screenshots) http://cl.ly/image/1X372l242S2u
as lost as I was..
Please find here this algorithm easy to compile and execute:
./convolution 8000 4000 159 9 edge_detection_9.txt 0 for the global memory version ./convolution 8000 4000 159 9 edge_detection_9.txt 1 for the shared memory version