I have a sequential smoothing algorithm
void triangularSmooth(unsigned char *grayImage, unsigned char *smoothImage, const int width, const int height, const float *filter, NSTimer &timer, dim3 grid_size, dim3 block_size) {
for ( int y = 0; y < height; y++ ) {
for ( int x = 0; x < width; x++ ) {
unsigned int filterItem = 0;
float filterSum = 0.0f;
float smoothPix = 0.0f;
for ( int fy = y - 2; fy < y + 3; fy++ ) {
for ( int fx = x - 2; fx < x + 3; fx++ ) {
if ( ((fy < 0) || (fy >= height)) || ((fx < 0) || (fx >= width)) ) {
filterItem++;
continue;
}
smoothPix += grayImage[(fy * width) + fx] * filter[filterItem];
filterSum += filter[filterItem];
filterItem++;
}
}
smoothPix /= filterSum;
smoothImage[(y * width) + x] = static_cast< unsigned char >(smoothPix);
}
}
}
I am implementing in CUDA and wish to use a shared variable to hold the pixels in grayImage. However before that, I'm trying to run it as it is. To this end I have kernel code:
__global__ void smooth(unsigned char *grayImage, unsigned char *smoothImage, const int width, const int height, const float *filter)
{
int x = blockIdx.x*blockDim.x + threadIdx.x;
int y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int filterItem = 0;
float filterSum = 0.0f;
float smoothPix = 0.0f;
for ( int fy = y - 2; fy < y + 3; fy++ ) {
for ( int fx = x - 2; fx < x + 3; fx++ ) {
if ( ((fy < 0) || (fy >= height)) || ((fx < 0) || (fx >= width)) ) {
filterItem++;
continue;
}
smoothPix += grayImage[(fy * width) + fx] * filter[filterItem];
filterSum += filter[filterItem];
filterItem++;
}
}
smoothPix /= filterSum;
smoothImage[(y * width) + x] = static_cast< unsigned char >(smoothPix);
}
And calling with:
const float filter[] = {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 2.0f, 3.0f, 2.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f};
dim3 gridSize((width*height)/1024,(width*height)/1024,1);
dim3 blockSize(256,256,1);
smooth <<< gridSize, blockSize >>> (grayImage, smoothImage, width, height, filter);
cudaDeviceSynchronize();
The problem is that, the resulting smooth image looking like the pixels are all in the wrong other (mixed up). Is this from the dimensions of the grid and block? I've tried a LOT of other possible dimensions. What would be the right way?
I'm using a GTX480, version - 2.x, Maximum dimensionality of grid of thread blocks - 3, Maximum x-, y-, or z-dimension of a grid of thread blocks - 65535, Maximum Number of Threads per Block - 1024