I witnessed a very interesting behaviour when using CUDA 4.2 and driver 295.41 on Linux. The code itself is nothing more than finding the maximum value of a random matrix and labelling the position to be 1.
#include <stdio.h>
#include <stdlib.h>
const int MAX = 8;
static __global__ void position(int* d, int len) {
int idx = threadIdx.x + blockIdx.x*blockDim.x;
if (idx < len)
d[idx] = (d[idx] == MAX) ? 1 : 0;
}
int main(int argc, const char** argv) {
int colNum = 16*512, rowNum = 1024;
int len = rowNum * colNum;
int* h = (int*)malloc(len*sizeof(int));
int* d = NULL;
cudaMalloc((void**)&d, len*sizeof(int));
// get a random matrix
for (int i = 0; i < len; i++) {
h[i] = rand()%(MAX+1);
}
// launch kernel
int threads = 128;
cudaMemcpy(d, h, len*sizeof(int), cudaMemcpyHostToDevice);
position<<<(len-1)/threads+1, threads>>>(d, len);
cudaMemcpy(h, d, len*sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d);
free(h);
return 0;
}
When I set the rowNum = 1024, the code does not work at all as if the kernel has never been launched. If rowNum = 1023, everything works fine.
And this rowNum value is somehow convoluted with the block size (in this example, 128), if I change the block size to be 512, the behaviour happens between rowNum = 4095 and 4096.
I'm not quite sure if this is a bug or did I miss anything?