I have a CUDA kernel that appears to have race condition and am trying to pinpoint where this race condition is coming from. I am aware of the 'racecheck' tool of cuda-memcheck, however racecheck tells me that there is no hazard when using small inputs which is actually consistent with my own investigations too. For large inputs though racecheck seems to take forever (literally) and so I can't use it.
Briefly explaining, a 1D vector d_mat_3d defined as a __device__ variable is filled with 0 and loaded in global memory. Two large arrays which are the inputs for the kernel (d_A and d_v) are also defined in main and passed to the kernal. A segment of array d_mat_3d, called mat_2d is cut, loaded in shared memory and some processing will be done on it. Then, mat_2d will be written back to d_mat_3d on global memory.
As shown here, atomic operations are used as without the use of atomic operations mat_2d would encounter a race condition b/w different threads.
The reason I guess I still have some sort of race condition going on is that the results of mat_3d is different every time.
Any thought as to where this race condition may come from? Any steps I can take to clear that out (other than the tool racecheck)? If you think, there is no evidence for race condition, can you explain why different values are assigned to d_mat_3d every time I execute the kernel?
CUDA 9.0 / NVidia Titan Black / Ubuntu 16.04
#include <cstdlib>
#include <sstream>
#include <cstdio>
#include <cuda.h>
#include <cuda_runtime_api.h>
#define W 7 // fix limit for loops in kernel
#define SIZE 100 // defining matrix dimension
#define N_ELEM 10000 // no of elements in each vector
#define NTPB 1024 // no of threads per block
using namespace std;
__device__ float d_mat_3d[SIZE*SIZE*SIZE];
__global__ void cuda_kernel(float *d_A, float *d_v){
__shared__ float mat_2d[SIZE*SIZE]; // a 2D slice of 3D matrix d_mat_3d
unsigned int n = blockDim.x*blockIdx.x+threadIdx.x;
if(n >= N_ELEM)
return;
int x, y, z, i;
float r;
float A = d_A[n];
float v = d_v[n];
#pragma unroll
for(x=0; x<SIZE; x++){
// load mat_2d (on shared memory) using d_mat_3d (on global memory)
for(i=0; i<SIZE*SIZE; i++){
mat_2d[i] = d_mat_3d[i+x*SIZE*SIZE];
}
// sync threads as mat_2d is on shared memory
__syncthreads();
for(y=SIZE/2; y<SIZE/2+W; y++){
for(z=SIZE/2; z<SIZE/2+W; z++){
r = sqrt( pow(A,2) / v ); // no need to be in these loops. I know, but for my real case, it must be.
atomicAdd(&mat_2d[z+y*SIZE], r); // atomically add r
}
}
__syncthreads();
// write mat_2d (shared memory) back to mat_3d (global memory)
for(i=0; i<SIZE*SIZE; i++){
d_mat_3d[i+x*SIZE*SIZE] = mat_2d[i];
}
}
}
// this function writes h_mat_3d to disk.
void write_image(float *h_mat_3d){
ostringstream o_addToFile;
o_addToFile << "mat3d.bin";
FILE *pFile;
pFile = fopen(o_addToFile.str().c_str(), "wb");
for(int i=0; i<SIZE*SIZE*SIZE; i++){
fwrite(&h_mat_3d[i], sizeof(float), 1, pFile);
}
fclose (pFile);
}
int main(){
int i;
float *h_A = new float[N_ELEM]; // some large vector
float *h_v = new float[N_ELEM]; // some other large vector
float h_mat_3d[SIZE*SIZE*SIZE]; // will be filled w/ 0
float *d_A; // device variables
float *d_v;
for(i=0; i<N_ELEM; i++){
h_A[i] = 0.2f+(float)i/N_ELEM; // fill out with some calculations
h_v[i] = 0.5f+2.f*i/N_ELEM;
}
for(i=0; i<SIZE*SIZE*SIZE; i++){
h_mat_3d[i] = 0.f; // fill h_mat_3d with 0
}
cudaMalloc((void **)&d_A, sizeof(float)*N_ELEM); // allocate variables on device
cudaMalloc((void **)&d_v, sizeof(float)*N_ELEM);
cudaMemcpy(d_A, h_A, sizeof(float)*N_ELEM, cudaMemcpyHostToDevice); // copy from host to device
cudaMemcpy(d_v, h_v, sizeof(float)*N_ELEM, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(d_mat_3d, &h_mat_3d, sizeof(float)*SIZE*SIZE*SIZE); // copy h_mat_3d to device
cuda_kernel<<<(N_ELEM+NTPB-1)/NTPB,NTPB>>>(d_A, d_v); // execute kernel
cudaMemcpyFromSymbol(h_mat_3d, d_mat_3d, sizeof(float)*SIZE*SIZE*SIZE); // write it back to h_mat_3d
write_image(h_mat_3d); // write h_mat_3d to disk for checking
cudaFree(d_A); // free memory
cudaFree(d_v);
delete [] h_A;
delete [] h_v;
return 0;
}