I am trying to implement the dot product in CUDA and compare the result with what MATLAB returns. My CUDA code (based on this tutorial) is the following:
#include <stdio.h>
#define N (2048 * 8)
#define THREADS_PER_BLOCK 512
#define num_t float
// The kernel - DOT PRODUCT
__global__ void dot(num_t *a, num_t *b, num_t *c)
{
__shared__ num_t temp[THREADS_PER_BLOCK];
int index = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x] = a[index] * b[index];
__syncthreads(); //Synchronize!
*c = 0.00;
// Does it need to be tid==0 that
// undertakes this task?
if (0 == threadIdx.x) {
num_t sum = 0.00;
int i;
for (i=0; i<THREADS_PER_BLOCK; i++)
sum += temp[i];
atomicAdd(c, sum);
//WRONG: *c += sum; This read-write operation must be atomic!
}
}
// Initialize the vectors:
void init_vector(num_t *x)
{
int i;
for (i=0 ; i<N ; i++){
x[i] = 0.001 * i;
}
}
// MAIN
int main(void)
{
num_t *a, *b, *c;
num_t *dev_a, *dev_b, *dev_c;
size_t size = N * sizeof(num_t);
cudaMalloc((void**)&dev_a, size);
cudaMalloc((void**)&dev_b, size);
cudaMalloc((void**)&dev_c, size);
a = (num_t*)malloc(size);
b = (num_t*)malloc(size);
c = (num_t*)malloc(size);
init_vector(a);
init_vector(b);
cudaMemcpy(dev_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size, cudaMemcpyHostToDevice);
dot<<<N/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(dev_a, dev_b, dev_c);
cudaMemcpy(c, dev_c, sizeof(num_t), cudaMemcpyDeviceToHost);
printf("a = [\n");
int i;
for (i=0;i<10;i++){
printf("%g\n",a[i]);
}
printf("...\n");
for (i=N-10;i<N;i++){
printf("%g\n",a[i]);
}
printf("]\n\n");
printf("a*b = %g.\n", *c);
free(a); free(b); free(c);
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
}
and I compile it with:
/usr/local/cuda-5.0/bin/nvcc -m64 -I/usr/local/cuda-5.0/include -gencode arch=compute_20,code=sm_20 -o multi_dot_product.o -c multi_dot_product.cu
g++ -m64 -o multi_dot_product multi_dot_product.o -L/usr/local/cuda-5.0/lib64 -lcudart
Information about my NVIDIA cards can be found at http://pastebin.com/8yTzXUuK. I tried to verify the result in MATLAB using the following simple code:
N = 2048 * 8;
a = zeros(N,1);
for i=1:N
a(i) = 0.001*(i-1);
end
dot_product = a'*a;
But as N increases, I'm getting significantly different results (For instance, for N=2048*32 CUDA reutrns 6.73066e+07 while MATLAB returns 9.3823e+07. For N=2048*64 CUDA gives 3.28033e+08 while MATLAB gives 7.5059e+08). I incline to believe that the discrepancy stems from the use of float in my C code, but if I replace it with double the compiler complains that atomicAdd does not support double parameters. How should I fix this problem?
Update: Also, for high values of N (e.g. 2048*64), I noticed that the result returned by CUDA changes at every run. This does not happen if N is low (e.g. 2048*8).
At the same time I have a more fundamental question: The variable temp is an array of size THREADS_PER_BLOCK and is shared between threads in the same block. Is it also shared between blocks or every block operates on a different copy of this variable? Should I think of the method dot as instructions to every block? Can someone elaborate on how exactly the jobs are split and how the variables are shared in this example
__shared__variable, including yourtemp. You can dodoubleatomicAdd using the method outlined here. - Robert Crovella*c = 0.00;unprotected by any synchronization in your kernel code is messing you up. Initializecto zero before you call your kernel, and eliminate that line from your kernel. Every thread of every block, whenever they happen to execute, is zeroing outcas you have written it. - Robert Crovella