I've created a simple but complete program basing on this tutorial: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#device-memory
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
//Kernel definition
__global__ void VecAdd(float* A, float* B, float* C,int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < N)
C[i] = A[i] + B[i];
}
//Host code
int main()
{
int N = 1000;
int i;
FILE *f;
size_t size = N * sizeof(float);
//allocate input vectors h_A and h_B in host memory
float *h_A = (float*)malloc(size);
float *h_B = (float*)malloc(size);
float *h_C = (float*)malloc(size);
//Initialize input vectors
f = fopen("A.txt","r");
for(i=0;i<N;i++)
fscanf(f,"%f ",&h_A[i]);
fclose(f);
f = fopen("B.txt","r");
for(i=0;i<N;i++)
fscanf(f,"%f ",&h_B[i]);
fclose(f);
//Allocate vactors in device memory
float *d_A;
gpuErrchk(cudaMalloc(&d_A,size));
float *d_B;
cudaMalloc(&d_B,size);
float *d_C;
cudaMalloc(&d_C,size);
gpuErrchk(cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice));
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
//invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
f = fopen("C.txt","w");
printf("%f \n",h_C[i]);
for(i=0;i<1000;i++)
fprintf(f,"%f ",h_C[i]);
fclose(f);
printf("Zakonczono obliczenia\n");
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
//Free host memory
free(h_A);
free(h_B);
return 0;
}
It should read two vectors from files, add them on device and then print the output into 'C.txt' file. However, it prints one thousand of zeros.
After a little debugging did I find the culprit- the cudaMalloc function.
(cuda-gdb) n 42 cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); (cuda-gdb) n 43 cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); (cuda-gdb) print d_A[0] $1 = 0 (cuda-gdb) print h_A[0] $2 = 3.66192293
I wonder why it doesn't work, this part of code had been raw copied from the tutorial.
cudaMemcpyHostToDevice
? A callback? – Fiddling Bitscuda-memcheck
. Your thread index variablei
in the kernel will also not let you scale to more than one threadblock's worth of data. You don't seem to have copied that from the "tutorial" correctly. – Robert Crovellaprintf("%f \n",h_C[i]);
I think it should be something like this:printf("%f \n",h_C[0]);
But I createdA.txt
andB.txt
files composed of 1000 lines of0.2
and the resultantC.txt
file had a single line of 1000 iterations of0.400000
. Andcuda-memcheck
reports no errors (in my case). So if yourC.txt
has garbage then I suspect a machine configuration issue (CUDA not functioning). If you add the proper error checking I linked to, it will probably shed some light on that. – Robert Crovella