I am currently having exceptional difficulty with CUDA programming--more specifically, in copying and reading an array which the device sends back to the host. When I attempt to read the data which I am supposed to have returned to me, all I get is junk data. Could anyone take a look at my code snippets and tell me what I'm doing wrong? Thank you very much!
struct intss {
u_int32_t one;
u_int32_t two;
};
int main()
{
int block_size = 3;
int grid_size = 1;
intss *device_fb = 0;
intss *host_fb = 0;
int num_bytes_fb = (block_size*grid_size)*sizeof(intss);
host_fb = (intss*)malloc(num_bytes_fb);
cudaMalloc((void **)&device_fb, num_bytes_fb);
....
render2<<<block_size,grid_size>>>(device_fb, device_pixelspercore, samples, obj_list_flat_dev, numOpsPerCore, lnumdev, camdev, lightsdev, uranddev, iranddev);
....
cudaMemcpy(host_fb, device_fb, num_bytes_fb, cudaMemcpyDeviceToHost);
printf("output %d ", host_fb[0].one);
printf("output %d ", host_fb[1].one);
printf("output %d ", host_fb[2].one);
//Note that I'm only looking at elements the 3 elements 0-2 from host_fb. I am doing this because block_size*grid_size = 3. Is this wrong?
cudaFree(device_fb);
free(host_fb);
}
__global__ void render2(intss *device_fb, struct parallelPixels *pixelsPerCore, int samples, double *obj_list_flat_dev, int numOpsPerCore, int lnumdev, struct camera camdev, struct vec3 *lightsdev, struct vec3 *uranddev, int *iranddev) //SPECIFY ARGUMENTS!!!
{
int index = blockIdx.x * blockDim.x + threadIdx.x; //DETERMINING INDEX BASED ON WHICH THREAD IS CURRENTLY RUNNING
....
//computing data...
device_fb[index].one = (((u_int32_t)(MIN(r, 1.0) * 255.0) & 0xff) << RSHIFT |
((u_int32_t)(MIN(g, 1.0) * 255.0) & 0xff) << GSHIFT |
((u_int32_t)(MIN(b, 1.0) * 255.0) & 0xff) << BSHIFT);
}
EDIT:
Thanks to a suggestion, I have implemented the CudaErrorCheck function in my program, and there seems to be a pattern in which functions are giving me errors.
In my program, I have a bunch of global host arrays(obj_list, lights, urand, irand). Whenever I attempt to use cudaMemCpy to copy these host arrays to device arrays, I receive the following error: "Cuda error in file 'cudatrace.cu' in line x : invalid argument."
obj_list and lights are filled in the following function, load_scene():
void load_scene(FILE *fp) { char line[256], *ptr, type;
obj_list = (sphere *)malloc(sizeof(struct sphere));
obj_list->next = 0;
objCounter = 0;
while((ptr = fgets(line, 256, fp))) {
int i;
struct vec3 pos, col;
double rad, spow, refl;
while(*ptr == ' ' || *ptr == '\t') ptr++;
if(*ptr == '#' || *ptr == '\n') continue;
if(!(ptr = strtok(line, DELIM))) continue;
type = *ptr;
for(i=0; i<3; i++) {
if(!(ptr = strtok(0, DELIM))) break;
*((double*)&pos.x + i) = atof(ptr);
}
if(type == 'l') {
lights[lnum++] = pos;
continue;
}
if(!(ptr = strtok(0, DELIM))) continue;
rad = atof(ptr);
for(i=0; i<3; i++) {
if(!(ptr = strtok(0, DELIM))) break;
*((double*)&col.x + i) = atof(ptr);
}
if(type == 'c') {
cam.pos = pos;
cam.targ = col;
cam.fov = rad;
continue;
}
if(!(ptr = strtok(0, DELIM))) continue;
spow = atof(ptr);
if(!(ptr = strtok(0, DELIM))) continue;
refl = atof(ptr);
if(type == 's') {
objCounter++;
struct sphere *sph = (sphere *)malloc(sizeof(*sph));
sph->next = obj_list->next;
obj_list->next = sph;
sph->pos = pos;
sph->rad = rad;
sph->mat.col = col;
sph->mat.spow = spow;
sph->mat.refl = refl;
} else {
fprintf(stderr, "unknown type: %c\n", type);
}
}
}
urand and irand are filled in main as follows:
/* initialize the random number tables for the jitter */
for(i=0; i<NRAN; i++) urand[i].x = (double)rand() / RAND_MAX - 0.5;
for(i=0; i<NRAN; i++) urand[i].y = (double)rand() / RAND_MAX - 0.5;
for(i=0; i<NRAN; i++) irand[i] = (int)(NRAN * ((double)rand() / RAND_MAX));
I don't think the invalid argument could be caused by the device array, since the cudaMalloc call creating the device array before the cudaMemcpy call did not have a CudaError message. For example, in the following lines of code:
cudaErrorCheck(cudaMalloc((void **)&lightsdev, MAX_LIGHTS*sizeof(struct vec3)) );
cudaErrorCheck( cudaMemcpy(&lightsdev, &lights, sizeof(struct vec3) * MAX_LIGHTS, cudaMemcpyHostToDevice) );
cudaMalloc did not produce an error, but cudaMemcpy did.
If I have not provided enough information on my code, I have pasted the entire code to: http://pastebin.com/UgzABPgH
(Note that in the pastebin version, I took out the CudaErrorCheck functions on the CudaMemcpy's which were producing the errors.)
Thank you very much!
EDIT: Actually, I just tried to see what would happen if urand and irand were not global, and if they were initialized alongside the device arrays uranddev and iranddev. I'm still getting the same "invalid argument" error, so the whether or not a variable is global must not relate to the problem.