CUDA programming guide states that "Memory allocated via malloc()
can be copied using the runtime (i.e., by calling any of the copy memory functions from Device Memory)", but somehow I'm having trouble to reproduce this functionality. Code:
#include <cstdio>
__device__ int* p;
__global__ void allocate_p() {
p = (int*) malloc(10);
printf("p = %p (seen by GPU)\n", p);
}
int main() {
cudaError_t err;
int* localp = (int*) malloc(10);
allocate_p<<<1,1>>>();
cudaDeviceSynchronize();
//Getting pointer to device-allocated memory
int* tmpp = NULL;
cudaMemcpyFromSymbol(&tmpp, p, 4);
printf("p = %p (seen by CPU)\n", tmpp);
//cudaMalloc((void**)&tmpp, 40);
err = cudaMemcpy(tmpp, localp, 40, cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
printf(" err:%i %s", (int)err, cudaGetErrorString(err));
delete localp;
return 0;
}
crashes with output:
p = 0x601f920 (seen by GPU)
p = 0x601f920 (seen by CPU)
err:11 invalid argument
I gather, that the host sees the appropriate address on device, but somehow does not like it coming from malloc()
.
If I allocate earlier by cudaMalloc((void**)&np, 40);
and then pass the pointer np
as argument to kernel allocate_p
, where it will be assigned to p
(instead of malloc()
), then the code runs fine.
What am I doing wrong / how do we use malloc()
allocated device-memory in host-side functions?