How to copy the memory allocated in device function back to main memory

Question

I have a CUDA program containing a host function and a device function Execute(). In the host function, I allocate a global memory output which will then be passed to the device function and used to store the address of the global memory allocated within the device function. I want to access the in-kernel allocated memory in the host function. The following is the code:

#include <stdio.h>
typedef struct                      
{
  int             * p;            
  int              num;            
} Structure_A;

__global__ void Execute(Structure_A *output);

int main(){

    Structure_A *output;
    cudaMalloc((void***)&output,sizeof(Structure_A)*1);
    dim3 dimBlockExecute(1,1);
    dim3 dimGridExecute(1,1);
    Execute<<<dimGridExecute,dimBlockExecute>>>(output);
    Structure_A * output_cpu;
    int * p_cpu;
    cudaError_t err;

    output_cpu= (Structure_A*)malloc(sizeof(Structure_A));
    err=cudaMemcpy(output_cpu,output,sizeof(Structure_A),cudaMemcpyDeviceToHost);    
    if( err != cudaSuccess)
    {
        printf("CUDA error a: %s\n", cudaGetErrorString(err));
        exit(-1);
    }
    p_cpu=(int *)malloc(sizeof(int));
    err=cudaMemcpy(p_cpu,output_cpu[0].p,sizeof(int),cudaMemcpyDeviceToHost);    
    if( err != cudaSuccess)
    {
        printf("CUDA error b: %s\n", cudaGetErrorString(err));
        exit(-1);
    }   
    printf("output=(%d,%d)\n",output_cpu[0].num,p_cpu[0]);
    return 0;
}

__global__ void Execute(Structure_A *output){

    int thid=threadIdx.x;

    output[thid].p= (int*)malloc(thid+1);

    output[thid].num=(thid+1);

    output[thid].p[0]=5;
}

I can compile the program. But when I run it, I got a error showing that there is a invalid argument in the following memory copy function:

err=cudaMemcpy(p_cpu,output_cpu[0].p,sizeof(int),cudaMemcpyDeviceToHost);

CUDA version is 4.2. CUDA card: Tesla C2075 OS: x86_64 GNU/Linux

Edit: modified the code and allocate proper size of memory for output_cpu and p_cpu.

harrism harrism · Accepted Answer · 2012-10-24T04:56:33

There are many things wrong with this code. For example, you are only allocating 1 byte in both of these two lines, not enough to hold a single instance of Structure_A.

output_cpu= (Structure_A*)malloc(1);
p_cpu=(int *)malloc(1);

But the immediate cause of your error is that you are doing a memcpy from a device runtime heap allocated pointer (i.e allocated with malloc or new inside your device code) to a host pointer.

err=cudaMemcpy(p_cpu,output_cpu[0].p,sizeof(int),cudaMemcpyDeviceToHost);

Unfortunately the host runtime API for cudaMalloc, cudaFree, and cudaMemcpy is not currently compatible with memory allocated on the device runtime heap.

How to copy the memory allocated in device function back to main memory

1 Answers