1
votes

I have a CUDA program containing a host function and a device function Execute(). In the host function, I allocate a global memory output which will then be passed to the device function and used to store the address of the global memory allocated within the device function. I want to access the in-kernel allocated memory in the host function. The following is the code:

#include <stdio.h>
typedef struct                      
{
  int             * p;            
  int              num;            
} Structure_A;

__global__ void Execute(Structure_A *output);

int main(){

    Structure_A *output;
    cudaMalloc((void***)&output,sizeof(Structure_A)*1);
    dim3 dimBlockExecute(1,1);
    dim3 dimGridExecute(1,1);
    Execute<<<dimGridExecute,dimBlockExecute>>>(output);
    Structure_A * output_cpu;
    int * p_cpu;
    cudaError_t err;

    output_cpu= (Structure_A*)malloc(sizeof(Structure_A));
    err=cudaMemcpy(output_cpu,output,sizeof(Structure_A),cudaMemcpyDeviceToHost);    
    if( err != cudaSuccess)
    {
        printf("CUDA error a: %s\n", cudaGetErrorString(err));
        exit(-1);
    }
    p_cpu=(int *)malloc(sizeof(int));
    err=cudaMemcpy(p_cpu,output_cpu[0].p,sizeof(int),cudaMemcpyDeviceToHost);    
    if( err != cudaSuccess)
    {
        printf("CUDA error b: %s\n", cudaGetErrorString(err));
        exit(-1);
    }   
    printf("output=(%d,%d)\n",output_cpu[0].num,p_cpu[0]);
    return 0;
}

__global__ void Execute(Structure_A *output){

    int thid=threadIdx.x;

    output[thid].p= (int*)malloc(thid+1);

    output[thid].num=(thid+1);

    output[thid].p[0]=5;
} 

I can compile the program. But when I run it, I got a error showing that there is a invalid argument in the following memory copy function:

err=cudaMemcpy(p_cpu,output_cpu[0].p,sizeof(int),cudaMemcpyDeviceToHost); 

CUDA version is 4.2. CUDA card: Tesla C2075 OS: x86_64 GNU/Linux

Edit: modified the code and allocate proper size of memory for output_cpu and p_cpu.

1

1 Answers

4
votes

There are many things wrong with this code. For example, you are only allocating 1 byte in both of these two lines, not enough to hold a single instance of Structure_A.

output_cpu= (Structure_A*)malloc(1);
p_cpu=(int *)malloc(1);

But the immediate cause of your error is that you are doing a memcpy from a device runtime heap allocated pointer (i.e allocated with malloc or new inside your device code) to a host pointer.

err=cudaMemcpy(p_cpu,output_cpu[0].p,sizeof(int),cudaMemcpyDeviceToHost);   

Unfortunately the host runtime API for cudaMalloc, cudaFree, and cudaMemcpy is not currently compatible with memory allocated on the device runtime heap.