cuda copy data which dynamic malloc in kernel from device memory

Question

I met a problem about using cudaMemcpy with cudaMemcpyDeviceToHost.

There is a struct which have a pointer int* a, It will malloc in the kernel function. And then I need copy this int* a to host memory.

My question is: I didn't know how it can not work by using cudaMemcpy.

There my codes:

#include <cuda_runtime.h>
#include <stdio.h>

typedef struct { int n, m; int *a; } myst;

__global__ void xthread(myst *st)
{
    unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
    myst *mst = &st[idx];
    mst->n = idx;
    mst->m = idx+1;
    mst->a = (int *)malloc((mst->m)*sizeof(int));
    mst->a[0] = idx;
}


int main(int argc,char **argv)
{
    dim3 dimGrid(1);
    dim3 dimBlock(2);

    myst *mst = NULL;
    myst *hst = (myst *)malloc(2 * sizeof(myst));
    cudaMalloc(&mst, 2 * sizeof(myst));

    xthread<<<dimGrid, dimBlock>>>(mst);
    cudaDeviceSynchronize();

    cudaMemcpy(&hst[0],&mst[0],sizeof(myst),cudaMemcpyDeviceToHost);
    cudaMemcpy(&hst[1],&mst[1],sizeof(myst),cudaMemcpyDeviceToHost);

    int *pInt1 = (int *)malloc((hst[0].m)*sizeof(int)) ;
    int *pInt2 = (int *)malloc((hst[1].m)*sizeof(int)) ;

    cudaMemcpy(pInt1, hst[0].a, (hst[0].m)*sizeof(int), cudaMemcpyDeviceToHost);
    cudaMemcpy(pInt2, hst[1].a, (hst[1].m)*sizeof(int), cudaMemcpyDeviceToHost);

    printf("%d\t%d\t%d\n",hst[0].n,hst[0].m, pInt1[0]);
    printf("%d\t%d\t%d\n",hst[1].n,hst[1].m, pInt2[0]);

    free(pInt1);
    free(pInt2);

    return 0;
}

The codes will go warning about "Cuda API error detected: cudaMemcpy returned (0xb)"

I saw a similar question : copy data which is allocated in device from device to host But it seem that can not solve my problem.

Thx.

You can't do that. Host access to device heap is not supported. — talonmies

Devin zhang Devin zhang · Accepted Answer · 2018-09-13T07:21:50

Alright, I work it out with a stupid way (-.-!!).

While return form the kernel function, I count how many space I have to malloc in Host and Device, and cudaMalloc again a big space . Next, in other kernel function named ythread, copy the data which in the Heap to the big space.

typedef struct { int n, m; int *a; } myst;
__global__ void xthread(myst *st) {
    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
    myst *mst = &st[idx];
    mst->n = idx;
    mst->m = idx + 1;
    mst->a = (int *) malloc((mst->m) * sizeof(int));
    for (int i = 0; i < mst->m; i++) {
        mst->a[i] = idx + 900 + i * 10;
    }
}
__global__ void ythread(myst *st, int *total_a) {
    unsigned int idx = blockIdx.x*blockDim.x + threadIdx.x;
    myst *mst = &st[idx];
    int offset=0;
    for(int i=0; i<idx; i++) {
        offset += st[i].m;
    }
    for(int i=0; i<mst->m; i++) {
        total_a[offset+i] = mst->a[i];
    }
}
int main(int argc,char **argv) {
    dim3 dimGrid(1);
    dim3 dimBlock(2);
    myst *mst = NULL;
    cudaMalloc((void**)&mst, dimBlock.x * sizeof(myst));

    xthread<<<dimGrid, dimBlock>>>(mst);
    cudaDeviceSynchronize();

    myst *hst = (myst *)malloc(dimBlock.x * sizeof(myst));
    cudaMemcpy(hst, mst, dimBlock.x*sizeof(myst),cudaMemcpyDeviceToHost);

    int t_size = 0;
    for(int i=0; i<dimBlock.x; i++) {
        t_size += hst[i].m;
    }
    printf("t_size:%d\n", t_size);
    int * t_a_h = (int *)malloc(t_size*sizeof(int));
    int * t_a_d = NULL;
    cudaMalloc((void**)&t_a_d, t_size*sizeof(int));
    ythread<<<dimGrid, dimBlock>>>(mst, t_a_d);
    cudaDeviceSynchronize();
    cudaMemcpy(t_a_h, t_a_d, t_size*sizeof(int),cudaMemcpyDeviceToHost);

    for(int i=0; i<t_size; i++) {
        printf("t_a_h[%d]:%d\n", i, t_a_h[i]);
    }

    free(t_a_h);
    cudaFree(mst);
    cudaFree(t_a_d);

    return 0;
}

Emmmmmm, it work, but I think there is a better way to solve this problem.

cuda copy data which dynamic malloc in kernel from device memory

1 Answers