0
votes

Having been playing around with this grand CUDA experiment for a few months now, I find myself experimenting more and trying to pull away from the tutorial examples.

My question is this : If I want to just use arrays on the GPU for something like temporary storage without copying them back to the host for display/output, can I just create a device array with __device__ double array[numpoints]; Then for anything I want to take back from the GPU, I need to do the whole cudaMalloc, cudaMemcpy spiel, right? Additionally, is there any difference between one method or another? I thought they both create arrays in global memory.

2

2 Answers

1
votes

See this discription about the __device__ qualifier. So if you declare it __device__ you cannot access it in the host through cudaMemcpy but there are other mentioned in the link.

Instead what you can do is declare a global pointer(ie., without __device__) in host code and allocate using the cudaMalloc. So you can use the same to copy the result back to host using the cudaMemcpy.

1
votes

You can create, fill and use globl memory arrays without the need of using cudaMemcpy to copy data from the host for initialization, if this is what are you asking. In the following simple example, I'm creating a global memory array which is initialized directly on the device and then I'm releasing it when not needed anymore.

#include<stdio.h>

__global__ void init_temp_data(float* temp_data) {
    temp_data[threadIdx.x] = 3.f;
}

__global__ void copy_global_data(float* temp_data, float* d_data) {
    d_data[threadIdx.x] = temp_data[threadIdx.x];
}

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
    if (code != cudaSuccess) 
    {
        fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
        if (abort) exit(code);
    }
}

int main() {

    float* data = (float*)malloc(16*sizeof(float));
    float* d_data; gpuErrchk(cudaMalloc((void**)&d_data,16*sizeof(float)));
    float* temp_data; gpuErrchk(cudaMalloc((void**)&temp_data,16*sizeof(float)));

    init_temp_data<<<1,16>>>(temp_data);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize()); 

    copy_global_data<<<1,16>>>(temp_data,d_data);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize()); 

    gpuErrchk(cudaFree(temp_data));
    gpuErrchk(cudaMemcpy(data,d_data,16*sizeof(float),cudaMemcpyDeviceToHost));

    for (int i=0; i<16; i++) printf("Element number %i is equal to %f\n",i,data[i]);

    getchar();

    return 0;
}