You can create, fill and use globl memory arrays without the need of using cudaMemcpy
to copy data from the host for initialization, if this is what are you asking. In the following simple example, I'm creating a global memory array which is initialized directly on the device and then I'm releasing it when not needed anymore.
#include<stdio.h>
__global__ void init_temp_data(float* temp_data) {
temp_data[threadIdx.x] = 3.f;
}
__global__ void copy_global_data(float* temp_data, float* d_data) {
d_data[threadIdx.x] = temp_data[threadIdx.x];
}
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
int main() {
float* data = (float*)malloc(16*sizeof(float));
float* d_data; gpuErrchk(cudaMalloc((void**)&d_data,16*sizeof(float)));
float* temp_data; gpuErrchk(cudaMalloc((void**)&temp_data,16*sizeof(float)));
init_temp_data<<<1,16>>>(temp_data);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
copy_global_data<<<1,16>>>(temp_data,d_data);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaFree(temp_data));
gpuErrchk(cudaMemcpy(data,d_data,16*sizeof(float),cudaMemcpyDeviceToHost));
for (int i=0; i<16; i++) printf("Element number %i is equal to %f\n",i,data[i]);
getchar();
return 0;
}