This is allocating a new pointer to host memory:
test[i].array = (char*)malloc(size * sizeof(char));
This is copying data to that region in host memory:
memcpy(test[i].array, temp, size * sizeof(char));
This is overwriting the previously allocated pointer to host memory (from step 1 above) with a new pointer to device memory:
cudaMalloc((void**)&(test[i].array), size * sizeof(char));
After step 3, the data you set up in step 2 is entirely lost, and no longer accessible in any fashion. Referring to steps 3 and 4 in the question/answer you linked:
3.Create a separate int pointer on the host, let's call it myhostptr
4.cudaMalloc int storage on the device for myhostptr
You haven't done this. You did not create a separate pointer. You reused (erased, overwrote) an existing pointer, which was pointing to data you cared about on the host. This question/answer, also linked from the answer you linked, gives almost exactly the steps you need to follow, in code.
Here's a modified version of your code, which properly implements the missing steps 3 and 4 (and 5) that you didn't implement correctly according to the question/answer you linked: (refer to comments delineating steps 3,4,5)
$ cat t755.cu
#include <stdio.h>
#include <stdlib.h>
struct Test {
char *array;
};
__global__ void kernel(Test *dev_test) {
for(int i=0; i < 5; i++) {
printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
}
}
int main(void) {
int n = 4, size = 5;
Test *dev_test, *test;
test = (Test*)malloc(sizeof(Test)*n);
for(int i = 0; i < n; i++)
test[i].array = (char*)malloc(size * sizeof(char));
for(int i=0; i < n; i++) {
char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
memcpy(test[i].array, temp, size * sizeof(char));
}
cudaMalloc((void**)&dev_test, n * sizeof(Test));
cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);
// Step 3:
char *temp_data[n];
// Step 4:
for (int i=0; i < n; i++)
cudaMalloc(&(temp_data[i]), size*sizeof(char));
// Step 5:
for (int i=0; i < n; i++)
cudaMemcpy(&(dev_test[i].array), &(temp_data[i]), sizeof(char *), cudaMemcpyHostToDevice);
// now copy the embedded data:
for (int i=0; i < n; i++)
cudaMemcpy(temp_data[i], test[i].array, size*sizeof(char), cudaMemcpyHostToDevice);
kernel<<<1, 1>>>(dev_test);
cudaDeviceSynchronize();
// memory free
return 0;
}
$ nvcc -o t755 t755.cu
$ cuda-memcheck ./t755
========= CUDA-MEMCHECK
Kernel[0][i]: a
Kernel[0][i]: b
Kernel[0][i]: c
Kernel[0][i]: d
Kernel[0][i]: e
========= ERROR SUMMARY: 0 errors
$
Since the above methodology can be challenging for beginners, the usual advice is not to do it, but instead flatten your data structures. Flatten generally means to rearrange the data storage so as to remove the embedded pointers that have to be separately allocated.
A trivial example of flattening this data structure would be to use this instead:
struct Test {
char array[5];
};
It's recognized of course that this particular approach would not serve every purpose, but it should illustrate the general idea/intent. With that modification, as an example, the code becomes much simpler:
$ cat t755.cu
#include <stdio.h>
#include <stdlib.h>
struct Test {
char array[5];
};
__global__ void kernel(Test *dev_test) {
for(int i=0; i < 5; i++) {
printf("Kernel[0][i]: %c \n", dev_test[0].array[i]);
}
}
int main(void) {
int n = 4, size = 5;
Test *dev_test, *test;
test = (Test*)malloc(sizeof(Test)*n);
for(int i=0; i < n; i++) {
char temp[] = { 'a', 'b', 'c', 'd' , 'e' };
memcpy(test[i].array, temp, size * sizeof(char));
}
cudaMalloc((void**)&dev_test, n * sizeof(Test));
cudaMemcpy(dev_test, test, n * sizeof(Test), cudaMemcpyHostToDevice);
kernel<<<1, 1>>>(dev_test);
cudaDeviceSynchronize();
// memory free
return 0;
}
$ nvcc -o t755 t755.cu
$ cuda-memcheck ./t755
========= CUDA-MEMCHECK
Kernel[0][i]: a
Kernel[0][i]: b
Kernel[0][i]: c
Kernel[0][i]: d
Kernel[0][i]: e
========= ERROR SUMMARY: 0 errors
$
cudaMalloc((void**)&(test[i].array), size * sizeof(char));
and notcudaMalloc((void**)&(dev_test[i].array), size * sizeof(char));
? Also, it should becudaMemcpy(dev_test[i].array, test[i].array, size * sizeof(char), cudaMemcpyHostToDevice);
. – francisdev_test[i].array
, not fortest[i].array
which is already allocated on the CPU bytest[i].array = (char*)malloc(size * sizeof(char));
. – francistest[i].array
is already allocated but only on CPU, no on GPU. We can't allocate memory fordev_test[i].array
, because this memory is only visible from device. At least I understand it so. – Bakus123