0
votes

I am writing a c++ cuda program. I have a very simple struct:

struct A
{
int size;
float* tab; 
}

and a kernel:

__global__ void Kernel(A* res, int n,args*) //
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < n)
{
    res[i] = AGenerator::Generate(args[i]);
}
}

Where AGenerator::Generate creates the A object and fills the tab array. What happens here is that when the results are send to the host the tab pointer is invalid. To prevent this I will need to apply the Rule of three to this class. Since there would be many classes like this I would like to avoid writing too many additional code.

I made the research and found that there is a thrust library which has device_vector and host_vector which will probably help with my problem but the thing is that I want the struct A and similar structs to be callable from both host and device so the device and host_vector are not good for this purpose. Is there any struct I can use to approach this?

EDIT I found that passing the struct by value will help me but since performance is quite important it doesn't seem like a good solution.

2
How do you intend to send the results to the host? That is a pretty important factor in answering your question. I don't think this is a rule of three issue so much as structures with embedded pointers are inherently more difficult to deal with in CUDA when sharing data between device and host, due to the separation of the memory spaces. Certainly your struct definition is usable both in device and host code. But objects created using that definition in device code will require several steps to transfer to the host. You will need some host-allocated space using e.g. cudaMalloc. - Robert Crovella
Right now I have the simplest possible solution. cudaMalloc((void**)&d_Res, Num * sizeof(CashflowsStruct)); Kernel(d_Res, args). cudaMemcpy(h_Res, d_Res, Num* sizeof(CashflowsStruct), cudaMemcpyDeviceToHost)) but I am open to any suggestions. Abouth Rule of three: You are most probably right. I have tried to implement assigment operator and it still doesnt work so this is more complicated than I thought - rank1
@RobertCrovella: how to do it using host-allocated space? - rank1
An object cannot be created by device code and also have host-accessible data areas unless you use a custom allocator that works out of a pre-allocated pool of device memory allocated by the host e.g. with cudaMalloc. That is pretty complicated. Is that what you want? Stated another way, you should start by definining all the basic interactions you want to support with A: Create objects of A in device code? In host code? Copy objects between host and device? If you answer "yes" to all those, then the only way I can think of is with a custom allocator. Others may have other ideas. - Robert Crovella
That's probably yes for all of those. By custom allocator you means sth like that : stackoverflow.com/questions/299761/… ? - rank1

2 Answers

2
votes

Here is a rough outline of what I had in mind for a custom allocator and pool that would hide some of the mechanics of using a class both on the host and the device.

I don't consider it to be a paragon of programming excellence. It is merely intended to be a rough outline of the steps that I think would be involved. I'm sure there are many bugs. I didn't include it, but I think you would want a public method that would get the size as well.

#include <iostream>
#include <assert.h>

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

typedef float mytype;

__device__ unsigned int pool_allocated = 0;
__device__ unsigned int pool_size = 0;
__device__ mytype *pool = 0;

__device__ unsigned int pool_reserve(size_t size){
  assert((pool_allocated+size) < pool_size);
  unsigned int offset = atomicAdd(&pool_allocated, size);
  assert (offset < pool_size);
  return offset;
}

__host__ void init_pool(size_t psize){
  mytype *temp;
  unsigned int my_size = psize;
  cudaMalloc((void **)&temp, psize*sizeof(mytype));
  cudaCheckErrors("init pool cudaMalloc fail");
  cudaMemcpyToSymbol(pool, &temp, sizeof(mytype *));
  cudaCheckErrors("init pool cudaMemcpyToSymbol 1 fail");
  cudaMemcpyToSymbol(pool_size, &my_size, sizeof(unsigned int));
  cudaCheckErrors("init pool cudaMemcpyToSymbol 2 fail");
}


class A{
  public:
  mytype *data;
  __host__ __device__ void pool_allocate_and_copy() {
  assert(d_data == 0);
  assert(size != 0);
#ifdef __CUDA_ARCH__
  unsigned int offset = pool_reserve(size);
  d_data = pool + offset;
  memcpy(d_data, data, size*sizeof(mytype));
#else
  cudaMalloc((void **)&d_data, size*sizeof(mytype));
  cudaCheckErrors("pool_allocate_and_copy cudaMalloc fail");
  cudaMemcpy(d_data, data, size*sizeof(mytype), cudaMemcpyHostToDevice);
  cudaCheckErrors("pool_allocate_and_copy cudaMemcpy fail");
#endif /* __CUDA_ARCH__ */

  }
  __host__ __device__ void update(){
#ifdef __CUDA_ARCH__
  assert(data != 0);
  data = d_data;
  assert(data != 0);
#else
  if (h_data == 0) h_data = (mytype *)malloc(size*sizeof(mytype));
  data = h_data;
  assert(data != 0);
  cudaMemcpy(data, d_data, size*sizeof(mytype), cudaMemcpyDeviceToHost);
  cudaCheckErrors("update cudaMempcy fail");
#endif
  }
  __host__ __device__ void allocate(size_t asize) {
    assert(data == 0);
    data = (mytype *)malloc(asize*sizeof(mytype));
    assert(data != 0);
#ifndef __CUDA_ARCH__
    h_data = data;
#endif
    size = asize;
  }
  __host__ __device__ void copyobj(A *obj){
    assert(obj != 0);
#ifdef __CUDA_ARCH__
    memcpy(this, obj, sizeof(A));
#else
    cudaMemcpy(this, obj, sizeof(A), cudaMemcpyDefault);
    cudaCheckErrors("copy cudaMempcy fail");
#endif
    this->update();
  }
  __host__ __device__ A();
    private:
    unsigned int size;
    mytype *d_data;
    mytype *h_data;
};

__host__ __device__ A::A(){
  data = 0;
  d_data = 0;
  h_data = 0;
  size = 0;
}

__global__ void mykernel(A obj, A *res){
  A mylocal;
  mylocal.copyobj(&obj);
  A mylocal2;
  mylocal2.allocate(24);
  mylocal2.data[0]=45;
  mylocal2.pool_allocate_and_copy();
  res->copyobj(&mylocal2);
  printf("kernel data %f\n", mylocal.data[0]);
}




int main(){
  A my_obj;
  A *d_result, h_result;
  my_obj.allocate(32);
  my_obj.data[0] = 12;
  init_pool(1048576);
  my_obj.pool_allocate_and_copy();
  cudaMalloc((void **)&d_result, sizeof(A));
  cudaCheckErrors("main cudaMalloc fail");
  mykernel<<<1,1>>>(my_obj, d_result);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");
  h_result.copyobj(d_result);
  printf("host data %f\n", h_result.data[0]);

  return 0;
}
-1
votes

I am pretty sure that the direction of the question and related comments are ill fated. Device memory and host memory are totally different things, both conceptually and physically. Pointers just don't carry over!

Please go back to step 1 and learn about copying values between host and device by reading the reference manual and the progamming guide for more details.

To get a more precise answer to your question please show how those A structs are allocated on the device including the allocation of those tab floats. Also please show how AGenerator::Generate somehow manipulates those tabs in a meaningful way. My best bet is that you are working with unallocated device memory here and that you should probably use a preallocated array of floats and indizes into the array instead of device pointers here. Those indices would then carry over to the host gracefully.