2
votes

To be more clear, what I want is passing the pointers and all the data they pointed to device. To test how I can achieve this goal, I wrote a simple class:

class vecarray{
    public:
        int * vecptr[N];                //array of pointers pointing to array
        int dim[N];                     //store length of each array pointed to
        __device__ __host__ vecarray(); //constructor
        __device__ __host__ int sum();  //sum up all the elements in the array being              
                                       //pointed to
}

vecarray::vecarray(){
    for(int i = 0; i<N; i++)
    {
        vecptr[i] = NULL;
        dim[i] = 0;
    }
}

int vecarray::sum(){
    int i=0, j=0, s=0;
    for (i=0; i<N; i++)
        for(j=0; j < dim[i]; j++)
            s += vecptr[i][j];
    return s;
}

Then I use this class in the following code:

#define N 2
__global__ void addvecarray( vecarray * v, int *s){
    *s = v->sum();
}

int main(){                                 //copy *V to device, do sum() and pass back 
    vecarray *v, *dev_v;                    //the result by dev_v
    v = new vecarray;
    dev_v = new vecarray;
    int a[3] = {1,2,3};                     //initialize v manually
    int b[4] = {4,5,6,7};
    int result = 0;
    int * dev_result;
    v->vecptr[0] = a;
    v->vecptr[1] = b;
    v->dim[0] = 3; v->dim[1] = 4;


    cudaMalloc((void**)&dev_v, sizeof(vecarray));      

    cudaMemcpy(dev_v, v, sizeof(vecarray),cudaMemcpyHostToDevice); //copy class object 

    for(int i = 0; i < N; i++){
        cudaMalloc((void**)&(dev_v->vecptr[i]), v->dim[i]*sizeof(int));
    }

    for(int i = 0; i<N; i++ ){                   //copy arrays
    cudaMemcpy(dev_v->vecptr[i], v->vecptr[i], v->dim[i]*sizeof(int), cudaMemcpyHostToDevice));
    }
    addvecarray<<<1,1>>>(dev_v, dev_result);

    cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
    printf("the result is %d\n", result);
}

The code passed nvcc compiler, but failed with segmentation fault when running. I've checked the problem lies in the two cudaMalloc and cudaMemcpy opertation in the for-loop. So my question is how should I pass this object to CUDA? Thanks in advance.

1
I believe this question is a duplicate of this one. In the line of code where you have a for loop that is performing a cudaMalloc operation, you are passing as a pointer to cudaMalloc a pointer that already lives in device memory. Instead you need to create a separate set of int pointers on the host, cudaMalloc these, then cudaMemcpy them to the device in the appropriate locations in your vecarray object instantiated at dev_v.Robert Crovella

1 Answers

3
votes

Your code had several errors in it. As I mentioned in the comments, one of the key errors is in how you are allocating memory for the data regions referenced by pointers within the class. The key mistake there is that you are passing a pointer to cudaMalloc that already lives in device memory. We can fix that by creating an extra set of pointers that we will use to allocate the needed device storage for the arrays that are pointed to within the class. In addition there were a few other errors, such as the fact that you had no properly allocated device storage for dev_result. The following code fixes all the errors I could find and I believe gives the correct result. I've also added a reference form of cuda error checking that you may find useful to use in your projects:

#include <stdio.h>

#define N 2
#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

using namespace std;

class vecarray{
    public:
        int *vecptr[N];                //array of pointers pointing to array
        int dim[N];                     //store length of each array pointed to

        __device__ __host__ vecarray(); //constructor
        __device__ __host__ int sum();  //sum up all the elements in the array being
                                       //pointed to
};

vecarray::vecarray(){
    for(int i = 0; i<N; i++)
    {
        vecptr[i] = NULL;
        dim[i] = 0;
    }
}

__device__ __host__ int vecarray::sum(){
    int i=0, j=0, s=0;
    for (i=0; i<N; i++)
        for(j=0; j < dim[i]; j++)
            s += vecptr[i][j];
    return s;
}

__global__ void addvecarray( vecarray * v, int *s){
    *s = v->sum();
}

int main(){                                 //copy *V to device, do sum() and pass back
    vecarray *v, *dev_v;                    //the result by dev_v
    v = new vecarray;
    int a[3] = {1,2,3};                     //initialize v manually
    int b[4] = {4,5,6,7};
    int result = 0;
    int *dev_result;
    v->vecptr[0] = a;
    v->vecptr[1] = b;
    v->dim[0] = 3; v->dim[1] = 4;
    int *vptr[N];

    cudaMalloc((void**)&dev_v, sizeof(vecarray));
    cudaCheckErrors("cudaMalloc1 fail");
    cudaMemcpy(dev_v, v, sizeof(vecarray),cudaMemcpyHostToDevice); //copy class object
    cudaCheckErrors("cudaMemcpy1 fail");

    for(int i = 0; i < N; i++){
        cudaMalloc((void**)&(vptr[i]), v->dim[i]*sizeof(int));
        cudaCheckErrors("cudaMalloc2 fail");
        cudaMemcpy(&(dev_v->vecptr[i]), &vptr[i], sizeof(int*), cudaMemcpyHostToDevice);
        cudaCheckErrors("cudaMemcpy2 fail");
    }

    for(int i = 0; i<N; i++ ){                   //copy arrays
        cudaMemcpy(vptr[i], v->vecptr[i], v->dim[i]*sizeof(int), cudaMemcpyHostToDevice);
        cudaCheckErrors("cudaMemcpy3 fail");
    }
    cudaMalloc((void **)&dev_result, sizeof(int));
    cudaCheckErrors("cudaMalloc3 fail");
    addvecarray<<<1,1>>>(dev_v, dev_result);

    cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
    cudaCheckErrors("cudaMemcpy4 fail");
    printf("the result is %d\n", result);
    return 0;
}