0
votes

I am using CUDA. I have the following class on host:

class Particle{
     public:
     float x;
     float v;
     // several other variables
}

Then I have a vector of particles

vector <Particle> p_all(512);

On the GPU, I want to operate on an array of all x's (taken from all the Particles), and want to copy the data from the Particles array into a float array on device. I have a hunch that cudaMemcpy can be used, and I tried the following code, but it gives invalid pitch error.

cudaMalloc( (void**) &pos_dev, sizeof(float)*512);
cudaMemcpy2D( (void*) &pos_dev, sizeof(float), (void*)&p_all[0].x, sizeof(Particle), sizeof(Particle), 512*sizeof(float), cudaMemcpyHostToDevice);

Is it at all possible to do so? Of course, the backup solution is to create an array of x's using a for loop and then copy it to the device. But I am looking for a more efficient solution.

Thanks.

FULL CODE BELOW.

#include <cuda_runtime.h>
#include <iostream>
#include <vector>
using namespace std;

// This will output the proper error string when calling cudaGetLastError
void getLastCudaError(string s=""){
    string errMessage = s;
    cudaError_t err = cudaGetLastError();
    if( err != cudaSuccess){
        cerr << __FILE__ << "(" << __LINE__ << ") : Last Cuda Error - " << errMessage 
             << " (" << int(err) << "): " << cudaGetErrorString(err) << ".\n";
        exit(-1);
    }
}

class Particle{
    public:
    float x;
    float v;
    int a;
    char c;
    short b;

    Particle(){
        a=1988; c='a'; v=5.56; x=1810; b=1.66;
    }
};

template <class T>
void printVec(vector <T> &v, string name = "v"){
    cout << name << " = ";
    for (int i=0; i<v.size(); ++i) cout << v[i] << " " ;
    cout << '\n';
}

int main(){

    const int N = 512;
    vector <float> pos(N,5);

    vector <Particle> p_all(N);

    float * pos_dev;
    float * vel_dev;

    cudaMalloc( (void**) &pos_dev, sizeof(float)*N);

    printVec(pos, "pos");

    cudaMemcpy2D( (void*) &pos_dev, sizeof(float), (void*)&(p_all[0].x), sizeof(Particle), sizeof(float), N, cudaMemcpyHostToDevice);
    getLastCudaError("HtoD");

    cudaMemcpy( (void*) &pos[0], (void*)&pos_dev, N*sizeof(float), cudaMemcpyDeviceToHost);
    getLastCudaError("DtoH");

    printVec(pos, "pos_new");

    return 0;

}
2

2 Answers

0
votes

Your cudaMemcpy2D call is set up incorrectly. Check the documentation.

try this instead:

cudaMemcpy2D( (void*) pos_dev, sizeof(float), (void*)&(p_all[0].x), sizeof(Particle), sizeof(float), 512, cudaMemcpyHostToDevice);

There were multiple parameters that needed to be modified, but the invalid pitch error came about because the requested width of transfer in bytes (you had sizeof(Particle)) was wider than the destination pitch (sizeof(float), which is correct)

EDIT: in addition, although you didn't ask about it, the final cudaMemcpy operation in the code you have now posted is also incorrect. The following changes should help:

cudaMemcpy( (void*) &(pos[0]), (void*)pos_dev, N*sizeof(float), cudaMemcpyDeviceToHost);
0
votes

You are allocating your data as "array of structures", like

class Particle{
    public:
        float x;
        float v;
}

Particle foo[N];

which will lead to coalescing issues due to the data interleaving and for this reason you are trying to use cudaMemcpy2D. A more convenient solution in terms of bandwidth exploitation is allocating the data as "structures of arrays" as

class Particle{
    public:
        float x[N];
        float v[N];
}

Particle foo;

In this way, you will be able to avoid the use of cudaMemcpy2D and copy the data from host to device by a simple cudaMemcpy.