Your attempt to use a struct with a pointer to an array of struct, each of which has an embedded pointer, will make for a very complex realization with cudaMalloc
. It may be a bit simpler if you use cudaMallocManaged
, but still unnecessarily complex. The complexities arise because cudaMalloc
allocates space on a particular device, and that data is not (by default) accessible to any other device, and also due to the fact that your embedded pointers create the necessity for various "deep copies". Here's a worked example:
$ cat t1492.cu
#include <iostream>
#include <stdio.h>
typedef struct ComplexArray
{
double *real;
} ComplexArray;
typedef struct ComplexArrayArray
{
ComplexArray* Arr;
} ComplexArrayArray;
__global__ void kernel(ComplexArrayArray stateVector, int dev, int ds)
{
// Calculate necessary device
int device_number = dev;
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.Arr[device_number].real[index] + dev;
stateVector.Arr[device_number].real[index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArrayArray *stateVector = new ComplexArrayArray[numberOfGPU];
const int ds = 32;
double *hdata = new double[ds]();
ComplexArray *ddata = new ComplexArray[numberOfGPU];
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector[i].Arr), sizeof(ComplexArray) * numberOfGPU);
cudaMalloc(&(ddata[i].real), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(ddata[i].real, hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(stateVector[i].Arr, ddata, sizeof(ComplexArray)*numberOfGPU, cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector[i], i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), ddata[i].real, (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1492 t1492.cu
$ cuda-memcheck ./t1492
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$
However, if you want to take a host array and partition into one chunk per GPU, you don't need that level of complexity. Here is a simpler example:
$ cat t1493.cu
#include <iostream>
#include <stdio.h>
typedef struct ComplexArray
{
double *real;
} ComplexArray;
typedef struct ComplexArrayArray
{
ComplexArray* Arr;
} ComplexArrayArray;
__global__ void kernel(ComplexArray stateVector, int dev, int ds)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.real[index] + dev;
stateVector.real[index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArray *stateVector = new ComplexArray[numberOfGPU];
const int ds = 32;
double *hdata = new double[ds]();
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector[i].real), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(stateVector[i].real, hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector[i], i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), stateVector[i].real, (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1493 t1493.cu
$ cuda-memcheck ./t1493
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$
Note that your question appears to make reference to the idea that you will break the data up into chunks, and each kernel will potentially have access to all the chunks. That will require either managed memory usage or knowledge that the system can support P2P access between the GPUs. That adds more complexity and is beyond the scope of what I have answered here, which is focused on your question about the kernel not being able to access "its own" data.
Since we should be able to upper-bound the number of GPUs that can participate (lets set it to a maximum of 8) we can avoid the deep copy of the first approach while still allowing all GPUs to have all pointers. Here is a modified example:
$ cat t1495.cu
#include <iostream>
#include <stdio.h>
const int maxGPU=8;
typedef struct ComplexArray
{
double *real[maxGPU];
} ComplexArray;
__global__ void kernel(ComplexArray stateVector, int dev, int ds)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index < ds){
double val = stateVector.real[dev][index] + dev;
stateVector.real[dev][index] = val;
}
}
const int nTPB = 256;
int main(){
int numberOfGPU;
cudaGetDeviceCount(&numberOfGPU);
std::cout << "GPU count: " << numberOfGPU << std::endl;
ComplexArray stateVector;
const int ds = 32;
double *hdata = new double[ds]();
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMalloc(&(stateVector.real[i]), (ds/numberOfGPU)*sizeof(double));
cudaMemcpy(stateVector.real[i], hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector, i, (ds/numberOfGPU));}
for (int i = 0; i < numberOfGPU; i++){
cudaSetDevice(i);
cudaMemcpy(hdata + i*(ds/numberOfGPU), stateVector.real[i], (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
for (int i = 0; i < ds; i++)
std::cout << hdata[i] << " ";
std::cout << std::endl;
}
$ nvcc -o t1495 t1495.cu
$ cuda-memcheck ./t1495
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$
stateVector
definitionComplexArrayArray stateVector ...
is not a pointer. How could it make sense to assign a pointer value to it? Does that even compile? In fact there are probably 3 separate issues with that one line of code. – Robert Crovella