Pass array of pointers to multiple devices to Cuda C Kernel

Question

I have a one-dimensional array that I need to process, but it is too large for a single GPU. Therefore, I'm passing the array to multiple GPUs to store in memory, the number of which will change depending on the problem size. If I pass an array of pointers to the arrays in the different GPUs, I cannot access the other arrays from my Cuda C Kernel.

I've tried passing a simple array of device pointers to each device with a kernel call, but the code seems to break when I try to access the arrays. Even the device that is running the Kernel cannot access the array in its own memory.

Data structures:

typedef struct ComplexArray
{
   double *real;
} ComplexArray;

typedef struct ComplexArrayArray
{
   ComplexArray* Arr;
} ComplexArrayArray;

Malloc:

ComplexArrayArray stateVector;
stateVector.Arr = (ComplexArray*)malloc(sizeof(ComplexArray*) * numberOfGPU));

for (int dev = 0; dev < numberOfGPI; dev++)
{
    ...
    cudaMalloc(&(stateVector.Arr[dev].real), numberOfElements * sizeof(*(stateVector.Arr[dev].real)) / numberOfGPU);
    ...
}

Kernel:

__global__ void kernel(..., ComplexArrayArray stateVector, ...)
{
   // Calculate necessary device
   int device_number = ...;
   int index = ...;

   double val = stateVector.Arr[device_number].real[index];
   ...
}

When I try to access the arrays with this manner, the Kernel seems to "break". There is no error message, but its obvious that the data has not been read. Furthermore, I don't reach any printf statements after the data access.

Any idea on the best way to pass an array of pointers to device memory to a Cuda C Kernel?

Your stateVector definition ComplexArrayArray stateVector ... is not a pointer. How could it make sense to assign a pointer value to it? Does that even compile? In fact there are probably 3 separate issues with that one line of code. — Robert Crovella
You're right, typo. I tried simplifying down the code that this comes from and wrote that incorrectly. I'm using malloc to allocate for the array of pointers within the struct. — Jason Fitch

Robert Crovella Robert Crovella · Accepted Answer · 2019-08-27T21:44:53

Your attempt to use a struct with a pointer to an array of struct, each of which has an embedded pointer, will make for a very complex realization with cudaMalloc. It may be a bit simpler if you use cudaMallocManaged, but still unnecessarily complex. The complexities arise because cudaMalloc allocates space on a particular device, and that data is not (by default) accessible to any other device, and also due to the fact that your embedded pointers create the necessity for various "deep copies". Here's a worked example:

$ cat t1492.cu
#include <iostream>
#include <stdio.h>
typedef struct ComplexArray
{
   double *real;
} ComplexArray;

typedef struct ComplexArrayArray
{
   ComplexArray* Arr;
} ComplexArrayArray;

__global__ void kernel(ComplexArrayArray stateVector, int dev, int ds)
{
   // Calculate necessary device
   int device_number = dev;
   int index = blockIdx.x*blockDim.x+threadIdx.x;
   if (index < ds){
     double val = stateVector.Arr[device_number].real[index] + dev;
     stateVector.Arr[device_number].real[index] = val;
   }
}
const int nTPB = 256;
int main(){
  int numberOfGPU;
  cudaGetDeviceCount(&numberOfGPU);
  std::cout << "GPU count: " << numberOfGPU << std::endl;
  ComplexArrayArray *stateVector = new ComplexArrayArray[numberOfGPU];
  const int ds = 32;
  double *hdata = new double[ds]();
  ComplexArray *ddata = new ComplexArray[numberOfGPU];
  for (int i = 0; i < numberOfGPU; i++){
    cudaSetDevice(i);
    cudaMalloc(&(stateVector[i].Arr), sizeof(ComplexArray) * numberOfGPU);
    cudaMalloc(&(ddata[i].real), (ds/numberOfGPU)*sizeof(double));
    cudaMemcpy(ddata[i].real, hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
  for (int i = 0; i < numberOfGPU; i++){
    cudaSetDevice(i);
    cudaMemcpy(stateVector[i].Arr, ddata, sizeof(ComplexArray)*numberOfGPU, cudaMemcpyHostToDevice);}
  for (int i = 0; i < numberOfGPU; i++){
    cudaSetDevice(i);
    kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector[i], i, (ds/numberOfGPU));}
  for (int i = 0; i < numberOfGPU; i++){
    cudaSetDevice(i);
    cudaMemcpy(hdata + i*(ds/numberOfGPU), ddata[i].real,  (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
  for (int i = 0; i < ds; i++)
    std::cout << hdata[i] << " ";
  std::cout << std::endl;
}
$ nvcc -o t1492 t1492.cu
$ cuda-memcheck ./t1492
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$

However, if you want to take a host array and partition into one chunk per GPU, you don't need that level of complexity. Here is a simpler example:

$ cat t1493.cu
#include <iostream>
#include <stdio.h>
typedef struct ComplexArray
{
   double *real;
} ComplexArray;

typedef struct ComplexArrayArray
{
   ComplexArray* Arr;
} ComplexArrayArray;

__global__ void kernel(ComplexArray stateVector, int dev, int ds)
{
   int index = blockIdx.x*blockDim.x+threadIdx.x;
   if (index < ds){
     double val = stateVector.real[index] + dev;
     stateVector.real[index] = val;
   }
}
const int nTPB = 256;
int main(){
  int numberOfGPU;
  cudaGetDeviceCount(&numberOfGPU);
  std::cout << "GPU count: " << numberOfGPU << std::endl;
  ComplexArray *stateVector = new ComplexArray[numberOfGPU];
  const int ds = 32;
  double *hdata = new double[ds]();
  for (int i = 0; i < numberOfGPU; i++){
    cudaSetDevice(i);
    cudaMalloc(&(stateVector[i].real), (ds/numberOfGPU)*sizeof(double));
    cudaMemcpy(stateVector[i].real, hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
  for (int i = 0; i < numberOfGPU; i++){
    cudaSetDevice(i);
    kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector[i], i, (ds/numberOfGPU));}
  for (int i = 0; i < numberOfGPU; i++){
    cudaSetDevice(i);
    cudaMemcpy(hdata + i*(ds/numberOfGPU), stateVector[i].real,  (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
  for (int i = 0; i < ds; i++)
    std::cout << hdata[i] << " ";
  std::cout << std::endl;
}
$ nvcc -o t1493 t1493.cu
$ cuda-memcheck ./t1493
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$

Note that your question appears to make reference to the idea that you will break the data up into chunks, and each kernel will potentially have access to all the chunks. That will require either managed memory usage or knowledge that the system can support P2P access between the GPUs. That adds more complexity and is beyond the scope of what I have answered here, which is focused on your question about the kernel not being able to access "its own" data.

Since we should be able to upper-bound the number of GPUs that can participate (lets set it to a maximum of 8) we can avoid the deep copy of the first approach while still allowing all GPUs to have all pointers. Here is a modified example:

$ cat t1495.cu
#include <iostream>
#include <stdio.h>
const int maxGPU=8;
typedef struct ComplexArray
{
   double *real[maxGPU];
} ComplexArray;

__global__ void kernel(ComplexArray stateVector, int dev, int ds)
{
   int index = blockIdx.x*blockDim.x+threadIdx.x;
   if (index < ds){
     double val = stateVector.real[dev][index] + dev;
     stateVector.real[dev][index] = val;
   }
}
const int nTPB = 256;
int main(){
  int numberOfGPU;
  cudaGetDeviceCount(&numberOfGPU);
  std::cout << "GPU count: " << numberOfGPU << std::endl;
  ComplexArray stateVector;
  const int ds = 32;
  double *hdata = new double[ds]();
  for (int i = 0; i < numberOfGPU; i++){
    cudaSetDevice(i);
    cudaMalloc(&(stateVector.real[i]), (ds/numberOfGPU)*sizeof(double));
    cudaMemcpy(stateVector.real[i], hdata + i*(ds/numberOfGPU), (ds/numberOfGPU)*sizeof(double), cudaMemcpyHostToDevice);}
  for (int i = 0; i < numberOfGPU; i++){
    cudaSetDevice(i);
    kernel<<<((ds/numberOfGPU)+nTPB-1)/nTPB,nTPB>>>(stateVector, i, (ds/numberOfGPU));}
  for (int i = 0; i < numberOfGPU; i++){
    cudaSetDevice(i);
    cudaMemcpy(hdata + i*(ds/numberOfGPU), stateVector.real[i],  (ds/numberOfGPU)*sizeof(double), cudaMemcpyDeviceToHost);}
  for (int i = 0; i < ds; i++)
    std::cout << hdata[i] << " ";
  std::cout << std::endl;
}
$ nvcc -o t1495 t1495.cu
$ cuda-memcheck ./t1495
========= CUDA-MEMCHECK
GPU count: 4
0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3
========= ERROR SUMMARY: 0 errors
$

Pass array of pointers to multiple devices to Cuda C Kernel

1 Answers