Array of cudaArray for multi-GPU texture code

Question

I have some code that I am trying to get to work for a general multi-GPU case, for n amount of equal devices, where n is unknown at compile time.

For this code, I need to bind to texture memory some array, and I need exactly the same data to be bound to the different GPUs.

My single GPU memory code for 3D texture binding looks like:

cudaArray *d_imagedata = 0;
const cudaExtent extent = make_cudaExtent(geo.nVoxelX, geo.nVoxelY, geo.nVoxelZ);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaMalloc3DArray(&d_imagedata, &channelDesc, extent);
cudaCheckErrors("cudaMalloc3D error 3D tex");

cudaMemcpy3DParms copyParams = { 0 };
copyParams.srcPtr = make_cudaPitchedPtr((void*)img, extent.width*sizeof(float), extent.width, extent.height);
copyParams.dstArray = d_imagedata;
copyParams.extent = extent;
copyParams.kind = cudaMemcpyHostToDevice;
cudaMemcpy3D(&copyParams);

cudaCheckErrors("cudaMemcpy3D fail");

// Configure texture options
tex.normalized = false;
tex.filterMode = cudaFilterModePoint; 
tex.addressMode[0] = cudaAddressModeBorder;
tex.addressMode[1] = cudaAddressModeBorder;
tex.addressMode[2] = cudaAddressModeBorder;

cudaBindTextureToArray(tex, d_imagedata, channelDesc);

Its the standard copy to a cudaArray and then binding and setting process, nothing new here.

To transform this code into multi GPU I am aware that I do not need to change the tex global texture reference, as CUDA will know that different GPUs have different tex, however I do need n cudaArray *d_imagedata instances, one for each GPU.

How do I make (and allocate) an array of cudaArrays?

If it was global memory pointers it would be easier, just a CPU malloc on a double pointer and then cudaMalloc on each of them would work, but as a cudaArray is not a standard type, I haven't figured out how to create a flexible array out of it.

Robert Crovella Robert Crovella · Accepted Answer · 2018-11-28T18:04:24

I would recommend using texture objects, rather than texture references.

Using texture objects, a trivial modification to the code presented here seems to work correctly for me:

$ cat t341.cu
#include <helper_cuda.h>
#include <curand.h>
#define NUM_TEX 4

const int SizeNoiseTest = 32;
const int cubeSizeNoiseTest = SizeNoiseTest*SizeNoiseTest*SizeNoiseTest;
static cudaTextureObject_t texNoise[NUM_TEX];

__global__ void AccesTexture(cudaTextureObject_t my_tex)
{
        float test = tex3D<float>(my_tex,(float)threadIdx.x,(float)threadIdx.y,(float)threadIdx.z);//by using this the error occurs
        printf("thread: %d,%d,%d, value: %f\n", threadIdx.x, threadIdx.y, threadIdx.z, test);
}

void CreateTexture()
{

    for (int i = 0; i < NUM_TEX; i++){
        cudaSetDevice(i);
        float *d_NoiseTest;//Device Array with random floats
        cudaMalloc((void **)&d_NoiseTest, cubeSizeNoiseTest*sizeof(float));//Allocation of device Array
        //curand Random Generator (needs compiler link -lcurand)
        curandGenerator_t gen;
        curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
        curandSetPseudoRandomGeneratorSeed(gen,1235ULL+i);
        curandGenerateUniform(gen, d_NoiseTest, cubeSizeNoiseTest);//writing data to d_NoiseTest
        curandDestroyGenerator(gen);

        //cudaArray Descriptor
        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
        //cuda Array
        cudaArray *d_cuArr;
        checkCudaErrors(cudaMalloc3DArray(&d_cuArr, &channelDesc, make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest), 0));
        cudaMemcpy3DParms copyParams = {0};


        //Array creation
        copyParams.srcPtr   = make_cudaPitchedPtr(d_NoiseTest, SizeNoiseTest*sizeof(float), SizeNoiseTest, SizeNoiseTest);
        copyParams.dstArray = d_cuArr;
        copyParams.extent   = make_cudaExtent(SizeNoiseTest,SizeNoiseTest,SizeNoiseTest);
        copyParams.kind     = cudaMemcpyDeviceToDevice;
        checkCudaErrors(cudaMemcpy3D(&copyParams));
        //Array creation End

        cudaResourceDesc    texRes;
        memset(&texRes, 0, sizeof(cudaResourceDesc));
        texRes.resType = cudaResourceTypeArray;
        texRes.res.array.array  = d_cuArr;
        cudaTextureDesc     texDescr;
        memset(&texDescr, 0, sizeof(cudaTextureDesc));
        texDescr.normalizedCoords = false;
        texDescr.filterMode = cudaFilterModeLinear;
        texDescr.addressMode[0] = cudaAddressModeClamp;   // clamp
        texDescr.addressMode[1] = cudaAddressModeClamp;
        texDescr.addressMode[2] = cudaAddressModeClamp;
        texDescr.readMode = cudaReadModeElementType;
        checkCudaErrors(cudaCreateTextureObject(&texNoise[i], &texRes, &texDescr, NULL));}
}

int main(int argc, char **argv)
{
        CreateTexture();
        cudaSetDevice(0);
        AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[0]);
        cudaSetDevice(1);
        AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[1]);
        cudaSetDevice(2);
        AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[2]);
        checkCudaErrors(cudaPeekAtLastError());
        cudaSetDevice(0);
        checkCudaErrors(cudaDeviceSynchronize());
        cudaSetDevice(1);
        checkCudaErrors(cudaDeviceSynchronize());
        cudaSetDevice(2);
        checkCudaErrors(cudaDeviceSynchronize());
        return 0;
}
$ nvcc -arch=sm_30 -I/usr/local/cuda/samples/common/inc -lcurand -o t341 t341.cu
$ cuda-memcheck ./t341
========= CUDA-MEMCHECK
thread: 0,0,0, value: 0.310691
thread: 1,0,0, value: 0.627906
thread: 0,1,0, value: 0.638900
thread: 1,1,0, value: 0.665186
thread: 0,0,1, value: 0.167465
thread: 1,0,1, value: 0.565227
thread: 0,1,1, value: 0.397606
thread: 1,1,1, value: 0.503013
thread: 0,0,0, value: 0.809163
thread: 1,0,0, value: 0.795669
thread: 0,1,0, value: 0.808565
thread: 1,1,0, value: 0.847564
thread: 0,0,1, value: 0.853998
thread: 1,0,1, value: 0.688446
thread: 0,1,1, value: 0.733255
thread: 1,1,1, value: 0.649379
thread: 0,0,0, value: 0.040824
thread: 1,0,0, value: 0.087417
thread: 0,1,0, value: 0.301392
thread: 1,1,0, value: 0.298669
thread: 0,0,1, value: 0.161962
thread: 1,0,1, value: 0.316443
thread: 0,1,1, value: 0.452077
thread: 1,1,1, value: 0.477722
========= ERROR SUMMARY: 0 errors
$

Note that for simplicity of presentation, this CreateTexture() function overwrites previously allocated device pointers such as d_NoiseTest and d_cuArr, during the processing of the loop. This isn't illegal or a functional issue, but it raises the possibility of memory leaks. (But see below for an example of how to avoid this.)

EDIT: Based on a question in the comments, none of this should be compile-time dependent. Here's a modification of the above code demonstrating this:

$ cat t342.cu
#include <helper_cuda.h>
#include <curand.h>

const int SizeNoiseTest = 32;
const int cubeSizeNoiseTest = SizeNoiseTest*SizeNoiseTest*SizeNoiseTest;

__global__ void AccesTexture(cudaTextureObject_t my_tex)
{
        float test = tex3D<float>(my_tex,(float)threadIdx.x,(float)threadIdx.y,(float)threadIdx.z);//by using this the error occurs
        printf("thread: %d,%d,%d, value: %f\n", threadIdx.x, threadIdx.y, threadIdx.z, test);
}

void CreateTexture(int num, cudaTextureObject_t *texNoise, cudaArray **d_cuArr, float **d_NoiseTest)
{

    for (int i = 0; i < num; i++){
        cudaSetDevice(i);
        cudaMalloc((void **)&d_NoiseTest[i], cubeSizeNoiseTest*sizeof(float));//Allocation of device Array
        //curand Random Generator (needs compiler link -lcurand)
        curandGenerator_t gen;
        curandCreateGenerator(&gen,CURAND_RNG_PSEUDO_DEFAULT);
        curandSetPseudoRandomGeneratorSeed(gen,1235ULL+i);
        curandGenerateUniform(gen, d_NoiseTest[i], cubeSizeNoiseTest);//writing data to d_NoiseTest
        curandDestroyGenerator(gen);

        //cudaArray Descriptor
        cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
        //cuda Array
        checkCudaErrors(cudaMalloc3DArray(&d_cuArr[i], &channelDesc, make_cudaExtent(SizeNoiseTest*sizeof(float),SizeNoiseTest,SizeNoiseTest), 0));
        cudaMemcpy3DParms copyParams = {0};


        //Array creation
        copyParams.srcPtr   = make_cudaPitchedPtr(d_NoiseTest[i], SizeNoiseTest*sizeof(float), SizeNoiseTest, SizeNoiseTest);
        copyParams.dstArray = d_cuArr[i];
        copyParams.extent   = make_cudaExtent(SizeNoiseTest,SizeNoiseTest,SizeNoiseTest);
        copyParams.kind     = cudaMemcpyDeviceToDevice;
        checkCudaErrors(cudaMemcpy3D(&copyParams));
        //Array creation End

        cudaResourceDesc    texRes;
        memset(&texRes, 0, sizeof(cudaResourceDesc));
        texRes.resType = cudaResourceTypeArray;
        texRes.res.array.array  = d_cuArr[i];
        cudaTextureDesc     texDescr;
        memset(&texDescr, 0, sizeof(cudaTextureDesc));
        texDescr.normalizedCoords = false;
        texDescr.filterMode = cudaFilterModeLinear;
        texDescr.addressMode[0] = cudaAddressModeClamp;   // clamp
        texDescr.addressMode[1] = cudaAddressModeClamp;
        texDescr.addressMode[2] = cudaAddressModeClamp;
        texDescr.readMode = cudaReadModeElementType;
        checkCudaErrors(cudaCreateTextureObject(&texNoise[i], &texRes, &texDescr, NULL));}
}
void FreeTexture(int num, cudaTextureObject_t *texNoise, cudaArray **d_cuArr, float **d_NoiseTest)
{
   for (int i = 0; i < num; i++){
     cudaFree(d_NoiseTest[i]);
     cudaDestroyTextureObject(texNoise[i]);
     cudaFreeArray(d_cuArr[i]);}
}

int main(int argc, char **argv)
{
        int num_dev = 1;
        if (argc > 1) num_dev = atoi(argv[1]);
        cudaTextureObject_t *texNoise = new cudaTextureObject_t[num_dev];
        cudaArray **d_cuArr = new cudaArray*[num_dev];
        float **d_NoiseTest = new float*[num_dev];
        CreateTexture(num_dev, texNoise, d_cuArr, d_NoiseTest);
        for (int i = 0; i < num_dev; i++){
          cudaSetDevice(i);
          AccesTexture<<<1,dim3(2,2,2)>>>(texNoise[i]);}
        checkCudaErrors(cudaPeekAtLastError());
        for (int i = 0; i < num_dev; i++){
          cudaSetDevice(i);
          checkCudaErrors(cudaDeviceSynchronize());}
        FreeTexture(num_dev, texNoise, d_cuArr, d_NoiseTest);
        delete[] d_cuArr;
        delete[] d_NoiseTest;
        delete[] texNoise;
        return 0;
}
$ nvcc -I/usr/local/cuda/samples/common/inc -lcurand -o t342 t342.cu
$ cuda-memcheck ./t342
========= CUDA-MEMCHECK
thread: 0,0,0, value: 0.310691
thread: 1,0,0, value: 0.627906
thread: 0,1,0, value: 0.638900
thread: 1,1,0, value: 0.665186
thread: 0,0,1, value: 0.167465
thread: 1,0,1, value: 0.565227
thread: 0,1,1, value: 0.397606
thread: 1,1,1, value: 0.503013
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck ./t342 2
========= CUDA-MEMCHECK
thread: 0,0,0, value: 0.310691
thread: 1,0,0, value: 0.627906
thread: 0,1,0, value: 0.638900
thread: 1,1,0, value: 0.665186
thread: 0,0,1, value: 0.167465
thread: 1,0,1, value: 0.565227
thread: 0,1,1, value: 0.397606
thread: 1,1,1, value: 0.503013
thread: 0,0,0, value: 0.809163
thread: 1,0,0, value: 0.795669
thread: 0,1,0, value: 0.808565
thread: 1,1,0, value: 0.847564
thread: 0,0,1, value: 0.853998
thread: 1,0,1, value: 0.688446
thread: 0,1,1, value: 0.733255
thread: 1,1,1, value: 0.649379
========= ERROR SUMMARY: 0 errors
$ cuda-memcheck ./t342 3
========= CUDA-MEMCHECK
thread: 0,0,0, value: 0.310691
thread: 1,0,0, value: 0.627906
thread: 0,1,0, value: 0.638900
thread: 1,1,0, value: 0.665186
thread: 0,0,1, value: 0.167465
thread: 1,0,1, value: 0.565227
thread: 0,1,1, value: 0.397606
thread: 1,1,1, value: 0.503013
thread: 0,0,0, value: 0.809163
thread: 1,0,0, value: 0.795669
thread: 0,1,0, value: 0.808565
thread: 1,1,0, value: 0.847564
thread: 0,0,1, value: 0.853998
thread: 1,0,1, value: 0.688446
thread: 0,1,1, value: 0.733255
thread: 1,1,1, value: 0.649379
thread: 0,0,0, value: 0.040824
thread: 1,0,0, value: 0.087417
thread: 0,1,0, value: 0.301392
thread: 1,1,0, value: 0.298669
thread: 0,0,1, value: 0.161962
thread: 1,0,1, value: 0.316443
thread: 0,1,1, value: 0.452077
thread: 1,1,1, value: 0.477722
========= ERROR SUMMARY: 0 errors
$

This code was run on a system that has (at least) 3 GPUs. I've also updated the above example so it demonstrates how to create an array of pointers to cudaArray type, and also demonstrating how one could avoid memory leaks.

Array of cudaArray for multi-GPU texture code

1 Answers