I'm trying to perform two tasks (separated into 2 kernels) on the GPU using Cuda and C++. As input I take a NxM matrix (stored in memory on the host as a float array). I will then use a kernel that performs some operations on this matrix to make it a NxMxD matrix. I then have a second kernel which performs some operations on this 3D matrix (I just read the values, I don't have to write values to it).
Operating in texture memory seems to be much faster for my task so my question is if it is possible to copy my data from global memory on the device after kernel 1 and transfer it directly to texture memory for kernel 2 without bringing it back to the host?
UPDATE
I've added some code to illustrate my problem better.
Here are the two kernels. The first is just a place holder for now and replicates the 2D matrix into 3D.
__global__ void computeFeatureVector(float* imData3D_dev, int imX, int imY, int imZ) {
//calculate each thread global index
int xindex=blockIdx.x*blockDim.x+threadIdx.x;
int yindex=blockIdx.y*blockDim.y+threadIdx.y;
#pragma unroll
for (int z=0; z<imZ; z++) {
imData3D_dev[xindex+yindex*imX + z*imX*imY] = tex2D(texImIp,xindex,yindex);
}
}
The second will take this 3D matrix, now represented as a texture and perform some operations on it. Blank for now.
__global__ void kernel2(float* resData_dev, int imX) {
//calculate each thread global index
int xindex=blockIdx.x*blockDim.x+threadIdx.x;
int yindex=blockIdx.y*blockDim.y+threadIdx.y;
resData_dev[xindex+yindex*imX] = tex3D(texImIp3D,xindex,yindex, 0);
return;
}
Then the main body of the code is as follows:
// declare textures
texture<float,2,cudaReadModeElementType> texImIp;
texture<float,3,cudaReadModeElementType> texImIp3D;
void main_fun() {
// constants
int imX = 1024;
int imY = 768;
int imZ = 16;
// input data
float* imData2D = new float[sizeof(float)*imX*imY];
for(int x=0; x<imX*imY; x++)
imData2D[x] = (float) rand()/RAND_MAX;
//create channel to describe data type
cudaArray* carrayImIp;
cudaChannelFormatDesc channel;
channel=cudaCreateChannelDesc<float>();
//allocate device memory for cuda array
cudaMallocArray(&carrayImIp,&channel,imX,imY);
//copy matrix from host to device memory
cudaMemcpyToArray(carrayImIp,0,0,imData2D,sizeof(float)*imX*imY,cudaMemcpyHostToDevice);
// Set texture properties
texImIp.filterMode=cudaFilterModePoint;
texImIp.addressMode[0]=cudaAddressModeClamp;
texImIp.addressMode[1]=cudaAddressModeClamp;
// bind texture reference with cuda array
cudaBindTextureToArray(texImIp,carrayImIp);
// kernel params
dim3 blocknum;
dim3 blocksize;
blocksize.x=16; blocksize.y=16; blocksize.z=1;
blocknum.x=(int)ceil((float)imX/16);
blocknum.y=(int)ceil((float)imY/16);
// store output here
float* imData3D_dev;
cudaMalloc((void**)&imData3D_dev,sizeof(float)*imX*imY*imZ);
// execute kernel
computeFeatureVector<<<blocknum,blocksize>>>(imData3D_dev, imX, imY, imZ);
//unbind texture reference to free resource
cudaUnbindTexture(texImIp);
// check copied ok
float* imData3D = new float[sizeof(float)*imX*imY*imZ];
cudaMemcpy(imData3D,imData3D_dev,sizeof(float)*imX*imY*imZ,cudaMemcpyDeviceToHost);
cout << " kernel 1" << endl;
for (int x=0; x<10;x++)
cout << imData3D[x] << " ";
cout << endl;
delete [] imData3D;
//
// kernel 2
//
// copy data on device to 3d array
cudaArray* carrayImIp3D;
cudaExtent volumesize;
volumesize = make_cudaExtent(imX, imY, imZ);
cudaMalloc3DArray(&carrayImIp3D,&channel,volumesize);
cudaMemcpyToArray(carrayImIp3D,0,0,imData3D_dev,sizeof(float)*imX*imY*imZ,cudaMemcpyDeviceToDevice);
// texture params and bind
texImIp3D.filterMode=cudaFilterModePoint;
texImIp3D.addressMode[0]=cudaAddressModeClamp;
texImIp3D.addressMode[1]=cudaAddressModeClamp;
texImIp3D.addressMode[2]=cudaAddressModeClamp;
cudaBindTextureToArray(texImIp3D,carrayImIp3D,channel);
// store output here
float* resData_dev;
cudaMalloc((void**)&resData_dev,sizeof(float)*imX*imY);
// kernel 2
kernel2<<<blocknum,blocksize>>>(resData_dev, imX);
cudaUnbindTexture(texImIp3D);
//copy result matrix from device to host memory
float* resData = new float[sizeof(float)*imX*imY];
cudaMemcpy(resData,resData_dev,sizeof(float)*imX*imY,cudaMemcpyDeviceToHost);
// check copied ok
cout << " kernel 2" << endl;
for (int x=0; x<10;x++)
cout << resData[x] << " ";
cout << endl;
delete [] imData2D;
delete [] resData;
cudaFree(imData3D_dev);
cudaFree(resData_dev);
cudaFreeArray(carrayImIp);
cudaFreeArray(carrayImIp3D);
}
Im happy that the first kernel is working correctly but the 3D matrix imData3D_dev does not seem to be bound to the texture texImIp3D correctly.
ANSWER
I solved my problem using cudaMemcpy3D. Here is revised code for the second part of the main function. imData3D_dev contains the 3D matrix in global memory from the first kernel.
cudaArray* carrayImIp3D;
cudaExtent volumesize;
volumesize = make_cudaExtent(imX, imY, imZ);
cudaMalloc3DArray(&carrayImIp3D,&channel,volumesize);
cudaMemcpy3DParms copyparms={0};
copyparms.extent = volumesize;
copyparms.dstArray = carrayImIp3D;
copyparms.kind = cudaMemcpyDeviceToDevice;
copyparms.srcPtr = make_cudaPitchedPtr((void*)imData3D_dev, sizeof(float)*imX,imX,imY);
cudaMemcpy3D(©parms);
// texture params and bind
texImIp3D.filterMode=cudaFilterModePoint;
texImIp3D.addressMode[0]=cudaAddressModeClamp;
texImIp3D.addressMode[1]=cudaAddressModeClamp;
texImIp3D.addressMode[2]=cudaAddressModeClamp;
cudaBindTextureToArray(texImIp3D,carrayImIp3D,channel);
// store output here
float* resData_dev;
cudaMalloc((void**)&resData_dev,sizeof(float)*imX*imY);
kernel2<<<blocknum,blocksize>>>(resData_dev, imX);
// ... clean up