I'm writing a CUDA program for image processing. Same kernel "processOneChannel" will be launched for RGB channels.
Below I try to specify streams for the three kernel launches so they can be processed concurrently. But nvprof says they are still launched one after another...
There are two other kernels before and after these three, and I don't want them to run concurrently.
Basically I want the following: seperateChannels --> processOneChannel(x3) --> recombineChannels
Please advice what I did wrong..
void kernelLauncher(const ushort4 * const h_inputImageRGBA, ushort4 * const d_inputImageRGBA,
ushort4* const d_outputImageRGBA, const size_t numRows, const size_t numCols,
unsigned short *d_redProcessed,
unsigned short *d_greenProcessed,
unsigned short *d_blueProcessed,
unsigned short *d_prand)
{
int MAXTHREADSx = 512;
int MAXTHREADSy = 1;
int nBlockX = numCols / MAXTHREADSx + 1;
int nBlockY = numRows / MAXTHREADSy + 1;
const dim3 blockSize(MAXTHREADSx,MAXTHREADSy,1);
const dim3 gridSize(nBlockX,nBlockY,1);
// cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
int nstreams = 5;
cudaStream_t *streams = (cudaStream_t *) malloc(nstreams * sizeof(cudaStream_t));
for (int i = 0; i < nstreams; i++)
{
checkCudaErrors(cudaStreamCreateWithFlags(&(streams[i]),cudaStreamNonBlocking));
}
separateChannels<<<gridSize,blockSize>>>(d_inputImageRGBA,
(int)numRows,
(int)numCols,
d_red,
d_green,
d_blue);
cudaDeviceSynchronize();
checkCudaErrors(cudaGetLastError());
processOneChannel<<<gridSize,blockSize,0,streams[0]>>>(d_red,
d_redProcessed,
(int)numRows,(int)numCols,
d_filter,d_prand);
processOneChannel<<<gridSize,blockSize,0,streams[1]>>>(d_green,
d_greenProcessed,
(int)numRows,(int)numCols,
d_filter,d_prand);
processOneChannel<<<gridSize,blockSize,0,streams[2]>>>(d_blue,
d_blueProcessed,
(int)numRows,(int)numCols,
d_filter,d_prand);
cudaDeviceSynchronize();
checkCudaErrors(cudaGetLastError());
recombineChannels<<<gridSize, blockSize>>>(d_redProcessed,
d_greenProcessed,
d_blueProcessed,
d_outputImageRGBA,
numRows,
numCols);
for (int i = 0; i < nstreams; i++)
{
cudaStreamDestroy(streams[i]);
}
free(streams);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}
Here's nvprof gpu trace output. Note the memcpy before the kernel launches are to pass filter data for the processing, so they cannot run in concurrency with kernel launches.
==10001== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
1.02428s 2.2400us - - - - - 28.125MB 1e+04GB/s GeForce GT 750M 1 13 [CUDA memset]
1.02855s 18.501ms - - - - - 28.125MB 1.4846GB/s GeForce GT 750M 1 13 [CUDA memcpy HtoD]
1.21959s 1.1371ms - - - - - 1.7580MB 1.5098GB/s GeForce GT 750M 1 13 [CUDA memcpy HtoD]
1.22083s 1.3440us - - - - - 7.0313MB 5e+03GB/s GeForce GT 750M 1 13 [CUDA memset]
1.22164s 1.3440us - - - - - 7.0313MB 5e+03GB/s GeForce GT 750M 1 13 [CUDA memset]
1.22243s 3.6480us - - - - - 7.0313MB 2e+03GB/s GeForce GT 750M 1 13 [CUDA memset]
1.22349s 10.240us - - - - - 8.0000KB 762.94MB/s GeForce GT 750M 1 13 [CUDA memcpy HtoD]
1.22351s 6.6021ms (6 1441 1) (512 1 1) 12 0B 0B - - GeForce GT 750M 1 13 separateChannels(...) [123]
1.23019s 10.661ms (6 1441 1) (512 1 1) 36 192B 0B - - GeForce GT 750M 1 14 processOneChannel(...) [133]
1.24085s 10.518ms (6 1441 1) (512 1 1) 36 192B 0B - - GeForce GT 750M 1 15 processOneChannel(...) [141]
1.25137s 10.779ms (6 1441 1) (512 1 1) 36 192B 0B - - GeForce GT 750M 1 16 processOneChannel(...) [149]
1.26372s 5.7810ms (6 1441 1) (512 1 1) 15 0B 0B - - GeForce GT 750M 1 13 recombineChannels(...) [159]
1.26970s 19.859ms - - - - - 28.125MB 1.3831GB/s GeForce GT 750M 1 13 [CUDA memcpy DtoH]
Here's CMakeList.txt where I passed -default-stream per-thread to nvcc
cmake_minimum_required(VERSION 2.6 FATAL_ERROR)
find_package(OpenCV REQUIRED)
find_package(CUDA REQUIRED)
set(
CUDA_NVCC_FLAGS
${CUDA_NVCC_FLAGS};
-default-stream per-thread
)
file( GLOB hdr *.hpp *.h )
file( GLOB cu *.cu)
SET (My_files main.cpp)
# Project Executable
CUDA_ADD_EXECUTABLE(My ${My_files} ${hdr} ${cu})
target_link_libraries(My ${OpenCV_LIBS})