Why not cudaMemcpyAsync(host to device) and CUDA kernel are parallel?

Question

I have loaded an image (8 bit, unsigned char) of size 1080 x 1920. For the testing purposes, I am processing the same image 4 times using a for loop and then, generating its timeline profiling.

Strategy: I am dividing the image into 3 parts. I have made three streams for the processing of this whole image.

I am providing a minimal working example below. I am sorry that it will need an image using OpenCV but I don't know how can I mimic the same situation without loading an image using OpenCV.

Problem: The timeline profiling shows that the first stream has finished transferring the data but still the kernel assigned to it did not start. The kernel assigned to the first stream and the data transfer by third stream are parallel. So, my question is that why the processing of first stream's kernel did not start in parallel with the data transfer of second stream?

GPU: NVIDIA Quadro K2000, Compatible 3.0

Timeline profile: Each stream has been assigned a different color.

My code:

__global__ void multiStream_ColorTransformation_kernel(int numChannels, int iw, int ih, unsigned char *ptr_source, unsigned char *ptr_dst)
{
    // Calculate our pixel's location
    int x = (blockIdx.x * blockDim.x) + threadIdx.x;
    int y = (blockIdx.y * blockDim.y) + threadIdx.y;

    // Operate only if we are in the correct boundaries
    if (x >= 0 && x < iw && y >= 0 && y < ih / 3)
    {
        ptr_dst[numChannels*  (iw*y + x) + 0] = ptr_source[numChannels*  (iw*y + x) + 0];
        ptr_dst[numChannels*  (iw*y + x) + 1] = ptr_source[numChannels*  (iw*y + x) + 1];
        ptr_dst[numChannels*  (iw*y + x) + 2] = ptr_source[numChannels*  (iw*y + x) + 2];

    }
}

void callMultiStreamingCudaKernel(unsigned char *dev_src, unsigned char *dev_dst, int numChannels, int iw, int ih, cudaStream_t *ptr_stream)
{

    dim3 numOfBlocks((iw / 20), (ih / 20)); //DON'T multiply by 3 because we have 1/3 data of image
    dim3 numOfThreadsPerBlocks(20, 20);
    multiStream_ColorTransformation_kernel << <numOfBlocks, numOfThreadsPerBlocks, 0, *ptr_stream >> >(numChannels, iw, ih, dev_src, dev_dst);

    return;
}

int main()
{

    cudaStream_t stream_one;
    cudaStream_t stream_two;
    cudaStream_t stream_three;

    cudaStreamCreate(&stream_one);
    cudaStreamCreate(&stream_two);
    cudaStreamCreate(&stream_three);

    Mat image = imread("DijSDK_test_image.jpg", 1);
    //Mat image(1080, 1920, CV_8UC3, Scalar(0,0,255));
    size_t numBytes = image.rows * image.cols * 3;
    int numChannels = 3;

    int iw = image.rows;
    int ih = image.cols;
    size_t totalMemSize = numBytes * sizeof(unsigned char);
    size_t oneThirdMemSize = totalMemSize / 3;

    unsigned char *dev_src_1, *dev_src_2, *dev_src_3, *dev_dst_1, *dev_dst_2, *dev_dst_3, *h_src, *h_dst;


    //Allocate memomry at device for SOURCE and DESTINATION and get their pointers
    cudaMalloc((void**)&dev_src_1, (totalMemSize) / 3);
    cudaMalloc((void**)&dev_src_2, (totalMemSize) / 3);
    cudaMalloc((void**)&dev_src_3, (totalMemSize) / 3);
    cudaMalloc((void**)&dev_dst_1, (totalMemSize) / 3);
    cudaMalloc((void**)&dev_dst_2, (totalMemSize) / 3);
    cudaMalloc((void**)&dev_dst_3, (totalMemSize) / 3);

    //Get the processed image 
    Mat org_dijSDK_img(image.rows, image.cols, CV_8UC3, Scalar(0, 0, 255));
    h_dst = org_dijSDK_img.data;

    //while (1)
    for (int i = 0; i < 3; i++)
    {
        std::cout << "\nLoop: " << i;

        //copy new data of image to the host pointer
        h_src = image.data;

        //Copy the source image to the device i.e. GPU
        cudaMemcpyAsync(dev_src_1, h_src, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_one);
        //KERNEL--stream-1
        callMultiStreamingCudaKernel(dev_src_1, dev_dst_1, numChannels, iw, ih, &stream_one);


        //Copy the source image to the device i.e. GPU
        cudaMemcpyAsync(dev_src_2, h_src + oneThirdMemSize, (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_two);
        //KERNEL--stream-2
        callMultiStreamingCudaKernel(dev_src_2, dev_dst_2, numChannels, iw, ih, &stream_two);

        //Copy the source image to the device i.e. GPU
        cudaMemcpyAsync(dev_src_3, h_src + (2 * oneThirdMemSize), (totalMemSize) / 3, cudaMemcpyHostToDevice, stream_three);
        //KERNEL--stream-3
        callMultiStreamingCudaKernel(dev_src_3, dev_dst_3, numChannels, iw, ih, &stream_three);


        //RESULT copy: GPU to CPU
        cudaMemcpyAsync(h_dst, dev_dst_1, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_one);
        cudaMemcpyAsync(h_dst + oneThirdMemSize, dev_dst_2, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_two);
        cudaMemcpyAsync(h_dst + (2 * oneThirdMemSize), dev_dst_3, (totalMemSize) / 3, cudaMemcpyDeviceToHost, stream_three);

        // wait for results 
        cudaStreamSynchronize(stream_one);
        cudaStreamSynchronize(stream_two);
        cudaStreamSynchronize(stream_three);

        //Assign the processed data to the display image.
        org_dijSDK_img.data = h_dst;
        //DISPLAY PROCESSED IMAGE           
        imshow("Processed dijSDK image", org_dijSDK_img);
        waitKey(33);
    }

    cudaDeviceReset();
    return 0;
}

UPDATE-1: If I remove the kernel call of first stream then, the second kernel and H2D copy of third stream are somehow overlapped (not completely) as shown below.

UPDATE-2 I even tried to use 10 streams and the things remain same. The first stream's kernel processing began only after the H2D copy of tenth's stream data.

But you have copy/execution overlap occurring in every stream in that profile data you are showing — talonmies
overlap is occurring for the execution of first stream's kernel and third streams H2D copy. My question is, why the execution of first stream's kernel did not start immediately after the H2D copy of first stream's data. — skm
@talonmies: there is no overlap between the execution of first stream's kernel and H2D data copy of second stream. That is my question. Rest of the parts are fine. — skm
if you remove the first stream, does the second one start processing immediately after copying data 2 or will it wait till after data 3? — Micka
Is the host memory pinned? The documentation states that the host memory must be paged locked for overlap to happen. — Jez

m.s. m.s. · Accepted Answer · 2015-07-16T13:59:39

As the commenters already pointed out, host memory must be page locked.

There is no need to allocate additional host memory through cudaHostAlloc, you can use cudaHostRegister on your existing OpenCV image:

cudaHostRegister(image.data, totalMemSize, cudaHostRegisterPortable)

Why not cudaMemcpyAsync(host to device) and CUDA kernel are parallel?

1 Answers