cuda cpu function - gpu kernel overlap

Question

I am having problems with concurrency in my CUDA application that I am trying to develop in order to practice CUDA. I want to share the work between GPU and CPU by using asynchronous behaviors of cudaMemecpyAsync and CUDA kernels but I cannot successfully overlap CPU execution and GPU execution.

It overlaps with Host to Device data transfer but kernel execution does not overlap. It basically waits CPU to finish and call the synchronization function then kernel starts to execute on device. I couldn't understand this behavior, aren't kernels always asynchronous to CPU thread?

My GPU is Nvidia Geforce GT 550m (Fermi Architecture with 1 Copy Engine and 1 Compute Engine).

I use CUDA 6.0 and Nsight 4.0.

Here is the code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdlib.h>
#include <stdio.h>

#include <iostream>
#include <thread>
#include <chrono>
using namespace std;

struct point4D 
{
    float x;
    float y;
    float z;
    float w;
};

void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC);

bool correct_output(point4D * data, unsigned int size);
void flush_buffer(point4D * data, unsigned int size);
void initialize_input(point4D *& data, unsigned int size);
void cudaCheckError(cudaError_t cudaStatus, char* err);

// Implements cross product for 4D point on the GPU-side.
__global__ void gpu_kernel(point4D * d_ptrData, point4D * d_out, point4D pB, point4D pC)
{
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    point4D pA = d_ptrData[index];
    point4D out; out.x = 0; out.y = 0; out.z = 0; out.w = 0;

    out.x +=  pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
    out.y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
    out.z +=  pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
    out.w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);

   d_out[index] = out;
}

// Implements cross product for 4D point on the CPU-size.
void cpu_function(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)
{
    for(unsigned int index = 0; index < h_dataSize; index++)
    {
        h_out[index].x = 0; h_out[index].y = 0; h_out[index].z = 0; h_out[index].w = 0;

        point4D pA = h_ptrData[index];

        h_out[index].x +=  pA.y*(pB.z*pC.w - pC.z*pB.w) - pA.z*(pB.y*pC.w - pC.y*pB.w) + pA.w*(pB.y*pC.z - pC.y*pB.z);
        h_out[index].y += -pA.x*(pB.z*pC.w - pC.z*pB.w) + pA.z*(pB.x*pC.w - pC.x*pB.w) - pA.w*(pB.x*pC.z - pC.x*pB.z);
        h_out[index].z +=  pA.x*(pB.y*pC.w - pC.y*pB.w) - pA.y*(pB.x*pC.w - pC.x*pB.w) + pA.w*(pB.x*pC.y - pC.x*pB.y);
        h_out[index].w += -pA.x*(pB.y*pC.z - pC.y*pB.z) + pA.y*(pB.x*pC.z - pC.x*pB.z) - pA.z*(pB.x*pC.y - pC.x*pB.y);
    }   
}


int main(int argc, char *argv[])
{
    int devID;
    cudaDeviceProp deviceProps;

    printf("[%s] - Starting...\n", argv[0]);

    int device_count;
    cudaCheckError(cudaGetDeviceCount(&device_count), "Couldn't get device count!");

    if (device_count == 0)
    {
        fprintf(stderr, "gpuDeviceInit() CUDA error: no devices supporting CUDA.\n");
        exit(EXIT_FAILURE);
    }

    devID = 0;
    cudaCheckError(cudaSetDevice(devID), "Couldn't set device!");
    cudaCheckError(cudaGetDeviceProperties(&deviceProps, devID), "Couldn't get Device Properties");
    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, deviceProps.name, deviceProps.major, deviceProps.minor);

    cudaDeviceReset();

    const unsigned int DATA_SIZE = 30000000;
    bool bFinalResults = true;

    // Input Data Initialization
    point4D pointB;
    pointB.x = 1; pointB.y = 1; pointB.z = 0; pointB.w = 0;

    point4D pointC;
    pointC.x = 1; pointC.y = 1; pointC.z = 1; pointC.w = 0;

    point4D * data = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
    point4D * out_points = (point4D*) malloc(DATA_SIZE * sizeof(point4D));
    initialize_input(data, DATA_SIZE);
    //

    flush_buffer(out_points, DATA_SIZE);
    cout << endl << endl;

    // 1+way
    heterogenous_1way_plus(data, DATA_SIZE, out_points, pointB, pointC);
    bFinalResults &= correct_output(out_points, DATA_SIZE); // checking correctness

    free(out_points);
    free(data);

    exit(bFinalResults ? EXIT_SUCCESS : EXIT_FAILURE);
    return 0;
}

void heterogenous_1way_plus(point4D * h_ptrData, unsigned int h_dataSize, point4D * h_out, point4D pB, point4D pC)
{
    cout << "1-way_plus: STARTS!!!" << endl;

    // Run the %25 of the data from CPU, rest will be executed on GPU
    unsigned int ratioPercentCPUtoGPU = 25;
    unsigned int d_dataSize = (h_dataSize * (100 - ratioPercentCPUtoGPU))/100;
    h_dataSize = (h_dataSize * ratioPercentCPUtoGPU)/100;
    size_t memorySize = d_dataSize * sizeof(point4D);

    cout << "Data Ratio Between CPU and GPU:" << (float)ratioPercentCPUtoGPU/100 << endl;
    cout << "CPU will process " << h_dataSize << " data." << endl;
    cout << "GPU will process " << d_dataSize << " data." << endl;

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    cudaCheckError(cudaHostRegister(h_ptrData, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");
    cudaCheckError(cudaHostRegister(h_out, memorySize, cudaHostRegisterPortable), "cudaHostRegister failed!");

    // allocate device memory
    point4D * d_in = 0; point4D * d_out = 0;
    cudaCheckError(cudaMalloc( (void **)&d_in, memorySize), "cudaMalloc failed!");
    cudaCheckError(cudaMalloc( (void **)&d_out, memorySize), "cudaMalloc failed!");

    // set kernel launch configuration
    dim3 nThreads = dim3(1000,1);
    dim3 nBlocks = dim3(d_dataSize / nThreads.x,1);

    cout << "GPU Kernel Configuration : " << endl;
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

    // create cuda stream
    cudaStream_t stream;
    cudaCheckError(cudaStreamCreate(&stream), "cudaStreamCreate failed!");

    // create cuda event handles
    cudaEvent_t start, stop;
    cudaCheckError(cudaEventCreate(&start), "cudaEventCreate failed!");
    cudaCheckError(cudaEventCreate(&stop), "cudaEventCreate failed!");

    // main thread waits for device
    cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");
    float gpu_time = 0.0f;
    cudaEventRecord(start, stream);

    cudaMemcpyAsync(d_in, h_ptrData, memorySize, cudaMemcpyHostToDevice, stream);       
    gpu_kernel<<<nBlocks, nThreads, 0, stream>>>(d_in, d_out, pB, pC);
    cudaMemcpyAsync(h_out, d_out, memorySize, cudaMemcpyDeviceToHost, stream);

    cudaEventRecord(stop, stream);

    // The memory layout of CPU processing starts after GPU's.
    cpu_function(h_ptrData + d_dataSize, h_dataSize, h_out + d_dataSize, pB, pC);       

    cudaCheckError(cudaStreamSynchronize(stream), "cudaStreamSynchronize failed!");

    cudaCheckError(cudaEventElapsedTime(&gpu_time, start, stop), "cudaEventElapsedTime failed!");

    cudaCheckError(cudaDeviceSynchronize(), "cudaDeviceSynchronize failed!");

    // release resources
    cudaCheckError(cudaEventDestroy(start), "cudaEventDestroy failed!");
    cudaCheckError(cudaEventDestroy(stop), "cudaEventDestroy failed!");
    cudaCheckError(cudaHostUnregister(h_ptrData), "cudaHostUnregister failed!");
    cudaCheckError(cudaHostUnregister(h_out), "cudaHostUnregister failed!");
    cudaCheckError(cudaFree(d_in), "cudaFree failed!");
    cudaCheckError(cudaFree(d_out), "cudaFree failed!");
    cudaCheckError(cudaStreamDestroy(stream), "cudaStreamDestroy failed!");

    cudaDeviceReset();    

    cout << "Execution of GPU: " << gpu_time << "ms" << endl;
    cout << "1-way_plus: ENDS!!!" << endl;        
}

// Checks correctness of outputs
bool correct_output(point4D * data, unsigned int size)
{ 
    const static float x = 0, y = 0, z = 0, w = -1;

    for (unsigned int i = 0; i < size; i++)
    {
        if (data[i].x != x || data[i].y != y ||
            data[i].z != y || data[i].w != w)
        {
            printf("Error! data[%d] = [%f, %f, %f, %f], ref = [%f, %f, %f, %f]\n",
            i, data[i].x, data[i].y, data[i].z, data[i].w, x, y, z, w);

            return 0;
        }
    }
    return 1;
}

// Refresh the output buffer
void flush_buffer(point4D * data, unsigned int size)
{
    for(unsigned int i = 0; i < size; i++)
    {
        data[i].x = 0; data[i].y = 0; data[i].z = 0; data[i].w = 0;
    }
}

// Initialize the input data to feed the system for simulation
void initialize_input(point4D *& data, unsigned int size)
{
    for(unsigned int idx = 0; idx < size; idx++)
    {
        point4D* d = &data[idx];
        d->x = 1;
        d->y = 0;
        d->z = 0;
        d->w = 0;
    }
}

void cudaCheckError(cudaError_t cudaStatus, char* err)
{
    if(cudaStatus != cudaSuccess)
    {
        fprintf(stderr, err);
        cudaDeviceReset();
       exit(EXIT_FAILURE);
    }
}

And here is the Nsight screenshot :

I couldn't add Nsight screenshot due to my insufficient points on stackoverflow :/ — Vemulo
I suggest you provide a complete example that someone else could copy, paste, compile, and run. What happens if you remove the cudaEventRecord() operations ? — Robert Crovella
@RobertCrovella, thank you for your suggestion I added the complete code. And removing cudaEventRecord() did not solve anything, unfortunately. — Vemulo

Robert Crovella Robert Crovella · Accepted Answer · 2014-07-03T02:20:07

You're getting proper overlap, from what I can see on your profiler image. I ran your code and see something similar.

In general, the critical sequence in your code is like this:

cudaMemcpyAsyncH2D
kernel call
cudaMemcpyAsyncD2H
cpu function
cudaStreamSynchronize

The CPU thread processes those steps in that order. Steps 1-3 are asynchronous, meaning control is returned to the CPU thread immediately, without waiting for the underlying CUDA operation to complete. And you desire that step 4 overlaps as much as possible with steps 1,2, and 3.

What we see is that the cudaStreamSynchronize() call shows up in the timeline approximately coincident with the start of the kernel execution. This means that all CPU thread activity preceding the cudaStreamSynchronize() call has completed at that point (i.e. approximately at the point of the beginning of the actual kernel execution.) Therefore the cpu function (step 4) that we are desiring to overlap with steps 1-3 is actually completed by the start of step 2 (in terms of actual CUDA execution). Therefore you are getting full overlap of your cpu function with the first host->device memcpy operation.

So it's working as expected. Because the cudaStreamSynchronize() call blocks the CPU thread until all stream activity is complete, it occupies the timeline from when it is encountered until the point at which the stream activity is complete.

The fact that the cudaStreamSynchronize() call is curiously coincident with the start of kernel execution, and that there is a gap in between the end of the H2D memcpy and the start of the kernel, is likely due to WDDM batching of commands. When I profile your code under linux, I don't see the gap and exact coincidence, but otherwise the general flow is the same. Here is what I see using the visual profiler under linux:

Linux visual profiler

Note that in the above image, the cudaStreamSynchronize() is actually encountered during the H2D memcpy operation before the kernel begins.

Responding to a question in the comments, I modified the app so the split percentage was 50 instead of 25:

unsigned int ratioPercentCPUtoGPU = 50;

here is what the new profiler output looks like:

profiler output with modified split percentage

We see that the CPU is taking more time relative to the GPU kernel call, and so the cudaStreamSynchronize() call is not encountered by the CPU thread until during the D2H memcpy operation. We continue to see under linux that there is no fixed relationship between this point and the start of the kernel execution. Now the CPU execution is fully overlapping the H2D memcpy, the kernel execution, and a small portion of the D2H memcpy.

cuda cpu function - gpu kernel overlap

1 Answers