How to run a Cuda kernel call and CPU function in parallel?

Question

I have a program running on GPU using CUDA with a lot of small kernels, meaning that the kernel call on my CPU needs about the same time as the kernel execution on my GPU.

I would like to add a CPU function to my program loop that needs about the same time as one iteration of all my kernels. I know that after a kernel launch, the CPU can work asynchronous to the GPU but because my last kernel launch is not much ahead of the GPU work being done, this is no option in this case.

So, my idea was to use multiple threads: One thread to launch my GPU kernels and another one (or multiple other ones) to execute the CPU function and run those two in parallel.

I created a small example to test this idea:

#include <unistd.h>
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>

#define THREADS_PER_BLOCK 64

__global__ void k_dummykernel1(const float* a, const float* b, float* c, const int N)
{
    const int id = blockIdx.x * blockDim.x + threadIdx.x;
    if(id < N)
    {
        float ai = a[id];
        float bi = b[id];

        c[id] = powf(expf(bi*sinf(ai)),1.0/bi);
    }
}

__global__ void k_dummykernel2(const float* a, const float* b, float* c, const int N)
{
    const int id = blockIdx.x * blockDim.x + threadIdx.x;
    if(id < N)
    {
        float bi = b[id];

        c[id] = powf(c[id],bi);
    }
}

__global__ void k_dummykernel3(const float* a, const float* b, float* c, const int N)
{
    const int id = blockIdx.x * blockDim.x + threadIdx.x;
    if(id < N)
    {
        float bi = b[id];

        c[id] = logf(c[id])/bi;
    }
}

__global__ void k_dummykernel4(const float* a, const float* b, float* c, const int N)
{
    const int id = blockIdx.x * blockDim.x + threadIdx.x;
    if(id < N)
    {

        c[id] = asinf(c[id]);
    }
}

int main()
{
    int N = 10000;
    int N2 = N/5;

    float *a = new float[N];
    float *b = new float[N];
    float *c = new float[N];

    float *d_a,*d_b,*d_c;

    for(int i = 0; i < N; i++)
    {
        a[i] = (10*(1+i))/(float)N;
        b[i] = (i+1)/50.0;
    }



    cudaMalloc((void**)&d_a,N*sizeof(float));
    cudaMalloc((void**)&d_b,N*sizeof(float));
    cudaMalloc((void**)&d_c,N*sizeof(float));

    cudaMemcpy(d_a, a ,N*sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b ,N*sizeof(float), cudaMemcpyHostToDevice);


    cudaProfilerStart();


    for(int k = 0; k < 100; k++)
    {

        k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);

        k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);

        for(int i = 0; i < N2; i++)
        {
            c[i] = pow(a[i],b[i]);
        }

    }

    cudaDeviceSynchronize();
    usleep(40000);

    for(int k = 0; k <= 100; k++)
    {

#pragma omp parallel sections num_threads(2)
        {
#pragma omp section
            {
                k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);

                k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
                k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            }

#pragma omp section
            {
                for(int i = 0; i < N2; i++)
                {
                    c[i] = pow(a[i],b[i]);
                }
            }
        }
    }

    cudaDeviceSynchronize();

    cudaProfilerStop();

    delete[] a;
    delete[] b;
    delete[] c;

    cudaFree((void*)d_a);
    cudaFree((void*)d_b);
    cudaFree((void*)d_c);
}

I compile using: nvcc main.cu -O3 -Xcompiler -fopenmp

First, I run 2x4 kernels and the CPU computation sequential and after that, I tried to do it in parallel using OpenMP sections.

This is the result in the profiler:

The parallel version is much slower than the sequential one...

If I zoom into the sequential part, it looks like this:

One can see that between each 8 kernel launches there is a gap, where the cpu computations are done (this one I would like to close by overlapping it with the kernel calls).

If I zoom into the parallel part (same zoom level!), it looks like this:

There are no gaps anymore but the kernel launches now need about 15 microseconds (vs 5microseconds before).

I also tried bigger array sizes and std::thread instead of OpenMP but the problem is always the same as before.

Can someone tell me, if this is even possible to get to work and if yes, what am I doing wrong?

Thanks in advance

Cat

I've run your code and looked at it in the visual profiler. I see the opposite of what you are seeing. The clump of kernel launches on the left hand side in the visual profiler is more spread out and takes longer in the timeline, and the clump of kernel launches on the right hand side is more tightly packed and takes less time in the timeline. This was on CUDA 9.2 — Robert Crovella
@RobertCrovella I am also using CUDA 9.2 did you use any other compiler switches? Or id you modify some settings in the profiler? I run this code on Ubuntu 18.04, GPU: GTX 1080 TI, CPU: Intel Xeon W-2133 — Catrexis

Zulan Zulan · Accepted Answer · 2018-10-11T21:37:56

I'm not getting as extreme results as you are, so I'm not sure this will actually help you. I see slower API calls from the second thread, so making sure that only one thread handles all the CUDA API calls improves the results somewhat. This is generally a good idea, and as you can see not the case for you with sections. A simple approach would be this:

#pragma omp parallel num_threads(2)
{
    for(int k = 0; k <= KMAX; k++)
    {
        if (omp_get_thread_num() == 0)
        {
            k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);

            k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
            k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
        }
        else
        {
            for(int i = 0; i < N2; i++)
            {
                c[i] = pow(a[i],b[i]);
            }
        }
        // this makes sure that the behavior is consistent
        #pragma omp barrier
    }
}

Note that I also moved the parallel portion outside of the loop such that there is less thread management overhead.

Another aspect to consider is that the performance monitoring does influence your performance, especially for these very short API calls. I added timing and increased the k-loop to 1000, and from console I get the following results:

Serial time:                   0.036724
Parallel time (pinned thread): 0.021165
Parallel time (sections):      0.027331

With nvprof I get:

Serial time:                   0.058805
Parallel time (pinned thread): 0.054116
Parallel time (sections):      0.053535

So basically you have to thread the results from within the visual profiler with a huge grain of salt. The insight from detailed tracing is often very useful, but in this case you should rely on end-to-end measurements.

How to run a Cuda kernel call and CPU function in parallel?

1 Answers