I have a program running on GPU using CUDA with a lot of small kernels, meaning that the kernel call on my CPU needs about the same time as the kernel execution on my GPU.
I would like to add a CPU function to my program loop that needs about the same time as one iteration of all my kernels. I know that after a kernel launch, the CPU can work asynchronous to the GPU but because my last kernel launch is not much ahead of the GPU work being done, this is no option in this case.
So, my idea was to use multiple threads: One thread to launch my GPU kernels and another one (or multiple other ones) to execute the CPU function and run those two in parallel.
I created a small example to test this idea:
#include <unistd.h>
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>
#define THREADS_PER_BLOCK 64
__global__ void k_dummykernel1(const float* a, const float* b, float* c, const int N)
{
const int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < N)
{
float ai = a[id];
float bi = b[id];
c[id] = powf(expf(bi*sinf(ai)),1.0/bi);
}
}
__global__ void k_dummykernel2(const float* a, const float* b, float* c, const int N)
{
const int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < N)
{
float bi = b[id];
c[id] = powf(c[id],bi);
}
}
__global__ void k_dummykernel3(const float* a, const float* b, float* c, const int N)
{
const int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < N)
{
float bi = b[id];
c[id] = logf(c[id])/bi;
}
}
__global__ void k_dummykernel4(const float* a, const float* b, float* c, const int N)
{
const int id = blockIdx.x * blockDim.x + threadIdx.x;
if(id < N)
{
c[id] = asinf(c[id]);
}
}
int main()
{
int N = 10000;
int N2 = N/5;
float *a = new float[N];
float *b = new float[N];
float *c = new float[N];
float *d_a,*d_b,*d_c;
for(int i = 0; i < N; i++)
{
a[i] = (10*(1+i))/(float)N;
b[i] = (i+1)/50.0;
}
cudaMalloc((void**)&d_a,N*sizeof(float));
cudaMalloc((void**)&d_b,N*sizeof(float));
cudaMalloc((void**)&d_c,N*sizeof(float));
cudaMemcpy(d_a, a ,N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b ,N*sizeof(float), cudaMemcpyHostToDevice);
cudaProfilerStart();
for(int k = 0; k < 100; k++)
{
k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
for(int i = 0; i < N2; i++)
{
c[i] = pow(a[i],b[i]);
}
}
cudaDeviceSynchronize();
usleep(40000);
for(int k = 0; k <= 100; k++)
{
#pragma omp parallel sections num_threads(2)
{
#pragma omp section
{
k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel1<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel2<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel3<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
k_dummykernel4<<<(N + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK, THREADS_PER_BLOCK>>>(d_a,d_b,d_c,N);
}
#pragma omp section
{
for(int i = 0; i < N2; i++)
{
c[i] = pow(a[i],b[i]);
}
}
}
}
cudaDeviceSynchronize();
cudaProfilerStop();
delete[] a;
delete[] b;
delete[] c;
cudaFree((void*)d_a);
cudaFree((void*)d_b);
cudaFree((void*)d_c);
}
I compile using: nvcc main.cu -O3 -Xcompiler -fopenmp
First, I run 2x4 kernels and the CPU computation sequential and after that, I tried to do it in parallel using OpenMP sections.
This is the result in the profiler:
The parallel version is much slower than the sequential one...
If I zoom into the sequential part, it looks like this:
One can see that between each 8 kernel launches there is a gap, where the cpu computations are done (this one I would like to close by overlapping it with the kernel calls).
If I zoom into the parallel part (same zoom level!), it looks like this:
There are no gaps anymore but the kernel launches now need about 15 microseconds (vs 5microseconds before).
I also tried bigger array sizes and std::thread
instead of OpenMP but the problem is always the same as before.
Can someone tell me, if this is even possible to get to work and if yes, what am I doing wrong?
Thanks in advance
Cat