Cuda kernels not running concurrent

Question

Originally I was asking that for some reason my kernels refused to run concurrently when I specify different streams. This has now be solved, however their concurrent behaviour is still not clear to me.

~~I know my system can run multiple streams, as the concurrentKernels CUDA sample runs fine. I can also extend this example so it mimicks my code and it still runs concurrently.~~ Apologies in advance for lots of code. I wanted to post it all as there is probably one small thing blocks my kernels running concurrently or I'm thinking it might be something to do with having structures or lots of separate files. Furthermore I am sure it is useful to you all when attempting to help me! I just wrote the following simplified programme which replicates my problem:

testMain.c

#include <stdlib.h>
#include <signal.h>
#include "test.h"

#define Nsim 900000
#define Ncomp 20

Vector* test1;
Vector* test2;
Vector* test3;

cudaStream_t stream1;
cudaStream_t stream2;
cudaStream_t stream3;

int
main (int argc, char **argv)
{
    test1 = Get_Vector(Nsim);
    test2 = Get_Vector(Nsim);
    test3 = Get_Vector(Nsim);

    checkGPU( cudaStreamCreate(&stream1) );
    checkGPU( cudaStreamCreate(&stream2) );
    checkGPU( cudaStreamCreate(&stream3) );

    int x = 0;
    for (x = 0; x < Ncomp; x++)
    {
      computeGPU(test1, test2, test3, x);
      checkGPU( cudaThreadSynchronize() );
    }
    checkGPU( cudaThreadSynchronize() );

    checkGPU( cudaStreamDestroy(stream1) );
    checkGPU( cudaStreamDestroy(stream2) );
    checkGPU( cudaStreamDestroy(stream3) );

    Free_Vector(test1);
    Free_Vector(test2);
    Free_Vector(test3);

    checkGPU( cudaDeviceReset() );
    exit(EXIT_SUCCESS);
}

basics.c

#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include "basics.h"

inline void gpuAssert(cudaError_t code, const char *file, int line)
{
  if (code != cudaSuccess) 
    {
      fprintf(stderr,"CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
      exit(EXIT_FAILURE);
    }
}

basics.h

#ifndef _BASICS_H
#define _BASICS_H

#include <cuda_runtime.h>

#define checkGPU(ans) { gpuAssert((ans), __FILE__, __LINE__); }

void gpuAssert(cudaError_t code, const char *file, int line);

#endif // _BASICS_H

test.cu

extern "C"
{
#include "test.h"
}

__global__ void compute(int* in, int x)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  in[i] = (int) (x * + 1.05 / 0.4);
}

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x);
}

test.h

#ifndef _TEST_H
#define _TEST_H

#include "vector.h"
#include "basics.h"
#include <cuda_runtime.h>

extern cudaStream_t stream1;
extern cudaStream_t stream2;
extern cudaStream_t stream3;

extern void computeGPU(Vector* in1, Vector* in2, Vector* in3, int x);

#endif // _TEST_H

vector.c

#include <stdlib.h>
#include "vector.h"
#include "basics.h"

Vector*
Get_Vector(int N)
{
  Vector* v = (Vector*) calloc(1, sizeof(Vector));
  v->N = N;
  checkGPU( cudaMalloc((void**) &v->d_data, N * sizeof(int)) );
  return v;
}

void
Free_Vector(Vector* in)
{
  checkGPU( cudaFree(in->d_data) );
  free(in);
}

vector.h

#ifndef _VECTOR_H
#define _VECTOR_H

typedef struct
{
    int N;
    int* d_data;
} Vector;

extern Vector* Get_Vector(int N);

extern void Free_Vector(Vector* in);

#endif // _VECTOR_H

I compile with:

nvcc -gencode arch=compute_20,code=sm_20 -O3 -use_fast_math -lineinfo -o test testMain.c test.cu basics.c vector.c; time ./test

And get separate kernels running in nvvp:

Kernels running serially instead of concurrently.

With Roberts's help I solved this problem by reducing Nsim.

If Nsim is big (900000) as in my question, the GPU is full of blocks and therefore cannot run my kernels concurrently even if specified in separate streams. The profile results are as above.
If Nsim is small (900), the kernels can in theory run concurrently however my kernel is so simple they finish quicker than the overhead of launching the next kernel, therefore the whole simulation is just Launch Compute(int*,int,int) in the RuntimeAPI row. The profile results look like this
If I make changes to my kernel and code such that the kernel takes longer to run (and set Nsim to something reasonable, 3000, not important now):

test.cu

__global__ void compute(int* in, int x, int y)
{
  int i = blockIdx.x*blockDim.x + threadIdx.x;
  in[i] = (int) (x * + 1.05 / 0.4);

  int clock_count = 5000000 * y;
  clock_t start_clock = clock();
  clock_t clock_offset = 0;
  while (clock_offset < clock_count)
  {
    clock_offset = clock() - start_clock;
  }
}

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x, 2);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x, 3);
}

My kernels now run concurrently waiting for the three to finish before launching the next three because I synchronise within my loop: kernels running concurrently

However, if launch my kernels with the following changes I would expect that because I launch all my kernels in the loop and then synchronise, the kernels should all run back to back and the quickest ones just finish 1/3 of the way through the run, the second 2/3 and the last and the end. What is happening here? Is CUDA doing some magic to realise that it has to wait for the long kernels to finish anyway so somehow being more optimized to intersperse running the others? The kernels are all launched and the runtime is just waiting on the one synchronise (this can be seen in the RuntimeAPI row).

testMain.c

int x = 0;
for (x = 0; x < Ncomp; x++)
{
  computeGPU(test1, test2, test3, x);
  //checkGPU( cudaThreadSynchronize() );
}
checkGPU( cudaThreadSynchronize() );

kernels running concurrent but not as expected

Furthermore, launching kernels with the following is very confusing, not as expected. Surely they can synchronise up better than this with two kernels taking the same amount of time to run (1x3 and 3x1) and the other just fitting in the time to run these somewhere.

test.cu

extern "C" void
computeGPU(Vector* in1, Vector* in2, Vector* in3, int x)
{
  int threadsPerBlock = 256;
  int blocksPerGrid = (in1->N + threadsPerBlock - 1) / threadsPerBlock;
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream1>>>(in1->d_data, x, 1);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream2>>>(in2->d_data, x, 2);
  compute<<<blocksPerGrid, threadsPerBlock, 0, stream3>>>(in3->d_data, x, 3);
}

confusing results

That is a strange way to use calloc. Shouldn't this: v->h_data = (int*) calloc(1, N * sizeof(Vector)); use sizeof(int) instead of sizeof(Vector) ? Anyway, it's not clear why you think these kernels should overlap or run concurrently. A kernel which fully occupies a GPU doesn't leave any room for the blocks of the next kernel to begin executing, until it is complete, or nearly complete. With Nsim of 900000 you are launching over 3500 blocks per kernel. These blocks will tend to fill the GPU and prevent any blocks from other kernels from running. — Robert Crovella
Oh yes that is a typo, thanks didnt notice. And yeh I did wonder about that happening, makes sense. In my main programme even with Nsim, let 's say of 100 they were still not concurrent. I do not know however if my simple case here has captured that here. I will check, thanks. — James
@RobertCrovella Thanks for your help. I have nearly solved it. — James
on-demand.gputechconf.com/gtc-express/2011/presentations/… look at slide 18 — Christian Sarofeen
Hmm interesting, thanks @ChristianSarofeen. Is there a video anywhere as well? Looks like a presentation — James

Christian Sarofeen Christian Sarofeen · Accepted Answer · 2015-02-17T13:42:40

http://on-demand.gputechconf.com/gtc-express/2011/presentations/StreamsAndConcurrencyWebinar.pdf

look at slide 18 for a description on an efficient order for submitting concurrent kernels.

With audio: https://developer.nvidia.com/gpu-computing-webinars

look for cuda concurrency & streams.

Cuda kernels not running concurrent

1 Answers