I have come across a serialization issue in CUDA kernels where concurrent execution is expected. I an using cudaEvents as markers to track kernel executions.
In my experiments on concurrent kernels with multiple streams, we observed that using events on their respective streams causes concurrent kernels to get serialized.
The code below demonstrates this issue. I tested this on two different devices which have concurrent kernel execution capabilities listed below:
- Tesla C2070, Driver version 4.10, Runtime version 4.10, CUDA capability 2.0
- Tesla M2090, Driver version 4.10, Runtime version 4.10, CUDA capability 2.0
You can run the program with and w/o events by changing USE_EVENTS macro and you will observe the difference due to concurrent execution vs. serial execution.
#include<cuda.h>
#include<pthread.h>
#include<stdio.h>
#include<stdlib.h>
#include<stdint.h>
#define CUDA_SAFE_CALL( call) do { \
cudaError_t err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in call at file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(-1); \
} } while (0)
// Device code
__global__ void VecAdd(uint64_t len)
{
volatile int a;
for(uint64_t n = 0 ; n < len ; n ++)
a++;
return ;
}
#define USE_EVENTS
int
main(int argc, char *argv[])
{
cudaStream_t stream[2];
for(int i = 0 ; i < 2 ; i++)
CUDA_SAFE_CALL(cudaStreamCreate(&stream[i]));
#ifdef USE_EVENTS
cudaEvent_t e[4];
CUDA_SAFE_CALL(cudaEventCreate(&e[0]));
CUDA_SAFE_CALL(cudaEventCreate(&e[1]));
CUDA_SAFE_CALL(cudaEventRecord(e[0],stream[0]));
#endif
VecAdd<<<1, 32, 0, stream[0]>>>(0xfffffff);
#ifdef USE_EVENTS
CUDA_SAFE_CALL(cudaEventRecord(e[1],stream[0]));
#endif
#ifdef USE_EVENTS
CUDA_SAFE_CALL(cudaEventCreate(&e[2]));
CUDA_SAFE_CALL(cudaEventCreate(&e[3]));
CUDA_SAFE_CALL(cudaEventRecord(e[2],stream[1]));
#endif
VecAdd<<<1, 32, 0, stream[1]>>>(0xfffffff);
#ifdef USE_EVENTS
CUDA_SAFE_CALL(cudaEventRecord(e[3],stream[1]));
#endif
CUDA_SAFE_CALL(cudaDeviceSynchronize());
for(int i = 0 ; i < 2 ; i++)
CUDA_SAFE_CALL(cudaStreamDestroy(stream[i]));
return 0;
}
Any suggestions in why this might be happening and how to circumvent this serialization will be useful.