CUDA CPU-GPU callbacks using asynchronous memory transfer

Question

Fellow Cuda Programmers,

I'm trying to implement a cpu-gpu callback mechanism using polling mechanism. I've 2 arrays of length 1 (a and cpuflag, corresponding on device side dev_a and gpuflag)(basically 2 variables).

First CPU clears a and waits for update of gpuflag. GPU sees this clearing of a and then updates gpuflag. CPU asynchronously keeps transfering gpuflag to cpuflag and waits for update in the flag. Once CPU sees the update, it again resets a and asynchronously sends it to gpu. Again GPU sees this clearing of a and updates gpuflag and the ping-pong process continues. I want this process to continue for 100 times.

The whole code is here. You can compile it just by saying nvcc -o output filename.cu I'm not able to understand why the code is not exhibiting ping-pong behavior. Any kind of help is very much appreciated. Thanks in advance.

#include <stdio.h>

#define LEN 1
#define MAX 100

__global__ void myKernel(int len, int *dev_a, int *gpuflag) {
        int tid = threadIdx.x;
        gpuflag[tid] = 0;

        while(true){
        //Check if cpu has completed work
                if(dev_a[tid] == 0){
            //Do gpu work and increment flag
                        dev_a[tid] = 1;
                        gpuflag[tid]++;

            //Wait till cpu detects the flag increment and resets
                        while(true){
                                if(dev_a[tid] == 0){
                                        break;
                                }
                        }
                }
        //Max 100 ping pongs
        if(gpuflag[tid]==MAX){
            break;
        }
        }
}

int main( void ) {
        int index, *cpuflag, *gpuflag, value;

        int *a;
        int *dev_a;

        cudaStream_t stream0, stream1;

        cudaStreamCreate( &stream0 );
        cudaStreamCreate( &stream1 );

        cudaMalloc ( (void**)&gpuflag, LEN*sizeof(int) );
        cudaMemset ( gpuflag, 0, LEN*sizeof(int) );
        cudaHostAlloc( (void**)&cpuflag, LEN*sizeof(int), cudaHostAllocDefault );

        cudaMalloc ( (void**)&dev_a, LEN*sizeof(int) );
        cudaMemset ( dev_a, 0, LEN*sizeof(int) );
        cudaHostAlloc( (void**)&a, LEN*sizeof(int), cudaHostAllocDefault );

    //Reset everything
        for(int i=0; i<LEN; i++)
                a[i] = 0;

    //Auxillary variables
        index = 0;
    value = 1;

    //call kernel
        myKernel<<<1,1,0,stream0>>>(LEN, dev_a, gpuflag);

        while(true){
        //Asynchronously copy gpu flag
                cudaMemcpyAsync(cpuflag, gpuflag, LEN*sizeof(int), cudaMemcpyDeviceToHost, stream1);
        //Check if increment has happened or not
                if(cpuflag[index] == value){
            //if yes, reset 
                for(int i=0; i<LEN; i++)
                        a[i] = 0;
            //transfer asynchronously
                    cudaMemcpyAsync(dev_a, a, LEN*sizeof(int), cudaMemcpyHostToDevice, stream1);
            //increment pattern
            value++;
                        printf("GPU updated once. Value is a[%d] = %d, cpuflag = %d\n", index, a[index], cpuflag[index]);
                } else {
                        printf("------------GPU didn't updated. Value is a[%d] = %d, cpuflag = %d\n", index, a[index], cpuflag[index]);
        }

        //Max 100 ping-pongs
        if(value == MAX){
            break;
        }
        }

    cudaFreeHost(a);
    cudaFreeHost(cpuflag);

    cudaFree(dev_a);
    cudaFree(gpuflag);

    cudaStreamDestroy( stream0 );
    cudaStreamDestroy( stream1 );

        return 0;
}

This sort of memory coherence isn't guaranteed by the CUDA memory model. You might be able to get somewhat further by making the Kernel execute a system wide threadfence if your GPU supports it, but this is really not defined behaviour, — talonmies
I agree, this got you undefined behaviour. But in CUDA 5.0 you can use host hooks mechanism to enqueue host call after specific kernel finished execution. It's fully asynchronous in dev. preview 5.0. — geek

Robert Crovella Robert Crovella · Accepted Answer · 2015-05-10T02:12:54

Probably the main thing missing is appropriate use of volatile.

Here's a simplified, fully worked example:

$ cat t763.cu
#include <stdio.h>

#define LEN 1
#define MAX 100
#define DLEN 1000
#define nTPB 256

#ifdef CDP_WORKER
__global__ void cdp_worker(int len, float *data){

  int tid = threadIdx.x+blockDim.x*blockIdx.x;
  if (tid < len) data[tid]++; // simple increment
}
#endif

// only call this kernel with 1 thread
__global__ void myKernel(int len, int dlen, volatile int *dev_a, int *gpuflag, float *data) {
        int tid = threadIdx.x+blockDim.x*blockIdx.x;

        while(gpuflag[tid] < MAX){
        //Check if cpu has completed work
                if(dev_a[tid] == 0){
            //Do gpu work and increment flag
#ifdef CDP_WORKER
                        cdp_worker<<<(dlen+nTPB-1)/nTPB, nTPB>>>(dlen, data);
                        cudaDeviceSynchronize();
#endif
                        dev_a[tid] = 1;
                        gpuflag[tid]++;

                                }
        }
}

void issue_work(int value, float *h_data, float *d_data, int len, cudaStream_t mystream){
#ifdef CDP_WORKER
  cudaMemcpyAsync(h_data, d_data, len*sizeof(float), cudaMemcpyDeviceToHost, mystream);
  cudaStreamSynchronize(mystream);
  for (int i = 0; i < len; i++) if (h_data[i] != value+1) {printf("fault - was %f, should be %f\n", h_data[i], (float)(value+1)); break;}
  cudaMemcpyAsync(d_data, h_data, len*sizeof(float), cudaMemcpyHostToDevice, mystream); // technically not really necessary
  cudaStreamSynchronize(mystream);
#endif
  return;
}
int main( void ) {
        int *gpuflag, value;
        float *h_data, *d_data;
        cudaHostAlloc(&h_data, DLEN*sizeof(float), cudaHostAllocDefault);
        cudaMalloc(&d_data, DLEN*sizeof(float));
        volatile int *z_a;

        cudaStream_t stream0, stream1;

        cudaStreamCreate( &stream0 );
        cudaStreamCreate( &stream1 );

        cudaMalloc ( (void**)&gpuflag, LEN*sizeof(int) );
        cudaMemset ( gpuflag, 0, LEN*sizeof(int) );
        cudaMemset ( d_data, 0, DLEN*sizeof(float));
        cudaHostAlloc( (void**)&z_a, LEN*sizeof(int), cudaHostAllocMapped );
        for (int i = 0; i < LEN; i++) z_a[i] =
        value = 0;
    //call kernel
        myKernel<<<1,1,0,stream0>>>(LEN, DLEN, z_a, gpuflag, d_data);

        while(value<MAX){
          if (z_a[0] == 1) {
             issue_work(value, h_data, d_data, DLEN, stream1);
             z_a[0] = 0;
             printf("%d", value%10);
             value++;}
        }
        printf("\n");
        return 0;
}
$ nvcc -o t763 t763.cu
$ cuda-memcheck ./t763
========= CUDA-MEMCHECK
0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
========= ERROR SUMMARY: 0 errors
$ nvcc -DCDP_WORKER -arch=sm_35 -rdc=true t763.cu -o t763 -lcudadevrt
$ cuda-memcheck ./t763
========= CUDA-MEMCHECK
0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
========= ERROR SUMMARY: 0 errors
$

Extending this to work on multiple threads in the same warp is not a trivial matter.

However, I've extended the basic example to demonstrate, on a cc3.5+ device, that the parent kernel can be the supervisory kernel, and it can launch work via child kernels. This is accomplished by compiling with the CDP_WORKER switch and the additional switches needed for CUDA Dynamic Parallelism, and by running on a cc3.5+ device.

CUDA CPU-GPU callbacks using asynchronous memory transfer

1 Answers