Why do cudaMemcpyAsync and kernel launches block even with an asynchronous stream?

Question

Consider the following program for enqueueing some work on a non-blocking GPU stream:

#include <iostream>

using clock_value_t = long long;

__device__ void gpu_sleep(clock_value_t sleep_cycles) {
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

void callback(cudaStream_t, cudaError_t, void *ptr) { 
    *(reinterpret_cast<bool *>(ptr)) = true; 
}

__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }

int main() {
    const clock_value_t duration_in_clocks = 1e6;
    const size_t buffer_size = 1e7;
    bool callback_executed = false;
    cudaStream_t stream;
    auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
    char* device_ptr;
    cudaMalloc(&device_ptr, buffer_size);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
    dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
    cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
    cudaStreamAddCallback(
        stream, callback, &callback_executed, 0 /* fixed and meaningless */);
    snapshot = callback_executed;
    std::cout << "Right after we finished enqueuing work, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
    cudaStreamSynchronize(stream);
    snapshot = callback_executed;
    std::cout << "After cudaStreamSynchronize, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

The size of the buffers and the length of the kernel sleep in cycles are high enough, that as they execute in parallel with the CPU thread, it should finish the enqueueing well before they've concluded (8ms+8ms for copying and 20 ms for the kernel).

And yet, looking at the trace below, it seems the two cudaMemcpyAsync() are actually synchronous, i.e. they block until the (non-blocking) stream has actually concluded the copying. Is this intended behavior? It seems to contract the relevant section of the CUDA Runtime API documentation. How does it make sense?

Trace: (numbered lines, time in useconds):

      1 "Start"        "Duration"    "Grid X"                             "Grid Y"  "Grid Z"    "Block X"   "Block Y"                       "Block Z"  
    104 14102.830000   59264.347000  "cudaMalloc"
    105 73368.351000   19.886000     "cudaStreamCreateWithFlags"
    106 73388.and 20 ms for the kernel).

And yet, looking at the trace below, it seems the two cudaMemcpyAsync()'s are actually synchronous, i.e. they block until the (non-blocking) stream has actually concluded the copying. Is this intended behavior? It seems to contradict the relevant section of the CUDA Runtime API documentation. How does it make sense?

850000   8330.257000   "cudaMemcpyAsync"
        107 73565.702000   8334.265000   47.683716                            5.587311  "Pageable"  "Device"    "GeForce GTX 650 Ti BOOST (0)"  "1"        
        108 81721.124000   2.394000      "cudaConfigureCall"
        109 81723.865000   3.585000      "cudaSetupArgument"
        110 81729.332000   30.742000     "cudaLaunch (dummy(__int64) [107])"
        111 81760.604000   39589.422000  "cudaMemcpyAsync"
        112 81906.303000   20157.648000  128                                  1         1           128         1                               1          
        113 102073.103000  18736.208000  47.683716                            2.485355  "Device"    "Pageable"  "GeForce GTX 650 Ti BOOST (0)"  "1"        
        114 121351.936000  5.560000      "cudaStreamSynchronize"

If I understand well, I think you may have missed this: docs.nvidia.com/cuda/cuda-runtime-api/… — Robin Thoni
@RobinThoni: So basically you're saying that if the host-side memory were pinned, I would get asynchronous behavior? — einpoklum
@RobinThoni: But what about the host-to-device transfer? That seems to meet the criteria at the link for being asynchronous, and yet seems to be done synchronously. — einpoklum
@RobinThoni: You're saying I should understand "from" and "to" as having their actually meaning, but an incompatible meaning :-( ... Anyway, thanks, and perhaps I could interest you in my followup question. — einpoklum

Robin Thoni Robin Thoni · Accepted Answer · 2017-10-27T14:31:46

This seemed weird, so I contacted someone from the CUDA driver team, who confirmed the documentation is correct. I was also able to confirm it:

#include <iostream>
#include <memory>

using clock_value_t = long long;

__device__ void gpu_sleep(clock_value_t sleep_cycles) {
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

void callback(cudaStream_t, cudaError_t, void *ptr) { 
    *(reinterpret_cast<bool *>(ptr)) = true; 
}

__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }

int main(int argc, char* argv[]) {
  cudaFree(0);
  struct timespec start, stop;
    const clock_value_t duration_in_clocks = 1e6;
    const size_t buffer_size = 2 * 1024 * 1024 * (size_t)1024;
    bool callback_executed = false;
    cudaStream_t stream;
    void* host_ptr;
    if (argc == 1){
      host_ptr = malloc(buffer_size);
    }
    else {
      cudaMallocHost(&host_ptr, buffer_size, 0);
    }
    char* device_ptr;
    cudaMalloc(&device_ptr, buffer_size);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    cudaMemcpyAsync(device_ptr, host_ptr, buffer_size, cudaMemcpyDefault, stream);
    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
    double result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
    std::cout << "Elapsed: " << result / 1000 / 1000<< std::endl;

    dummy<<<128, 128, 0, stream>>>(duration_in_clocks);

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    cudaMemcpyAsync(host_ptr, device_ptr, buffer_size, cudaMemcpyDefault, stream);
    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
    result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
    std::cout << "Elapsed: " << result / 1000 / 1000 << std::endl;

    cudaStreamAddCallback(
        stream, callback, &callback_executed, 0 /* fixed and meaningless */);
    auto snapshot = callback_executed;
    std::cout << "Right after we finished enqueuing work, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
    cudaStreamSynchronize(stream);
    snapshot = callback_executed;
    std::cout << "After cudaStreamSynchronize, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

This is basically your code, with a few modifications:

Time measurement
A switch to allocate from pageable or pinned memory
A buffer size of 2 GiB to ensure a measurable copy time
cudaFree(0) to force CUDA lazy initialisation.

Here are the results:

$ nvcc -std=c++11 main.cu -lrt

$ ./a.out # using pageable memory
Elapsed: 0.360828 # (memcpyDtoH pageable -> device, fully async)
Elapsed: 5.20288 # (memcpyHtoD device -> pageable, sync)

$ ./a.out 1 # using pinned memory
Elapsed: 4.412e-06 # (memcpyDtoH pinned -> device, fully async)
Elapsed: 7.127e-06 # (memcpyDtoH device -> pinned, fully async)

It is slower when copying from pageable to device, but it is really async.

I'm sorry for my mistake. I deleted my previous comments to avoid confusing people.

Why do cudaMemcpyAsync and kernel launches block even with an asynchronous stream?

2 Answers

2. API synchronization behavior