1
votes

Consider the following program for enqueueing some work on a non-blocking GPU stream:

#include <iostream>

using clock_value_t = long long;

__device__ void gpu_sleep(clock_value_t sleep_cycles) {
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

void callback(cudaStream_t, cudaError_t, void *ptr) { 
    *(reinterpret_cast<bool *>(ptr)) = true; 
}

__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }

int main() {
    const clock_value_t duration_in_clocks = 1e6;
    const size_t buffer_size = 1e7;
    bool callback_executed = false;
    cudaStream_t stream;
    auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
    char* device_ptr;
    cudaMalloc(&device_ptr, buffer_size);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
    cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
    dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
    cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
    cudaStreamAddCallback(
        stream, callback, &callback_executed, 0 /* fixed and meaningless */);
    snapshot = callback_executed;
    std::cout << "Right after we finished enqueuing work, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
    cudaStreamSynchronize(stream);
    snapshot = callback_executed;
    std::cout << "After cudaStreamSynchronize, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

The size of the buffers and the length of the kernel sleep in cycles are high enough, that as they execute in parallel with the CPU thread, it should finish the enqueueing well before they've concluded (8ms+8ms for copying and 20 ms for the kernel).

And yet, looking at the trace below, it seems the two cudaMemcpyAsync() are actually synchronous, i.e. they block until the (non-blocking) stream has actually concluded the copying. Is this intended behavior? It seems to contract the relevant section of the CUDA Runtime API documentation. How does it make sense?


Trace: (numbered lines, time in useconds):

      1 "Start"        "Duration"    "Grid X"                             "Grid Y"  "Grid Z"    "Block X"   "Block Y"                       "Block Z"  
    104 14102.830000   59264.347000  "cudaMalloc"
    105 73368.351000   19.886000     "cudaStreamCreateWithFlags"
    106 73388.and 20 ms for the kernel).

And yet, looking at the trace below, it seems the two cudaMemcpyAsync()'s are actually synchronous, i.e. they block until the (non-blocking) stream has actually concluded the copying. Is this intended behavior? It seems to contradict the relevant section of the CUDA Runtime API documentation. How does it make sense?

850000   8330.257000   "cudaMemcpyAsync"
        107 73565.702000   8334.265000   47.683716                            5.587311  "Pageable"  "Device"    "GeForce GTX 650 Ti BOOST (0)"  "1"        
        108 81721.124000   2.394000      "cudaConfigureCall"
        109 81723.865000   3.585000      "cudaSetupArgument"
        110 81729.332000   30.742000     "cudaLaunch (dummy(__int64) [107])"
        111 81760.604000   39589.422000  "cudaMemcpyAsync"
        112 81906.303000   20157.648000  128                                  1         1           128         1                               1          
        113 102073.103000  18736.208000  47.683716                            2.485355  "Device"    "Pageable"  "GeForce GTX 650 Ti BOOST (0)"  "1"        
        114 121351.936000  5.560000      "cudaStreamSynchronize"
2
If I understand well, I think you may have missed this: docs.nvidia.com/cuda/cuda-runtime-api/…Robin Thoni
@RobinThoni: So basically you're saying that if the host-side memory were pinned, I would get asynchronous behavior?einpoklum
Yes, that would be the expected behaviorRobin Thoni
@RobinThoni: But what about the host-to-device transfer? That seems to meet the criteria at the link for being asynchronous, and yet seems to be done synchronously.einpoklum
@RobinThoni: You're saying I should understand "from" and "to" as having their actually meaning, but an incompatible meaning :-( ... Anyway, thanks, and perhaps I could interest you in my followup question.einpoklum

2 Answers

2
votes

This seemed weird, so I contacted someone from the CUDA driver team, who confirmed the documentation is correct. I was also able to confirm it:

#include <iostream>
#include <memory>

using clock_value_t = long long;

__device__ void gpu_sleep(clock_value_t sleep_cycles) {
    clock_value_t start = clock64();
    clock_value_t cycles_elapsed;
    do { cycles_elapsed = clock64() - start; }
    while (cycles_elapsed < sleep_cycles);
}

void callback(cudaStream_t, cudaError_t, void *ptr) { 
    *(reinterpret_cast<bool *>(ptr)) = true; 
}

__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }

int main(int argc, char* argv[]) {
  cudaFree(0);
  struct timespec start, stop;
    const clock_value_t duration_in_clocks = 1e6;
    const size_t buffer_size = 2 * 1024 * 1024 * (size_t)1024;
    bool callback_executed = false;
    cudaStream_t stream;
    void* host_ptr;
    if (argc == 1){
      host_ptr = malloc(buffer_size);
    }
    else {
      cudaMallocHost(&host_ptr, buffer_size, 0);
    }
    char* device_ptr;
    cudaMalloc(&device_ptr, buffer_size);
    cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    cudaMemcpyAsync(device_ptr, host_ptr, buffer_size, cudaMemcpyDefault, stream);
    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
    double result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
    std::cout << "Elapsed: " << result / 1000 / 1000<< std::endl;

    dummy<<<128, 128, 0, stream>>>(duration_in_clocks);

    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
    cudaMemcpyAsync(host_ptr, device_ptr, buffer_size, cudaMemcpyDefault, stream);
    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &stop);
    result = (stop.tv_sec - start.tv_sec) * 1e6 + (stop.tv_nsec - start.tv_nsec) / 1e3;
    std::cout << "Elapsed: " << result / 1000 / 1000 << std::endl;

    cudaStreamAddCallback(
        stream, callback, &callback_executed, 0 /* fixed and meaningless */);
    auto snapshot = callback_executed;
    std::cout << "Right after we finished enqueuing work, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
    cudaStreamSynchronize(stream);
    snapshot = callback_executed;
    std::cout << "After cudaStreamSynchronize, the stream has "
        << (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}

This is basically your code, with a few modifications:

  • Time measurement
  • A switch to allocate from pageable or pinned memory
  • A buffer size of 2 GiB to ensure a measurable copy time
  • cudaFree(0) to force CUDA lazy initialisation.

Here are the results:

$ nvcc -std=c++11 main.cu -lrt

$ ./a.out # using pageable memory
Elapsed: 0.360828 # (memcpyDtoH pageable -> device, fully async)
Elapsed: 5.20288 # (memcpyHtoD device -> pageable, sync)

$ ./a.out 1 # using pinned memory
Elapsed: 4.412e-06 # (memcpyDtoH pinned -> device, fully async)
Elapsed: 7.127e-06 # (memcpyDtoH device -> pinned, fully async)

It is slower when copying from pageable to device, but it is really async.

I'm sorry for my mistake. I deleted my previous comments to avoid confusing people.

1
votes

It so happens that CUDA memory copies are only asynchronous under strict conditions, as @RobinThoni has kindly indicated. For the code in question, the issue is mostly the use of unpinned (that is, paged) host memory.

To quote from a separate section of the Runtime API documentation (emphasis mine):

2. API synchronization behavior

The API provides memcpy/memset functions in both synchronous and asynchronous forms, the latter having an "Async" suffix. This is a misnomer as each function may exhibit synchronous or asynchronous behavior depending on the arguments passed to the function.

...

Asynchronous

  • For transfers from device memory to pageable host memory, the function will return only once the copy has completed.

and that's just the half of it! It's actually true that

  • For transfers from pageable host memory to device memory, the data will first be staged in pinned host memory, then copied to the device; and the function will return only after the staging has occurred.