Consider the following program for enqueueing some work on a non-blocking GPU stream:
#include <iostream>
using clock_value_t = long long;
__device__ void gpu_sleep(clock_value_t sleep_cycles) {
clock_value_t start = clock64();
clock_value_t cycles_elapsed;
do { cycles_elapsed = clock64() - start; }
while (cycles_elapsed < sleep_cycles);
}
void callback(cudaStream_t, cudaError_t, void *ptr) {
*(reinterpret_cast<bool *>(ptr)) = true;
}
__global__ void dummy(clock_value_t sleep_cycles) { gpu_sleep(sleep_cycles); }
int main() {
const clock_value_t duration_in_clocks = 1e6;
const size_t buffer_size = 1e7;
bool callback_executed = false;
cudaStream_t stream;
auto host_ptr = std::unique_ptr<char[]>(new char[buffer_size]);
char* device_ptr;
cudaMalloc(&device_ptr, buffer_size);
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
cudaMemcpyAsync(device_ptr, host_ptr.get(), buffer_size, cudaMemcpyDefault, stream);
dummy<<<128, 128, 0, stream>>>(duration_in_clocks);
cudaMemcpyAsync(host_ptr.get(), device_ptr, buffer_size, cudaMemcpyDefault, stream);
cudaStreamAddCallback(
stream, callback, &callback_executed, 0 /* fixed and meaningless */);
snapshot = callback_executed;
std::cout << "Right after we finished enqueuing work, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
cudaStreamSynchronize(stream);
snapshot = callback_executed;
std::cout << "After cudaStreamSynchronize, the stream has "
<< (snapshot ? "" : "not ") << "concluded execution." << std::endl;
}
The size of the buffers and the length of the kernel sleep in cycles are high enough, that as they execute in parallel with the CPU thread, it should finish the enqueueing well before they've concluded (8ms+8ms for copying and 20 ms for the kernel).
And yet, looking at the trace below, it seems the two cudaMemcpyAsync()
are actually synchronous, i.e. they block until the (non-blocking) stream has actually concluded the copying. Is this intended behavior? It seems to contract the relevant section of the CUDA Runtime API documentation. How does it make sense?
Trace: (numbered lines, time in useconds):
1 "Start" "Duration" "Grid X" "Grid Y" "Grid Z" "Block X" "Block Y" "Block Z"
104 14102.830000 59264.347000 "cudaMalloc"
105 73368.351000 19.886000 "cudaStreamCreateWithFlags"
106 73388.and 20 ms for the kernel).
And yet, looking at the trace below, it seems the two cudaMemcpyAsync()
's are actually synchronous, i.e. they block until the (non-blocking) stream has actually concluded the copying. Is this intended behavior? It seems to contradict the relevant section of the CUDA Runtime API documentation. How does it make sense?
850000 8330.257000 "cudaMemcpyAsync"
107 73565.702000 8334.265000 47.683716 5.587311 "Pageable" "Device" "GeForce GTX 650 Ti BOOST (0)" "1"
108 81721.124000 2.394000 "cudaConfigureCall"
109 81723.865000 3.585000 "cudaSetupArgument"
110 81729.332000 30.742000 "cudaLaunch (dummy(__int64) [107])"
111 81760.604000 39589.422000 "cudaMemcpyAsync"
112 81906.303000 20157.648000 128 1 1 128 1 1
113 102073.103000 18736.208000 47.683716 2.485355 "Device" "Pageable" "GeForce GTX 650 Ti BOOST (0)" "1"
114 121351.936000 5.560000 "cudaStreamSynchronize"