Running the code below to write 1 GB in global memory in the NVIDIA Visual Profiler, I get:
- 100% storage efficiency
- 69.4% (128.6 GB/s) DRAM utilization
- 18.3% total replay overhead
- 18.3% global memory replay overhead.
The memory writes are supposed to be coalesced and there is no divergence in the kernel, so the question is where is the global memory replay overhead coming from? I am running this on Ubuntu 13.04, with nvidia-cuda-toolkit version 5.0.35-4ubuntu1.
#include <cuda.h>
#include <unistd.h>
#include <getopt.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <stdint.h>
#include <ctype.h>
#include <sched.h>
#include <assert.h>
static void
HandleError( cudaError_t err, const char *file, int line )
{
if (err != cudaSuccess) {
printf( "%s in %s at line %d\n", cudaGetErrorString(err), file, line);
exit( EXIT_FAILURE );
}
}
#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
// Global memory writes
__global__ void
kernel_write(uint32_t *start, uint32_t entries)
{
uint32_t tid = threadIdx.x + blockIdx.x*blockDim.x;
while (tid < entries) {
start[tid] = tid;
tid += blockDim.x*gridDim.x;
}
}
int main(int argc, char *argv[])
{
uint32_t *gpu_mem; // Memory pointer
uint32_t n_blocks = 256; // Blocks per grid
uint32_t n_threads = 192; // Threads per block
uint32_t n_bytes = 1073741824; // Transfer size (1 GB)
float elapsedTime; // Elapsed write time
// Allocate 1 GB of memory on the device
HANDLE_ERROR( cudaMalloc((void **)&gpu_mem, n_bytes) );
// Create events
cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate(&start) );
HANDLE_ERROR( cudaEventCreate(&stop) );
// Write to global memory
HANDLE_ERROR( cudaEventRecord(start, 0) );
kernel_write<<<n_blocks, n_threads>>>(gpu_mem, n_bytes/4);
HANDLE_ERROR( cudaGetLastError() );
HANDLE_ERROR( cudaEventRecord(stop, 0) );
HANDLE_ERROR( cudaEventSynchronize(stop) );
HANDLE_ERROR( cudaEventElapsedTime(&elapsedTime, start, stop) );
// Report exchange time
printf("#Delay(ms) BW(GB/s)\n");
printf("%10.6f %10.6f\n", elapsedTime, 1e-6*n_bytes/elapsedTime);
// Destroy events
HANDLE_ERROR( cudaEventDestroy(start) );
HANDLE_ERROR( cudaEventDestroy(stop) );
// Free memory
HANDLE_ERROR( cudaFree(gpu_mem) );
return 0;
}