CUDA atomic operation performance in different scenarios

Question

When I came across this question on SO, I was curious to know the answer. so I wrote below piece of code to test atomic operation performance in different scenarios. The OS is Ubuntu 12.04 with CUDA 5.5 and the device is GeForce GTX780 (Kepler architecture). I compiled the code with -O3 flag and for CC=3.5.

#include <stdio.h>

static void HandleError( cudaError_t err, const char *file, int line ) {
    if (err != cudaSuccess) {
        printf( "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

#define BLOCK_SIZE 256
#define RESTRICTION_SIZE 32

__global__ void CoalescedAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data+i, 6);  //arbitrary number to add
    }
}

__global__ void AddressRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data+(i&(RESTRICTION_SIZE-1)), 6);   //arbitrary number to add
    }
}

__global__ void WarpRestrictedAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data+(i>>5), 6); //arbitrary number to add
    }
}

__global__ void SameAddressAtomicOnGlobalMem(int* data, int nElem)
{
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( data, 6);    //arbitrary number to add
    }
}

__global__ void CoalescedAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data+threadIdx.x, data[i]);
    }
}

__global__ void AddressRestrictedAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data+(threadIdx.x&(RESTRICTION_SIZE-1)), data[i&(RESTRICTION_SIZE-1)]);
    }
}

__global__ void WarpRestrictedAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data+(threadIdx.x>>5), data[i>>5]);

    }
}

__global__ void SameAddressAtomicOnSharedMem(int* data, int nElem)
{
    __shared__ int smem_data[BLOCK_SIZE];
    unsigned int tid = (blockIdx.x * blockDim.x) + threadIdx.x;
    for ( unsigned int i = tid; i < nElem; i += blockDim.x*gridDim.x){
        atomicAdd( smem_data, data[0]);
    }
}

int main(void)
{

    const int n = 2 << 24;
    int* data = new int[n];

    int i;
    for(i=0; i<n; i++) {
        data[i] = i%1024+1;
    }

    int* dev_data;
    HANDLE_ERROR( cudaMalloc((void **)&dev_data, sizeof(int) * size_t(n)) );
    HANDLE_ERROR( cudaMemset(dev_data, 0, sizeof(int) * size_t(n)) );
    HANDLE_ERROR( cudaMemcpy( dev_data, data, n * sizeof(int), cudaMemcpyHostToDevice) );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        CoalescedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        AddressRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        WarpRestrictedAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        SameAddressAtomicOnGlobalMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        CoalescedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        AddressRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        WarpRestrictedAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    for(int i=0; i<50; i++)
    {
        dim3 blocksize(BLOCK_SIZE);
        dim3 griddize((12*2048)/BLOCK_SIZE); //12 SMX ON GTX780 each can have 2048 threads
        SameAddressAtomicOnSharedMem<<<griddize, blocksize>>>( dev_data, n);
        HANDLE_ERROR( cudaPeekAtLastError() );
    }
    HANDLE_ERROR( cudaDeviceSynchronize() );

    HANDLE_ERROR( cudaDeviceReset() );
    printf("Program finished without error.\n");
    return 0;
}

Basically in above code there are 8 kernels in which all threads do atomicAdd on all data.

Coalesced atomic addition on global memory.
Atomic addition on a restricted address space in global memory. (32 in the code)
Atomic addition for warp lanes on the same address in global memory.
Atomic addition of all threads on the same address in global memory.

Items 5 to 8 can be found by replacing global with shared in above items. Chosen block size is 256.

I used nvprof to profile the program. The output is:

Time(%)      Time     Calls       Avg       Min       Max  Name
44.33%  2.35113s        50  47.023ms  46.987ms  47.062ms  SameAddressAtomicOnSharedMem(int*, int)
31.89%  1.69104s        50  33.821ms  33.818ms  33.826ms  SameAddressAtomicOnGlobalMem(int*, int)
10.10%  535.88ms        50  10.718ms  10.707ms  10.738ms  WarpRestrictedAtomicOnSharedMem(int*, int)
3.96%  209.95ms        50  4.1990ms  4.1895ms  4.2103ms  AddressRestrictedAtomicOnSharedMem(int*, int)
3.95%  209.47ms        50  4.1895ms  4.1893ms  4.1900ms  AddressRestrictedAtomicOnGlobalMem(int*, int)
3.33%  176.48ms        50  3.5296ms  3.5050ms  3.5498ms  WarpRestrictedAtomicOnGlobalMem(int*, int)
1.08%  57.428ms        50  1.1486ms  1.1460ms  1.1510ms  CoalescedAtomicOnGlobalMem(int*, int)
0.84%  44.784ms        50  895.68us  888.65us  905.77us  CoalescedAtomicOnSharedMem(int*, int)
0.51%  26.805ms         1  26.805ms  26.805ms  26.805ms  [CUDA memcpy HtoD]
0.01%  543.61us         1  543.61us  543.61us  543.61us  [CUDA memset]

Obviously coalesced conflict-free atomic operations had the best performance, and same-address had the the worst. One thing I couldn't explain was that why same address atomic on shared memory (inside a block) is slower comparing to on global memory (common between all threads).
When all the warps lanes access the same place in shared memory the performance is very bad but it's (surprisingly) not the case when they perform it onto global memory. I cannot explain why. Another confusion case is address restricted atomic on global is performing worse than when all the threads inside the warp perform it on the same address while it seems memory contentions in the first case is lower.

Anyways I would be happy if anyone could explain above profiling results.

Why are you adding data[0] in SameAddressAtomicOnSharedMem, instead of an immediate value as in SameAddressAtomicOnGlobalMem? It causes one extra global read. It's existence in cache is not guaranteed. I think that is the case for all your kernels' shared versions vs. global versions. I don't think I understanding the reasoning behind it. — Engin Kayraklioglu
I wanted to be fair as much as I could in comparisons between shared and global memories in all cases. While atomicAdd in global memory involves a protected read-modify-write, I wanted for shared memory versions to have that read. Even if we replace global reads with immediate literals, results stay almost the same. For example SameAddressAtomicOnSharedMem average decreased only 2.5 ms. — Farzad
Fair enough. A follow up Q: How do we know that immediate additions are not optimized? You might say; "even if both are immediate adds global still performs better". But then is it too far-fetched to assume that there might be some more aggressive optimizations for global adds than shared adds? Just brainstorming.. — Engin Kayraklioglu

Robert Crovella Robert Crovella · Accepted Answer · 2014-03-13T01:45:04

As a forward-looking statement, to some extent my comments here may be architecture-specific. But for the architectures at hand (up to cc 3.5, AFAIK) shared memory atomics get implemented via a code sequence (created by the assembler). This code sequence, operating on shared memory, is subject to serialization if multiple threads are contending for access to the same bank/location.

The R-M-W operation itself is atomic in the sense that no other thread can disrupt the operation (i.e. create incorrect results), but when the threads are contending to do an atomic operation on a single shared memory location, the contention gives rise to serialization, exacerbating the delay associated with atomics.

To quote Nick from the CUDA Handbook:

Unlike global memory, which implements atomics using single instructions (either GATOM or GRED, depending on whether the return value is used), shared memory atomics are implemented with explicit lock/unlock semantics, and the compiler emits code that causes each thread to loop over these lock operations until the thread has performed its atomic operation.

and:

Take care to avoid contention, or the loop in Listing 8-2 may iterate up to 32 times.

I suggest you read the full section 8.1.5, at least.

Starting in the Maxwell architecture, shared memory atomics are no longer implemented via a code sequence, but there are native atomic instructions for shared memory. This can result in shared memory atomics running considerably faster on Maxwell and newer architectures.

CUDA atomic operation performance in different scenarios

1 Answers