performance difference for cuda between experiments and the documentation for float/double data types

Question

As described in table 2 in the cuda c programming guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions, the number of operations per clock cycle per multiprocessor for 32-bit floating-point add is 128, while it is 4 for 64-bit floating-point add, namely, 32 times slower for 64-bit floating-point add.

However, as I used the following code to test the speed difference, the double version is at most 2 times slower than float (It does not change much even with the compilation flag --device-debug), does any know the reason?

#define N 100000000
typedef double Real;

// Device code
__global__ void VecAdd(Real* A, Real* B, Real* C)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < N) {
        C[i] = A[i] + B[i];
    }
}

// Host code
int main()
{
    size_t size = N * sizeof(Real);
    // Allocate input vectors h_A and h_B in host memory
    Real* h_A = (Real*)malloc(size);
    Real* h_B = (Real*)malloc(size);
    Real* h_C = (Real*)malloc(size);
    // Initialize input vectors
    for (int i = 0; i < N; i++)
    {
        h_A[i] = 1.0f + i * 0.1f;
        h_B[i] = 100.0f + i * 0.1f;
    }
    // Allocate vectors in device memory
    Real* d_A;
    cudaMalloc(&d_A, size);
    Real* d_B;
    cudaMalloc(&d_B, size);
    Real* d_C;
    cudaMalloc(&d_C, size);
    // Copy vectors from host memory to device memory
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    // Invoke kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =
        (N + threadsPerBlock - 1) / threadsPerBlock;

    // Time measurement starts
    cudaEvent_t     start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);
    cudaEventSynchronize(start);

    for (int i = 0; i < 10000; i++)
    {
        VecAdd << <blocksPerGrid, threadsPerBlock >> >(d_A, d_B, d_C);
    }

    // Time measurement ends
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float   elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    printf("Time to generate:  %3.8f ms\n", elapsedTime);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    // Copy result from device memory to host memory
    // h_C contains the result in host memory
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    // Free device memory
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    // Free host memory
    free(h_A);
    free(h_B);
    free(h_C);
}

I use Visual Studio 2013 with CUDA toolkit 8.0 and my system is 64bit windows 10 with GeForce 1080, driver version 372.90.

Edit: After reading the answer from @talonmies, I changed N to 1000 and the kernel function as

__global__ void VecAdd(Real* A, Real* B, Real* C)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < N) {
        Real a = A[i];
        Real b = B[i];
        Real c = 0.0f;
        for (int j = 0; j < 100000; j++)
        {
            c += (a + b);
        }
        C[i] = c;
    }
}

Now float version (3700ms) is about 10 times faster than double version (38570ms), however, it is still far from the theoretical value 32, could anybody explain that? ps. it is without the flag --device-debug, since with it the float version is much slower and again, only at most two times faster than the double version.

Edit @einpoklum, here is the ptx file. I am not sure about the meaning of the ptx file, but I think the loop in the kernel is not optimized away by nvcc, since if I set the kernel loop number to 10000 for the float version, and the delay becomes 390ms, it's about one tenth of delay for the loop number 100000.

//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-20732876
// Cuda compilation tools, release 8.0, V8.0.26
// Based on LLVM 3.4svn
//

.version 5.0
.target sm_20
.address_size 32

    // .globl   _Z6VecAddPfS_S_

.visible .entry _Z6VecAddPfS_S_(
    .param .u32 _Z6VecAddPfS_S__param_0,
    .param .u32 _Z6VecAddPfS_S__param_1,
    .param .u32 _Z6VecAddPfS_S__param_2
)
{
    .reg .pred  %p<3>;
    .reg .f32   %f<57>;
    .reg .b32   %r<20>;


    ld.param.u32    %r5, [_Z6VecAddPfS_S__param_0];
    ld.param.u32    %r6, [_Z6VecAddPfS_S__param_1];
    ld.param.u32    %r7, [_Z6VecAddPfS_S__param_2];
    mov.u32     %r8, %ctaid.x;
    mov.u32     %r9, %ntid.x;
    mov.u32     %r10, %tid.x;
    mad.lo.s32  %r1, %r8, %r9, %r10;
    setp.gt.s32 %p1, %r1, 999;
    @%p1 bra    BB0_4;

    cvta.to.global.u32  %r2, %r7;
    cvta.to.global.u32  %r12, %r5;
    shl.b32     %r13, %r1, 2;
    add.s32     %r14, %r12, %r13;
    cvta.to.global.u32  %r15, %r6;
    add.s32     %r16, %r15, %r13;
    ld.global.f32   %f5, [%r16];
    ld.global.f32   %f6, [%r14];
    add.f32     %f1, %f6, %f5;
    mov.f32     %f56, 0f00000000;
    mov.u32     %r19, 100000;

BB0_2:
    add.f32     %f7, %f1, %f56;
    add.f32     %f8, %f1, %f7;
    add.f32     %f9, %f1, %f8;
    add.f32     %f10, %f1, %f9;
    add.f32     %f11, %f1, %f10;
    add.f32     %f12, %f1, %f11;
    add.f32     %f13, %f1, %f12;
    add.f32     %f14, %f1, %f13;
    add.f32     %f15, %f1, %f14;
    add.f32     %f16, %f1, %f15;
    add.f32     %f17, %f1, %f16;
    add.f32     %f18, %f1, %f17;
    add.f32     %f19, %f1, %f18;
    add.f32     %f20, %f1, %f19;
    add.f32     %f21, %f1, %f20;
    add.f32     %f22, %f1, %f21;
    add.f32     %f23, %f1, %f22;
    add.f32     %f24, %f1, %f23;
    add.f32     %f25, %f1, %f24;
    add.f32     %f26, %f1, %f25;
    add.f32     %f27, %f1, %f26;
    add.f32     %f28, %f1, %f27;
    add.f32     %f29, %f1, %f28;
    add.f32     %f30, %f1, %f29;
    add.f32     %f31, %f1, %f30;
    add.f32     %f32, %f1, %f31;
    add.f32     %f33, %f1, %f32;
    add.f32     %f34, %f1, %f33;
    add.f32     %f35, %f1, %f34;
    add.f32     %f36, %f1, %f35;
    add.f32     %f37, %f1, %f36;
    add.f32     %f38, %f1, %f37;
    add.f32     %f39, %f1, %f38;
    add.f32     %f40, %f1, %f39;
    add.f32     %f41, %f1, %f40;
    add.f32     %f42, %f1, %f41;
    add.f32     %f43, %f1, %f42;
    add.f32     %f44, %f1, %f43;
    add.f32     %f45, %f1, %f44;
    add.f32     %f46, %f1, %f45;
    add.f32     %f47, %f1, %f46;
    add.f32     %f48, %f1, %f47;
    add.f32     %f49, %f1, %f48;
    add.f32     %f50, %f1, %f49;
    add.f32     %f51, %f1, %f50;
    add.f32     %f52, %f1, %f51;
    add.f32     %f53, %f1, %f52;
    add.f32     %f54, %f1, %f53;
    add.f32     %f55, %f1, %f54;
    add.f32     %f56, %f1, %f55;
    add.s32     %r19, %r19, -50;
    setp.ne.s32 %p2, %r19, 0;
    @%p2 bra    BB0_2;

    add.s32     %r18, %r2, %r13;
    st.global.f32   [%r18], %f56;

BB0_4:
    ret;
}

I wonder what the compiler does with that loop. Maybe it optimizes some of it away? Can you produce the PTX? (nvcc has a switch for that). — einpoklum
@einpoklum , thank you very much for your reply, I have added the ptx file. — Feng

talonmies talonmies · Accepted Answer · 2016-10-24T12:47:54

The kernel code you have shown could never be limited by arithmetic instruction throughput, and therefore could never expose the arithmetic instruction throughput differences between single and double precision on your Pascal GPU.

It is much more likely that memory bandwidth is the performance limit of that code (your code requires two loads and a store per FLOP). The reason you are seeing a ratio of around two between single and double precision is likely down to the ratio of size of the types and nothing more.

performance difference for cuda between experiments and the documentation for float/double data types

1 Answers