As described in table 2 in the cuda c programming guide http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#arithmetic-instructions, the number of operations per clock cycle per multiprocessor for 32-bit floating-point add is 128, while it is 4 for 64-bit floating-point add, namely, 32 times slower for 64-bit floating-point add.
However, as I used the following code to test the speed difference, the double version is at most 2 times slower than float (It does not change much even with the compilation flag --device-debug), does any know the reason?
#define N 100000000
typedef double Real;
// Device code
__global__ void VecAdd(Real* A, Real* B, Real* C)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
C[i] = A[i] + B[i];
}
}
// Host code
int main()
{
size_t size = N * sizeof(Real);
// Allocate input vectors h_A and h_B in host memory
Real* h_A = (Real*)malloc(size);
Real* h_B = (Real*)malloc(size);
Real* h_C = (Real*)malloc(size);
// Initialize input vectors
for (int i = 0; i < N; i++)
{
h_A[i] = 1.0f + i * 0.1f;
h_B[i] = 100.0f + i * 0.1f;
}
// Allocate vectors in device memory
Real* d_A;
cudaMalloc(&d_A, size);
Real* d_B;
cudaMalloc(&d_B, size);
Real* d_C;
cudaMalloc(&d_C, size);
// Copy vectors from host memory to device memory
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
// Invoke kernel
int threadsPerBlock = 256;
int blocksPerGrid =
(N + threadsPerBlock - 1) / threadsPerBlock;
// Time measurement starts
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cudaEventSynchronize(start);
for (int i = 0; i < 10000; i++)
{
VecAdd << <blocksPerGrid, threadsPerBlock >> >(d_A, d_B, d_C);
}
// Time measurement ends
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("Time to generate: %3.8f ms\n", elapsedTime);
cudaEventDestroy(start);
cudaEventDestroy(stop);
// Copy result from device memory to host memory
// h_C contains the result in host memory
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
// Free device memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
// Free host memory
free(h_A);
free(h_B);
free(h_C);
}
I use Visual Studio 2013 with CUDA toolkit 8.0 and my system is 64bit windows 10 with GeForce 1080, driver version 372.90.
Edit: After reading the answer from @talonmies, I changed N to 1000 and the kernel function as
__global__ void VecAdd(Real* A, Real* B, Real* C)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) {
Real a = A[i];
Real b = B[i];
Real c = 0.0f;
for (int j = 0; j < 100000; j++)
{
c += (a + b);
}
C[i] = c;
}
}
Now float version (3700ms) is about 10 times faster than double version (38570ms), however, it is still far from the theoretical value 32, could anybody explain that? ps. it is without the flag --device-debug, since with it the float version is much slower and again, only at most two times faster than the double version.
Edit @einpoklum, here is the ptx file. I am not sure about the meaning of the ptx file, but I think the loop in the kernel is not optimized away by nvcc, since if I set the kernel loop number to 10000 for the float version, and the delay becomes 390ms, it's about one tenth of delay for the loop number 100000.
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-20732876
// Cuda compilation tools, release 8.0, V8.0.26
// Based on LLVM 3.4svn
//
.version 5.0
.target sm_20
.address_size 32
// .globl _Z6VecAddPfS_S_
.visible .entry _Z6VecAddPfS_S_(
.param .u32 _Z6VecAddPfS_S__param_0,
.param .u32 _Z6VecAddPfS_S__param_1,
.param .u32 _Z6VecAddPfS_S__param_2
)
{
.reg .pred %p<3>;
.reg .f32 %f<57>;
.reg .b32 %r<20>;
ld.param.u32 %r5, [_Z6VecAddPfS_S__param_0];
ld.param.u32 %r6, [_Z6VecAddPfS_S__param_1];
ld.param.u32 %r7, [_Z6VecAddPfS_S__param_2];
mov.u32 %r8, %ctaid.x;
mov.u32 %r9, %ntid.x;
mov.u32 %r10, %tid.x;
mad.lo.s32 %r1, %r8, %r9, %r10;
setp.gt.s32 %p1, %r1, 999;
@%p1 bra BB0_4;
cvta.to.global.u32 %r2, %r7;
cvta.to.global.u32 %r12, %r5;
shl.b32 %r13, %r1, 2;
add.s32 %r14, %r12, %r13;
cvta.to.global.u32 %r15, %r6;
add.s32 %r16, %r15, %r13;
ld.global.f32 %f5, [%r16];
ld.global.f32 %f6, [%r14];
add.f32 %f1, %f6, %f5;
mov.f32 %f56, 0f00000000;
mov.u32 %r19, 100000;
BB0_2:
add.f32 %f7, %f1, %f56;
add.f32 %f8, %f1, %f7;
add.f32 %f9, %f1, %f8;
add.f32 %f10, %f1, %f9;
add.f32 %f11, %f1, %f10;
add.f32 %f12, %f1, %f11;
add.f32 %f13, %f1, %f12;
add.f32 %f14, %f1, %f13;
add.f32 %f15, %f1, %f14;
add.f32 %f16, %f1, %f15;
add.f32 %f17, %f1, %f16;
add.f32 %f18, %f1, %f17;
add.f32 %f19, %f1, %f18;
add.f32 %f20, %f1, %f19;
add.f32 %f21, %f1, %f20;
add.f32 %f22, %f1, %f21;
add.f32 %f23, %f1, %f22;
add.f32 %f24, %f1, %f23;
add.f32 %f25, %f1, %f24;
add.f32 %f26, %f1, %f25;
add.f32 %f27, %f1, %f26;
add.f32 %f28, %f1, %f27;
add.f32 %f29, %f1, %f28;
add.f32 %f30, %f1, %f29;
add.f32 %f31, %f1, %f30;
add.f32 %f32, %f1, %f31;
add.f32 %f33, %f1, %f32;
add.f32 %f34, %f1, %f33;
add.f32 %f35, %f1, %f34;
add.f32 %f36, %f1, %f35;
add.f32 %f37, %f1, %f36;
add.f32 %f38, %f1, %f37;
add.f32 %f39, %f1, %f38;
add.f32 %f40, %f1, %f39;
add.f32 %f41, %f1, %f40;
add.f32 %f42, %f1, %f41;
add.f32 %f43, %f1, %f42;
add.f32 %f44, %f1, %f43;
add.f32 %f45, %f1, %f44;
add.f32 %f46, %f1, %f45;
add.f32 %f47, %f1, %f46;
add.f32 %f48, %f1, %f47;
add.f32 %f49, %f1, %f48;
add.f32 %f50, %f1, %f49;
add.f32 %f51, %f1, %f50;
add.f32 %f52, %f1, %f51;
add.f32 %f53, %f1, %f52;
add.f32 %f54, %f1, %f53;
add.f32 %f55, %f1, %f54;
add.f32 %f56, %f1, %f55;
add.s32 %r19, %r19, -50;
setp.ne.s32 %p2, %r19, 0;
@%p2 bra BB0_2;
add.s32 %r18, %r2, %r13;
st.global.f32 [%r18], %f56;
BB0_4:
ret;
}