Weird cuda performance on two same empty kernels.

Question

I have two "empty" kernels, each with one if statement which will never be touched.

#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>

inline double wtime(){
  double time[2];
  struct timeval time1;
  gettimeofday(&time1, NULL);

  time[0]=time1.tv_sec;
  time[1]=time1.tv_usec;

  return time[0]+time[1]*1.0e-6;
}

__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}

int main(){

  cudaDeviceSynchronize();
  double s = wtime();
  __empty1<<<256,256>>>(false);
  cudaDeviceSynchronize();
  printf("empty1: %.3f\n", 1000*(wtime()-s));

  cudaDeviceSynchronize();
  s = wtime();
  __empty2<<<256,256>>>(false);
  cudaDeviceSynchronize();
  printf("empty2: %.3f\n", 1000*(wtime()-s));
  return 0;
}

I compiled the code with cuda-7.5 and -O3, then run it on K40m.

nvcc -O3 -arch=sm_35 ./main.cu

The first empty kernel takes 1ms, while the second kernel takes 0.02ms.

empty1: 1.075
empty2: 0.019

That's weird since neither of these two kernels enters the branch. The runtime of these two kernels is expected to be the same.

What makes you assume the run time is the same? There may be a lot of unconditionally included and executed support code for printf. Unless you have disassembled and inspected the actual SASS code the GPU runs in both cases, there is no way you can make that assumption — talonmies

Robert Crovella Robert Crovella · Accepted Answer · 2018-05-11T14:33:28

The codes are not the same. As pointed out in the comments, this is discoverable by looking at the SASS.

Example:

$ cat t1353.cu
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>

inline double wtime(){
  double time[2];
  struct timeval time1;
  gettimeofday(&time1, NULL);

  time[0]=time1.tv_sec;
  time[1]=time1.tv_usec;

  return time[0]+time[1]*1.0e-6;
}

__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}

int main(){

  cudaDeviceSynchronize();
  double s = wtime();
  __empty1<<<256,256>>>(false);
  cudaDeviceSynchronize();
  printf("empty1: %.3f\n", 1000*(wtime()-s));

  cudaDeviceSynchronize();
  s = wtime();
  __empty2<<<256,256>>>(false);
  cudaDeviceSynchronize();
  printf("empty2: %.3f\n", 1000*(wtime()-s));
  return 0;
}
$ nvcc -arch=sm_35 -o t1353 t1353.cu
$ cuobjdump -sass t1353

Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit

        code for sm_35

Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit

        code for sm_35
                Function : _Z8__empty2b
        .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                          /* 0x0800000000b81000 */
        /*0008*/                   MOV R1, c[0x0][0x44];  /* 0x64c03c00089c0006 */
        /*0010*/                   MOV RZ, RZ;            /* 0xe4c03c007f9c03fe */
        /*0018*/                   EXIT;                  /* 0x18000000001c003c */
        /*0020*/                   BRA 0x20;              /* 0x12007ffffc1c003c */
        /*0028*/                   NOP;                   /* 0x85800000001c3c02 */
        /*0030*/                   NOP;                   /* 0x85800000001c3c02 */
        /*0038*/                   NOP;                   /* 0x85800000001c3c02 */
                .............................


                Function : _Z8__empty1b
        .headerflags    @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
                                                                     /* 0x08b8b0a0a0a0a000 */
        /*0008*/                   MOV R1, c[0x0][0x44];             /* 0x64c03c00089c0006 */
        /*0010*/                   LDC.S8 R0, c[0x0][0x140];         /* 0x7c880000a01ffc02 */
        /*0018*/                   I2I.U16.S8 R0, R0;                /* 0xe6000000001c8402 */
        /*0020*/                   LOP.AND R0, R0, 0xff;             /* 0xc20000007f9c0001 */
        /*0028*/                   I2I.S32.S16 R0, R0;               /* 0xe6000000001cd802 */
        /*0030*/                   ISETP.EQ.AND P0, PT, R0, RZ, PT;  /* 0xdb281c007f9c001e */
        /*0038*/               @P0 EXIT;                             /* 0x180000000000003c */
                                                                     /* 0x08b810b800108010 */
        /*0048*/                   MOV32I R4, 0x0;                   /* 0x74000000001fc012 */
        /*0050*/                   MOV32I R5, 0x0;                   /* 0x74000000001fc016 */
        /*0058*/                   MOV R7, RZ;                       /* 0xe4c03c007f9c001e */
        /*0060*/                   MOV R6, RZ;                       /* 0xe4c03c007f9c001a */
        /*0068*/                   JCAL 0x0;                         /* 0x1100000000000100 */
        /*0070*/                   MOV RZ, RZ;                       /* 0xe4c03c007f9c03fe */
        /*0078*/                   EXIT;                             /* 0x18000000001c003c */
        /*0080*/                   BRA 0x80;                         /* 0x12007ffffc1c003c */
        /*0088*/                   NOP;                              /* 0x85800000001c3c02 */
        /*0090*/                   NOP;                              /* 0x85800000001c3c02 */
        /*0098*/                   NOP;                              /* 0x85800000001c3c02 */
        /*00a0*/                   NOP;                              /* 0x85800000001c3c02 */
        /*00a8*/                   NOP;                              /* 0x85800000001c3c02 */
        /*00b0*/                   NOP;                              /* 0x85800000001c3c02 */
        /*00b8*/                   NOP;                              /* 0x85800000001c3c02 */
                .............................



Fatbin ptx code:
================
arch = sm_35
code version = [5,0]
producer = cuda
host = linux
compile_size = 64bit
compressed
$

So in the empty2 case the machine code looks like this:

        /*0008*/                   MOV R1, c[0x0][0x44];  /* 0x64c03c00089c0006 */
        /*0010*/                   MOV RZ, RZ;            /* 0xe4c03c007f9c03fe */
        /*0018*/                   EXIT;                  /*

In the empty1 case it is longer:

        /*0008*/                   MOV R1, c[0x0][0x44];             /* 0x64c03c00089c0006 */
        /*0010*/                   LDC.S8 R0, c[0x0][0x140];         /* 0x7c880000a01ffc02 */
        /*0018*/                   I2I.U16.S8 R0, R0;                /* 0xe6000000001c8402 */
        /*0020*/                   LOP.AND R0, R0, 0xff;             /* 0xc20000007f9c0001 */
        /*0028*/                   I2I.S32.S16 R0, R0;               /* 0xe6000000001cd802 */
        /*0030*/                   ISETP.EQ.AND P0, PT, R0, RZ, PT;  /* 0xdb281c007f9c001e */
        /*0038*/               @P0 EXIT;                            

...
        /*0078*/                   EXIT;                             /* 0x18000000001c003c */

The bigger issue here is possibly one of timing rigor/correctness. CUDA has lazy initialization. This means that the first set of calls in your CUDA code may incur more than the usual amount of timing overhead. According to my testing this is affecting the conclusion here. If I run a "warm-up" call to empty1 before actually timing it, the measured time between the two cases is nearly the same. It could be plausibly explained by the difference in code length.

Example:

$ cat t1353.cu
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>

inline double wtime(){
  double time[2];
  struct timeval time1;
  gettimeofday(&time1, NULL);

  time[0]=time1.tv_sec;
  time[1]=time1.tv_usec;

  return time[0]+time[1]*1.0e-6;
}

__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}

int main(){

  __empty1<<<256,256>>>(false);
  cudaDeviceSynchronize();
  double s = wtime();
  __empty1<<<256,256>>>(false);
  cudaDeviceSynchronize();
  printf("empty1: %.3f\n", 1000*(wtime()-s));

  __empty2<<<256,256>>>(false);
  cudaDeviceSynchronize();
  s = wtime();
  __empty2<<<256,256>>>(false);
  cudaDeviceSynchronize();
  printf("empty2: %.3f\n", 1000*(wtime()-s));
  return 0;
}
$ nvcc -arch=sm_35 -o t1353 t1353.cu
$ ./t1353
empty1: 0.023
empty2: 0.015
$

Weird cuda performance on two same empty kernels.

1 Answers