- The codes are not the same. As pointed out in the comments, this is discoverable by looking at the SASS.
Example:
$ cat t1353.cu
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>
inline double wtime(){
double time[2];
struct timeval time1;
gettimeofday(&time1, NULL);
time[0]=time1.tv_sec;
time[1]=time1.tv_usec;
return time[0]+time[1]*1.0e-6;
}
__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}
int main(){
cudaDeviceSynchronize();
double s = wtime();
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty1: %.3f\n", 1000*(wtime()-s));
cudaDeviceSynchronize();
s = wtime();
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty2: %.3f\n", 1000*(wtime()-s));
return 0;
}
$ nvcc -arch=sm_35 -o t1353 t1353.cu
$ cuobjdump -sass t1353
Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_35
Fatbin elf code:
================
arch = sm_35
code version = [1,7]
producer = cuda
host = linux
compile_size = 64bit
code for sm_35
Function : _Z8__empty2b
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x0800000000b81000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0018*/ EXIT; /* 0x18000000001c003c */
/*0020*/ BRA 0x20; /* 0x12007ffffc1c003c */
/*0028*/ NOP; /* 0x85800000001c3c02 */
/*0030*/ NOP; /* 0x85800000001c3c02 */
/*0038*/ NOP; /* 0x85800000001c3c02 */
.............................
Function : _Z8__empty1b
.headerflags @"EF_CUDA_SM35 EF_CUDA_PTX_SM(EF_CUDA_SM35)"
/* 0x08b8b0a0a0a0a000 */
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ LDC.S8 R0, c[0x0][0x140]; /* 0x7c880000a01ffc02 */
/*0018*/ I2I.U16.S8 R0, R0; /* 0xe6000000001c8402 */
/*0020*/ LOP.AND R0, R0, 0xff; /* 0xc20000007f9c0001 */
/*0028*/ I2I.S32.S16 R0, R0; /* 0xe6000000001cd802 */
/*0030*/ ISETP.EQ.AND P0, PT, R0, RZ, PT; /* 0xdb281c007f9c001e */
/*0038*/ @P0 EXIT; /* 0x180000000000003c */
/* 0x08b810b800108010 */
/*0048*/ MOV32I R4, 0x0; /* 0x74000000001fc012 */
/*0050*/ MOV32I R5, 0x0; /* 0x74000000001fc016 */
/*0058*/ MOV R7, RZ; /* 0xe4c03c007f9c001e */
/*0060*/ MOV R6, RZ; /* 0xe4c03c007f9c001a */
/*0068*/ JCAL 0x0; /* 0x1100000000000100 */
/*0070*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0078*/ EXIT; /* 0x18000000001c003c */
/*0080*/ BRA 0x80; /* 0x12007ffffc1c003c */
/*0088*/ NOP; /* 0x85800000001c3c02 */
/*0090*/ NOP; /* 0x85800000001c3c02 */
/*0098*/ NOP; /* 0x85800000001c3c02 */
/*00a0*/ NOP; /* 0x85800000001c3c02 */
/*00a8*/ NOP; /* 0x85800000001c3c02 */
/*00b0*/ NOP; /* 0x85800000001c3c02 */
/*00b8*/ NOP; /* 0x85800000001c3c02 */
.............................
Fatbin ptx code:
================
arch = sm_35
code version = [5,0]
producer = cuda
host = linux
compile_size = 64bit
compressed
$
So in the empty2
case the machine code looks like this:
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ MOV RZ, RZ; /* 0xe4c03c007f9c03fe */
/*0018*/ EXIT; /*
In the empty1
case it is longer:
/*0008*/ MOV R1, c[0x0][0x44]; /* 0x64c03c00089c0006 */
/*0010*/ LDC.S8 R0, c[0x0][0x140]; /* 0x7c880000a01ffc02 */
/*0018*/ I2I.U16.S8 R0, R0; /* 0xe6000000001c8402 */
/*0020*/ LOP.AND R0, R0, 0xff; /* 0xc20000007f9c0001 */
/*0028*/ I2I.S32.S16 R0, R0; /* 0xe6000000001cd802 */
/*0030*/ ISETP.EQ.AND P0, PT, R0, RZ, PT; /* 0xdb281c007f9c001e */
/*0038*/ @P0 EXIT;
...
/*0078*/ EXIT; /* 0x18000000001c003c */
- The bigger issue here is possibly one of timing rigor/correctness. CUDA has lazy initialization. This means that the first set of calls in your CUDA code may incur more than the usual amount of timing overhead. According to my testing this is affecting the conclusion here. If I run a "warm-up" call to
empty1
before actually timing it, the measured time between the two cases is nearly the same. It could be plausibly explained by the difference in code length.
Example:
$ cat t1353.cu
#include <cstdio>
#include <time.h>
#include <sys/time.h>
#include <cuda.h>
inline double wtime(){
double time[2];
struct timeval time1;
gettimeofday(&time1, NULL);
time[0]=time1.tv_sec;
time[1]=time1.tv_usec;
return time[0]+time[1]*1.0e-6;
}
__global__ void __empty1(bool flag){if(flag){ printf("hh\n");}}
__global__ void __empty2(bool flag){if(flag){ ; }}
int main(){
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
double s = wtime();
__empty1<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty1: %.3f\n", 1000*(wtime()-s));
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
s = wtime();
__empty2<<<256,256>>>(false);
cudaDeviceSynchronize();
printf("empty2: %.3f\n", 1000*(wtime()-s));
return 0;
}
$ nvcc -arch=sm_35 -o t1353 t1353.cu
$ ./t1353
empty1: 0.023
empty2: 0.015
$