The only important thing I can see here is to make sure your arch settings match. Here is what I see:
$ cat t39.cu
__global__ void testPtx(int *devBuff,float *devDummy,unsigned int *timeBuff){
unsigned int temp=0;
unsigned int start,end;
volatile unsigned int *tempPtr;
tempPtr = (volatile unsigned int *)&devBuff[0];
start = clock64();
temp=*tempPtr;
__threadfence();
end = clock64();
*devDummy=(float)(1.0/(float)(temp));
*timeBuff = (unsigned int)(end-start);
}
$ nvcc -c t39.cu
$ cuobjdump -sass t39.o
Fatbin elf code:
================
arch = sm_52
code version = [1,7]
producer = <unknown>
host = linux
compile_size = 64bit
code for sm_52
Function : _Z7testPtxPiPfPj
.headerflags @"EF_CUDA_SM52 EF_CUDA_PTX_SM(EF_CUDA_SM52)"
/* 0x001fc400fec007f6 */
/*0008*/ MOV R1, c[0x0][0x20] ; /* 0x4c98078000870001 */
/*0010*/ CS2R R0, SR_CLOCKLO ; /* 0x50c8000005070000 */
/*0018*/ MOV R2, c[0x0][0x140] ; /* 0x4c98078005070002 */
/* 0x001ffc00162007f2 */
/*0028*/ MOV R3, c[0x0][0x144] ; /* 0x4c98078005170003 */
/*0030*/ LDG.E.CV R3, [R2] ; /* 0xeed4e00000070203 */
/*0038*/ MEMBAR.GL.IVALLD ; /* 0xef98000000070101 */
/* 0x003fd820e3a00ff6 */
/*0048*/ CS2R R2, SR_CLOCKLO ; /* 0x50c8000005070002 */
/*0050*/ I2F.F32.U32 R3, R3 ; /* 0x5cb8000000370a03 */
/*0058*/ IADD32I R4, R3, 0x1800000 ; /* 0x1c00180000070304 */
/* 0x001ff400fda007f6 */
/*0068*/ LOP32I.AND R4, R4, 0x7f800000 ; /* 0x0407f80000070404 */
/*0070*/ ISETP.GT.U32.AND P0, PT, R4, c[0x2][0x0], PT ; /* 0x4b68038800070407 */
/*0078*/ @P0 BRA 0xa0 ; /* 0xe24000000200000f */
/* 0x001ff400fe0007fd */
/*0088*/ CAL 0x120 ; /* 0xe260000009000040 */
/*0090*/ { MOV R7, R4 ; /* 0x5c98078000470007 */
/*0098*/ BRA 0xd0 }
/* 0xe24000000307000f */
/* 0x001fd801fec0071d */
/*00a8*/ MUFU.RCP R4, R3 ; /* 0x5080000000470304 */
/*00b0*/ FFMA R5, R3, R4, c[0x2][0x4] ; /* 0x5180020800170305 */
/*00b8*/ FADD.FTZ R5, -R5, -RZ ; /* 0x5c5930000ff70505 */
/* 0x001fc400fe2007f6 */
/*00c8*/ FFMA R7, R4, R5, R4 ; /* 0x5980020000570407 */
/*00d0*/ IADD R6, -R0, R2 ; /* 0x5c12000000270006 */
/*00d8*/ MOV R2, c[0x0][0x148] ; /* 0x4c98078005270002 */
/* 0x001fc400fe0007f2 */
/*00e8*/ MOV R3, c[0x0][0x14c] ; /* 0x4c98078005370003 */
/*00f0*/ { MOV R4, c[0x0][0x150] ; /* 0x4c98078005470004 */
/*00f8*/ STG.E [R2], R7 }
/* 0xeedc200000070207 */
/* 0x001ffc00fe2007f2 */
/*0108*/ MOV R5, c[0x0][0x154] ; /* 0x4c98078005570005 */
/*0110*/ STG.E [R4], R6 ; /* 0xeedc200000070406 */
/*0118*/ EXIT ; /* 0xe30000000007000f */
/* 0x001fb400fec007f6 */
/*0128*/ SHL R4, R3, 0x1 ; /* 0x3848000000170304 */
/*0130*/ SHR.U32 R4, R4, 0x18 ; /* 0x3828000001870404 */
/*0138*/ ISETP.NE.U32.AND P0, PT, R4, RZ, PT ; /* 0x5b6a03800ff70407 */
/* 0x001fb400fec007fd */
/*0148*/ @P0 BRA 0x1c0 ; /* 0xe24000000700000f */
/*0150*/ SHL R4, R3, 0x1 ; /* 0x3848000000170304 */
/*0158*/ ISETP.NE.AND P0, PT, R4, RZ, PT ; /* 0x5b6b03800ff70407 */
/* 0x001fc801ffa00712 */
/*0168*/ @!P0 MUFU.RCP R4, R3 ; /* 0x5080000000480304 */
/*0170*/ @!P0 RET ; /* 0xe32000000008000f */
/*0178*/ FFMA R3, R3, 1.84467440737095516160e+19, RZ ; /* 0x32807fdf80070303 */
/* 0x001fd801fec0071d */
/*0188*/ MUFU.RCP R4, R3 ; /* 0x5080000000470304 */
/*0190*/ FFMA R5, R3, R4, c[0x2][0x4] ; /* 0x5180020800170305 */
/*0198*/ FADD.FTZ R5, -R5, -RZ ; /* 0x5c5930000ff70505 */
/* 0x001ffc00fe0007f6 */
/*01a8*/ FFMA R4, R4, R5, R4 ; /* 0x5980020000570404 */
/*01b0*/ { FFMA R4, R4, 1.84467440737095516160e+19, RZ ; /* 0x32807fdf80070404 */
/*01b8*/ RET }
/* 0xe32000000007000f */
/* 0x001ff400fda007f6 */
/*01c8*/ IADD32I R5, R4, -0xfd ; /* 0x1c0ffffff0370405 */
/*01d0*/ ISETP.GT.U32.AND P0, PT, R5, 0x1, PT ; /* 0x3668038000170507 */
/*01d8*/ @P0 BRA 0x300 ; /* 0xe24000001200000f */
/* 0x001fd000fe2007f1 */
/*01e8*/ LOP32I.AND R11, R3, 0x7fffff ; /* 0x040007fffff7030b */
/*01f0*/ MOV32I R10, 0x3 ; /* 0x010000000037f00a */
/*01f8*/ IADD32I R4, R4, -0xfc ; /* 0x1c0ffffff0470404 */
/* 0x001c7400fe0007f2 */
/*0208*/ LOP32I.OR R6, R11, 0x3f800000 ; /* 0x0423f80000070b06 */
/*0210*/ { SHL R10, R10, R5 ; /* 0x5c48000000570a0a */
/*0218*/ MUFU.RCP R7, R6 }
/* 0x5080000000470607 */
/* 0x381fc400fcc00ff6 */
/*0228*/ FFMA R8, R6, R7, c[0x2][0x4] ; /* 0x5180038800170608 */
/*0230*/ FADD.FTZ R8, -R8, -RZ ; /* 0x5c5930000ff70808 */
/*0238*/ FFMA.RM R9, R7.reuse, R8.reuse, R7.reuse ; /* 0x5988038000870709 */
/* 0x001fd440fe2007f5 */
/*0248*/ FFMA.RP R8, R7, R8, R7 ; /* 0x5990038000870708 */
/*0250*/ LOP32I.AND R7, R9.reuse, 0x7fffff ; /* 0x040007fffff70907 */
/*0258*/ FSET.NEU.FTZ.AND R8, R9, R8, PT ; /* 0x588d038000870908 */
/* 0x001fc400fca007f1 */
/*0268*/ LOP32I.OR R7, R7, 0x800000 ; /* 0x0420080000070707 */
/*0270*/ IADD R8, -R8, RZ ; /* 0x5c1200000ff70808 */
/*0278*/ LOP.AND R10, R10, R7 ; /* 0x5c47000000770a0a */
/* 0x001fd800fe8407f1 */
/*0288*/ LOP3.LUT.NZ P1, RZ, R8, R5.reuse, R7, 0xf8 ; /* 0x5be103bf805708ff */
/*0290*/ SHR.U32 R4, R7, R4 ; /* 0x5c28000000470704 */
/*0298*/ SHR.U32 R10, R10, R5 ; /* 0x5c28000000570a0a */
/* 0x001f8400fd8207f1 */
/*02a8*/ LOP.AND.NZ P2, RZ, R10.reuse, 0x2 ; /* 0x3842300000270aff */
/*02b0*/ LOP.AND.NZ P0, RZ, R10, 0x1 ; /* 0x3840300000170aff */
/*02b8*/ PSETP.OR.AND P1, PT, P1, P2, PT ; /* 0x509003804107100f */
/* 0x001ff400fda007ec */
/*02c8*/ ISETP.EQ.U32.AND P2, PT, R11, RZ, PT ; /* 0x5b6403800ff70b17 */
/*02d0*/ PSETP.AND.AND P0, PT, P0, P1, PT ; /* 0x5090038020070007 */
/*02d8*/ @P0 IADD32I R4, R4, 0x1 ; /* 0x1c00000000100404 */
/* 0x001ffc00fe0007f6 */
/*02e8*/ @P2 SHL R4, R4, 0x1 ; /* 0x3848000000120404 */
/*02f0*/ { LOP3.LUT R4, R4, c[0x2][0x8], R3, 0xf8 ; /* 0x02f8018800270404 */
/*02f8*/ RET }
/* 0xe32000000007000f */
/* 0x001ffc01ffe0071d */
/*0308*/ MUFU.RCP R4, R3 ; /* 0x5080000000470304 */
/*0310*/ RET ; /* 0xe32000000007000f */
/*0318*/ BRA 0x318 ; /* 0xe2400fffff87000f */
/* 0x001f8000fc0007e0 */
/*0328*/ NOP; /* 0x50b0000000070f00 */
/*0330*/ NOP; /* 0x50b0000000070f00 */
/*0338*/ NOP; /* 0x50b0000000070f00 */
..........
Fatbin ptx code:
================
arch = sm_52
code version = [7,2]
producer = <unknown>
host = linux
compile_size = 64bit
compressed
$ nvcc -cubin t39.cu
$ nvdisasm -c t39.cubin
.headerflags @"EF_CUDA_TEXMODE_UNIFIED EF_CUDA_64BIT_ADDRESS EF_CUDA_SM52 EF_CUDA_VIRTUAL_SM(EF_CUDA_SM52)"
.elftype @"ET_EXEC"
//--------------------- .text._Z7testPtxPiPfPj --------------------------
.section .text._Z7testPtxPiPfPj,"ax",@progbits
.sectioninfo @"SHI_REGISTERS=12"
.align 32
.global _Z7testPtxPiPfPj
.type _Z7testPtxPiPfPj,@function
.size _Z7testPtxPiPfPj,(.L_34 - _Z7testPtxPiPfPj)
.other _Z7testPtxPiPfPj,@"STO_CUDA_ENTRY STV_DEFAULT"
_Z7testPtxPiPfPj:
.text._Z7testPtxPiPfPj:
/*0008*/ MOV R1, c[0x0][0x20] ;
/*0010*/ CS2R R0, SR_CLOCKLO ;
/*0018*/ MOV R2, c[0x0][0x140] ;
/*0028*/ MOV R3, c[0x0][0x144] ;
/*0030*/ LDG.E.CV R3, [R2] ;
/*0038*/ MEMBAR.GL.IVALLD ;
/*0048*/ CS2R R2, SR_CLOCKLO ;
/*0050*/ I2F.F32.U32 R3, R3 ;
/*0058*/ IADD32I R4, R3, 0x1800000 ;
/*0068*/ LOP32I.AND R4, R4, 0x7f800000 ;
/*0070*/ ISETP.GT.U32.AND P0, PT, R4, c[0x2][0x0], PT ;
/*0078*/ @P0 BRA `(.L_1) ;
/*0088*/ CAL `($_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath) ;
/*0090*/ { MOV R7, R4 ;
/*0098*/ BRA `(.L_2) }
.L_1:
/*00a8*/ MUFU.RCP R4, R3 ;
/*00b0*/ FFMA R5, R3, R4, c[0x2][0x4] ;
/*00b8*/ FADD.FTZ R5, -R5, -RZ ;
/*00c8*/ FFMA R7, R4, R5, R4 ;
.L_2:
/*00d0*/ IADD R6, -R0, R2 ;
/*00d8*/ MOV R2, c[0x0][0x148] ;
/*00e8*/ MOV R3, c[0x0][0x14c] ;
/*00f0*/ { MOV R4, c[0x0][0x150] ;
/*00f8*/ STG.E [R2], R7 }
/*0108*/ MOV R5, c[0x0][0x154] ;
/*0110*/ STG.E [R4], R6 ;
/*0118*/ EXIT ;
.weak $_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath
.type $_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath,@function
.size $_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath,(.L_34 - $_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath)
$_Z7testPtxPiPfPj$__cuda_sm20_rcp_rn_f32_slowpath:
/*0128*/ SHL R4, R3, 0x1 ;
/*0130*/ SHR.U32 R4, R4, 0x18 ;
/*0138*/ ISETP.NE.U32.AND P0, PT, R4, RZ, PT ;
/*0148*/ @P0 BRA `(.L_3) ;
/*0150*/ SHL R4, R3, 0x1 ;
/*0158*/ ISETP.NE.AND P0, PT, R4, RZ, PT ;
/*0168*/ @!P0 MUFU.RCP R4, R3 ;
/*0170*/ @!P0 RET ;
/*0178*/ FFMA R3, R3, 1.84467440737095516160e+19, RZ ;
/*0188*/ MUFU.RCP R4, R3 ;
/*0190*/ FFMA R5, R3, R4, c[0x2][0x4] ;
/*0198*/ FADD.FTZ R5, -R5, -RZ ;
/*01a8*/ FFMA R4, R4, R5, R4 ;
/*01b0*/ { FFMA R4, R4, 1.84467440737095516160e+19, RZ ;
/*01b8*/ RET }
.L_3:
/*01c8*/ IADD32I R5, R4, -0xfd ;
/*01d0*/ ISETP.GT.U32.AND P0, PT, R5, 0x1, PT ;
/*01d8*/ @P0 BRA `(.L_4) ;
/*01e8*/ LOP32I.AND R11, R3, 0x7fffff ;
/*01f0*/ MOV32I R10, 0x3 ;
/*01f8*/ IADD32I R4, R4, -0xfc ;
/*0208*/ LOP32I.OR R6, R11, 0x3f800000 ;
/*0210*/ { SHL R10, R10, R5 ;
/*0218*/ MUFU.RCP R7, R6 }
/*0228*/ FFMA R8, R6, R7, c[0x2][0x4] ;
/*0230*/ FADD.FTZ R8, -R8, -RZ ;
/*0238*/ FFMA.RM R9, R7.reuse, R8.reuse, R7.reuse ;
/*0248*/ FFMA.RP R8, R7, R8, R7 ;
/*0250*/ LOP32I.AND R7, R9.reuse, 0x7fffff ;
/*0258*/ FSET.NEU.FTZ.AND R8, R9, R8, PT ;
/*0268*/ LOP32I.OR R7, R7, 0x800000 ;
/*0270*/ IADD R8, -R8, RZ ;
/*0278*/ LOP.AND R10, R10, R7 ;
/*0288*/ LOP3.LUT.NZ P1, RZ, R8, R5.reuse, R7, 0xf8 ;
/*0290*/ SHR.U32 R4, R7, R4 ;
/*0298*/ SHR.U32 R10, R10, R5 ;
/*02a8*/ LOP.AND.NZ P2, RZ, R10.reuse, 0x2 ;
/*02b0*/ LOP.AND.NZ P0, RZ, R10, 0x1 ;
/*02b8*/ PSETP.OR.AND P1, PT, P1, P2, PT ;
/*02c8*/ ISETP.EQ.U32.AND P2, PT, R11, RZ, PT ;
/*02d0*/ PSETP.AND.AND P0, PT, P0, P1, PT ;
/*02d8*/ @P0 IADD32I R4, R4, 0x1 ;
/*02e8*/ @P2 SHL R4, R4, 0x1 ;
/*02f0*/ { LOP3.LUT R4, R4, c[0x2][0x8], R3, 0xf8 ;
/*02f8*/ RET }
.L_4:
/*0308*/ MUFU.RCP R4, R3 ;
/*0310*/ RET ;
.L_5:
/*0318*/ BRA `(.L_5) ;
.L_34:
$
Those two sets of SASS are the same.
gencode
switches and just do something like-arch=sm_60
– Robert Crovellagencode
switches and compiled the source code. However, the problem didn't get resolved. I still need to generate the cubin file separately for using nvdisasm (first usingnvcc -cubin sourcefile.cu
and thennvdisasm -c sourcefile.cubin
) for disassembly which doesn't contain any compilation flag information and so it is still different from sass code generated usingcuobjdump
from the host binary file. – duttasankha