I am writing an expression evaluator which produces x86 64bit assembly. I choose to mimic the C++ compiler rules for the variable types (char to int promotion etc., and literal values). However, I have an issue which is perplexing. When evaluating a complex mathematical expression, I use a postfix conversion first. Next, while tracking data types internally, the logic tracks data type according to conversion and or input operand types. My question is when do signed numbers become unsigned and vice versa within intermediate representation by a C++ compiler? During production of assembly, I noticed that clang assembly will sometimes use a movl when you might think that based upon previous data types the instruction would be to sign extended (movswl). In general it is my desire to mimic the traditions of type conversion of a c++ compiler as well as during intermediate processing.
As an example, consider this input into my program (the data types mimic c++):
short s_A=658
long long ll_B=293
unsigned int ui_C=94
print s_A
print ll_B
print ui_C
print "-------------------------"
ll_B=((394)-ui_C*(649)+(917)+ll_B-(80))
print ll_B
print "-------------------------"
print s_A
print ll_B
print ui_C
whereas in C++ the same code is produced as:
include <stdio.h>
int main(void) {
short s_A=658;
long long ll_B=293;
unsigned int ui_C=94;
printf("%hi\n",s_A);
printf("%lld\n",ll_B);
printf("%u\n",ui_C);
printf("-------------------------\n");
ll_B=((394)-ui_C*(649)+(917)+ll_B-(80));
printf("%lld\n",ll_B);
printf("-------------------------\n");
printf("%hi\n",s_A);
printf("%lld\n",ll_B);
printf("%u\n",ui_C);
}
My logic produces the following assembly:
.section .data
LC1:
.asciz "%hi\n"
LC2:
.asciz "%lld\n"
LC5:
.asciz "%s\n"
LC3:
.asciz "%u\n"
LC4:
.asciz "-------------------------"
ll_B:
.quad 293
s_A:
.word 658 #
ui_C:
.long 94 #
.section .text
.globl _start
_start:
and $0xfffffffffffffff0,%rsp
movq %rsp, %rbp
finit
mov s_A, %rsi
xor %rax, %rax
lea LC1, %rdi
call printf
mov ll_B, %rsi
xor %rax, %rax
lea LC2, %rdi
call printf
mov ui_C, %rsi
xor %rax, %rax
lea LC3, %rdi
call printf
lea LC4, %rsi
xor %rax, %rax
lea LC5, %rdi
call printf
# Expression: ((394)-ui_C*(649)+(917)+ll_B-(80))
# Postfix: 394 ui_C 649 * - 917 + ll_B + 80 -
# EVAL(1): ui_C*649
# MOVE ui_C ( UnsignedInt - 32bits ) to ( Int - 32bits )
# instructionType=STATE_x86
movl ui_C, %eax
imull $649, %eax
# EVAL(2): 394-%eax
movl $394, %ebx
subl %eax, %ebx
# free register %eax
# EVAL(3): %ebx+917
addl $917, %ebx
# EVAL(4): %ebx+ll_B
# CONVERT %ebx ( Int - 32bits ) to ( LongLong - 64bits )
# instructionType=STATE_x86
movslq %ebx, %rax #ISSUE
addq ll_B, %rax
# free register %ebx
# EVAL(5): %rax-80
subq $80, %rax
# STORE RESULT : %rax -> ll_B
# MOVE %rax ( LongLong - 64bits ) to ( LongLong - 64bits )
# instructionType=STATE_x86
movq %rax, ll_B
#
mov ll_B, %rsi
xor %rax, %rax
lea LC2, %rdi
call printf
lea LC4, %rsi
xor %rax, %rax
lea LC5, %rdi
call printf
mov s_A, %rsi
xor %rax, %rax
lea LC1, %rdi
call printf
mov ll_B, %rsi
xor %rax, %rax
lea LC2, %rdi
call printf
mov ui_C, %rsi
xor %rax, %rax
lea LC3, %rdi
call printf
done:
mov $0, %rdi
call exit
while gcc produces
.file "whole_short_int_longlong_simple_33.cpp"
.section .rodata
.LC0:
.string "%hi\n"
.LC1:
.string "%lld\n"
.LC2:
.string "%u\n"
.LC3:
.string "-------------------------"
.text
.globl main
.type main, @function
main:
.LFB0:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
movw $658, -14(%rbp)
movq $293, -8(%rbp)
movl $94, -12(%rbp)
movswl -14(%rbp), %eax
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movq -8(%rbp), %rax
movq %rax, %rsi
movl $.LC1, %edi
movl $0, %eax
call printf
movl -12(%rbp), %eax
movl %eax, %esi
movl $.LC2, %edi
movl $0, %eax
call printf
movl $.LC3, %edi
call puts
movl -12(%rbp), %eax
imull $649, %eax, %eax
movl $1311, %edx
subl %eax, %edx
movl %edx, %eax
movl %eax, %edx
movq -8(%rbp), %rax
addq %rdx, %rax
subq $80, %rax
movq %rax, -8(%rbp)
movq -8(%rbp), %rax
movq %rax, %rsi
movl $.LC1, %edi
movl $0, %eax
call printf
movl $.LC3, %edi
call puts
movswl -14(%rbp), %eax
movl %eax, %esi
movl $.LC0, %edi
movl $0, %eax
call printf
movq -8(%rbp), %rax
movq %rax, %rsi
movl $.LC1, %edi
movl $0, %eax
call printf
movl -12(%rbp), %eax
movl %eax, %esi
movl $.LC2, %edi
movl $0, %eax
call printf
movl $0, %eax
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE0:
.size main, .-main
.ident "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
.section .note.GNU-stack,"",@progbits
and clang produces:
.text
.file "/home/anthony/comptest/simple/whole_short_int_longlong_simple/whole_short_int_longlong_simple_33.cpp"
.globl main
.align 16, 0x90
.type main,@function
main: # @main
.cfi_startproc
# BB#0:
pushq %rbp
.Ltmp0:
.cfi_def_cfa_offset 16
.Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
.Ltmp2:
.cfi_def_cfa_register %rbp
subq $64, %rsp
movabsq $.L.str, %rdi
movw $658, -2(%rbp) # imm = 0x292
movq $293, -16(%rbp) # imm = 0x125
movl $94, -20(%rbp)
movswl -2(%rbp), %esi
movb $0, %al
callq printf
movabsq $.L.str1, %rdi
movq -16(%rbp), %rsi
movl %eax, -24(%rbp) # 4-byte Spill
movb $0, %al
callq printf
movabsq $.L.str2, %rdi
movl -20(%rbp), %esi
movl %eax, -28(%rbp) # 4-byte Spill
movb $0, %al
callq printf
movabsq $.L.str3, %rdi
movl %eax, -32(%rbp) # 4-byte Spill
movb $0, %al
callq printf
movabsq $.L.str1, %rdi
movl $394, %esi # imm = 0x18A
imull $649, -20(%rbp), %ecx # imm = 0x289
subl %ecx, %esi
addl $917, %esi # imm = 0x395
movl %esi, %ecx
movl %ecx, %edx
addq -16(%rbp), %rdx
subq $80, %rdx
movq %rdx, -16(%rbp)
movq -16(%rbp), %rsi
movl %eax, -36(%rbp) # 4-byte Spill
movb $0, %al
callq printf
movabsq $.L.str3, %rdi
movl %eax, -40(%rbp) # 4-byte Spill
movb $0, %al
callq printf
movabsq $.L.str, %rdi
movswl -2(%rbp), %esi
movl %eax, -44(%rbp) # 4-byte Spill
movb $0, %al
callq printf
movabsq $.L.str1, %rdi
movq -16(%rbp), %rsi
movl %eax, -48(%rbp) # 4-byte Spill
movb $0, %al
callq printf
movabsq $.L.str2, %rdi
movl -20(%rbp), %esi
movl %eax, -52(%rbp) # 4-byte Spill
movb $0, %al
callq printf
xorl %ecx, %ecx
movl %eax, -56(%rbp) # 4-byte Spill
movl %ecx, %eax
addq $64, %rsp
popq %rbp
retq
.Ltmp3:
.size main, .Ltmp3-main
.cfi_endproc
.type .L.str,@object # @.str
.section .rodata.str1.1,"aMS",@progbits,1
.L.str:
.asciz "%hi\n"
.size .L.str, 5
.type .L.str1,@object # @.str1
.L.str1:
.asciz "%lld\n"
.size .L.str1, 6
.type .L.str2,@object # @.str2
.L.str2:
.asciz "%u\n"
.size .L.str2, 4
.type .L.str3,@object # @.str3
.L.str3:
.asciz "-------------------------\n"
.size .L.str3, 27
.ident "Ubuntu clang version 3.6.2-1 (tags/RELEASE_362/final) (based on LLVM 3.6.2)"
.section ".note.GNU-stack","",@progbits
Notice that the problem is within the evaluation of the rbx register (EBX=32). Because the last operands were literals, internally my logic notes these values as signed int 32bit. Therefore, when widening to 64bits, the movslq is emitted to sign extend the value. While gcc (c compiler rules), does a zero extend by moving the 32 bit data into another register (which zero fills the upper half), and then references it as a 64bit numeric.
There are two other problems which are a little more complex I also want to solve (carry, overflow (integer, SSE and FPU), unsigned/signed etc.), so if you are good in assembly on x86 64, I am a well behaved conversationalist. Researching the Internet does produce some results for issues relating this subject. However, I have not found a definitive encompassing text for these two's compliments comprehensions.