0
votes

I am writing an expression evaluator which produces x86 64bit assembly. I choose to mimic the C++ compiler rules for the variable types (char to int promotion etc., and literal values). However, I have an issue which is perplexing. When evaluating a complex mathematical expression, I use a postfix conversion first. Next, while tracking data types internally, the logic tracks data type according to conversion and or input operand types. My question is when do signed numbers become unsigned and vice versa within intermediate representation by a C++ compiler? During production of assembly, I noticed that clang assembly will sometimes use a movl when you might think that based upon previous data types the instruction would be to sign extended (movswl). In general it is my desire to mimic the traditions of type conversion of a c++ compiler as well as during intermediate processing.

As an example, consider this input into my program (the data types mimic c++):

short s_A=658
long long ll_B=293
unsigned int ui_C=94

print s_A
print ll_B
print ui_C
print "-------------------------"

ll_B=((394)-ui_C*(649)+(917)+ll_B-(80))
print ll_B


print "-------------------------"
print s_A
print ll_B
print ui_C

whereas in C++ the same code is produced as:

include <stdio.h>

int main(void) { 
short s_A=658;
long long ll_B=293;
unsigned int ui_C=94;

printf("%hi\n",s_A);
printf("%lld\n",ll_B);
printf("%u\n",ui_C);
printf("-------------------------\n");

ll_B=((394)-ui_C*(649)+(917)+ll_B-(80));
printf("%lld\n",ll_B);


printf("-------------------------\n");
printf("%hi\n",s_A);
printf("%lld\n",ll_B);
printf("%u\n",ui_C);

}

My logic produces the following assembly:

.section .data
LC1:
    .asciz "%hi\n"
LC2:
    .asciz "%lld\n"
LC5:
    .asciz "%s\n"
LC3:
    .asciz "%u\n"
LC4:
    .asciz "-------------------------"
ll_B:
    .quad 293
s_A:
    .word 658   # 
ui_C:
    .long 94    # 
.section .text
.globl _start
_start:
    and $0xfffffffffffffff0,%rsp
    movq    %rsp, %rbp
    finit
    mov s_A, %rsi
    xor %rax, %rax
    lea LC1, %rdi
    call printf
    mov ll_B, %rsi
    xor %rax, %rax
    lea LC2, %rdi
    call printf
    mov ui_C, %rsi
    xor %rax, %rax
    lea LC3, %rdi
    call printf
    lea LC4, %rsi
    xor %rax, %rax
    lea LC5, %rdi
    call printf
# Expression: ((394)-ui_C*(649)+(917)+ll_B-(80))
# Postfix: 394 ui_C 649 * - 917 + ll_B + 80 - 
# EVAL(1): ui_C*649
# MOVE ui_C ( UnsignedInt - 32bits ) to ( Int - 32bits  )
# instructionType=STATE_x86
    movl ui_C, %eax
    imull $649, %eax
# EVAL(2): 394-%eax
    movl $394, %ebx
    subl %eax, %ebx
# free register %eax
# EVAL(3): %ebx+917
    addl $917, %ebx
# EVAL(4): %ebx+ll_B
# CONVERT %ebx ( Int - 32bits ) to ( LongLong - 64bits  )
# instructionType=STATE_x86
    movslq %ebx, %rax  #ISSUE
    addq ll_B, %rax
# free register %ebx
# EVAL(5): %rax-80
    subq $80, %rax
# STORE RESULT :  %rax -> ll_B
# MOVE %rax ( LongLong - 64bits ) to ( LongLong - 64bits  )
# instructionType=STATE_x86
    movq %rax, ll_B
#
    mov ll_B, %rsi
    xor %rax, %rax
    lea LC2, %rdi
    call printf
    lea LC4, %rsi
    xor %rax, %rax
    lea LC5, %rdi
    call printf
    mov s_A, %rsi
    xor %rax, %rax
    lea LC1, %rdi
    call printf
    mov ll_B, %rsi
    xor %rax, %rax
    lea LC2, %rdi
    call printf
    mov ui_C, %rsi
    xor %rax, %rax
    lea LC3, %rdi
    call printf
done:
    mov $0, %rdi
    call exit

while gcc produces

    .file   "whole_short_int_longlong_simple_33.cpp"
    .section    .rodata
.LC0:
    .string "%hi\n"
.LC1:
    .string "%lld\n"
.LC2:
    .string "%u\n"
.LC3:
    .string "-------------------------"
    .text
    .globl  main
    .type   main, @function
main:
.LFB0:
    .cfi_startproc
    pushq   %rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    movq    %rsp, %rbp
    .cfi_def_cfa_register 6
    subq    $16, %rsp
    movw    $658, -14(%rbp)
    movq    $293, -8(%rbp)
    movl    $94, -12(%rbp)
    movswl  -14(%rbp), %eax
    movl    %eax, %esi
    movl    $.LC0, %edi
    movl    $0, %eax
    call    printf
    movq    -8(%rbp), %rax
    movq    %rax, %rsi
    movl    $.LC1, %edi
    movl    $0, %eax
    call    printf
    movl    -12(%rbp), %eax
    movl    %eax, %esi
    movl    $.LC2, %edi
    movl    $0, %eax
    call    printf
    movl    $.LC3, %edi
    call    puts
    movl    -12(%rbp), %eax
    imull   $649, %eax, %eax
    movl    $1311, %edx
    subl    %eax, %edx
    movl    %edx, %eax
    movl    %eax, %edx
    movq    -8(%rbp), %rax
    addq    %rdx, %rax
    subq    $80, %rax
    movq    %rax, -8(%rbp)
    movq    -8(%rbp), %rax
    movq    %rax, %rsi
    movl    $.LC1, %edi
    movl    $0, %eax
    call    printf
    movl    $.LC3, %edi
    call    puts
    movswl  -14(%rbp), %eax
    movl    %eax, %esi
    movl    $.LC0, %edi
    movl    $0, %eax
    call    printf
    movq    -8(%rbp), %rax
    movq    %rax, %rsi
    movl    $.LC1, %edi
    movl    $0, %eax
    call    printf
    movl    -12(%rbp), %eax
    movl    %eax, %esi
    movl    $.LC2, %edi
    movl    $0, %eax
    call    printf
    movl    $0, %eax
    leave
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE0:
    .size   main, .-main
    .ident  "GCC: (Ubuntu 5.2.1-22ubuntu2) 5.2.1 20151010"
    .section    .note.GNU-stack,"",@progbits

and clang produces:

.text
    .file   "/home/anthony/comptest/simple/whole_short_int_longlong_simple/whole_short_int_longlong_simple_33.cpp"
    .globl  main
    .align  16, 0x90
    .type   main,@function
main:                                   # @main
    .cfi_startproc
# BB#0:
    pushq   %rbp
.Ltmp0:
    .cfi_def_cfa_offset 16
.Ltmp1:
    .cfi_offset %rbp, -16
    movq    %rsp, %rbp
.Ltmp2:
    .cfi_def_cfa_register %rbp
    subq    $64, %rsp
    movabsq $.L.str, %rdi
    movw    $658, -2(%rbp)          # imm = 0x292
    movq    $293, -16(%rbp)         # imm = 0x125
    movl    $94, -20(%rbp)
    movswl  -2(%rbp), %esi
    movb    $0, %al
    callq   printf
    movabsq $.L.str1, %rdi
    movq    -16(%rbp), %rsi
    movl    %eax, -24(%rbp)         # 4-byte Spill
    movb    $0, %al
    callq   printf
    movabsq $.L.str2, %rdi
    movl    -20(%rbp), %esi
    movl    %eax, -28(%rbp)         # 4-byte Spill
    movb    $0, %al
    callq   printf
    movabsq $.L.str3, %rdi
    movl    %eax, -32(%rbp)         # 4-byte Spill
    movb    $0, %al
    callq   printf
    movabsq $.L.str1, %rdi
    movl    $394, %esi              # imm = 0x18A
    imull   $649, -20(%rbp), %ecx   # imm = 0x289
    subl    %ecx, %esi
    addl    $917, %esi              # imm = 0x395
    movl    %esi, %ecx
    movl    %ecx, %edx
    addq    -16(%rbp), %rdx
    subq    $80, %rdx
    movq    %rdx, -16(%rbp)
    movq    -16(%rbp), %rsi
    movl    %eax, -36(%rbp)         # 4-byte Spill
    movb    $0, %al
    callq   printf
    movabsq $.L.str3, %rdi
    movl    %eax, -40(%rbp)         # 4-byte Spill
    movb    $0, %al
    callq   printf
    movabsq $.L.str, %rdi
    movswl  -2(%rbp), %esi
    movl    %eax, -44(%rbp)         # 4-byte Spill
    movb    $0, %al
    callq   printf
    movabsq $.L.str1, %rdi
    movq    -16(%rbp), %rsi
    movl    %eax, -48(%rbp)         # 4-byte Spill
    movb    $0, %al
    callq   printf
    movabsq $.L.str2, %rdi
    movl    -20(%rbp), %esi
    movl    %eax, -52(%rbp)         # 4-byte Spill
    movb    $0, %al
    callq   printf
    xorl    %ecx, %ecx
    movl    %eax, -56(%rbp)         # 4-byte Spill
    movl    %ecx, %eax
    addq    $64, %rsp
    popq    %rbp
    retq
.Ltmp3:
    .size   main, .Ltmp3-main
    .cfi_endproc

    .type   .L.str,@object          # @.str
    .section    .rodata.str1.1,"aMS",@progbits,1
.L.str:
    .asciz  "%hi\n"
    .size   .L.str, 5

    .type   .L.str1,@object         # @.str1
.L.str1:
    .asciz  "%lld\n"
    .size   .L.str1, 6

    .type   .L.str2,@object         # @.str2
.L.str2:
    .asciz  "%u\n"
    .size   .L.str2, 4

    .type   .L.str3,@object         # @.str3
.L.str3:
    .asciz  "-------------------------\n"
    .size   .L.str3, 27


    .ident  "Ubuntu clang version 3.6.2-1 (tags/RELEASE_362/final) (based on LLVM 3.6.2)"
    .section    ".note.GNU-stack","",@progbits

Notice that the problem is within the evaluation of the rbx register (EBX=32). Because the last operands were literals, internally my logic notes these values as signed int 32bit. Therefore, when widening to 64bits, the movslq is emitted to sign extend the value. While gcc (c compiler rules), does a zero extend by moving the 32 bit data into another register (which zero fills the upper half), and then references it as a 64bit numeric.

There are two other problems which are a little more complex I also want to solve (carry, overflow (integer, SSE and FPU), unsigned/signed etc.), so if you are good in assembly on x86 64, I am a well behaved conversationalist. Researching the Internet does produce some results for issues relating this subject. However, I have not found a definitive encompassing text for these two's compliments comprehensions.

1
Do not add tags for unrelated languages. TL;DR. No need to post your whole program code. Provide a minimal reproducible example.too honest for this site
Understand integer conversion rules. The first link in google when searching for pretty much that term. You really must've exhausted yourself greatly while searching.Voo
Technically, you only ask one question about the rules of integral type conversion in C++, and the rest of context. And that question is a duplicate. stackoverflow.com/questions/5563000/…CinchBlue

1 Answers

0
votes

Section 4.5 explains how a value is converted to it's own type:

4.5 Integral promotions [conv.prom]

  • A prvalue of an integer type other than bool, char16_t, char32_t, or wchar_t whose integer conversion rank (4.13) is less than the rank of int can be converted to a prvalue of type int if int can represent all the values of the source type; otherwise, the source prvalue can be converted to a prvalue of type unsigned int.

  • A prvalue of type char16_t, char32_t, or wchar_t (3.9.1) can be converted to a prvalue of the first of the following types that can represent all the values of its underlying type: int, unsigned int, long int, unsigned long int, long long int, or unsigned long long int. If none of the types in that list can represent all the values of its underlying type, a prvalue of type char16_t, char32_t, or wchar_t can be converted to a prvalue of its underlying type.

  • A prvalue of an unscoped enumeration type whose underlying type is not fixed (7.2) can be converted to a prvalue of the first of the following types that can represent all the values of the enumeration (i.e., the values in the range b min to b max as described in 7.2): int, unsigned int, long int, unsigned long int, long long int, or unsigned long long int. If none of the types in that list can represent all the values of the enumeration, a prvalue of an unscoped enumeration type can be converted to a prvalue of the extended integer type with lowest integer conversion rank (4.13) greater than the rank of long long in which all the values of the enumeration can be represented. If there are two such extended types, the signed one is chosen.

  • A prvalue of an unscoped enumeration type whose underlying type is fixed (7.2) can be converted to a prvalue of its underlying type. Moreover, if integral promotion can be applied to its underlying type, a rvalue of an unscoped enumeration type whose underlying type is fixed can also be converted to a prvalue of the promoted underlying type.

  • A prvalue for an integral bit-field (9.6) can be converted to a prvalue of type int if int can represent all the values of the bit-field; otherwise, it can be converted to unsigned int if unsigned int can represent all the values of the bit-field. If the bit-field is larger yet, no integral promotion applies to it. If the bit-field has an enumerated type, it is treated as any other value of that type for promotion purposes.

  • A prvalue of type bool can be converted to a prvalue of type int, with false becoming zero and true becoming one.
  • These conversions are called integral promotions.

Section 5 of the C++ standard explains the promotion within expressions.

... lots of other things ...

  • Otherwise, the integral promotions (4.5) shall be performed on both operands. [59] Then the following rules shall be applied to the promoted operands:

    • If both operands have the same type, no further conversion is needed.

    • Otherwise, if both operands have signed integer types or both have unsigned integer types, the operand with the type of lesser integer conversion rank shall be converted to the type of the operand with greater rank.

    • Otherwise, if the operand that has unsigned integer type has rank greater than or equal to the rank of the type of the other operand, the operand with signed integer type shall be converted to the type of the operand with unsigned integer type.

    • Otherwise, if the type of the operand with signed integer type can represent all of the values of the type of the operand with unsigned integer type, the operand with unsigned integer type shall be converted to the type of the operand with signed integer type.

    • Otherwise, both operands shall be converted to the unsigned integer type corresponding to the type of the operand with signed integer type.

[59] As a consequence, operands of type bool, char16_t, char32_t, wchar_t, or an enumerated type are converted to some integral type.

So, if we take your expression:

ll_B=((394)-ui_C*(649)+(917)+ll_B-(80));

Remove superfluous parenthesis so to read easier:

ll_B=394-ui_C*649+917+ll_B-80;

So, we have 394 - signed integer by section 4.5 rules. Then ui_C*649 is an unsigned expression, 917 is a signed value (but adding it to a unsigned value of equal or greater size and fits in unsigned range), then ll_B is long-long. 80 is a signed integer.

As a summary, we should see:

  • unsigned multiplication of ui_C with 649.
  • Subtracting that from 394.
  • Adding 917 (unsigned).
  • Unsigned extension to 64 bits.
  • Adding ll_B
  • Adding of 80.