I'm experimenting with an i.MX53 board with linux ubuntu. I'm working over ssh with a cross compiler (arm-linux-gnueabihf) on my host system.
For a benchmark with floating-point operations on the processor (ARM Cortex-A8), I created the following two different programs, which both contain a simple loop. In the first program, the loop contains a single multiplication, in the second one the same multiplication with an extra addition.
I compiled the two programs with the following compiler call:
arm-linux-gnueabihf-g++-4.8 -O3 -ffast-math -Ofast -Wall -fmessage-length=0 -Wno-multichar -Wno-unknown-pragmas -std=c++11 -mcpu=cortex-a8 -mfpu=neon -mfloat-abi=hard -save-temps loopMul.cpp -o loopMul
Now my question: Why outputs the compiler such different assembly code for the loop part? (see assembly code)
I recognised that in the first program, the compiler generates NEON instructions for the multiplication, and in the second one only the slower VFP instructions.
first program (loopMul.cpp):
#include <iostream>
#include <ctime>
using namespace std;
int main(int argc, char **argv)
{
size_t length = 10E7;
float test = 1;
clock_t start = clock();
for(size_t i=1; i<length; i++)
{
test *= i;
}
clock_t elapsed = clock() - start;
cout << test << endl;
float elapsed_seconds = float(elapsed) / float(CLOCKS_PER_SEC);
cout << "loop took " << elapsed_seconds << "seconds" << endl;
return 0;
}
second program (loopMulAdd):
#include <iostream>
#include <ctime>
using namespace std;
int main(int argc, char **argv)
{
size_t length = 10E7;
float test = 1;
clock_t start = clock();
for(size_t i=1; i<length; i++)
{
test *= i;
test += 1;
}
clock_t elapsed = clock() - start;
cout << test << endl;
float elapsed_seconds = float(elapsed) / float(CLOCKS_PER_SEC);
cout << "loop took " << elapsed_seconds << "seconds" << endl;
return 0;
}
assembly output of first programm (loopMul.s):
.syntax unified
.cpu cortex-a8
.eabi_attribute 27, 3
.eabi_attribute 28, 1
.fpu neon
.eabi_attribute 23, 1
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.thumb
.file "loopMul.cpp"
.section .text.startup,"ax",%progbits
.align 2
.global main
.thumb
.thumb_func
.type main, %function
main:
.fnstart
.LFB1265:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, lr}
.save {r4, lr}
fstmfdd sp!, {d8, d9, d10, d11}
.vsave {d8, d9, d10, d11}
bl clock
vmov.i32 q11, #4 @ v4si
vmov.f32 q9, #1.0e+0 @ v4sf
movw r2, #30783
movt r2, 381
movs r3, #0
vldr d16, .L8
vldr d17, .L8+8
mov r4, r0
.L3:
vcvt.f32.s32 q10, q8
adds r3, r3, #1
cmp r3, r2
vadd.i32 q8, q8, q11
vmul.f32 q9, q9, q10
bne .L3
vmov.i32 q8, #0 @ v16qi
vext.8 q4, q9, q8, #8
vmul.f32 q4, q4, q9
vext.8 q5, q4, q8, #4
bl clock
flds s15, .L8+16
vmul.f32 q4, q5, q4
vmov.32 r3, d8[0]
fmsr s0, r3
fmuls s0, s0, s15
fcvtds d0, s0
subs r4, r0, r4
movw r0, #:lower16:_ZSt4cout
movt r0, #:upper16:_ZSt4cout
bl _ZNSo9_M_insertIdEERSoT_
bl _ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
movw r0, #:lower16:_ZSt4cout
movw r1, #:lower16:.LC0
movt r0, #:upper16:_ZSt4cout
movt r1, #:upper16:.LC0
bl _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
fmsr s15, r4 @ int
fsitos s0, s15
flds s15, .L8+20
fmuls s0, s0, s15
fcvtds d0, s0
bl _ZNSo9_M_insertIdEERSoT_
movw r1, #:lower16:.LC1
movt r1, #:upper16:.LC1
bl _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
bl _ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
fldmfdd sp!, {d8-d11}
movs r0, #0
pop {r4, pc}
.L9:
.align 3
.L8:
.word 1
.word 2
.word 3
.word 4
.word 1733542428
.word 897988541
.fnend
.size main, .-main
.align 2
.thumb
.thumb_func
.type _GLOBAL__sub_I_main, %function
_GLOBAL__sub_I_main:
.fnstart
.LFB1422:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, lr}
movw r4, #:lower16:.LANCHOR0
movt r4, #:upper16:.LANCHOR0
mov r0, r4
bl _ZNSt8ios_base4InitC1Ev
mov r0, r4
movw r1, #:lower16:_ZNSt8ios_base4InitD1Ev
movw r2, #:lower16:__dso_handle
movt r1, #:upper16:_ZNSt8ios_base4InitD1Ev
movt r2, #:upper16:__dso_handle
pop {r4, lr}
b __aeabi_atexit
.cantunwind
.fnend
.size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main
.section .init_array,"aw",%init_array
.align 2
.word _GLOBAL__sub_I_main(target1)
.section .rodata.str1.4,"aMS",%progbits,1
.align 2
.LC0:
.ascii "loop took \000"
.space 1
.LC1:
.ascii "seconds\000"
.bss
.align 2
.LANCHOR0 = . + 0
.type _ZStL8__ioinit, %object
.size _ZStL8__ioinit, 1
_ZStL8__ioinit:
.space 1
.hidden __dso_handle
.ident "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu7) 4.8.1"
.section .note.GNU-stack,"",%progbits
Assembly output of the second programm(loopMulAdd.s):
.syntax unified
.cpu cortex-a8
.eabi_attribute 27, 3
.eabi_attribute 28, 1
.fpu neon
.eabi_attribute 23, 1
.eabi_attribute 24, 1
.eabi_attribute 25, 1
.eabi_attribute 26, 2
.eabi_attribute 30, 2
.eabi_attribute 34, 1
.eabi_attribute 18, 4
.thumb
.file "loopMulAdd.cpp"
.section .text.startup,"ax",%progbits
.align 2
.global main
.thumb
.thumb_func
.type main, %function
main:
.fnstart
.LFB1265:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, lr}
.save {r4, lr}
fstmfdd sp!, {d8}
.vsave {d8}
bl clock
fconsts s16, #112
mov r2, #57600
movt r2, 1525
movs r3, #1
fcpys s14, s16
mov r4, r0
.L3:
fmsr s13, r3 @ int
adds r3, r3, #1
cmp r3, r2
fsitos s15, s13
fcpys s13, s14
fmacs s13, s16, s15
fcpys s16, s13
bne .L3
bl clock
fcvtds d0, s16
subs r4, r0, r4
movw r0, #:lower16:_ZSt4cout
movt r0, #:upper16:_ZSt4cout
bl _ZNSo9_M_insertIdEERSoT_
bl _ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
movw r0, #:lower16:_ZSt4cout
movw r1, #:lower16:.LC0
movt r0, #:upper16:_ZSt4cout
movt r1, #:upper16:.LC0
bl _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
fmsr s15, r4 @ int
fsitos s0, s15
flds s15, .L7
fmuls s0, s0, s15
fcvtds d0, s0
bl _ZNSo9_M_insertIdEERSoT_
movw r1, #:lower16:.LC1
movt r1, #:upper16:.LC1
bl _ZStlsISt11char_traitsIcEERSt13basic_ostreamIcT_ES5_PKc
bl _ZSt4endlIcSt11char_traitsIcEERSt13basic_ostreamIT_T0_ES6_
fldmfdd sp!, {d8}
movs r0, #0
pop {r4, pc}
.L8:
.align 2
.L7:
.word 897988541
.fnend
.size main, .-main
.align 2
.thumb
.thumb_func
.type _GLOBAL__sub_I_main, %function
_GLOBAL__sub_I_main:
.fnstart
.LFB1422:
@ args = 0, pretend = 0, frame = 0
@ frame_needed = 0, uses_anonymous_args = 0
push {r4, lr}
movw r4, #:lower16:.LANCHOR0
movt r4, #:upper16:.LANCHOR0
mov r0, r4
bl _ZNSt8ios_base4InitC1Ev
mov r0, r4
movw r1, #:lower16:_ZNSt8ios_base4InitD1Ev
movw r2, #:lower16:__dso_handle
movt r1, #:upper16:_ZNSt8ios_base4InitD1Ev
movt r2, #:upper16:__dso_handle
pop {r4, lr}
b __aeabi_atexit
.cantunwind
.fnend
.size _GLOBAL__sub_I_main, .-_GLOBAL__sub_I_main
.section .init_array,"aw",%init_array
.align 2
.word _GLOBAL__sub_I_main(target1)
.section .rodata.str1.4,"aMS",%progbits,1
.align 2
.LC0:
.ascii "loop took \000"
.space 1
.LC1:
.ascii "seconds\000"
.bss
.align 2
.LANCHOR0 = . + 0
.type _ZStL8__ioinit, %object
.size _ZStL8__ioinit, 1
_ZStL8__ioinit:
.space 1
.hidden __dso_handle
.ident "GCC: (Ubuntu/Linaro 4.8.1-10ubuntu7) 4.8.1"
.section .note.GNU-stack,"",%progbits
-funsafe-math-optimizations
flag? Normally I would have expected avmla
but may be it is due to floating point correctness. – auselen