(Updated to remove decltype and replace with static_cast, same results)
In the code example, Adding in the cast in the macro MAX, the code performs faster. I can't figure out why as it seems that it should be identical. This happens with two different ARM compilers, GCC (and armclang in larger codebase). Any thoughts on this would be very helpful.
In the code below, when defining WITH_CAST, the results of the compilation are significantly improved (with identical results in my larger codebase). The cast performed appears to be superfluous. I am running this within Keil 5.25pre2 (only as a simulator). I've used Keil simulator to check performance speed, by looking at what the t1 timer shows in terms of micro-seconds passed.
Snippet from code:
#if defined (WITH_CAST)
#define MAX(a,b) (((a) > (b)) ? (static_cast<mytype>(a)) : (static_cast<mytype>(b)))
#else
#define MAX(a,b) (((a) > (b)) ? ((a)) : ((b)))
#endif
GNU Arm Tools Embedded v. 7 2017-q4-major.
Compiler options: -c -mcpu=cortex-m4 -mthumb -gdwarf-2 -MD -Wall -O -mapcs-frame -mthumb-interwork -std=c++14 -Ofast -I./RTE/_Target_1 -IC:/Keil_v525pre/ARM/PACK/ARM/CMSIS/5.2.0/CMSIS/Include -IC:/Keil_v525pre/ARM/PACK/ARM/CMSIS/5.2.0/Device/ARM/ARMCM4/Include -I"C:/Program Files (x86)/GNU Tools ARM Embedded/7 2017-q4-major/arm-none-eabi/include" -I"C:/Program Files (x86)/GNU Tools ARM Embedded/7 2017-q4-major/lib/gcc/arm-none-eabi/7.2.1/include" -I"C:/Program Files (x86)/GNU Tools ARM Embedded/7 2017-q4-major/arm-none-eabi/include/c++/7.2.1" -I"C:/Program Files (x86)/GNU Tools ARM Embedded/7 2017-q4-major/arm-none-eabi/include/c++/7.2.1/arm-none-eabi" -D__UVISION_VERSION="525" -D__GCC -D__GCC_VERSION="721" -D_RTE_ -DARMCM4 -Wa,-alhms="*.lst" -o *.o
Assembler options: -mcpu=cortex-m4 -mthumb --gdwarf-2 -mthumb-interwork --MD .d -I./RTE/_Target_1 -IC:/Keil_v525pre/ARM/PACK/ARM/CMSIS/5.2.0/CMSIS/Include -IC:/Keil_v525pre/ARM/PACK/ARM/CMSIS/5.2.0/Device/ARM/ARMCM4/Include -I"C:/Program Files (x86)/GNU Tools ARM Embedded/7 2017-q4-major/arm-none-eabi/include" -I"C:/Program Files (x86)/GNU Tools ARM Embedded/7 2017-q4-major/lib/gcc/arm-none-eabi/7.2.1/include" -I"C:/Program Files (x86)/GNU Tools ARM Embedded/7 2017-q4-major/arm-none-eabi/include/c++/7.2.1" -I"C:/Program Files (x86)/GNU Tools ARM Embedded/7 2017-q4-major/arm-none-eabi/include/c++/7.2.1/arm-none-eabi" -alhms=".lst" -o *.o
Linker options: -T ./RTE/Device/ARMCM4/gcc_arm.ld -mcpu=cortex-m4 -mthumb -mthumb-interwork -Wl,-Map="./Optimization.map" -o Optimization.elf *.o -lm
#include <cstdlib>
#include <cstring>
#include <cstdint>
#define WITH_CAST
struct mytype {
uint32_t value;
__attribute__((const, always_inline)) constexpr friend bool operator>(const mytype & t, const mytype & a) {
return t.value > a.value;
}
};
static mytype output_buf [32];
static mytype * output_memory_ptr = output_buf;
static mytype * volatile * output_memory_tmpp = &output_memory_ptr;
static mytype input_buf [32];
static mytype * input_memory_ptr = input_buf;
static mytype * volatile * input_memory_tmpp = &input_memory_ptr;
#if defined (WITH_CAST)
#define MAX(a,b) (((a) > (b)) ? (static_cast<mytype>(a)) : (static_cast<mytype>(b)))
#else
#define MAX(a,b) (((a) > (b)) ? ((a)) : ((b)))
#endif
int main (void) {
const mytype * input = *input_memory_tmpp;
mytype * output = *output_memory_tmpp;
mytype p = input[0];
mytype c = input[1];
mytype pc = MAX(p, c);
output[0] = pc;
for (int i = 1; i < 31; i ++) {
mytype n = input[i + 1];
mytype cn = MAX(c, n);
output[i] = MAX(pc, cn);
p = c;
c = n;
pc = cn;
}
output[31] = pc;
}
?:is not an lvalue. Without that cast the result of?:is lvalue. Now, why it makes your code faster is a different story... - AnT