4
votes

I am using an Haswell Core i7-4790K.

When I compile the following toy example with icc -O3 -std=c99 -march=core-avx2 -g:

#include <stdio.h>
#include <stdint.h>
#include <immintrin.h>

typedef struct {
  __m256i a;
  __m256i b;
  __m256i c;
} mystruct_t;

#define SIZE     1000
#define TEST_VAL 42

int _do(mystruct_t* array) {
  int value = 0;

  for (size_t i = 0; i < SIZE; ++i) {
    array[i].a = _mm256_set1_epi8(TEST_VAL + i*3    );
    array[i].b = _mm256_set1_epi8(TEST_VAL + i*3 + 1);
    array[i].c = _mm256_set1_epi8(TEST_VAL + i*3 + 2);

    value += _mm_popcnt_u32(_mm256_movemask_epi8(array[i].a)) +
             _mm_popcnt_u32(_mm256_movemask_epi8(array[i].b)) +
             _mm_popcnt_u32(_mm256_movemask_epi8(array[i].c));
  }

  return value;
}

int main() {
  mystruct_t* array = (mystruct_t*)_mm_malloc(SIZE * sizeof(*array), 32);
  printf("%d\n", _do(array));
  _mm_free(array);
}

The following ASM code is produced for the _do() function:

0x0000000000400bc0 <+0>:    xor    %eax,%eax
0x0000000000400bc2 <+2>:    xor    %ecx,%ecx
0x0000000000400bc4 <+4>:    xor    %edx,%edx
0x0000000000400bc6 <+6>:    nopl   (%rax)
0x0000000000400bc9 <+9>:    nopl    0x0(%rax)
0x0000000000400bd0 <+16>:   lea     0x2b(%rdx),%r8d
0x0000000000400bd4 <+20>:   inc    %ecx
0x0000000000400bd6 <+22>:   lea     0x2a(%rdx),%esi
0x0000000000400bd9 <+25>:   lea     0x2c(%rdx),%r9d
0x0000000000400bdd <+29>:   add    $0x3,%edx
0x0000000000400be0 <+32>:   vmovd  %r8d,%xmm1
0x0000000000400be5 <+37>:   vpbroadcastb %xmm1,%ymm4
0x0000000000400bea <+42>:   vmovd  %esi,%xmm0
0x0000000000400bee <+46>:   vpmovmskb %ymm4,%r11d
0x0000000000400bf2 <+50>:   vmovd  %r9d,%xmm2
0x0000000000400bf7 <+55>:   vmovdqu %ymm4,0x20(%rdi)
0x0000000000400bfc <+60>:   vpbroadcastb %xmm0,%ymm3
0x0000000000400c01 <+65>:   vpbroadcastb %xmm2,%ymm5
0x0000000000400c06 <+70>:   vpmovmskb %ymm3,%r10d
0x0000000000400c0a <+74>:   vmovdqu %ymm3,(%rdi)
0x0000000000400c0e <+78>:   vmovdqu %ymm5,0x40(%rdi)
0x0000000000400c13 <+83>:   popcnt %r11d,%esi
0x0000000000400c18 <+88>:   add    $0x60,%rdi
0x0000000000400c1c <+92>:   vpmovmskb %ymm5,%r11d
0x0000000000400c20 <+96>:   popcnt %r10d,%r9d
0x0000000000400c25 <+101>:  popcnt %r11d,%r8d
0x0000000000400c2a <+106>:  add    %esi,%r9d
0x0000000000400c2d <+109>:  add    %r8d,%r9d
0x0000000000400c30 <+112>:  add    %r9d,%eax
0x0000000000400c33 <+115>:  cmp    $0x3e8,%ecx
0x0000000000400c39 <+121>:  jb      0x400bd0 <_do+16>
0x0000000000400c3b <+123>:  vzeroupper 
0x0000000000400c3e <+126>:  retq   
0x0000000000400c3f <+127>:  nop

If I compile the same code using gcc-5 -O3 -std=c99 -mavx2 -march=native -g, the following ASM code is produced for the _do() function:

0x0000000000400650 <+0>:    lea     0x17700(%rdi),%r9
0x0000000000400657 <+7>:    mov    $0x2a,%r8d
0x000000000040065d <+13>:   xor    %eax,%eax
0x000000000040065f <+15>:   nop
0x0000000000400660 <+16>:   lea     0x1(%r8),%edx
0x0000000000400664 <+20>:   vmovd  %r8d,%xmm2
0x0000000000400669 <+25>:   xor    %esi,%esi
0x000000000040066b <+27>:   vpbroadcastb %xmm2,%ymm2
0x0000000000400670 <+32>:   vmovd  %edx,%xmm1
0x0000000000400674 <+36>:   add    $0x60,%rdi
0x0000000000400678 <+40>:   lea     0x2(%r8),%edx
0x000000000040067c <+44>:   vpbroadcastb %xmm1,%ymm1
0x0000000000400681 <+49>:   vmovdqa %ymm2,-0x60(%rdi)
0x0000000000400686 <+54>:   add    $0x3,%r8d
0x000000000040068a <+58>:   vmovd  %edx,%xmm0
0x000000000040068e <+62>:   vpmovmskb %ymm2,%edx
0x0000000000400692 <+66>:   vmovdqa %ymm1,-0x40(%rdi)
0x0000000000400697 <+71>:   vpbroadcastb %xmm0,%ymm0
0x000000000040069c <+76>:   popcnt %edx,%esi
0x00000000004006a0 <+80>:   vpmovmskb %ymm1,%edx
0x00000000004006a4 <+84>:   popcnt %edx,%edx
0x00000000004006a8 <+88>:   vpmovmskb %ymm0,%ecx
0x00000000004006ac <+92>:   add    %esi,%edx
0x00000000004006ae <+94>:   vmovdqa %ymm0,-0x20(%rdi)
0x00000000004006b3 <+99>:   popcnt %ecx,%ecx
0x00000000004006b7 <+103>:  add    %ecx,%edx
0x00000000004006b9 <+105>:  add    %edx,%eax
0x00000000004006bb <+107>:  cmp    %rdi,%r9
0x00000000004006be <+110>:  jne     0x400660 <_do+16>
0x00000000004006c0 <+112>:  vzeroupper 
0x00000000004006c3 <+115>:  retq

My questions are:

1) Why icc uses unaligned moves (vmovdqu) unlike gcc?

2) Is there a penalty when vmovdqu is used instead of vmovdqa on aligned memory?

P.S: The problem is the same using SSE instructions/registers.

Thanks

2
ICC started doing this in 2012 and MSVC followed suit a year later. The annoyance is that it doesn't crash when the data is misaligned. So you don't even know there's a performance problem. Fortunately, the streaming instructions only have aligned versions. So there's no room for the compiler to "cheat". - Mysticial

2 Answers

7
votes

There is no penalty to using VMOVDQU when the address is aligned. The behavior is identical to using VMOVDQA in that case.

As for "why" there may not be a single clear answer. It's possible that ICC does this deliberately so that users who later call _do with an unaligned argument will not crash, but it's also possible that it's simply emergent behavior of the compiler. Someone on the Intel compiler team could answer this question, the rest of us can only speculate.

3
votes

There are three factors at play that solve the bigger problem:

a) faulting behavior may be good for debugging performance but not as good for production code - especially when a mix of 3rd party libraries is involved - very few would take a crash over slightly slower performance of their software product at a customer site

b) Intel microarchitecture solved the "unaligned" instruction forms on aligned data performance problem starting with Nehalem, they are same performance as "aligned" forms, AMD did it even before that I think

c) AVX+ improved the architectural behavior of Load+OP forms over SSE to non-faulting, so

VADDPS ymm0, ymm0, ymmword ptr [rax]; // no longer faults when rax is misaligned

Since for AVX+ we want the compiler to still have the liberty to use either standalone or Load+OP instruction forms when generating code from intrinsics, for code such as this:

_mm256_add_ps( a, *(__m256*)data_ptr  );

With AVX+ a compiler can use the vMOVUs (VMOVUPS/VMOVUPD/VMOVDQU) for the all loads and maintain uniform behavior with Load+OP forms

It is needed for when the source code changes slightly or the code generation of the same code changes (e.g. between different compilers/versions or due to inlining) and code generation switches from a Load+OP instruction to standalone Load and OP instructions, the behavior of a load is the same as with a Load+OP, i.e. non-faulting.

So AVX with the above compiler practice and the use of “unaligned” store instruction forms overall allow a uniform non-faulting behavior for SIMD code, without a performance loss on aligned data.

Of course, there are still (relatively rare) usage targeted instructions for non-temporal stores (vMOVNTDQ/vMOVNTPS/vMOVNTPD) and loads from WC types of memory (vMOVNDQA) that maintain faulting behavior for misaligned addresses.

-Max Locktyukhin, Intel