2
votes

I've been using ICC on my project, and ICC will utilize vector instructions very well. recently I tried to use GCC (version 5.5) to compile the same code, however on some modules, GCC's version is 10 times slower than ICC's. This happens when I do complex multiply etc.

A sample code will be like:

definitions:

float *ptr1 = _mm_malloc(1280 , 64);
float *ptr2 = _mm_malloc(1280 , 64);
float complex *realptr1 = (float complex *)&ptr1[storageOffset];
float complex *realptr2 = (float complex *)&ptr2[storageOffset];

Pragma and compiler options:

__assume_aligned(realptr1, 64);
__assume_aligned(realptr2, 64);
#pragma ivdep
#pragma vector aligned

for (j = 0; j < 512; j++) {
  float complex derSlot0 = realptr1[j] * realptr2[j];
  float complex derSlot1 = realptr1[j] + realptr2[j];
  realptr1[j] = derSlot0;
  realptr2[j] = derSlot1;
}

ICC compiled result of the major loop will be like:


    ..B1.6:                         # Preds ..B1.6 ..B1.5
                                    # Execution count [5.12e+02]
            vmovups   32(%r15,%rdx,8), %ymm9                        #35.29
            lea       (%r15,%rdx,8), %rax                           #37.5
            vmovups   (%rax), %ymm3                                 #35.29
            vaddps    32(%rbx,%rdx,8), %ymm9, %ymm11                #36.43
            vaddps    (%rbx,%rdx,8), %ymm3, %ymm5                   #36.43
            vmovshdup 32(%rbx,%rdx,8), %ymm6                        #35.43
            vshufps   $177, %ymm9, %ymm9, %ymm7                     #35.43
            vmulps    %ymm7, %ymm6, %ymm8                           #35.43
            vmovshdup (%rbx,%rdx,8), %ymm0                          #35.43
            vshufps   $177, %ymm3, %ymm3, %ymm1                     #35.43
            vmulps    %ymm1, %ymm0, %ymm2                           #35.43
            vmovsldup 32(%rbx,%rdx,8), %ymm10                       #35.43
            vfmaddsub213ps %ymm8, %ymm9, %ymm10                     #35.43
            vmovups   %ymm11, 32(%rbx,%rdx,8)                       #38.5
            vmovups   %ymm10, 32(%rax)                              #37.5
            vmovsldup (%rbx,%rdx,8), %ymm4                          #35.43
            vfmaddsub213ps %ymm2, %ymm3, %ymm4                      #35.43
            vmovups   %ymm5, (%rbx,%rdx,8)                          #38.5
            vmovups   %ymm4, (%rax)                                 #37.5
            addq      $8, %rdx                                      #32.3
            cmpq      $512, %rdx                                    #32.3
            jb        ..B1.6        # Prob 99%                      #32.3

The command line used for icc is: icc -march=core-avx2 -S -fsource-asm -c test.c

For GCC, what I've already done include: replace "#pragma ivdep" with "#pragma GCC ivdep", replace "__assume_aligned(realptr1, 64);" with "realptr1 = __builtin_assume_aligned(realptr1, 64);"

The command for GCC is: gcc -c -O2 -ftree-vectorize -mavx2 -g -Wa,-a,-ad gcctest.c

and the result for the same loop is something like this:


     109                .L7:
     110 00d8 C5FA103B      vmovss  (%rbx), %xmm7
     111 00dc 4883C308      addq    $8, %rbx
     112 00e0 C5FA1073      vmovss  -4(%rbx), %xmm6
     112      FC
     113 00e5 4983C408      addq    $8, %r12
     114 00e9 C4C17A10      vmovss  -8(%r12), %xmm5
     114      6C24F8
     115 00f0 C4C17A10      vmovss  -4(%r12), %xmm4
     115      6424FC
     116                .LBB2:
     117                    .loc 1 35 0 discriminator 3
     118 00f7 C5F828C7      vmovaps %xmm7, %xmm0
     119 00fb C5F828CE      vmovaps %xmm6, %xmm1
     120 00ff C5FA1165      vmovss  %xmm4, -80(%rbp)
     120      B0
     121 0104 C5F828DC      vmovaps %xmm4, %xmm3
     122 0108 C5FA116D      vmovss  %xmm5, -76(%rbp)
     122      B4
     123 010d C5F828D5      vmovaps %xmm5, %xmm2
     124 0111 C5FA1175      vmovss  %xmm6, -72(%rbp)
     124      B8
     125 0116 C5FA117D      vmovss  %xmm7, -68(%rbp)
     125      BC
     126 011b E8000000      call    __mulsc3
     126      00
     127                .LVL7:
     128                    .loc 1 38 0 discriminator 3
     129 0120 C5FA107D      vmovss  -68(%rbp), %xmm7
     129      BC
     130 0125 C5FA106D      vmovss  -76(%rbp), %xmm5
     130      B4
     131 012a C5FA1075      vmovss  -72(%rbp), %xmm6
     131      B8
     132 012f C5D258EF      vaddss  %xmm7, %xmm5, %xmm5
     133 0133 C5FA1065      vmovss  -80(%rbp), %xmm4
     133      B0
     134                    .loc 1 35 0 discriminator 3
     135 0138 C5F9D645      vmovq   %xmm0, -56(%rbp)
     135      C8
     136                    .loc 1 38 0 discriminator 3
     137 013d C5DA58E6      vaddss  %xmm6, %xmm4, %xmm4
     138                    .loc 1 35 0 discriminator 3
     139 0141 C5FA1045      vmovss  -52(%rbp), %xmm0
     139      CC
     140                .LVL8:
     141                    .loc 1 37 0 discriminator 3
     142 0146 C5FA104D      vmovss  -56(%rbp), %xmm1
     142      C8
     143 014b C5FA114B      vmovss  %xmm1, -8(%rbx)
     143      F8
     144                .LVL9:
     145 0150 C5FA1143      vmovss  %xmm0, -4(%rbx)
     145      FC
     146                    .loc 1 38 0 discriminator 3
     147 0155 C4C17A11      vmovss  %xmm5, -8(%r12)
     147      6C24F8
     148 015c C4C17A11      vmovss  %xmm4, -4(%r12)
     148      6424FC
     149                .LBE2:
     150                    .loc 1 32 0 discriminator 3
     151 0163 4C39EB        cmpq    %r13, %rbx
     152 0166 0F856CFF      jne .L7
     152      FFFF

So, I can see that GCC uses some kind of vector instructions, but still it it not as good as ICC.

My question is that, are there any more options I can do to make GCC perform better?

Thanks a lot.

1

1 Answers

1
votes

You didn't post full code to test but you may start with adding

 -ffast-math 

and optionally

-mfma

so more or less you will end up with

  vmovaps ymm0, YMMWORD PTR [rbx+rax]
  vmovaps ymm3, YMMWORD PTR [r12+rax]
  vpermilps ymm2, ymm0, 177
  vpermilps ymm4, ymm3, 245
  vpermilps ymm1, ymm3, 160
  vmulps ymm2, ymm2, ymm4
  vmovaps ymm4, ymm0
  vfmsub132ps ymm4, ymm2, ymm1
  vfmadd132ps ymm1, ymm2, ymm0
  vaddps ymm0, ymm0, ymm3
  vmovaps YMMWORD PTR [rbx+rax], ymm0
  vblendps ymm1, ymm4, ymm1, 170
  vmovaps YMMWORD PTR [r12+rax], ymm1
  add rax, 32
  cmp rax, 4096
  jne .L6