11
votes

I found this post that explains how to transpose an 8x8 bytes matrix with 24 operations, and a few scrolls later there's the code that implements the transpose. However, this method does not exploit the fact that we can block the 8x8 transpose into four 4x4 transposes, and each one can be done in one shuffle instruction only (this post is the reference). So I came out with this solution:

__m128i transpose4x4mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13,  9, 5, 1, 12,  8, 4, 0);
__m128i shuffle8x8Mask = _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 4,  5, 6, 7, 12,  13, 14, 15);

void TransposeBlock8x8(uint8_t *src, uint8_t *dst, int srcStride, int dstStride) {
    __m128i load0 = _mm_set_epi64x(*(uint64_t*)(src + 1 * srcStride), *(uint64_t*)(src + 0 * srcStride));
    __m128i load1 = _mm_set_epi64x(*(uint64_t*)(src + 3 * srcStride), *(uint64_t*)(src + 2 * srcStride));
    __m128i load2 = _mm_set_epi64x(*(uint64_t*)(src + 5 * srcStride), *(uint64_t*)(src + 4 * srcStride));
    __m128i load3 = _mm_set_epi64x(*(uint64_t*)(src + 7 * srcStride), *(uint64_t*)(src + 6 * srcStride));

    __m128i shuffle0 = _mm_shuffle_epi8(load0, shuffle8x8Mask);
    __m128i shuffle1 = _mm_shuffle_epi8(load1, shuffle8x8Mask);
    __m128i shuffle2 = _mm_shuffle_epi8(load2, shuffle8x8Mask);
    __m128i shuffle3 = _mm_shuffle_epi8(load3, shuffle8x8Mask);

    __m128i block0 = _mm_unpacklo_epi64(shuffle0, shuffle1);
    __m128i block1 = _mm_unpackhi_epi64(shuffle0, shuffle1);
    __m128i block2 = _mm_unpacklo_epi64(shuffle2, shuffle3);
    __m128i block3 = _mm_unpackhi_epi64(shuffle2, shuffle3);

    __m128i transposed0 = _mm_shuffle_epi8(block0, transpose4x4mask);   
    __m128i transposed1 = _mm_shuffle_epi8(block1, transpose4x4mask);   
    __m128i transposed2 = _mm_shuffle_epi8(block2, transpose4x4mask);   
    __m128i transposed3 = _mm_shuffle_epi8(block3, transpose4x4mask);   

    __m128i store0 = _mm_unpacklo_epi32(transposed0, transposed2);
    __m128i store1 = _mm_unpackhi_epi32(transposed0, transposed2);
    __m128i store2 = _mm_unpacklo_epi32(transposed1, transposed3);
    __m128i store3 = _mm_unpackhi_epi32(transposed1, transposed3);

    *((uint64_t*)(dst + 0 * dstStride)) = _mm_extract_epi64(store0, 0);
    *((uint64_t*)(dst + 1 * dstStride)) = _mm_extract_epi64(store0, 1);
    *((uint64_t*)(dst + 2 * dstStride)) = _mm_extract_epi64(store1, 0);
    *((uint64_t*)(dst + 3 * dstStride)) = _mm_extract_epi64(store1, 1);
    *((uint64_t*)(dst + 4 * dstStride)) = _mm_extract_epi64(store2, 0);
    *((uint64_t*)(dst + 5 * dstStride)) = _mm_extract_epi64(store2, 1);
    *((uint64_t*)(dst + 6 * dstStride)) = _mm_extract_epi64(store3, 0);
    *((uint64_t*)(dst + 7 * dstStride)) = _mm_extract_epi64(store3, 1);
}

Excluding load/store operations this procedure consists of only 16 instructions instead of 24.

What am I missing?

7
You missed 4 128-bit operations to load vectors and 4 128-bit operations to store vectors.ErmIg
__m128i load0 = _mm_set_epi64x((uint64_t)(src + 1 * srcStride), (uint64_t)(src + 0 * srcStride)); it is equal to __m128i load0 = _mm_loadu_si128((__m128i*)(src + 0 * srcStride));ErmIg
((uint64_t)(dst + 0 * dstStride)) = _mm_extract_epi64(store0, 0); ((uint64_t)(dst + 1 * dstStride)) = _mm_extract_epi64(store0, 1); is equal to _mm_storeu_si128((__m128i*)(src + 0 * srcStride), store0);ErmIg
@ErmIg: Load/Store should be excluded from the ops. The post with 24 ops I linked do not take them into account. You have to load data anyway, no?xmas79
@ErmIg: They are only equivalent only if srcStride is 8 and dstStride is 8, that is if src and dst are both matrices of 8x8 bytes each. They are different otherwise, eg matrices of 128x128 bytes, but I'm actually "zooming-in" into an 8x8 block only, in that case scrStride and dstStride would be both 128.xmas79

7 Answers

5
votes

Apart from the loads, stores and pinsrq-s to read from and write to memory, with possibly a stride not equal to 8 bytes, you can do the transpose with only 12 instructions (this code can easily be used in combination with Z boson's test code):

void tran8x8b_SSE_v2(char *A, char *B) {
  __m128i pshufbcnst = _mm_set_epi8(15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0);

  __m128i B0, B1, B2, B3, T0, T1, T2, T3;
  B0 = _mm_loadu_si128((__m128i*)&A[ 0]);
  B1 = _mm_loadu_si128((__m128i*)&A[16]);
  B2 = _mm_loadu_si128((__m128i*)&A[32]);
  B3 = _mm_loadu_si128((__m128i*)&A[48]);


  T0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B0),_mm_castsi128_ps(B1),0b10001000));
  T1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B2),_mm_castsi128_ps(B3),0b10001000));
  T2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B0),_mm_castsi128_ps(B1),0b11011101));
  T3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B2),_mm_castsi128_ps(B3),0b11011101));

  B0 = _mm_shuffle_epi8(T0,pshufbcnst);
  B1 = _mm_shuffle_epi8(T1,pshufbcnst);
  B2 = _mm_shuffle_epi8(T2,pshufbcnst);
  B3 = _mm_shuffle_epi8(T3,pshufbcnst);

  T0 = _mm_unpacklo_epi32(B0,B1);
  T1 = _mm_unpackhi_epi32(B0,B1);
  T2 = _mm_unpacklo_epi32(B2,B3);
  T3 = _mm_unpackhi_epi32(B2,B3);

  _mm_storeu_si128((__m128i*)&B[ 0], T0);
  _mm_storeu_si128((__m128i*)&B[16], T1);
  _mm_storeu_si128((__m128i*)&B[32], T2);
  _mm_storeu_si128((__m128i*)&B[48], T3);
}


Here we use the 32 bit floating point shuffle which is more flexible than the epi32 shuffle. The casts do not generate extra instructions (code generated with gcc 5.4):

tran8x8b_SSE_v2:
.LFB4885:
    .cfi_startproc
    vmovdqu 48(%rdi), %xmm5
    vmovdqu 32(%rdi), %xmm2
    vmovdqu 16(%rdi), %xmm0
    vmovdqu (%rdi), %xmm1
    vshufps $136, %xmm5, %xmm2, %xmm4
    vshufps $221, %xmm5, %xmm2, %xmm2
    vmovdqa .LC6(%rip), %xmm5
    vshufps $136, %xmm0, %xmm1, %xmm3
    vshufps $221, %xmm0, %xmm1, %xmm1
    vpshufb %xmm5, %xmm3, %xmm3
    vpshufb %xmm5, %xmm1, %xmm0
    vpshufb %xmm5, %xmm4, %xmm4
    vpshufb %xmm5, %xmm2, %xmm1
    vpunpckldq  %xmm4, %xmm3, %xmm5
    vpunpckldq  %xmm1, %xmm0, %xmm2
    vpunpckhdq  %xmm4, %xmm3, %xmm3
    vpunpckhdq  %xmm1, %xmm0, %xmm0
    vmovups %xmm5, (%rsi)
    vmovups %xmm3, 16(%rsi)
    vmovups %xmm2, 32(%rsi)
    vmovups %xmm0, 48(%rsi)
    ret
    .cfi_endproc



On some, but not all, older cpus there might be a small bypass delay (between 0 and 2 cycles) for moving data between the integer and the floating point units. This increases the latency of the function, but it does not necessarily affect the throughput of the code.

A simple latency test with 1e9 tranpositions:

  for (int i=0;i<500000000;i++){
     tran8x8b_SSE(A,C);
     tran8x8b_SSE(C,A);
  }
  print8x8b(A);

This takes about 5.5 seconds (19.7e9 cycles) with tran8x8b_SSE and 4.5 seconds (16.0e9 cycles) with tran8x8b_SSE_v2 (Intel core i5-6500). Note that the load and stores were not eliminated by the compiler, although the functions were inlined in the for loop.


Update: AVX2-128 / SSE 4.1 solution with blends.

The 'shuffles' (unpack, shuffle) are handled by port 5, with 1 instruction per cpu cycle on modern cpus. Sometimes it pays off to replace one 'shuffle' with two blends. On Skylake the 32 bit blend instructions can run on either port 0, 1 or 5.

Unfortunately, _mm_blend_epi32 is only AVX2-128. An efficient SSE 4.1 alternative is _mm_blend_ps in combination with a few casts (which are usually free). The 12 'shuffles' are replaced by 8 shuffles in combination with 8 blends.

The simple latency test now runs in about 3.6 seconds (13e9 cpu cycles), which is 18 % faster than the results with tran8x8b_SSE_v2.

Code:

/* AVX2-128 version, sse 4.1 version see ---------------->       SSE 4.1 version of tran8x8b_AVX2_128()                                                              */
void tran8x8b_AVX2_128(char *A, char *B) {                   /*  void tran8x8b_SSE4_1(char *A, char *B) {                                                            */                                    
  __m128i pshufbcnst_0 = _mm_set_epi8(15, 7,11, 3,  
               13, 5, 9, 1,  14, 6,10, 2,  12, 4, 8, 0);     /*    __m128i pshufbcnst_0 = _mm_set_epi8(15, 7,11, 3,  13, 5, 9, 1,  14, 6,10, 2,  12, 4, 8, 0);       */                                    
  __m128i pshufbcnst_1 = _mm_set_epi8(13, 5, 9, 1,  
               15, 7,11, 3,  12, 4, 8, 0,  14, 6,10, 2);     /*    __m128i pshufbcnst_1 = _mm_set_epi8(13, 5, 9, 1,  15, 7,11, 3,  12, 4, 8, 0,  14, 6,10, 2);       */                                    
  __m128i pshufbcnst_2 = _mm_set_epi8(11, 3,15, 7,  
                9, 1,13, 5,  10, 2,14, 6,   8, 0,12, 4);     /*    __m128i pshufbcnst_2 = _mm_set_epi8(11, 3,15, 7,   9, 1,13, 5,  10, 2,14, 6,   8, 0,12, 4);       */                                    
  __m128i pshufbcnst_3 = _mm_set_epi8( 9, 1,13, 5,  
               11, 3,15, 7,   8, 0,12, 4,  10, 2,14, 6);     /*    __m128i pshufbcnst_3 = _mm_set_epi8( 9, 1,13, 5,  11, 3,15, 7,   8, 0,12, 4,  10, 2,14, 6);       */                                    
  __m128i B0, B1, B2, B3, T0, T1, T2, T3;                    /*    __m128 B0, B1, B2, B3, T0, T1, T2, T3;                                                            */                                    
                                                             /*                                                                                                      */                                    
  B0 = _mm_loadu_si128((__m128i*)&A[ 0]);                    /*    B0 = _mm_loadu_ps((float*)&A[ 0]);                                                                */                                    
  B1 = _mm_loadu_si128((__m128i*)&A[16]);                    /*    B1 = _mm_loadu_ps((float*)&A[16]);                                                                */                                    
  B2 = _mm_loadu_si128((__m128i*)&A[32]);                    /*    B2 = _mm_loadu_ps((float*)&A[32]);                                                                */                                    
  B3 = _mm_loadu_si128((__m128i*)&A[48]);                    /*    B3 = _mm_loadu_ps((float*)&A[48]);                                                                */                                    
                                                             /*                                                                                                      */                                    
  B1 = _mm_shuffle_epi32(B1,0b10110001);                     /*    B1 = _mm_shuffle_ps(B1,B1,0b10110001);                                                            */                                    
  B3 = _mm_shuffle_epi32(B3,0b10110001);                     /*    B3 = _mm_shuffle_ps(B3,B3,0b10110001);                                                            */                                    
  T0 = _mm_blend_epi32(B0,B1,0b1010);                        /*    T0 = _mm_blend_ps(B0,B1,0b1010);                                                                  */                                    
  T1 = _mm_blend_epi32(B2,B3,0b1010);                        /*    T1 = _mm_blend_ps(B2,B3,0b1010);                                                                  */                                    
  T2 = _mm_blend_epi32(B0,B1,0b0101);                        /*    T2 = _mm_blend_ps(B0,B1,0b0101);                                                                  */                                    
  T3 = _mm_blend_epi32(B2,B3,0b0101);                        /*    T3 = _mm_blend_ps(B2,B3,0b0101);                                                                  */                                    
                                                             /*                                                                                                      */                                    
  B0 = _mm_shuffle_epi8(T0,pshufbcnst_0);                    /*    B0 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T0),pshufbcnst_0));                       */                                    
  B1 = _mm_shuffle_epi8(T1,pshufbcnst_1);                    /*    B1 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T1),pshufbcnst_1));                       */                                    
  B2 = _mm_shuffle_epi8(T2,pshufbcnst_2);                    /*    B2 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T2),pshufbcnst_2));                       */                                    
  B3 = _mm_shuffle_epi8(T3,pshufbcnst_3);                    /*    B3 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T3),pshufbcnst_3));                       */                                    
                                                             /*                                                                                                      */                                    
  T0 = _mm_blend_epi32(B0,B1,0b1010);                        /*    T0 = _mm_blend_ps(B0,B1,0b1010);                                                                  */                                    
  T1 = _mm_blend_epi32(B0,B1,0b0101);                        /*    T1 = _mm_blend_ps(B0,B1,0b0101);                                                                  */                                    
  T2 = _mm_blend_epi32(B2,B3,0b1010);                        /*    T2 = _mm_blend_ps(B2,B3,0b1010);                                                                  */                                    
  T3 = _mm_blend_epi32(B2,B3,0b0101);                        /*    T3 = _mm_blend_ps(B2,B3,0b0101);                                                                  */                                    
  T1 = _mm_shuffle_epi32(T1,0b10110001);                     /*    T1 = _mm_shuffle_ps(T1,T1,0b10110001);                                                            */                                    
  T3 = _mm_shuffle_epi32(T3,0b10110001);                     /*    T3 = _mm_shuffle_ps(T3,T3,0b10110001);                                                            */                                    
                                                             /*                                                                                                      */                                    
  _mm_storeu_si128((__m128i*)&B[ 0], T0);                    /*    _mm_storeu_ps((float*)&B[ 0], T0);                                                                */                                    
  _mm_storeu_si128((__m128i*)&B[16], T1);                    /*    _mm_storeu_ps((float*)&B[16], T1);                                                                */                                    
  _mm_storeu_si128((__m128i*)&B[32], T2);                    /*    _mm_storeu_ps((float*)&B[32], T2);                                                                */                                    
  _mm_storeu_si128((__m128i*)&B[48], T3);                    /*    _mm_storeu_ps((float*)&B[48], T3);                                                                */                                    
}                                                            /*  }                                                                                                   */                                    
4
votes

Posting this as an answer. I'm also going to change the title of the question from "... with SSE" to "... with SIMD" due to some answers and comments received so far.

I succeeded in transposing the matrix with AVX2 in 8 instructions only, 10 including load/store (excluding masks loads). EDIT: I found a shorter version. See below. This is the case where the matrices are all contiguous in memory, so direct load/store can be used.

Here's the C code:

void tran8x8b_AVX2(char *src, char *dst) {
    __m256i perm = _mm256_set_epi8(
        0, 0, 0, 7,
        0, 0, 0, 5,
        0, 0, 0, 3,
        0, 0, 0, 1,

        0, 0, 0, 6,
        0, 0, 0, 4,
        0, 0, 0, 2,
        0, 0, 0, 0
    );

    __m256i tm = _mm256_set_epi8(
        15, 11, 7, 3,
        14, 10, 6, 2,
        13,  9, 5, 1,
        12,  8, 4, 0,

        15, 11, 7, 3,
        14, 10, 6, 2,
        13,  9, 5, 1,
        12,  8, 4, 0
    );

    __m256i load0 = _mm256_loadu_si256((__m256i*)&src[ 0]);
    __m256i load1 = _mm256_loadu_si256((__m256i*)&src[32]);  

    __m256i perm0 = _mm256_permutevar8x32_epi32(load0, perm);   
    __m256i perm1 = _mm256_permutevar8x32_epi32(load1, perm);   

    __m256i transpose0 = _mm256_shuffle_epi8(perm0, tm);    
    __m256i transpose1 = _mm256_shuffle_epi8(perm1, tm);    

    __m256i unpack0 = _mm256_unpacklo_epi32(transpose0, transpose1);    
    __m256i unpack1 = _mm256_unpackhi_epi32(transpose0, transpose1);

    perm0 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(unpack0), _mm256_castsi256_ps(unpack1), 32));    
    perm1 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(unpack0), _mm256_castsi256_ps(unpack1), 49));    

    _mm256_storeu_si256((__m256i*)&dst[ 0], perm0);
    _mm256_storeu_si256((__m256i*)&dst[32], perm1);
}

GCC was smart enough to perform a permutation during AVX load, saving two instructions. Here's the compiler output:

tran8x8b_AVX2(char*, char*):
        vmovdqa ymm1, YMMWORD PTR .LC0[rip]
        vmovdqa ymm2, YMMWORD PTR .LC1[rip]
        vpermd  ymm0, ymm1, YMMWORD PTR [rdi]
        vpermd  ymm1, ymm1, YMMWORD PTR [rdi+32]
        vpshufb ymm0, ymm0, ymm2
        vpshufb ymm1, ymm1, ymm2
        vpunpckldq      ymm2, ymm0, ymm1
        vpunpckhdq      ymm0, ymm0, ymm1
        vinsertf128     ymm1, ymm2, xmm0, 1
        vperm2f128      ymm0, ymm2, ymm0, 49
        vmovdqu YMMWORD PTR [rsi], ymm1
        vmovdqu YMMWORD PTR [rsi+32], ymm0
        vzeroupper
        ret

It emitted the vzerupper instruction with -O3, but going down to -O1 removes this.

In case of my original problem (a large matrix and I'm zooming in to an 8x8 part of it), handling strides destroys the output in a pretty bad way:

void tran8x8b_AVX2(char *src, char *dst, int srcStride, int dstStride) {
    __m256i load0 = _mm256_set_epi64x(*(uint64_t*)(src + 3 * srcStride), *(uint64_t*)(src + 2 * srcStride), *(uint64_t*)(src + 1 * srcStride), *(uint64_t*)(src + 0 * srcStride));
    __m256i load1 = _mm256_set_epi64x(*(uint64_t*)(src + 7 * srcStride), *(uint64_t*)(src + 6 * srcStride), *(uint64_t*)(src + 5 * srcStride), *(uint64_t*)(src + 4 * srcStride));

    // ... the same as before, however we can skip the final permutations because we need to handle the destination stride...

    *((uint64_t*)(dst + 0 * dstStride)) = _mm256_extract_epi64(unpack0, 0);
    *((uint64_t*)(dst + 1 * dstStride)) = _mm256_extract_epi64(unpack0, 1);
    *((uint64_t*)(dst + 2 * dstStride)) = _mm256_extract_epi64(unpack1, 0);
    *((uint64_t*)(dst + 3 * dstStride)) = _mm256_extract_epi64(unpack1, 1);
    *((uint64_t*)(dst + 4 * dstStride)) = _mm256_extract_epi64(unpack0, 2);
    *((uint64_t*)(dst + 5 * dstStride)) = _mm256_extract_epi64(unpack0, 3);
    *((uint64_t*)(dst + 6 * dstStride)) = _mm256_extract_epi64(unpack1, 2);
    *((uint64_t*)(dst + 7 * dstStride)) = _mm256_extract_epi64(unpack1, 3);
}

Here's the compiler output:

tran8x8b_AVX2(char*, char*, int, int):
        movsx   rdx, edx
        vmovq   xmm5, QWORD PTR [rdi]
        lea     r9, [rdi+rdx]
        vmovdqa ymm3, YMMWORD PTR .LC0[rip]
        movsx   rcx, ecx
        lea     r11, [r9+rdx]
        vpinsrq xmm0, xmm5, QWORD PTR [r9], 1
        lea     r10, [r11+rdx]
        vmovq   xmm4, QWORD PTR [r11]
        vpinsrq xmm1, xmm4, QWORD PTR [r10], 1
        lea     r8, [r10+rdx]
        lea     rax, [r8+rdx]
        vmovq   xmm7, QWORD PTR [r8]
        vmovq   xmm6, QWORD PTR [rax+rdx]
        vpinsrq xmm2, xmm7, QWORD PTR [rax], 1
        vinserti128     ymm1, ymm0, xmm1, 0x1
        vpinsrq xmm0, xmm6, QWORD PTR [rax+rdx*2], 1
        lea     rax, [rsi+rcx]
        vpermd  ymm1, ymm3, ymm1
        vinserti128     ymm0, ymm2, xmm0, 0x1
        vmovdqa ymm2, YMMWORD PTR .LC1[rip]
        vpshufb ymm1, ymm1, ymm2
        vpermd  ymm0, ymm3, ymm0
        vpshufb ymm0, ymm0, ymm2
        vpunpckldq      ymm2, ymm1, ymm0
        vpunpckhdq      ymm0, ymm1, ymm0
        vmovdqa xmm1, xmm2
        vmovq   QWORD PTR [rsi], xmm1
        vpextrq QWORD PTR [rax], xmm1, 1
        vmovdqa xmm1, xmm0
        add     rax, rcx
        vextracti128    xmm0, ymm0, 0x1
        vmovq   QWORD PTR [rax], xmm1
        add     rax, rcx
        vpextrq QWORD PTR [rax], xmm1, 1
        add     rax, rcx
        vextracti128    xmm1, ymm2, 0x1
        vmovq   QWORD PTR [rax], xmm1
        add     rax, rcx
        vpextrq QWORD PTR [rax], xmm1, 1
        vmovq   QWORD PTR [rax+rcx], xmm0
        vpextrq QWORD PTR [rax+rcx*2], xmm0, 1
        vzeroupper
        ret

However, this seems not a big deal if compared against the output my original code.


EDIT: I found a shorter version. 4 instructions in total, 8 counting both load/stores. This is possible because I read the matrix in a different way, hiding some "shuffles" in the "gather" instruction during load. Also, note that the final permutation is needed to perform the store because AVX2 doesn't have a "scatter" instruction. Having a scatter instruction would bring down everything to 2 instructions only. Also, note that I can handle without hassles the src stride by changing the content of the vindex vector.

Unfortunately this AVX_v2 seems to be slower than the previous one. Here's the code:

void tran8x8b_AVX2_v2(char *src1, char *dst1) {
    __m256i tm = _mm256_set_epi8(
        15, 11, 7, 3,
        14, 10, 6, 2,
        13,  9, 5, 1,
        12,  8, 4, 0,

        15, 11, 7, 3,
        14, 10, 6, 2,
        13,  9, 5, 1,
        12,  8, 4, 0
    );

    __m256i vindex = _mm256_setr_epi32(0, 8, 16, 24, 32, 40, 48, 56);
    __m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);

     __m256i load0 = _mm256_i32gather_epi32((int*)src1, vindex, 1);
    __m256i load1 = _mm256_i32gather_epi32((int*)(src1 + 4), vindex, 1); 

    __m256i transpose0 = _mm256_shuffle_epi8(load0, tm);    
    __m256i transpose1 = _mm256_shuffle_epi8(load1, tm);    

    __m256i final0 = _mm256_permutevar8x32_epi32(transpose0, perm);    
    __m256i final1 = _mm256_permutevar8x32_epi32(transpose1, perm);    

    _mm256_storeu_si256((__m256i*)&dst1[ 0], final0);
    _mm256_storeu_si256((__m256i*)&dst1[32], final1);
}

And here's the output of the compiler:

tran8x8b_AVX2_v2(char*, char*):
        vpcmpeqd        ymm3, ymm3, ymm3
        vmovdqa ymm2, YMMWORD PTR .LC0[rip]
        vmovdqa ymm4, ymm3
        vpgatherdd      ymm0, DWORD PTR [rdi+4+ymm2*8], ymm3
        vpgatherdd      ymm1, DWORD PTR [rdi+ymm2*8], ymm4
        vmovdqa ymm2, YMMWORD PTR .LC1[rip]
        vpshufb ymm1, ymm1, ymm2
        vpshufb ymm0, ymm0, ymm2
        vmovdqa ymm2, YMMWORD PTR .LC2[rip]
        vpermd  ymm1, ymm2, ymm1
        vpermd  ymm0, ymm2, ymm0
        vmovdqu YMMWORD PTR [rsi], ymm1
        vmovdqu YMMWORD PTR [rsi+32], ymm0
        vzeroupper
        ret
3
votes

A simplified one

void tp128_8x8(char *A, char *B) {
  __m128i sv = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8,  0);
  __m128i iv[4], ov[4];

  ov[0] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)A), sv);
  ov[1] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(A+16)), sv);
  ov[2] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(A+32)), sv);
  ov[3] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(A+48)), sv);

  iv[0] = _mm_unpacklo_epi16(ov[0], ov[1]); 
  iv[1] = _mm_unpackhi_epi16(ov[0], ov[1]); 
  iv[2] = _mm_unpacklo_epi16(ov[2], ov[3]); 
  iv[3] = _mm_unpackhi_epi16(ov[2], ov[3]); 

  _mm_storeu_si128((__m128i*)B,      _mm_unpacklo_epi32(iv[0], iv[2]));
  _mm_storeu_si128((__m128i*)(B+16), _mm_unpackhi_epi32(iv[0], iv[2]));
  _mm_storeu_si128((__m128i*)(B+32), _mm_unpacklo_epi32(iv[1], iv[3]));
  _mm_storeu_si128((__m128i*)(B+48), _mm_unpackhi_epi32(iv[1], iv[3]));
}



Benchmark:i5-5300U 2.3GHz (cycles per byte)
tran8x8b           : 2.140
tran8x8b_SSE       : 1.602
tran8x8b_SSE_v2    : 1.551
tp128_8x8          : 1.535
tran8x8b_AVX2      : 1.563
tran8x8b_AVX2_v2   : 1.731
2
votes

Normally when load and store instructions are not counted it's because the code is working with a matrix in register e.g. doing multiple operations in addition to the transpose in a loop. The loads and stores in this case are not counted because they are not part of the main loop.

But in your code the loads and stores (or rather sets and extracts) are doing part of the transpose.

GCC implements _mm_set_epi64x for SSE4.1 in your code with _mm_insert_epi64 and _mm_loadl_epi64. The insert instruction is doing part of the transpose i.e. the transpose starts at load0,1,2,3 not at shuffle0,1,2,3. And then your final store0,1,2,3 values don't contain the transpose either. You have to use eight _mm_extract_epi64 instructions to finish the transpose in memory. So it does not really make sense to not count the set and extract intrinsics.

In any case, it turns out you can do the transpose from register with only 16 instructions using only SSSE3 like this:

//__m128i B0, __m128i B1, __m128i B2, __m128i B3
__m128i mask = _mm_setr_epi8(0x0,0x04,0x01,0x05, 0x02,0x06,0x03,0x07, 0x08,0x0c,0x09,0x0d, 0x0a,0x0e,0x0b,0x0f);

__m128i T0, T1, T2, T3;
T0 = _mm_unpacklo_epi8(B0,B1);
T1 = _mm_unpackhi_epi8(B0,B1);
T2 = _mm_unpacklo_epi8(B2,B3);
T3 = _mm_unpackhi_epi8(B2,B3);

B0 = _mm_unpacklo_epi16(T0,T2);
B1 = _mm_unpackhi_epi16(T0,T2);
B2 = _mm_unpacklo_epi16(T1,T3);
B3 = _mm_unpackhi_epi16(T1,T3);

T0 = _mm_unpacklo_epi32(B0,B2);
T1 = _mm_unpackhi_epi32(B0,B2);
T2 = _mm_unpacklo_epi32(B1,B3);
T3 = _mm_unpackhi_epi32(B1,B3);

B0 = _mm_shuffle_epi8(T0,mask);
B1 = _mm_shuffle_epi8(T1,mask);
B2 = _mm_shuffle_epi8(T2,mask);
B3 = _mm_shuffle_epi8(T3,mask);

I'm not sure if it makes sense to exclude the loads and store here either because I'm not sure how convenient it is to work with a 8x8 byte matrix in four 128-bit registers.

Here is code testing this:

#include <stdio.h>
#include <x86intrin.h>

void print8x8b(char *A) {
  for(int i=0; i<8; i++) {
    for(int j=0; j<8; j++) {
      printf("%2d ", A[i*8+j]);
    } puts("");
  } puts("");
}

void tran8x8b(char *A, char *B) {
  for(int i=0; i<8; i++) {
    for(int j=0; j<8; j++) {
      B[j*8+i] = A[i*8+j];
    }
  }
}

void tran8x8b_SSE(char *A, char *B) {
  __m128i mask = _mm_setr_epi8(0x0,0x04,0x01,0x05, 0x02,0x06,0x03,0x07, 0x08,0x0c,0x09,0x0d, 0x0a,0x0e,0x0b,0x0f);

  __m128i B0, B1, B2, B3, T0, T1, T2, T3;
  B0 = _mm_loadu_si128((__m128i*)&A[ 0]);
  B1 = _mm_loadu_si128((__m128i*)&A[16]);
  B2 = _mm_loadu_si128((__m128i*)&A[32]);
  B3 = _mm_loadu_si128((__m128i*)&A[48]);

  T0 = _mm_unpacklo_epi8(B0,B1);
  T1 = _mm_unpackhi_epi8(B0,B1);
  T2 = _mm_unpacklo_epi8(B2,B3);
  T3 = _mm_unpackhi_epi8(B2,B3);

  B0 = _mm_unpacklo_epi16(T0,T2);
  B1 = _mm_unpackhi_epi16(T0,T2);
  B2 = _mm_unpacklo_epi16(T1,T3);
  B3 = _mm_unpackhi_epi16(T1,T3);

  T0 = _mm_unpacklo_epi32(B0,B2);
  T1 = _mm_unpackhi_epi32(B0,B2);
  T2 = _mm_unpacklo_epi32(B1,B3);
  T3 = _mm_unpackhi_epi32(B1,B3);

  B0 = _mm_shuffle_epi8(T0,mask);
  B1 = _mm_shuffle_epi8(T1,mask);
  B2 = _mm_shuffle_epi8(T2,mask);
  B3 = _mm_shuffle_epi8(T3,mask);

  _mm_storeu_si128((__m128i*)&B[ 0], B0);
  _mm_storeu_si128((__m128i*)&B[16], B1);
  _mm_storeu_si128((__m128i*)&B[32], B2);
  _mm_storeu_si128((__m128i*)&B[48], B3);
}

int main(void) {
  char A[64], B[64], C[64];
  for(int i=0; i<64; i++) A[i] = i;
  print8x8b(A);
  tran8x8b(A,B);
  print8x8b(B);
  tran8x8b_SSE(A,C);
  print8x8b(C);
}
1
votes

AVX512VBMI (Cascade Lake / Ice Lake)

AVX512VBMI introduces vpermb, a 64-byte lane-crossing shuffle with byte granularity.
_mm512_permutexvar_epi8( __m512i idx, __m512i a);

Existing CPUs that support it run it as a single uop, with 1/clock throughput. (https://www.uops.info/html-tp/CNL/VPERMB_ZMM_ZMM_M512-Measurements.html)

That trivializes the problem, making it possible with 1 instruction (at least for the stride=8 case where the whole 8x8 block is contiguous). Otherwise you should look at vpermt2b to shuffle together bytes from 2 sources. But that's 3 uops on CannonLake.

// TODO: strided loads / stores somehow for stride != 8
// AVX512VBMI
void TransposeBlock8x8_contiguous(uint8_t *src, uint8_t *dst)
{
    const __m512i trans8x8shuf = _mm512_set_epi8(
        63, 63-8*1, 63-8*2, 63-8*3, 63-8*4, ...
        ...
        57, 49, 41, 33, 25, 17, 9, 1,
        56, 48, 40, 32, 24, 16, 8, 0
   );

    __m512i vsrc = _mm512_loadu_si512(src);
    __m512i shuffled = _mm512_permutexvar_epi8(trans8x8shuf, vsrc);
    _mm512_storeu_si512(dst, shuffled);
}

https://godbolt.org/z/wrfyy3

Apparently _mm512_setr_epi8 doesn't exist for gcc/clang (only the 256 and 128 versions) so you have to define the constant in last-to-first order, opposite of C array initializer order.


vpermb even works with the data as a memory source operand, so it can load+shuffle in a single instruction. But according to https://uops.info/, it doesn't micro-fuse on CannonLake: unlike vpermd zmm, zmm, [r14] which decodes to 1 fused-domain uop (note "retire_slots: 1.0")
vpermd zmm, zmm, [r14] decodes to 2 separate uops for the front-end / fused-domain: "retire_slots: 2.0"). This from experimental testing with perf counters on a real CannonLake CPU. uops.info doesn't have a Cascade Lake or Ice Lake available yet, so it's possible it will be even more efficient there.

The uops.info tables uselessly count total number of unfused-domain uops, so you have to click on an instruction to see if it micro-fuses or not.


Source or dst strided, not contiguous, data

I guess you'd want to do qword (8-byte) loads into XMM registers and shuffle together pairs of inputs, or concatenate them with movhps or pinsrq. It's possibly worth it to use a qword-gather load with strided indices, but that's often not worth it.

I'm not sure if it's worth combining as far as YMM registers, let alone ZMM, or if it's best to only get as wide as XMM registers so we can efficiently scatter qwords back to memory manually with vmovq and vmovhps (which don't need a shuffle uop, just a store, on Intel CPUs). If the dst is contiguous, merging a non-contiguous strided src makes a lot more sense.

AVX512VBMI vpermt2b ymm looks like it would be useful for shuffle+merge like a lane-crossing punpcklbw, selecting any 32 bytes from the concatenation of two other 32-byte YMM registers. (Or 64 from 2x 64-byte regs for the ZMM version). But unfortunately on CannonLake it costs 3 uops, like vpermt2w on Skylake-X and Cannon Lake.

If we can worry about bytes later, vpermt2d is efficient on CPUs that support it (single uop)! Skylake-X and later.

Ice Lake has one per 2-cycle throughput for vpermt2b (instlat), perhaps because it has an extra shuffle unit that can run some (but not all) shuffle uops. Notice for example that vpshufb xmm and ymm is 0.5c throughput, but vpshufb zmm is 1c throughput. But vpermb is always just 1c throughput.

I wonder if we can take advantage of merge-masking? Like maybe vpmovzxbq to zero-extend input bytes to qwords. (one 8-byte row -> 64-byte ZMM register). Then maybe dword left-shift with merge-masking into another register? No, that doesn't help, the useful data is in the same dword elements for both inputs unless you do something to one register first, defeating the purpose.


Overlapped byte-masked stores (vmovdqu8 [rdi + 0..7]{k1}, zmm0..7) of vpmovzxbq load results are also possible, but probably not efficient. All but one of them would be misaligned, at best. The store buffer and/or cache hardware might be able to efficiently commit 8x masked stores, though.

A hybrid strategy doing some moving around in registers and some masked-stores might be interesting to balance shuffle/blend vs. store work for a contiguous dst. Especially if all the stores can be aligned, which would require moving data around in each vector so it's in the right place.

Ice Lake has 2 store execution units. (IDK if L1d cache commit can keep up with that, or if merging in the store buffer usually helps, or if that just helps with bursts of work.)

1
votes

This was really interesting to me, and I was looking to do exactly this, but for various reasons, I ended up needing to do it in Go, instead of C, and I didn't have vector intrinsics, so I thought "well, I'll just write something and see how it does".

My reported times, on a ~3.6GHz CPU, are about 28ns per 64-byte block transposed for a naive implementation, and about 19ns each for one done using bit shifts. I used perf to confirm the numbers, which seemed a bit unlikely to me, and they seem to add up. The fancy bit shift implementation is a bit over 250 instructions, and gets about 3.6 instructions per cycle, so it comes out to about 69-70 cycles per operation.

This is Go, but honestly it should be trivial to implement; it's just treating the input array of 64 bytes as 8 uint64_t.

You can get another nanosecond or so with declaring some of these things as new variables to hint to the register allocator.


import (
"unsafe"
)

const (
    hi16 = uint64(0xFFFF0000FFFF0000)
    lo16 = uint64(0x0000FFFF0000FFFF)
    hi8  = uint64(0xFF00FF00FF00FF00)
    lo8  = uint64(0x00FF00FF00FF00FF)
)

// Okay, this might take some explaining. We are working on a logical
// 8x8 matrix of bytes, which we get as a 64-byte array. We want to transpose
// it (row/column).
//
// start:
// [[00 08 16 24 32 40 48 56]
//  [01 09 17 25 33 41 49 57]
//  [02 10 18 26 34 42 50 58]
//  [03 11 19 27 35 43 51 59]
//  [04 12 20 28 36 44 52 60]
//  [05 13 21 29 37 45 53 61]
//  [06 14 22 30 38 46 54 62]
//  [07 15 23 31 39 47 55 63]]
//
// First, let's make sure everything under 32 is in the top four rows,
// and everything over 32 is in the bottom four rows. We do this by
// swapping pairs of 32-bit words.
// swap32:
// [[00 08 16 24 04 12 20 28]
//  [01 09 17 25 05 13 21 29]
//  [02 10 18 26 06 14 22 30]
//  [03 11 19 27 07 15 23 31]
//  [32 40 48 56 36 44 52 60]
//  [33 41 49 57 37 45 53 61]
//  [34 42 50 58 38 46 54 62]
//  [35 43 51 59 39 47 55 63]]
//
// Next, let's make sure everything over 16 or 48 is in the bottom two
// rows of the two four-row sections, and everything under 16 or 48 is
// in the top two rows of the section. We do this by swapping masked
// pairs in much the same way:
// swap16:
// [[00 08 02 10 04 12 06 14]
//  [01 09 03 11 05 13 07 15]
//  [16 24 18 26 20 28 22 30]
//  [17 25 19 27 21 29 23 31]
//  [32 40 34 42 36 44 38 46]
//  [33 41 35 43 37 45 39 47]
//  [48 56 50 58 52 60 54 62]
//  [49 57 51 59 53 61 55 63]]
//
// Now, we will do the same thing to each pair -- but because of
// clever choices in the specific arrange ment leading up to this, that's
// just one more byte swap, where each 2x2 block has its upper right
// and lower left corners swapped, and that turns out to be an easy
// shift and mask.
func UnswizzleLazy(m *[64]uint8) {
    // m32 treats the 8x8 array as a 2x8 array, because
    // it turns out we only need to swap a handful of the
    // bits...
    m32 := (*[16]uint32)(unsafe.Pointer(&m[0]))
    m32[1], m32[8] = m32[8], m32[1]
    m32[3], m32[10] = m32[10], m32[3]
    m32[5], m32[12] = m32[12], m32[5]
    m32[7], m32[14] = m32[14], m32[7]
    m64 := (*[8]uint64)(unsafe.Pointer(&m[0]))
    // we're now at the state described above as "swap32"
    tmp0, tmp1, tmp2, tmp3 :=
        (m64[0]&lo16)|(m64[2]&lo16)<<16,
        (m64[1]&lo16)|(m64[3]&lo16)<<16,
        (m64[0]&hi16)>>16|(m64[2]&hi16),
        (m64[1]&hi16)>>16|(m64[3]&hi16)
    tmp4, tmp5, tmp6, tmp7 :=
        (m64[4]&lo16)|(m64[6]&lo16)<<16,
        (m64[5]&lo16)|(m64[7]&lo16)<<16,
        (m64[4]&hi16)>>16|(m64[6]&hi16),
        (m64[5]&hi16)>>16|(m64[7]&hi16)
    // now we're at "swap16".
    lo8 := lo8
    hi8 := hi8
    m64[0], m64[1] = (tmp0&lo8)|(tmp1&lo8)<<8, (tmp0&hi8)>>8|tmp1&hi8
    m64[2], m64[3] = (tmp2&lo8)|(tmp3&lo8)<<8, (tmp2&hi8)>>8|tmp3&hi8
    m64[4], m64[5] = (tmp4&lo8)|(tmp5&lo8)<<8, (tmp4&hi8)>>8|tmp5&hi8
    m64[6], m64[7] = (tmp6&lo8)|(tmp7&lo8)<<8, (tmp6&hi8)>>8|tmp7&hi8
}

What this is doing is, I hope, reasonably obvious: shuffle the half-words around, so the first four words have all the values that belong in them, and the last four have all the values that belong in them. Then do a similar thing to each set of four words, so you end up with the things that belong in the top two words in the top two, etcetera.

I wasn't going to comment until I realized that, if the cycles/byte numbers above are right, this actually outperforms the shuffle/unpack solution.

(Note that this is an in-place transpose, but it's easy to use temps for the intermediate steps and do the final store somewhere else. It's actually probably faster.)

UPDATE: I originally described my algorithm slightly incorrectly, then I realized that I could actually do what I'd described. This one's running about 65.7 cycles per 64 bits.

EDIT #2: Tried one of the above AVX versions on this machine. On my hardware (Xeon E3-1505M, nominally 3GHz), I get a little over 10 cycles per 64-byte block, so, about 6 bytes per cycle. That seems a lot more reasonable to me than 1.5 cycles per byte did.

EDIT #3: Got down a bit further, about 45 cycles per 64 bits, by just writing the first part as shifts and masks on uint64 instead of trying to be "smart" and just move the 32 bits I cared about.

1
votes

Most answers here use a combination of different sized shuffles and permutations using _mm_shuffle_epi8, which is available only at SSSE3 and above.

A pure SSE2 implementation with 12* instruction kernel can be formed from interleaving the first 32 elements with the last 32 elements three times in a row:

void systolic_kernel(__m128i a[4]) {
    __m128i a0 = _mm_unpacklo_epi8(a[0], a[2]);
    __m128i a1 = _mm_unpackhi_epi8(a[0], a[2]);
    __m128i a2 = _mm_unpacklo_epi8(a[1], a[3]);
    __m128i a3 = _mm_unpackhi_epi8(a[1], a[3]);
    a[0] = a0;
    a[1] = a1;
    a[2] = a2;
    a[3] = a3;
}

void transpose(__m128i a[4]) {
    systolic_kernel(a);
    systolic_kernel(a);
    systolic_kernel(a);
}

*without VEX encoding (for three operand instructions), there will be 6 potentially zero cost movdqa instructions added.

The same strategy can be more easily applied to 4x4, 16x16 transposes and more, as the calculation of the indices to be permuted and the block sizes is factored out from the equation.