(Vec4 x Mat4x4) product using SIMD and improvements

Question

I am writing a complex simulation program and it apprears that the most time consumming routine is the one for multiplying a four-vector (float4) with a 4x4 matrix. I need to run this program on several computers, which are more or less old. That is why I tried to check SIMD capabilities of such operations in the following code :

//#include <xmmintrin.h> // SSE
//#include <pmmintrin.h> // SSE3
//#include <nmmintrin.h> // SSE4.2
  #include <immintrin.h> // AVX

#include <iostream>
#include <ctime>
#include <string>

using namespace std;

// 4-vector.
typedef struct
{
    float x;
    float y;
    float z;
    float w;
}float4;

// typedef to simplify the pointer of function notation.
typedef void(*Function)(float4&,const float4*,const float4&);

float dot( const float4& in_A, const float4& in_x )
{
    return in_A.x*in_x.x + in_A.y*in_x.y + in_A.z*in_x.z + in_A.w*in_x.w; // 7 FLOPS
}

void A_times_x( float4& out_y, const float4* in_A, const float4& in_x )
{
    out_y.x = dot(in_A[0], in_x); // 7 FLOPS
    out_y.y = dot(in_A[1], in_x); // 7 FLOPS
    out_y.z = dot(in_A[2], in_x); // 7 FLOPS
    out_y.w = dot(in_A[3], in_x); // 7 FLOPS
}

void A_times_x_SSE( float4& out_y, const float4* in_A, const float4& in_x )
{
    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m128 A0 = _mm_load_ps((const float*)(in_A + 0));
    __m128 A1 = _mm_load_ps((const float*)(in_A + 1));
    __m128 A2 = _mm_load_ps((const float*)(in_A + 2));
    __m128 A3 = _mm_load_ps((const float*)(in_A + 3));

    // Transpose the matrix and re-order the vector.
    _MM_TRANSPOSE4_PS( A0,A1,A2,A3 );

    __m128 u1 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(0,0,0,0));
    __m128 u2 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(1,1,1,1));
    __m128 u3 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(2,2,2,2));
    __m128 u4 = _mm_shuffle_ps(x,x, _MM_SHUFFLE(3,3,3,3));

    // Multiply each matrix row with the vector x
    __m128 m0 = _mm_mul_ps(A0, u1); // 4 FLOPS
    __m128 m1 = _mm_mul_ps(A1, u2); // 4 FLOPS
    __m128 m2 = _mm_mul_ps(A2, u3); // 4 FLOPS
    __m128 m3 = _mm_mul_ps(A3, u4); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m128 sum_01 = _mm_add_ps(m0, m1); // 4 FLOPS
    __m128 sum_23 = _mm_add_ps(m2, m3); // 4 FLOPS
    __m128 result = _mm_add_ps(sum_01, sum_23); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);
}

void A_times_x_SSE3( float4& out_y, const float4* in_A, const float4& in_x )
{
    // Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar.

    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m128 A0 = _mm_load_ps((const float*)(in_A + 0));
    __m128 A1 = _mm_load_ps((const float*)(in_A + 1));
    __m128 A2 = _mm_load_ps((const float*)(in_A + 2));
    __m128 A3 = _mm_load_ps((const float*)(in_A + 3));

    // Multiply each matrix row with the vector x
    __m128 m0 = _mm_mul_ps(A0, x); // 4 FLOPS
    __m128 m1 = _mm_mul_ps(A1, x); // 4 FLOPS
    __m128 m2 = _mm_mul_ps(A2, x); // 4 FLOPS
    __m128 m3 = _mm_mul_ps(A3, x); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m128 sum_01 = _mm_hadd_ps(m0, m1); // 4 FLOPS
    __m128 sum_23 = _mm_hadd_ps(m2, m3); // 4 FLOPS
    __m128 result = _mm_hadd_ps(sum_01, sum_23); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);
}

void A_times_x_SSE4( float4& out_y, const float4* in_A, const float4& in_x ) // 28 FLOPS
{
    // Should be 4 (SSE) x 4 (ALU) = 16 times faster than scalar.

    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m128 A0 = _mm_load_ps((const float*)(in_A + 0));
    __m128 A1 = _mm_load_ps((const float*)(in_A + 1));
    __m128 A2 = _mm_load_ps((const float*)(in_A + 2));
    __m128 A3 = _mm_load_ps((const float*)(in_A + 3));

    // Multiply each matrix row with the vector x
    __m128 m0 = _mm_dp_ps(A0, x, 0xFF); // 4 FLOPS
    __m128 m1 = _mm_dp_ps(A1, x, 0xFF); // 4 FLOPS
    __m128 m2 = _mm_dp_ps(A2, x, 0xFF); // 4 FLOPS
    __m128 m3 = _mm_dp_ps(A3, x, 0xFF); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m128 mov_01 = _mm_movelh_ps(m0, m1); // 4 FLOPS
    __m128 mov_23 = _mm_movelh_ps(m2, m3); // 4 FLOPS
    __m128 result = _mm_shuffle_ps(mov_01, mov_23, _MM_SHUFFLE(2, 0, 2, 0)); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);
}

void A_times_x_AVX( float4& out_y, const float4* in_A, const float4& in_x )
{
    // Load matrix A and vector x into SSE registers
    __m128 x  = _mm_load_ps((const float*)&in_x); // load/store are almost = 0 FLOPS
    __m256 xx = _mm256_castps128_ps256(x);
           xx = _mm256_insertf128_ps(xx,x,1);
    __m256 A0 = _mm256_load_ps((const float*)(in_A + 0));
    __m256 A2 = _mm256_load_ps((const float*)(in_A + 2));

    // Multiply each matrix row with the vector x
    __m256 m0 = _mm256_mul_ps(A0, xx); // 4 FLOPS
    __m256 m2 = _mm256_mul_ps(A2, xx); // 4 FLOPS

    // Using HADD, we add four floats at a time
    __m256 sum_00 = _mm256_hadd_ps(m0, m2); // 4 FLOPS

  /*__m128 sum_10 = _mm256_extractf128_ps(sum_00,0);
    __m128 sum_01 = _mm256_extractf128_ps(sum_00,1);

    __m128 result = _mm_hadd_ps(sum_10, sum_01); // 4 FLOPS

    // Finally, store the result
    _mm_store_ps((float*)&out_y, result);*/

    // Finally, store the result (no temp variable: direct HADD, this avoid to copy from ALU128 to ALU256)
    _mm_store_ps((float*)&out_y, _mm_hadd_ps(_mm256_extractf128_ps(sum_00,0),
                                             _mm256_extractf128_ps(sum_00,1)));
}

void test_function ( Function f, string simd, unsigned int imax )
{
    float4 Y;
    float4 X1 = {0.5,1,0.2,0.7};
    float4 X2 = {0.7,1,0.2,0.5};
    float4 X3 = {0.5,0.2,1,0.7};
    float4 X4 = {1,0.7,0.2,0.5};
    float4 A[4] = {{0.5,1,0.2,0.7},
                   {0.6,0.4,0.1,0.8},
                   {0.3,0.8,0.2,0.5},
                   {1,0.4,0.6,0.9}};

    clock_t tstart = clock();

    for( unsigned int i=0 ; i<imax ; i++ )
    for( unsigned long int j=0 ; j<250000000 ; j++ )
    // Avoid for loop over long long, it is 2 times slower !
    {
        // Function pointer give a real call, whether the direct
        // call is inlined and thus results are overestimated.
        f( Y,A,X1 );
        f( Y,A,X2 );
        f( Y,A,X3 );
        f( Y,A,X4 );
    }

    clock_t tend = clock();

    double diff = static_cast<double>(tend - tstart) * 1e-3;

    cout << "Time  (" << simd << ") = " << diff << " s" << endl;
    cout << "Nops  (" << simd << ") = " << (double) imax << ".10^9" << endl;
    cout << "Power (" << simd << ") = " << (double) imax * 28. / diff << " GFLOPS" << endl; // 28 FLOPS for std.
    cout << endl;
}

int main ( int argc, char *argv[] )
{
    test_function ( &A_times_x     ,"std" , 1 );
    test_function ( &A_times_x_SSE ,"SSE" , 2 );
    test_function ( &A_times_x_SSE3,"SSE3", 3 );
    test_function ( &A_times_x_SSE4,"SSE4", 1 );
    test_function ( &A_times_x_AVX ,"AVX" , 3 );

    return 0;
}

I have some troubles about the improvements for such problem. When running the code I obtain the following results (Intel Core i5 4670K, 3.4GHz, Haswell, Codeblock+MinGW compiler using -O2 -march=corei7-avx) :

Time  (std) = 6.287 s
Nops  (std) = 1.10^9
Power (std) = 4.45363 GFLOPS

Time  (SSE) = 6.661 s
Nops  (SSE) = 2.10^9
Power (SSE) = 8.40715 GFLOPS

Time  (SSE3) = 8.361 s
Nops  (SSE3) = 3.10^9
Power (SSE3) = 10.0466 GFLOPS

Time  (SSE4) = 6.131 s
Nops  (SSE4) = 1.10^9
Power (SSE4) = 4.56695 GFLOPS

Time  (AVX) = 8.767 s
Nops  (AVX) = 3.10^9
Power (AVX) = 9.58138 GFLOPS

My questions are the following :

Is this possible to improve more the performances/speed up ? It should be x4 (maximum) for SSE and x8 for AVX.
Why the AVX is not faster than SSE3 ?

For those who say : "stop using your stuff, use Intel Math Kernel Library", I reply : "I would not, because I want a small executable file, and I only need to use SIMD for this specific case, not elsewhere" ;-)

What compiler did you use, and what optimisation switches did you enable ? — Paul R
The problem is that you spend more time shuffling data around than doing useful arithmetic. As it is a vector x matrix multiply of size 4 has too little computation to optimize in isolation. You may need to broaden the scope and consider changing your data layout to something a bit more SIMD-friendly. — Mysticial
I tested your code with MSVC2013 on a Haswell, and my results are different. I set all the tests to 10x10^9, and get 11.9s for std, and 2.95s for all the others (std is exactly x4 slower). Also, the std was entirely optimized out, so I added a volatile on the results. Also, it seems that all the calls are inlined, so there is no calls, which could explain the big slow when you do actual calls. — ElderBug
I have a feeling the OP is either using an old compiler or does not have optimisation enabled, as every compiler I've tried here optimises all the loops away, making the test results meaningless. — Paul R
I use MinGW with Codeblock (I know ... the shame), and my compiler flags are : -Wall -O2 -march=corei7-avx. I will try with MSVC2013 tonight to check this with the Intel C++ compiler. Thanks for your useful comment ! — Asohan

Z boson Z boson · Accepted Answer · 2015-06-27T22:12:24

Is this possible to improve more the performances/speed up ? It should be x4 (maximum) for SSE and x8 for AVX.

Yes, I explained this in detail at efficient-4x4-matrix-vector-multiplication-with-sse-horizontal-add-and-dot-product.

The efficient method for multiplying a 4x4 matrix M with a column vector u giving v = M u is:

v = u1*col1 + u2*col2 + u3*col3 + u4*col4.

this requires that you store the column vectors. For example let's assume you have the following 4x4 matrix A:

 0  1  2  3
 4  5  6  7
 8  9 10 11
12 13 14 15

then you store this as

0 4 8 c 1 5 9 d 2 6 a e 3 7 b f

conversely if you want row vector uT times matrix M giving vT = uT*M then you want

vT = uT1*row1 + uT2*row2 + uT3*row3 + uT4*row4.

and in this case you should pack the rows not the columns.

So to optimize your code in your function A_times_x_SSE comment out the line

 _MM_TRANSPOSE4_PS( A0,A1,A2,A3 );

and this function will be faster than your other functions using horizontal operations.

Horizontal operations with SIMD are not efficient. They are in some since not SIMD because they are broken down into scalar micro-ops so they are not parallel. They are only useful when it's inconvenient to pack your data in a form friendly for SIMD. For example when you can't store the columns of M and only have the rows.

Why the AVX is not faster than SSE3 ?

In order to do this efficiently with AVX you have to operate on two 4x4 matrices at once and also pack you matrices so that they are friendly for AVX. Now let's assume that in addition the matrix A defined above you have another matrix B:

16 17 18 19
20 21 22 23
24 25 26 27
28 29 30 31

The optimal way to pack A and B for AVX is

col1A col1B col2A col2B col3A col3B col4A col4B
0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29 2 6 10 14 18 22 26 30 3 7 11 15 19 23 27 31

Let's assume you have two vectors u = {0,1,2,3} and v = {4,5,6,7) and you want y = Au and z = Bv then with AVX you do:

c1 = col1A col1B = {0  4  8 12 16 20 24 28} = _mm256_load_ps
c2 = col2A col2B = {1  5  9 13 17 21 25 29}
c3 = col3A col3B = {2  6 10 14 18 22 26 30}
c4 = col4A col4B = {3  7 11 15 19 23 27 31}
broad1 = {0,0,0,0,4,4,4,4}
broad2 = {1,1,1,1,5,5,5,5}
broad3 = {2,2,2,2,6,6,6,6}
broad4 = {3,3,3,3,7,7,7,7}
w = broad1*c1 + broad2*c2 + broad3*c + broad4*c4;
//w = {y1, y2, y3, y4, z1, z2, z3, z4};

So the resultant 8-wide vector w contains the 4-vectors y and z. This is the most efficient method with AVX. If you have fixed matrices and variable vectors you can run over in a loop then packing before the loop at runtime will be negligible for a large loop. If you know the matrices are fixed at compile time then you can pack at compile time.

(Vec4 x Mat4x4) product using SIMD and improvements

1 Answers