I'm trying to write some SSE code using Eigen, and some behavior eludes me.
Given code:
#ifndef EIGEN_DONT_VECTORIZE // Not needed with Intel C++ Compiler XE 15.0
#define EIGEN_VECTORIZE_SSE4_2
#define EIGEN_VECTORIZE_SSE4_1
#define EIGEN_VECTORIZE_SSSE3
#define EIGEN_VECTORIZE_SSE3
#endif
#include "stdafx.h"
#include <iostream>
#include <unsupported/Eigen/AlignedVector3>
#include <Eigen/StdVector>
#include <chrono>
int _tmain(int argc, _TCHAR* argv[]) {
static const int SIZE = 4000000;
EIGEN_ALIGNED_VECTOR3 Eigen::AlignedVector3<float> A_SSE(1, 1, 1);
//EIGEN_ALIGNED_VECTOR3 Eigen::AlignedVector3<float> B_SSE(2, 2, 2);
//std::vector<Eigen::AlignedVector3<float>> C_SSE(SIZE, Eigen::AlignedVector3<float>(0,0,0));
EIGEN_ALIGNED_VECTOR3 Eigen::AlignedVector3<float> A_SSE1(1, 1, 1);
EIGEN_ALIGNED_VECTOR3 Eigen::AlignedVector3<float> A_SSE2(1, 1, 1);
EIGEN_ALIGNED_VECTOR3 Eigen::AlignedVector3<float> A_SSE3(1, 1, 1);
EIGEN_ALIGNED_VECTOR3 Eigen::AlignedVector3<float> A_SSE4(1, 1, 1);
EIGEN_ALIGNED_VECTOR3 Eigen::AlignedVector3<float> B_SSE(2, 2, 2);
EIGEN_ALIGNED_VECTOR3 Eigen::AlignedVector3<float> B_SSE_increment_unroll(16, 16, 16);
A_SSE2 += B_SSE;
A_SSE3 = A_SSE2 + B_SSE;
A_SSE4 = A_SSE3 + B_SSE;
std::vector<Eigen::AlignedVector3<float>> C_SSE(SIZE, Eigen::AlignedVector3<float>(0, 0, 0));
auto start2 = std::chrono::system_clock::now();
// no unroll
for (int iteration = 0; iteration < SIZE; ++iteration) {
A_SSE += B_SSE;
C_SSE[iteration] = A_SSE;
}
//// own unroll
//for (int iteration = 0; iteration < SIZE / 8; ++iteration){
// A_SSE1 += B_SSE_increment_unroll;
// A_SSE2 += B_SSE_increment_unroll;
// A_SSE3 += B_SSE_increment_unroll;
// A_SSE4 += B_SSE_increment_unroll;
// C_SSE[iteration * 2] = A_SSE1;
// C_SSE[iteration * 2 + 1] = A_SSE2;
// C_SSE[iteration * 2 + 2] = A_SSE3;
// C_SSE[iteration * 2 + 3] = A_SSE4;
//}
auto end2 = std::chrono::system_clock::now();
auto elapsed2 = end2 - start2;
std::cout << "Eigen aligned vector " << elapsed2.count() << '\n';
Eigen::Matrix3Xf A = Eigen::Matrix3Xf::Zero(3, SIZE);
Eigen::Vector3f B(3, 3, 3);
Eigen::Vector3f C(2, 2, 2);
auto start1 = std::chrono::system_clock::now();
for (int iteration = 0; iteration < SIZE; ++iteration) {
B += C;
A.col(iteration) = B;
}
auto end1 = std::chrono::system_clock::now();
auto elapsed1 = end1 - start1;
std::cout << "Eigen matrix " << elapsed1.count() << '\n';
float *pResult = (float*)_aligned_malloc(SIZE * sizeof(float) * 4, 16); // align to 16-byte for SSE
auto start3 = std::chrono::system_clock::now();
__m128 x;
__m128 xDelta = _mm_set1_ps(2.0f); // Set the xDelta to (4,4,4,4)
__m128 *pResultSSE = (__m128*) pResult;
x = _mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f); // Set the initial values of x to (4,3,2,1)
for (int iteration = 0; iteration < SIZE; ++iteration)
{
x = _mm_add_ps(x, xDelta);
pResultSSE[iteration] = x;
}
auto end3 = std::chrono::system_clock::now();
auto elapsed3 = end3 - start3;
std::cout << "Own sse " << elapsed3.count() << '\n';
}
Timing seems odd, on my pc
- Eigen Aligned Vector Unroll: 20057
- Eigen Align Vector no unroll: ~120320
- Eigen Matrix: ~120207 ( same as Align no unroll)
- Own SSE: 160784
When I examine assembly, aligned versions and Own SSE use addps movaps, but until i manually unroll loops I don't gain additional performance, and even if I do it not in all runs (50%) I don't get any boost. Version wit Eigen Matrix don't use sse, achieve same performance, inline assembly shows unrolling on 16 iterations. Does manual unrolling is that impactful? Should we manually do it for SSE, and If on with CPU properties it depends?
Edit: So to sum up. SSE instruction do not perform better because of not able to prove that unrolling loop will hold same result as not unrolled, so it can not hide memory storage latency. But in assembly code "single" instructions are using only 1 register and incrementing it in unrolled loop. If the SSE addiction is performed vertically (single float in aligned vector accumulates same amount of operation of addition) compiler should be able to prove equality for unrolling. Are SSE operation by default non optimized by compiler? If unrolling loop preserve order of execution, so preserve non associative math, automatic unrolling should be possible so why it does not happen, and how to force compiler to do it?
EDIT: As suggested I run test, but bench unit from eigen do not work under visual studio 2017 so it was replaced by
#include <iostream>
#include <vector>
#include <unsupported/Eigen/AlignedVector3>
#include <chrono>
#include <numeric>
EIGEN_DONT_INLINE
void vector_no_unroll(std::vector<Eigen::AlignedVector3<float>>& out)
{
Eigen::AlignedVector3<float> A_SSE(1, 1, 1);
Eigen::AlignedVector3<float> B_SSE(2, 2, 2);
for (auto &x : out)
{
A_SSE += B_SSE;
x = A_SSE;
}
}
EIGEN_DONT_INLINE
void vector_unrolled(std::vector<Eigen::AlignedVector3<float>>& out)
{
Eigen::AlignedVector3<float> A_SSE1(1, 1, 1);
Eigen::AlignedVector3<float> A_SSE2(1, 1, 1);
Eigen::AlignedVector3<float> A_SSE3(1, 1, 1);
Eigen::AlignedVector3<float> A_SSE4(1, 1, 1);
Eigen::AlignedVector3<float> B_SSE(2, 2, 2);
Eigen::AlignedVector3<float> B_SSE_increment_unroll(16, 16, 16);
A_SSE2 += B_SSE;
A_SSE3 = A_SSE2 + B_SSE;
A_SSE4 = A_SSE3 + B_SSE;
for (size_t i = 0; i<out.size(); i += 4)
{
A_SSE1 += B_SSE_increment_unroll;
A_SSE2 += B_SSE_increment_unroll;
A_SSE3 += B_SSE_increment_unroll;
A_SSE4 += B_SSE_increment_unroll;
out[i + 0] = A_SSE1;
out[i + 1] = A_SSE2;
out[i + 2] = A_SSE3;
out[i + 3] = A_SSE4;
}
}
EIGEN_DONT_INLINE
void eigen_matrix(Eigen::Matrix3Xf& out)
{
Eigen::Vector3f B(1, 1, 1);
Eigen::Vector3f C(2, 2, 2);
for (int i = 0; i < out.cols(); ++i) {
B += C;
out.col(i) = B;
}
}
template<int unrolling> EIGEN_DONT_INLINE
void eigen_matrix_unrolled(Eigen::Matrix3Xf& out)
{
Eigen::Matrix<float, 3, unrolling> B = Eigen::Matrix<float, 1, unrolling>::LinSpaced(3.f, 1 + 2 * unrolling).template replicate<3, 1>();
for (int i = 0; i < out.cols(); i += unrolling) {
out.middleCols<unrolling>(i) = B;
B.array() += float(2 * unrolling);
}
}
int main() {
static const int SIZE = 4000000;
int tries = 30;
int rep = 10;
std::vector<int> Timings(tries, 0);
{
Eigen::Matrix3Xf A(3, SIZE);
#pragma loop( 1 )
for (int iter = 0; iter < tries; ++iter)
{
auto start1 = std::chrono::system_clock::now();
eigen_matrix(A);
Timings[iter] = (std::chrono::system_clock::now() - start1).count();
}
}
std::cout << "eigen matrix Min: " << *std::min_element(Timings.begin(), Timings.end()) << " ms\n";
std::cout << "eigen matrix Mean: " << std::accumulate(Timings.begin(), Timings.end(), 0) / tries << " ms\n";
{
Eigen::Matrix3Xf A(3, SIZE);
#pragma loop( 1 )
for (int iter = 0; iter < tries; ++iter)
{
auto start1 = std::chrono::system_clock::now();
eigen_matrix_unrolled<4>(A);
Timings[iter] = (std::chrono::system_clock::now() - start1).count();
}
}
std::cout << "eigen matrix unrolled 4 min: " << *std::min_element(Timings.begin(), Timings.end()) << " ms\n";
std::cout << "eigen matrix unrolled 4 Mean: " << std::accumulate(Timings.begin(), Timings.end(), 0) / tries << " ms\n";
{
Eigen::Matrix3Xf A(3, SIZE);
#pragma loop( 1 )
for (int iter = 0; iter < tries; ++iter)
{
auto start1 = std::chrono::system_clock::now();
eigen_matrix_unrolled<8>(A);
Timings[iter] = (std::chrono::system_clock::now() - start1).count();
}
}
std::cout << "eigen matrix unrolled 8 min: " << *std::min_element(Timings.begin(), Timings.end()) << " ms\n";
std::cout << "eigen matrix unrolled 8 Mean: " << std::accumulate(Timings.begin(), Timings.end(), 0) / tries << " ms\n";
{
std::vector<Eigen::AlignedVector3<float>> A(SIZE, Eigen::AlignedVector3<float>(0, 0, 0));
#pragma loop( 1 )
for (int iter = 0; iter < tries; ++iter)
{
auto start1 = std::chrono::system_clock::now();
vector_no_unroll(A);
Timings[iter] = (std::chrono::system_clock::now() - start1).count();
}
}
std::cout << "eigen vector min: " << *std::min_element(Timings.begin(), Timings.end()) << " ms\n";
std::cout << "eigen vector Mean: " << std::accumulate(Timings.begin(), Timings.end(), 0) / tries << " ms\n";
{
std::vector<Eigen::AlignedVector3<float>> A(SIZE, Eigen::AlignedVector3<float>(0, 0, 0));
#pragma loop( 1 )
for (int iter = 0; iter < tries; ++iter)
{
auto start1 = std::chrono::system_clock::now();
vector_unrolled(A);
Timings[iter] = (std::chrono::system_clock::now() - start1).count();
}
}
std::cout << "eigen vector unrolled min: " << *std::min_element(Timings.begin(), Timings.end()) << " ms\n";
std::cout << "eigen vector unrolled Mean: " << std::accumulate(Timings.begin(), Timings.end(), 0) / tries << " ms\n";
}
And checked the results on 8 diffrent machines (all windows) and get following results
eigen matrix Min: 110477 ms
eigen matrix Mean: 131691 ms
eigen matrix unrolled 4 min: 40099 ms
eigen matrix unrolled 4 Mean: 54812 ms
eigen matrix unrolled 8 min: 40001 ms
eigen matrix unrolled 8 Mean: 51482 ms
eigen vector min: 100270 ms
eigen vector Mean: 117316 ms
eigen vector unrolled min: 59966 ms
eigen vector unrolled Mean: 65847 ms
On every machine I tested, exepted one with was the oldest. Looks like on new machines small unrolling can be quite beneficial ( results differs form 1.5 to 3.5 times speed up on 4x unrolled and do not incrise even if unrolling was for 8,16,32, or 256 time).