I want to use the std::array
to store the data of N-dimensional vectors and implement arithmetic operations for such vectors. I figured, since the std::array
now has a constexpr
size()
member function, I can use this to unroll the loops that I need for the arithmetic operations over its elements.
Here is a minimal example:
#include <array>
#include <type_traits>
#include <iostream>
#include <cassert>
template<std::size_t N=0, typename Vector>
void plus_equals(Vector& result, Vector const& input)
{
result[N] += input[N];
if constexpr (N + 1 < result.size())
plus_equals<N+1>(result, input);
}
template<typename INT, size_t N>
class Vector
{
std::array<INT, N> data_;
public:
template<typename ... BracketList>
Vector(BracketList ... blist)
:
data_{std::forward<BracketList>(blist)...}
{}
INT& operator[](std::size_t i)
{
return data_[i];
}
INT operator[](std::size_t i) const
{
return data_[i];
}
decltype(auto) begin() const
{
return data_.begin();
}
decltype(auto) end() const
{
return data_.end();
}
decltype(auto) end()
{
return data_.end();
}
constexpr decltype(auto) size()
{
return data_.size();
}
void operator+=(Vector const& other)
{
plus_equals(*this, other);
}
};
template<size_t N = 0, typename Vector>
Vector operator+(Vector const& uVec, Vector const& vVec)
{
Vector result {uVec};
result += vVec;
return result;
}
template<size_t N = 0, typename Vector>
Vector sum(Vector const& uVec, Vector const& vVec)
{
Vector result {uVec};
for (decltype(result.size()) i = 0; i < result.size(); ++i)
result[i] += vVec[i];
return result;
}
template<typename Vector>
void print(Vector&& v)
{
for (const auto& el : v) std::cout << el << " ";
std::cout << std::endl;
}
using namespace std;
int main()
{
Vector<int, 3> c1 = {1,2,3};
Vector<int, 3> c2 = {3,2,1};
auto r1 = c1 + c2;
print (r1);
auto r2 = sum(c2, c2);
print (r2);
Vector<int, 3> s1, s2;
for (std::size_t i = 0; i < 3; ++i)
cin >> s1[i];
for (std::size_t i = 0; i < 3; ++i)
cin >> s2[i];
auto r3 = s1 + s2;
print(r3);
auto r4 = sum(s1, s2);
print(r4);
return 0;
}
The sum
operation is implemented using plus_equals
that should unroll the individual +=
operations on elements of the Vector, and the sum(Vector const&, Vector const&)
function, uses a for
loop.
I compiled the example on godbolt using -O3 -std=c++2a
.
If I comment out everything apart from
Vector<int, 3> c1 = {2,11,7};
Vector<int, 3> c2 = {9,22,5};
auto r1 = c1 + c2;
print (r1);
I get
movabs rax, 141733920779
sub rsp, 24
lea rdi, [rsp+4]
mov QWORD PTR [rsp+4], rax
mov DWORD PTR [rsp+12], 12
call void print<Vector<int, 3ul>&>(Vector<int, 3ul>&)
xor eax, eax
add rsp, 24
ret
What is happening here? Why can't I see the first two constants c1[0] + c2[0]
and c1[1] + c2[1]
? On the other hand 7 + 5 = 12
is moved:
mov DWORD PTR [rsp+12], 12
Why is the assembly of the code
int main()
{
Vector<int, 3> c1 = {2,11,7};
Vector<int, 3> c2 = {9,22,5};
//auto r1 = c1 + c2;
//print (r1);
auto r2 = sum(c1, c2);
print (r2);
exactly the same?
If I try to use runtime variables:
Vector<int, 3> s1, s2;
for (std::size_t i = 0; i < 3; ++i)
cin >> s1[i];
for (std::size_t i = 0; i < 3; ++i)
cin >> s2[i];
auto r3 = s1 + s2;
print(r3);
I get
mov edx, DWORD PTR [rsp+28]
mov eax, DWORD PTR [rsp+32]
lea rdi, [rsp+36]
add eax, DWORD PTR [rsp+20]
add edx, DWORD PTR [rsp+16]
mov ecx, DWORD PTR [rsp+24]
add ecx, DWORD PTR [rsp+12]
mov DWORD PTR [rsp+44], eax
mov DWORD PTR [rsp+36], ecx
mov DWORD PTR [rsp+40], edx
Which links to the plus_equals
function template and unrolls the iterations as expected.
For the sum
:
Vector<int, 3> s1, s2;
for (std::size_t i = 0; i < 3; ++i)
cin >> s1[i];
for (std::size_t i = 0; i < 3; ++i)
cin >> s2[i];
//auto r3 = s1 + s2;
//print(r3);
auto r4 = sum(s1, s2);
print(r4);
The assembly is:
mov edx, DWORD PTR [rsp+32]
add edx, DWORD PTR [rsp+20]
add ecx, eax
shr rax, 32
add eax, DWORD PTR [rsp+28]
mov DWORD PTR [rsp+44], edx
mov DWORD PTR [rsp+40], eax
mov DWORD PTR [rsp+36], ecx
And there are no equality comparisons and jumps, so the loop has been unrolled.
When I look at the assembly code of the sum
template, there are comparison operators and jumps there. This I expected because I figure that the compiler generates a general code first, for any Vector
, and then later on figures out if Vector::size()
is constexpr
and applies further optimizations.
Is the interpretation OK? If so, can one conclude that there is no sense in manually unrolling iterations for fixed-sized arrays, because with -O3
the loops that use constexpr size
member functions will be unrolled anyway by the compiler?
constexpr size()
in the loop definition, it would make the code much more readable with for loops then with recursive funtion templates. This is important because the code is scientific and will possibly be extended by others. – tmaricconstexpr
or not), use flag-funroll-loops
.--param max-unroll-times=...
can control unrolling depth. These days compiler doesn't do aggressive unrolling normally, because the CPU can handle loop pretty efficiently. But there are still some cases adding-funroll-all-loop
will make significant improvement, due to the non-negligible throughput impact ofjcc
. Especially some subpar gcc vectorization, they have the same performance as unrolling a loop, which isn't done by default. – llllllllll