Not really a full answer. But still an evidence that the penalty is visible.
MSVC 2022 benchmark, compiler with /std:c++latest
.
#include <chrono>
#include <iostream>
struct alignas(16) S
{
char* a;
int* b;
};
extern "C" void init_fused_copy_unfused(int n, S & s2, S & s1);
extern "C" void init_fused_copy_fused(int n, S & s2, S & s1);
extern "C" void init_unfused_copy_unfused(int n, S & s2, S & s1);
extern "C" void init_unfused_copy_fused(int n, S & s2, S & s1);
int main()
{
using namespace std::chrono;
S s1, s2;
constexpr int N = 1'000'000'000;
auto t1 = system_clock::now();
init_fused_copy_fused(N, s2, s1);
auto t2 = system_clock::now();
init_fused_copy_unfused(N, s2, s1);
auto t3 = system_clock::now();
init_unfused_copy_fused(N, s2, s1);
auto t4 = system_clock::now();
init_unfused_copy_unfused(N, s2, s1);
auto t5 = system_clock::now();
std::cout
<< "init fused copy fused " << duration_cast<duration<double>>(t2 - t1) << "\n"
<< "init fused copy unfused " << duration_cast<duration<double>>(t3 - t2) << "\n"
<< "init unfused copy fused " << duration_cast<duration<double>>(t4 - t3) << "\n"
<< "init unfused copy unfused " << duration_cast<duration<double>>(t5 - t4) << "\n";
}
.code
c db 0
i dd 0
s dq byte ptr [c], dword ptr [i]
ALIGN 16
init_fused_copy_fused PROC
movups xmm0,xmmword ptr [s]
movups xmmword ptr [r8],xmm0
movups xmm1,xmmword ptr [r8]
movups xmmword ptr [rdx], xmm1
dec ecx
jnz init_fused_copy_fused
ret
init_fused_copy_fused ENDP
ALIGN 16
init_unfused_copy_fused PROC
lea rax, byte ptr [c]
mov qword ptr[r8], rax
lea rax, dword ptr [i]
mov qword ptr[r8 + 8], rax
movups xmm1,xmmword ptr [r8]
movups xmmword ptr [rdx], xmm1
dec ecx
jnz init_unfused_copy_fused
ret
init_unfused_copy_fused ENDP
ALIGN 16
init_fused_copy_unfused PROC
movups xmm0,xmmword ptr [s]
movups xmmword ptr [r8],xmm0
mov rax, qword ptr[r8]
mov qword ptr[rdx], rax
mov rax, qword ptr[r8 + 8]
mov qword ptr[rdx +8], rax
dec ecx
jnz init_fused_copy_unfused
ret
init_fused_copy_unfused ENDP
ALIGN 16
init_unfused_copy_unfused PROC
lea rax, byte ptr [c]
mov qword ptr[r8], rax
lea rax, dword ptr [i]
mov qword ptr[r8 + 8], rax
mov rax, qword ptr[r8]
mov qword ptr[rdx], rax
mov rax, qword ptr[r8 + 8]
mov qword ptr[rdx +8], rax
dec ecx
jnz init_unfused_copy_unfused
ret
init_unfused_copy_unfused ENDP
END
init fused copy fused 0.664739s
init fused copy unfused 0.935631s
init unfused copy fused 4.34326s
init unfused copy unfused 1.02741s
CPU: Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21 GHz
I interpret the results as follows:
- With fused init, the forwarding never fails, the difference between fused and unfused copy is within the benchmark error
- With unfused init the fused copy caused forwarding failure, causing significant perf difference