What are the costs of failed store-to-load forwarding on x86?

Question

What are the costs of a failed store-to-load forwarding on recent x86 architectures?

In particular, store-to-load forwarding that fails because the load partly overlaps an earlier store, or because the earlier load or store cross some alignment boundary that causes the forwarding to fail.

Certainly there is a latency cost: how big is it? Is there also a throughput cost, e.g., does a failed store-to-load forwarding use additional resources that are then unavailable to other loads and stores, or even other non-memory operations?

Is there a difference when all the parts of the store come from the store buffer, versus the case where it's a mix of the store buffer and L1?

Yeah, I've wondered this, too. Did you ever get around to testing it? There's some evidence an SF stall can't pipeline with other SF stalls on Intel IvB (easyperf.net/blog/2018/03/09/… using your own uarch-bench), but I don't know if successful store-forwarding is possible during this. — Peter Cordes

Alex Guteniev Alex Guteniev · Accepted Answer · 2021-10-12T07:01:03

Not really a full answer. But still an evidence that the penalty is visible.

MSVC 2022 benchmark, compiler with /std:c++latest.

#include <chrono>
#include <iostream>

struct alignas(16) S
{
    char* a;
    int* b;
};

extern "C" void init_fused_copy_unfused(int n, S & s2, S & s1);
extern "C" void init_fused_copy_fused(int n, S & s2, S & s1);
extern "C" void init_unfused_copy_unfused(int n, S & s2, S & s1);
extern "C" void init_unfused_copy_fused(int n, S & s2, S & s1);

int main()
{
    using namespace std::chrono;

    S s1, s2;
    constexpr int N = 1'000'000'000;
    auto t1 = system_clock::now();
    init_fused_copy_fused(N, s2, s1);
    auto t2 = system_clock::now();
    init_fused_copy_unfused(N, s2, s1);
    auto t3 = system_clock::now();
    init_unfused_copy_fused(N, s2, s1);
    auto t4 = system_clock::now();
    init_unfused_copy_unfused(N, s2, s1);
    auto t5 = system_clock::now();
    
    std::cout
     << "init fused copy fused     " << duration_cast<duration<double>>(t2 - t1) << "\n"
     << "init fused copy unfused   " << duration_cast<duration<double>>(t3 - t2) << "\n"
     << "init unfused copy fused   " << duration_cast<duration<double>>(t4 - t3) << "\n"
     << "init unfused copy unfused " << duration_cast<duration<double>>(t5 - t4) << "\n";
}

.code

c     db 0
i     dd 0

s     dq byte ptr [c], dword ptr [i]

ALIGN 16
init_fused_copy_fused PROC
    movups      xmm0,xmmword ptr [s]
    movups      xmmword ptr [r8],xmm0
    
    movups      xmm1,xmmword ptr [r8]
    movups      xmmword ptr [rdx], xmm1
    
    dec ecx
    jnz init_fused_copy_fused
    ret
init_fused_copy_fused ENDP


ALIGN 16
init_unfused_copy_fused PROC

    lea         rax, byte ptr [c]
    mov         qword ptr[r8], rax
    lea         rax, dword ptr [i]
    mov         qword ptr[r8 + 8], rax

    movups      xmm1,xmmword ptr [r8]
    movups      xmmword ptr [rdx], xmm1

    dec ecx
    jnz init_unfused_copy_fused
    ret
init_unfused_copy_fused ENDP

ALIGN 16
init_fused_copy_unfused PROC
    movups      xmm0,xmmword ptr [s]
    movups      xmmword ptr [r8],xmm0
    
    mov         rax, qword ptr[r8]
    mov         qword ptr[rdx], rax
    mov         rax, qword ptr[r8 + 8]
    mov         qword ptr[rdx +8], rax
    
    dec ecx
    jnz init_fused_copy_unfused
    ret
init_fused_copy_unfused ENDP


ALIGN 16
init_unfused_copy_unfused PROC

    lea         rax, byte ptr [c]
    mov         qword ptr[r8], rax
    lea         rax, dword ptr [i]
    mov         qword ptr[r8 + 8], rax

    mov         rax, qword ptr[r8]
    mov         qword ptr[rdx], rax
    mov         rax, qword ptr[r8 + 8]
    mov         qword ptr[rdx +8], rax

    dec ecx
    jnz init_unfused_copy_unfused
    ret
init_unfused_copy_unfused ENDP

END

init fused copy fused     0.664739s
init fused copy unfused   0.935631s
init unfused copy fused   4.34326s
init unfused copy unfused 1.02741s

CPU: Intel(R) Core(TM) i7-8750H CPU @ 2.20GHz 2.21 GHz

I interpret the results as follows:

With fused init, the forwarding never fails, the difference between fused and unfused copy is within the benchmark error
With unfused init the fused copy caused forwarding failure, causing significant perf difference

What are the costs of failed store-to-load forwarding on x86?

1 Answers