I have a simple vector-vector addition algorithm (c = a + b * lambda) written in intel assembly, using AVX instructions. Here is my code:
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;; Dense to dense
;; Uses cache
;; AVX
;; Without tolerances
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
global _denseToDenseAddAVX_cache_64_linux
_denseToDenseAddAVX_cache_64_linux:
push rbp
mov rbp, rsp
; rdi: address1
; rsi: address2
; rdx: address3
; rcx: count
; xmm0: lambda
mov rax, rcx
shr rcx, 3
and rax, 0x07
vzeroupper
vmovupd ymm5, [abs_mask]
sub rsp, 8
vmovlpd [rbp - 8], xmm0
vbroadcastsd ymm7, [rbp - 8]
vmovapd ymm6, ymm7
cmp rcx, 0
je after_loop_denseToDenseAddAVX_cache_64_linux
start_denseToDenseAddAVX_cache_64_linux:
vmovapd ymm0, [rdi] ; a
vmovapd ymm1, ymm7
vmulpd ymm1, [rsi] ; b
vaddpd ymm0, ymm1 ; ymm0 = c = a + b * lambda
vmovapd [rdx], ymm0
vmovapd ymm2, [rdi + 32] ; a
vmovapd ymm3, ymm6
vmulpd ymm3, [rsi + 32] ; b
vaddpd ymm2, ymm3 ; ymm2 = c = a + b * lambda
vmovapd [rdx + 32], ymm2
add rdi, 64
add rsi, 64
add rdx, 64
loop start_denseToDenseAddAVX_cache_64_linux
after_loop_denseToDenseAddAVX_cache_64_linux:
cmp rax, 0
je end_denseToDenseAddAVX_cache_64_linux
mov rcx, rax
last_loop_denseToDenseAddAVX_cache_64_linux:
vmovlpd xmm0, [rdi] ; a
vmovapd xmm1, xmm7
vmulsd xmm1, [rsi] ; b
vaddsd xmm0, xmm1 ; xmm0 = c = a + b * lambda
vmovlpd [rdx], xmm0
add rdi, 8
add rsi, 8
add rdx, 8
loop last_loop_denseToDenseAddAVX_cache_64_linux
end_denseToDenseAddAVX_cache_64_linux:
mov rsp, rbp
pop rbp
ret
People often suggest me to use intel intrinsics because it is much better and safer. Now I've implemented this algorithm as this:
void denseToDenseAddAVX_cache(const double * __restrict__ a,
const double * __restrict__ b,
double * __restrict__ c,
size_t count, double lambda) {
const size_t firstCount = count / 8;
const size_t rem1 = count % 8;
int i;
__m256d mul = _mm256_broadcast_sd(&lambda);
for (i = 0; i < firstCount; i++) {
// c = a + b * lambda
__m256d dataA1 = _mm256_load_pd(&a[i * 8]);
__m256d dataC1 = _mm256_add_pd(dataA1, _mm256_mul_pd(_mm256_load_pd(&b[i * 8]), mul ));
_mm256_store_pd(&c[i * 8], dataC1);
__m256d dataA2 = _mm256_load_pd(&a[i * 8 + 4]);
__m256d dataC2 = _mm256_add_pd(dataA2, _mm256_mul_pd(_mm256_load_pd(&b[i * 8 + 4]), mul ));
_mm256_store_pd(&c[i * 8 + 4], dataC2);
}
const size_t secondCount = rem1 / 4;
const size_t rem2 = rem1 % 4;
if (secondCount) {
__m256d dataA = _mm256_load_pd(&a[i * 8]);
__m256d dataC = _mm256_add_pd(dataA, _mm256_mul_pd(_mm256_load_pd(&b[i * 8]), mul ));
_mm256_store_pd(&c[i * 8], dataC);
i += 4;
}
for (; i < count; i++) {
c[i] = a[i] + b[i] * lambda;
}
}
My problem is that the assembly version is two times faster than the second one. What is the problem with the c++ version?
-O3) your C++ code? - 1201ProgramAlarmvmovapd ymm1, ymm7is not needed, use 3-operand AVX instructions likevmulpd ymm1, ymm7, [rsi]. Plus you use the slow-on-Intelloopinstruction, bottlenecking this loop at 1 iteration (2 vectors) per 7 clock cycles. agner.org/optimize. I think even if a compiler didn't unroll, and used indexed addressing modes defeating micro-fusion and the port 7 store-AGU on Intel, it would still be at least as good as this. Like GCC does: godbolt.org/z/MHgtfa - Peter Cordesvmovlpdas a load unless you want to merges into the low element of an existing vector. You wantvmovsdto avoid a false dependency and extra ALU uop. - Peter Cordes