I am trying to write very efficient Hamming-distance code. Inspired by Wojciech Muła's extremely clever SSE3 popcount implementation, I coded an AVX2 equivalent solution, this time using 256 bit registers. l was expecting at least a 30%-40% improvement based on the doubled parallelism of the involved operations, however to my surprise, the AVX2 code is a tad slower (around 2%)!
Can someone enlighten me of the possible reasons why I'm not obtaining the expected performance boost?
Unrolled, SSE3 Hamming distance of two 64-byte blocks:
INT32 SSE_PopCount(const UINT32* __restrict pA, const UINT32* __restrict pB) {
__m128i paccum = _mm_setzero_si128();
__m128i a = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA));
__m128i b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB));
__m128i err = _mm_xor_si128 (a, b);
__m128i lo = _mm_and_si128 (err, low_mask);
__m128i hi = _mm_srli_epi16 (err, 4);
hi = _mm_and_si128 (hi, low_mask);
__m128i popcnt1 = _mm_shuffle_epi8(lookup, lo);
__m128i popcnt2 = _mm_shuffle_epi8(lookup, hi);
paccum = _mm_add_epi8(paccum, popcnt1);
paccum = _mm_add_epi8(paccum, popcnt2);
a = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA + 4));
b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB + 4));
err = _mm_xor_si128 (a, b);
lo = _mm_and_si128 (err, low_mask);
hi = _mm_srli_epi16 (err, 4);
hi = _mm_and_si128 (hi, low_mask);
popcnt1 = _mm_shuffle_epi8(lookup, lo);
popcnt2 = _mm_shuffle_epi8(lookup, hi);
paccum = _mm_add_epi8(paccum, popcnt1);
paccum = _mm_add_epi8(paccum, popcnt2);
a = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA + 8));
b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB + 8));
err = _mm_xor_si128 (a, b);
lo = _mm_and_si128 (err, low_mask);
hi = _mm_srli_epi16 (err, 4);
hi = _mm_and_si128 (hi, low_mask);
popcnt1 = _mm_shuffle_epi8(lookup, lo);
popcnt2 = _mm_shuffle_epi8(lookup, hi);
paccum = _mm_add_epi8(paccum, popcnt1);
paccum = _mm_add_epi8(paccum, popcnt2);
a = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA + 12));
b = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB + 12));
err = _mm_xor_si128 (a, b);
lo = _mm_and_si128 (err, low_mask);
hi = _mm_srli_epi16 (err, 4);
hi = _mm_and_si128 (hi, low_mask);
popcnt1 = _mm_shuffle_epi8(lookup, lo);
popcnt2 = _mm_shuffle_epi8(lookup, hi);
paccum = _mm_add_epi8(paccum, popcnt1);
paccum = _mm_add_epi8(paccum, popcnt2);
paccum = _mm_sad_epu8(paccum, _mm_setzero_si128());
UINT64 result = paccum.m128i_u64[0] + paccum.m128i_u64[1];
return (INT32)result;
}
Ununrolled, equivalent version using AVX's 256-bit registers:
INT32 AVX_PopCount(const UINT32* __restrict pA, const UINT32* __restrict pB) {
__m256i paccum = _mm256_setzero_si256();
__m256i a = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pA));
__m256i b = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pB));
__m256i err = _mm256_xor_si256 (a, b);
__m256i lo = _mm256_and_si256 (err, low_mask256);
__m256i hi = _mm256_srli_epi16 (err, 4);
hi = _mm256_and_si256 (hi, low_mask256);
__m256i popcnt1 = _mm256_shuffle_epi8(lookup256, lo);
__m256i popcnt2 = _mm256_shuffle_epi8(lookup256, hi);
paccum = _mm256_add_epi8(paccum, popcnt1);
paccum = _mm256_add_epi8(paccum, popcnt2);
a = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pA + 8));
b = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pB + 8));
err = _mm256_xor_si256 (a, b);
lo = _mm256_and_si256 (err, low_mask256);
hi = _mm256_srli_epi16 (err, 4);
hi = _mm256_and_si256 (hi, low_mask256);
popcnt1 = _mm256_shuffle_epi8(lookup256, lo);
popcnt2 = _mm256_shuffle_epi8(lookup256, hi);
paccum = _mm256_add_epi8(paccum, popcnt1);
paccum = _mm256_add_epi8(paccum, popcnt2);
paccum = _mm256_sad_epu8(paccum, _mm256_setzero_si256());
UINT64 result = paccum.m256i_i64[0] + paccum.m256i_u64[1] + paccum.m256i_i64[2] + paccum.m256i_i64[3];
return (INT32)result;
}
I already verified the output assembly code emitted by the compiler and it looks good, with the expected direct translation of the intrinsic instruction to machine instruction. The only thing I noticed is that on the AVX2 version, the last line where the population count of the 4 quad-words is accumulated, it generates more complex code than the SSE3 version (where only 2 quad-words need to be accumulated to obtain the population count), however I would still expect faster throughput.
AVX2 code generated for quad-word accumulation
vextractf128 xmm0, ymm2, 1
psrldq xmm0, 8
movd ecx, xmm2
movd eax, xmm0
vextractf128 xmm0, ymm2, 1
psrldq xmm2, 8
add eax, ecx
movd ecx, xmm0
add eax, ecx
movd ecx, xmm2
add eax, ecx
SSE3 code generated for quad-word accumulation
movd ecx, xmm2
psrldq xmm2, 8
movd eax, xmm2
add eax, ecx
My test program is calling 1 million times each routine, with different input values, but reusing two static buffers for holding the data of the pA
and pB
parameters. In my limited understanding of CPU architecture, this locality (reusing the same memory buffers over and over) should warm up the CPU caches nicely and not be tied by a memory bandwidth problem, but other than possibly memory bandwidth, I can't understand why there is no performance improvement.
Test routine
int _tmain(int argc, _TCHAR* argv[]) {
lookup = _mm_setr_epi8(
/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
);
low_mask = _mm_set1_epi8(0xf);
lookup256 = _mm256_setr_epi8(
/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
/* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
/* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
/* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
);
low_mask256 = _mm256_set1_epi8(0xf);
std::default_random_engine generator;
generator.seed(37);
std::uniform_int_distribution<UINT32> distribution(0, ULONG_MAX);
auto dice = std::bind( distribution, generator);
UINT32 a[16];
UINT32 b[16];
int count;
count = 0;
{
cout << "AVX PopCount\r\n";
boost::timer::auto_cpu_timer t;
for( int i = 0; i < 1000000; i++ ) {
for( int j = 0; j < 16; j++ ) {
a[j] = dice();
b[j] = dice();
}
count+= AVX_PopCount(a, b);
}
}
cout << count << "\r\n";
std::default_random_engine generator2;
generator2.seed(37);
std::uniform_int_distribution<UINT32> distribution2(0, ULONG_MAX);
auto dice2 = std::bind( distribution2, generator2);
count = 0;
{
cout << "SSE PopCount\r\n";
boost::timer::auto_cpu_timer t;
for( int i = 0; i < 1000000; i++ ) {
for( int j = 0; j < 16; j++ ) {
a[j] = dice2();
b[j] = dice2();
}
count+= SSE_PopCount(a, b);
}
}
cout << count << "\r\n";
getch();
return 0;
}
Test machine is an Intel Corei7 4790, and I'm using Visual Studio 2012 Pro.
vzeroupper
seems missing if you are otherwise using vanilla SSE (non-VEX) for the rest of your math. software.intel.com/sites/default/files/m/d/4/1/d/8/… Alternatively, make sure you are compiling your entire application to use AVX versions of all FPU instructions (VEX encoded SSE)/arch:AVX
, or-mavx
, etc... whatever your compiler is, and VEX intrinsics for any other SSE routines you have written. – J...m256i_i64
andm256i_u64
(last line)? This seems at first glance incongruous with your SSE code... – J...