If you can either assume that vector<bool>
is using contiguous byte-sized elements for storage, or if you can consider using something like vector<uint8_t>
instead, then this example should give you a good starting point:
static size_t count_equal(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
assert(vec1.size() == vec2.size()); // vectors must be same size
const size_t n = vec1.size();
const size_t max_block_size = 255 * 16; // max block size before possible overflow
__m128i vcount = _mm_setzero_si128();
size_t i, count = 0;
for (i = 0; i + 16 <= n; ) // for each block
{
size_t m = std::min(n, i + max_block_size);
for ( ; i + 16 <= m; i += 16) // for each vector in block
{
__m128i v1 = _mm_loadu_si128((__m128i *)&vec1[i]);
__m128i v2 = _mm_loadu_si128((__m128i *)&vec2[i]);
__m128i vcmp = _mm_cmpeq_epi8(v1, v2);
vcount = _mm_sub_epi8(vcount, vcmp);
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
vcount = _mm_setzero_si128(); // update count from current block
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
for ( ; i < n; ++i) // deal with any remaining partial vector
{
count += (vec1[i] == vec2[i]);
}
return count;
}
Note that this is using vector<uint8_t>
. If you really have to use vector<bool>
and can guarantee that the elements will always be contiguous and byte-sized then you'll just need to coerce the vector<bool>
into a const uint8_t *
or similar somehow.
Test harness:
#include <cassert>
#include <cstdlib>
#include <ctime>
#include <iostream>
#include <vector>
#include <emmintrin.h> // SSE2
using std::vector;
static size_t count_equal_ref(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
assert(vec1.size() == vec2.size());
const size_t n = vec1.size();
size_t i, count = 0;
for (i = 0 ; i < n; ++i)
{
count += (vec1[i] == vec2[i]);
}
return count;
}
static size_t count_equal(const vector<uint8_t> &vec1, const vector<uint8_t> &vec2)
{
assert(vec1.size() == vec2.size()); // vectors must be same size
const size_t n = vec1.size();
const size_t max_block_size = 255 * 16; // max block size before possible overflow
__m128i vcount = _mm_setzero_si128();
size_t i, count = 0;
for (i = 0; i + 16 <= n; ) // for each block
{
size_t m = std::min(n, i + max_block_size);
for ( ; i + 16 <= m; i += 16) // for each vector in block
{
__m128i v1 = _mm_loadu_si128((__m128i *)&vec1[i]);
__m128i v2 = _mm_loadu_si128((__m128i *)&vec2[i]);
__m128i vcmp = _mm_cmpeq_epi8(v1, v2);
vcount = _mm_sub_epi8(vcount, vcmp);
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
vcount = _mm_setzero_si128(); // update count from current block
}
vcount = _mm_sad_epu8(vcount, _mm_setzero_si128());
count += _mm_extract_epi16(vcount, 0) + _mm_extract_epi16(vcount, 4);
for ( ; i < n; ++i) // deal with any remaining partial vector
{
count += (vec1[i] == vec2[i]);
}
return count;
}
int main(int argc, char * argv[])
{
size_t n = 100;
if (argc > 1)
{
n = atoi(argv[1]);
}
vector<uint8_t> vec1(n);
vector<uint8_t> vec2(n);
srand((unsigned int)time(NULL));
for (size_t i = 0; i < n; ++i)
{
vec1[i] = rand() & 1;
vec2[i] = rand() & 1;
}
size_t n_ref = count_equal_ref(vec1, vec2);
size_t n_test = count_equal(vec1, vec2);
if (n_ref == n_test)
{
std::cout << "PASS" << std::endl;
}
else
{
std::cout << "FAIL: n_ref = " << n_ref << ", n_test = " << n_test << std::endl;
}
return 0;
}
Compile and run:
$ g++ -Wall -msse3 -O3 test.cpp && ./a.out
PASS
xor
and do apopcount()
? – EOFvector<bool>
has special properties and isn't the same as an array ofbool
objects. I'm not sure if you have taken this into account because I haven't used those instructions before. – Neil Kirkvector<bool>
may be bytes, or packed bits, or anything else that the implementer sees fit to use. If you want to do this portably then use e.g.vector<uint8_t>
, and then an SSE implementation will be relatively straightforward. – Paul Rvector<bool>
? And do you want to store each bool as a byte or could you use bits? – Z boson