Trying to xor a huge uint32
array I decided to use NEON coprocessor.
I implemented two c
versions:
version 1:
uint32_t xor_array_ver_1(uint32_t *array, int size)
{
uint32x2_t acc = vmov_n_u32(0);
uint32_t acc1 = 0;
for (; size != 0; size -= 2) {
uint32x2_t vec;
vec = vld1_u32(array);
array += 2;
acc = veor_u32(acc, vec);
}
acc1 = vget_lane_u32(acc,0) ^ vget_lane_u32(acc,1);
return acc1;
}
version 2:
uint32_t xor_array_ver_2(uint32_t *array, int size)
{
uint32x4_t acc = vmovq_n_u32(0);
uint32_t acc1 = 0;
for (; size != 0; size -= 4) {
uint32x4_t vec;
vec = vld1q_u32(array);
array += 4;
acc = veorq_u32(acc, vec);
}
acc1 ^= vgetq_lane_u32(acc,0);
acc1 ^= vgetq_lane_u32(acc,1);
acc1 ^= vgetq_lane_u32(acc,2);
acc1 ^= vgetq_lane_u32(acc,3);
return acc1;
}
Comparing the above 2 versions to the traditional xor implementation:
for (i=0; i<arr_size; i++)
val ^= my_array[i];
I observed 2 issues:
- Version 1 has the same performance.
- Version 2 is s bit more than 30% better.
- Can I rewrite it to be even better? where
my_array
is declared asuint32_t my_array[BIG_LENGTH];
- Is there a non-NEON way I can improve the performance of the regular xoring code? unrolling the loop doesn't give any improvement.