We can reduce both the loops and make them execute parallel. Note that i'm doing all the loads at once to remove any latency or dependency.
unsigned int sadCalculator_Neon_not_basic(void* sDPointer, int source_stride, void*
pDPointer, int pred_stride, int w_block, int h_block)
{
uint8_t* sdPointer = (uint8_t*)sDPointer;
uint8_t* pdPointer = (uint8_t*)pDPointer;
//if w_block is 8 handles 8x8
if (w_block == 8)
{
if (h_block == 8)
{
uint8x8_t sBlock_8_1, sBlock_8_2, sBlock_8_3, sBlock_8_4, sBlock_8_5, sBlock_8_6, sBlock_8_7, sBlock_8_8;
uint8x8_t pBlock_8_1, pBlock_8_2, pBlock_8_3, pBlock_8_4, pBlock_8_5, pBlock_8_6, pBlock_8_7, pBlock_8_8;
uint8x8_t res1, res2, res3, res4, res5, res6, res7, res8;
unsigned int sad=0,sad1 = 0, sad2 = 0, sad3 = 0, sad4 = 0, sad5 = 0, sad6 = 0, sad7 = 0, sad8 = 0;
sBlock_8_1 = vld1_u8(sdPointer);
sBlock_8_2 = vld1_u8(sdPointer+(1 * source_stride));
sBlock_8_3 = vld1_u8(sdPointer + (2 * source_stride));
sBlock_8_4 = vld1_u8(sdPointer + (3 * source_stride));
sBlock_8_5 = vld1_u8(sdPointer + (4 * source_stride));
sBlock_8_7= vld1_u8(sdPointer + (6 * source_stride));
sBlock_8_6= vld1_u8(sdPointer + (5 * source_stride));
sBlock_8_8= vld1_u8(sdPointer + (7 * source_stride));
pBlock_8_1 = vld1_u8(pdPointer);
pBlock_8_2 = vld1_u8(pdPointer+(1 * pred_stride));
pBlock_8_3 = vld1_u8(pdPointer + (2 * pred_stride));
pBlock_8_4 = vld1_u8(pdPointer + (3 * pred_stride));
pBlock_8_5 = vld1_u8(pdPointer + (4 * pred_stride));
pBlock_8_6 = vld1_u8(pdPointer + (5 * pred_stride));
pBlock_8_7 = vld1_u8(pdPointer + (6 * pred_stride));
pBlock_8_8 = vld1_u8(pdPointer + (7 * pred_stride));
res1 = vabd_u8(sBlock_8_1, pBlock_8_1);
uint16x4_t res16 = vpaddl_u8(res1);
uint32x2_t res32 = vpaddl_u16(res16);
uint64x1_t res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res2 = vabd_u8(sBlock_8_2, pBlock_8_2);
res16 = vpaddl_u8(res2);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res3 = vabd_u8(sBlock_8_3, pBlock_8_3);
res16 = vpaddl_u8(res3);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res4 = vabd_u8(sBlock_8_4, pBlock_8_4);
res16 = vpaddl_u8(res4);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res5 = vabd_u8(sBlock_8_5, pBlock_8_5);
res16 = vpaddl_u8(res5);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res6 = vabd_u8(sBlock_8_6, pBlock_8_6);
res16 = vpaddl_u8(res6);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res7 = vabd_u8(sBlock_8_7, pBlock_8_7);
res16 = vpaddl_u8(res7);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
res8 = vabd_u8(sBlock_8_8, pBlock_8_8);
res16 = vpaddl_u8(res8);
res32 = vpaddl_u16(res16);
res64 = vpaddl_u32(res32);
sad += vget_lane_u64(res64, 0);
return sad;
}
}
}
const
. Did you try a NEON-capable compiler, which might auto-vectorize for you? – unwinduint16x8_t vabal_u8 (uint16x8_t, uint8x8_t, uint8x8_t)
in arm_neon.h – user3528438