I wrote a 16*4 SAD function and its arm-neon optimized version. The arm-neon version is written in inline assembly. My problem is I am getting only 2x optimization ( with O3 enabled ), while ideally I should be getting at least 6x optimization out of it. Can anyone please explain the internals of what is happening ?
static unsigned int f_sad_16x4 ( const unsigned char* a, const unsigned char* b, const unsigned int uiStrideOrg, const unsigned int uiStrideCur )
{
unsigned int sad = 0;
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < 16; j++)
{
sad += abs(static_cast<int>(a[i*uiStrideOrg+j]) - static_cast<int>(b[i*uiStrideCur+j]));
}
}
return sad;
}
static unsigned int f_sad_16x4_neon(const unsigned char* a, const unsigned char* b, const unsigned int uiStrideOrg, const unsigned int uiStrideCur )
{
unsigned short auiSum[8];
unsigned short* puiSum = auiSum;
__asm__ volatile( \
/* Load 4 rows of piOrg and piCur each */
"vld1.8 {q0},[%[piOrg]],%[iStrideOrg] \n\t"\
"vld1.8 {q4},[%[piCur]],%[iStrideCur] \n\t"\
"vld1.8 {q1},[%[piOrg]],%[iStrideOrg] \n\t"\
"vabd.u8 q8, q0, q4 \n\t"\
"vld1.8 {q5},[%[piCur]],%[iStrideCur] \n\t"\
"vld1.8 {q2},[%[piOrg]],%[iStrideOrg] \n\t"\
"vabd.u8 q9, q1, q5 \n\t"\
"vld1.8 {q6},[%[piCur]],%[iStrideCur] \n\t"\
"vld1.8 {q3},[%[piOrg]],%[iStrideOrg] \n\t"\
"vabd.u8 q10, q2, q6 \n\t"\
"vld1.8 {q7},[%[piCur]],%[iStrideCur] \n\t"\
"vpaddl.u8 q12, q8 \n\t"\
"vabd.u8 q11, q3, q7 \n\t"\
"vpaddl.u8 q13, q9 \n\t"\
"vpaddl.u8 q14, q10 \n\t"\
"vadd.u16 q8, q12, q13 \n\t"\
"vpaddl.u8 q15, q11 \n\t"\
"vadd.u16 q9, q14, q15 \n\t"\
"vadd.u16 q0, q8, q9 \n\t"\
"vst1.16 {q0}, [%[puiSum]] \n\t"\
:[piOrg] "+r" (a),
[piCur] "+r" (b),
[puiSum] "+r" (puiSum)
:[iStrideCur] "r" (uiStrideCur),
[iStrideOrg] "r" (uiStrideOrg)
:"q0","q1","q2","q3","q4","q5","q6","q7","q8","q9","q10","q11","q12","q13","q14","q15"
);
unsigned int uiSum += auiSum[0] + auiSum[1] + auiSum[2] + auiSum[3] + auiSum[4] + auiSum[5] + auiSum[6] + auiSum[7];
return uiSum;
}