0
votes

Im fairly new to ARM and NEON programming, i have been given a task to optimize a SAD (Sum of Absolute Difference) function. I dont know where to start, I have tried several ways of producing NEON code without succeeding. The sequential function looks something like this:

void sad_block_8x8(uint8_t *block1, uint8_t *block2, int stride, int *result)
{
    int u, v;

    *result = 0;

    for (v = 0; v < 8; ++v)
    {
        for (u = 0; u < 8; ++u)
        {
            *result += abs(block2[v*stride+u] - block1[v*stride+u]);
        }
    }
}

So my problem is:

  1. How do I load registers for every iteration
  2. how do I do computations and store it to the variable result

Any help will be very much appreciated!

okay... so my first try was something like this (which works, but I know is a extremely bad neon code)

void sad_block_8x8_2(uint8_t *block1, uint8_t *block2, int stride, int *result) 
{
int u, v;
uint8x8_t m_1, m_2, m_o;
uint8_t* test;
test = (uint8_t*)malloc(v*u*sizeof(uint8_t));;
*result = 0;
for (v = 0; v < 8; ++v)
{
    for(u = 0; u < 8; ++u)
    {
        m_1 = vld1_u8(&block1[v*stride]);
        m_2 = vld1_u8(&block2[v*stride]);

        m_o = vabd_u8(m_2, m_1);
        vst1_u8(&test[v], m_o);
        //printf("%d ", test[v]);
        *result += (int)test[v];
    }
}
}

any help please?

2
Input data blocks should be const. Did you try a NEON-capable compiler, which might auto-vectorize for you?unwind
To get 8x8 to 8x1 you can use uint16x8_t vabal_u8 (uint16x8_t, uint8x8_t, uint8x8_t) in arm_neon.huser3528438

2 Answers

0
votes

This is a little better and clearer implementation of SAD algorithm you wanted:

void neon_sad_block_8x8(uint8_t *__restrict block1, uint8_t * __restrict block2, int stride, int *__restrict result)
{
        int i, j;
        uint8x8_t neon_block1;
        uint8x8_t neon_block2;
        uint8x8_t res;
        int sum = 0;
        for (i = 0; i < 8; i++) {                                                
            neon_block1 = vld1_u8(&block1[i * stride]);                      
            neon_block2 = vld1_u8(&block2[i * stride]);                      
            res = vabd_u8(neon_block2, neon_block1);                         
            sum += res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7];
        }
        *result = sum;
}

This code has:

  • Only one loop
  • There are no break statements in the loop
  • The pointers are are guarded by __restrict
0
votes

We can reduce both the loops and make them execute parallel. Note that i'm doing all the loads at once to remove any latency or dependency.

unsigned int sadCalculator_Neon_not_basic(void* sDPointer, int source_stride, void* 
pDPointer, int pred_stride, int w_block, int h_block)

{

uint8_t* sdPointer = (uint8_t*)sDPointer;
uint8_t* pdPointer = (uint8_t*)pDPointer;

//if w_block is 8   handles 8x8 
if (w_block == 8)
{
    if (h_block == 8)
    {   
        uint8x8_t sBlock_8_1, sBlock_8_2, sBlock_8_3, sBlock_8_4, sBlock_8_5, sBlock_8_6, sBlock_8_7, sBlock_8_8;
        uint8x8_t pBlock_8_1, pBlock_8_2, pBlock_8_3, pBlock_8_4, pBlock_8_5, pBlock_8_6, pBlock_8_7, pBlock_8_8;
        uint8x8_t res1, res2, res3, res4, res5, res6, res7, res8;
        unsigned int sad=0,sad1 = 0, sad2 = 0, sad3 = 0, sad4 = 0, sad5 = 0, sad6 = 0, sad7 = 0, sad8 = 0;

        sBlock_8_1 = vld1_u8(sdPointer);

        sBlock_8_2 = vld1_u8(sdPointer+(1 * source_stride));

        sBlock_8_3 = vld1_u8(sdPointer + (2 * source_stride));

        sBlock_8_4 = vld1_u8(sdPointer + (3 * source_stride));

        sBlock_8_5 = vld1_u8(sdPointer + (4 * source_stride));


        sBlock_8_7= vld1_u8(sdPointer + (6 * source_stride));

        sBlock_8_6= vld1_u8(sdPointer + (5 * source_stride));
        sBlock_8_8= vld1_u8(sdPointer + (7 * source_stride));




        pBlock_8_1 = vld1_u8(pdPointer);
        pBlock_8_2 = vld1_u8(pdPointer+(1 * pred_stride));

        pBlock_8_3 = vld1_u8(pdPointer + (2 * pred_stride));

        pBlock_8_4 = vld1_u8(pdPointer + (3 * pred_stride));

        pBlock_8_5 = vld1_u8(pdPointer + (4 * pred_stride));

        pBlock_8_6 = vld1_u8(pdPointer + (5 * pred_stride));

        pBlock_8_7 = vld1_u8(pdPointer + (6 * pred_stride));

        pBlock_8_8 = vld1_u8(pdPointer + (7 * pred_stride));


        res1 = vabd_u8(sBlock_8_1, pBlock_8_1);

        uint16x4_t res16 = vpaddl_u8(res1);
        uint32x2_t res32 = vpaddl_u16(res16);
        uint64x1_t res64 = vpaddl_u32(res32);
        sad += vget_lane_u64(res64, 0);


        res2 = vabd_u8(sBlock_8_2, pBlock_8_2);

         res16 = vpaddl_u8(res2);
         res32 = vpaddl_u16(res16);
         res64 = vpaddl_u32(res32);
        sad += vget_lane_u64(res64, 0);


        res3 = vabd_u8(sBlock_8_3, pBlock_8_3);

         res16 = vpaddl_u8(res3);
         res32 = vpaddl_u16(res16);
         res64 = vpaddl_u32(res32);
        sad += vget_lane_u64(res64, 0);


        res4 = vabd_u8(sBlock_8_4, pBlock_8_4);

         res16 = vpaddl_u8(res4);
         res32 = vpaddl_u16(res16);
         res64 = vpaddl_u32(res32);
        sad += vget_lane_u64(res64, 0);


        res5 = vabd_u8(sBlock_8_5, pBlock_8_5);

         res16 = vpaddl_u8(res5);
         res32 = vpaddl_u16(res16);
         res64 = vpaddl_u32(res32);
        sad += vget_lane_u64(res64, 0);


        res6 = vabd_u8(sBlock_8_6, pBlock_8_6);

         res16 = vpaddl_u8(res6);
         res32 = vpaddl_u16(res16);
         res64 = vpaddl_u32(res32);
        sad += vget_lane_u64(res64, 0);


        res7 = vabd_u8(sBlock_8_7, pBlock_8_7);

         res16 = vpaddl_u8(res7);
         res32 = vpaddl_u16(res16);
         res64 = vpaddl_u32(res32);
        sad += vget_lane_u64(res64, 0);

        res8 = vabd_u8(sBlock_8_8, pBlock_8_8);

         res16 = vpaddl_u8(res8);
         res32 = vpaddl_u16(res16);
         res64 = vpaddl_u32(res32);
        sad += vget_lane_u64(res64, 0);


        return sad;


    }
 }

}