I'm trying to optimize an image format conversion on iOS using the NEON vector instruction set. I assumed this would map well to that because it processes a bunch of similar data.
My attempts haven't gone that well, though, achieving only a marginal speedup vs the naive c implementation:
for(int i = 0; i < pixelCount; ++i, ++inPixel32) {
const unsigned int r = ((*inPixel32 >> 0 ) & 0xFF);
const unsigned int g = ((*inPixel32 >> 8 ) & 0xFF);
const unsigned int b = ((*inPixel32 >> 16) & 0xFF);
*outPixel16++ = ((r >> 3) << 11) | ((g >> 2) << 5) | ((b >> 3) << 0);
}
1 megapixel image array on iPad 2:
format is [min avg max n=number of timer samples] in milliseconds
C: [14.446 14.632 18.405 n=1000]ms
NEON: [11.920 12.032 15.336 n=1000]ms
My attempt at a NEON implementation is below:
int i;
const int pixelsPerLoop = 8;
for(i = 0; i < pixelCount; i += pixelsPerLoop, inPixel32 += pixelsPerLoop, outPixel16 += pixelsPerLoop) {
//Read all r,g,b pixels into 3 registers
uint8x8x4_t rgba = vld4_u8(inPixel32);
//Right-shift r,g,b as appropriate
uint8x8_t r = vshr_n_u8(rgba.val[0], 3);
uint8x8_t g = vshr_n_u8(rgba.val[1], 2);
uint8x8_t b = vshr_n_u8(rgba.val[2], 3);
//Widen b
uint16x8_t r5_g6_b5 = vmovl_u8(b);
//Widen r
uint16x8_t r16 = vmovl_u8(r);
//Left shift into position within 16-bit int
r16 = vshlq_n_u16(r16, 11);
r5_g6_b5 |= r16;
//Widen g
uint16x8_t g16 = vmovl_u8(g);
//Left shift into position within 16-bit int
g16 = vshlq_n_u16(g16, 5);
r5_g6_b5 |= g16;
//Now write back to memory
vst1q_u16(outPixel16, r5_g6_b5);
}
//Do the remainder on normal flt hardware
Code was compiled via LLVM 3.0 into the following (.loc and extra labels removed):
_DNConvert_ARGB8888toRGB565:
push {r4, r5, r7, lr}
mov r9, r1
mov.w r12, #0
add r7, sp, #8
cmp r2, #0
mov.w r1, #0
it ne
movne r1, #1
cmp r0, #0
mov.w r3, #0
it ne
movne r3, #1
cmp.w r9, #0
mov.w r4, #0
it ne
movne r4, #1
tst.w r9, #3
bne LBB0_8
ands r1, r3
ands r1, r4
cmp r1, #1
bne LBB0_8
movs r1, #0
lsr.w lr, r9, #2
cmp.w r1, r9, lsr #2
bne LBB0_9
mov r3, r2
mov r5, r0
b LBB0_5
LBB0_4:
movw r1, #65528
add.w r0, lr, #7
movt r1, #32767
ands r1, r0
LBB0_5:
mov.w r12, #1
cmp r1, lr
bhs LBB0_8
rsb r0, r1, r9, lsr #2
mov.w r9, #63488
mov.w lr, #2016
mov.w r12, #1
LBB0_7:
ldr r2, [r5], #4
subs r0, #1
and.w r1, r9, r2, lsl #8
and.w r4, lr, r2, lsr #5
ubfx r2, r2, #19, #5
orr.w r2, r2, r4
orr.w r1, r1, r2
strh r1, [r3], #2
bne LBB0_7
LBB0_8:
mov r0, r12
pop {r4, r5, r7, pc}
LBB0_9:
sub.w r1, lr, #1
movs r3, #32
add.w r3, r3, r1, lsl #2
bic r3, r3, #31
adds r5, r0, r3
movs r3, #16
add.w r1, r3, r1, lsl #1
bic r1, r1, #15
adds r3, r2, r1
movs r1, #0
LBB0_10:
vld4.8 {d16, d17, d18, d19}, [r0]!
adds r1, #8
cmp r1, lr
vshr.u8 d20, d16, #3
vshr.u8 d21, d17, #2
vshr.u8 d16, d18, #3
vmovl.u8 q11, d20
vmovl.u8 q9, d21
vmovl.u8 q8, d16
vshl.i16 q10, q11, #11
vshl.i16 q9, q9, #5
vorr q8, q8, q10
vorr q8, q8, q9
vst1.16 {d16, d17}, [r2]!
Ltmp28:
blo LBB0_10
b LBB0_4
Full code is available at https://github.com/darknoon/DNImageConvert I would appreciate any help, thanks!