how to write neon code for the following loop:
float sfx[64], delta = 9.9e-5;
for(int i = 0; i < 64; i++) {
if (sfx[i] < delta) {
abq[i] = 1.0/delta;
} else {
abq[i] = 1.0/sfx[i];
}
}
I try to use vbslq_f32, but I have to construct its parameter one by one. Why NEON doesn't provide a more convenient way to do the job? Is there any better way to do this?
float32x4_t vdelta = vdupq_n_f32((float)1.0/delta);
for(int i = 0; i < 64; i+=4) {
float32x4_t vsfx = vld1q_f32((const float32_t*)(sfx+i));
uint32x4_t vcon;
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,0)<delta), vcon, 0);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,1)<delta), vcon, 1);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,2)<delta), vcon, 2);
vcon = vsetq_lane_u32((vgetq_lane_f32(vsfx,3)<delta), vcon, 3);
float32x4_t vsfxdiv;
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,0)), vsfxdiv, 0);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,1)), vsfxdiv, 1);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,2)), vsfxdiv, 2);
vsfxdiv = vsetq_lane_f32((1.0/vgetq_lane_f32(vsfx,3)), vsfxdiv, 3);
float32x4_t vabq = vblsq_f32(vcon, vsfxdiv, vdelta);
vst1q_f32((abq+i), vabq);
}