I'm working on a RGBA32 buffer (8bits per component), and I'd need to multiply each component by a constant, then add each of the results of the multiplication to the others as such :
Result = r*x + g * y + b * z + a*w (dot product between the two vectors rgba and xyzw)
I'm trying to use intel SSE intrinsics to speed up the process, but I don't know how to do such a thing without shuffling the input.
Is there any way to do this? Like building a register contaning {x,y,z,w,x,y,z,w,x,y,z,w,x,y,z,w} and perform an 8 bit multiply with saturation?
The final objective is to multiply RGBA vector by the corresponding color conversion matrix :
[ 66 129 25 0] [R]
[-38 -74 112 0] * [G]
[112 -94 -18 0] [B]
[0 0 0 0] [A]
Thanks
Edit 1 : here's the final function, using floating point calculation for more color precision, which converts a rgba image to a YUV444 one using SSE. Function takes between 1.9 and 3.5ms to convert a full HD image on an intel i5 3570k, using only one thread (it's really easy to thread this function, and it could yield significant performance improvments) :
void SSE_rgba2YUV444_FP(char* a, char* y, char* u, char* v)
{
__m128i mask = _mm_setr_epi8(0x00,0x04,0x08,0x0c, 0x01,0x05,0x09,0x0d, 0x02,0x06,0x0a,0x0e, 0x03,0x07,0x0b,0x0f); // Masque de mélange, chaque uint8 donne la position à donner (en offset en octet) du uint8 correspondant
float m[9] = {0.299, 0.587, 0.114, -0.1687, -0.3313, 0.5, 0.5, -0.4187, -0.0813}; // Dans le __m128i que l'on mélange
__m128i row[4];
for(int i=0; i<4; i++) {
row[i] = _mm_loadu_si128((__m128i*)&a[16*i]);
row[i] = _mm_shuffle_epi8(row[i],mask);
}
// row[i] = {rrrrggggbbbbaaaa} tous en uint8t
__m128i t0 = _mm_unpacklo_epi32(row[0], row[1]); //to = {rrrrrrrrgggggggg}
__m128i t1 = _mm_unpacklo_epi32(row[2], row[3]); //t1 = {rrrrrrrrgggggggg}
__m128i t2 = _mm_unpackhi_epi32(row[0], row[1]); //t2 = {bbbbbbbbaaaaaaaa}
__m128i t3 = _mm_unpackhi_epi32(row[2], row[3]); //t3 = {bbbbbbbbaaaaaaaa}
row[0] = _mm_unpacklo_epi64(t0, t1); // row[0] = {rrrrrrrrrrrrrrrr}
row[1] = _mm_unpackhi_epi64(t0, t1); // etc
row[2] = _mm_unpacklo_epi64(t2, t3);
__m128i v_lo[3], v_hi[3];
for(int i=0; i<3; i++) {
v_lo[i] = _mm_unpacklo_epi8(row[i],_mm_setzero_si128()); // On entrelace chaque row avec des 0, ce qui fait passer les valeurs
v_hi[i] = _mm_unpackhi_epi8(row[i],_mm_setzero_si128()); // de 8bits à 16bits pour pouvoir travailler dessus
}
__m128 v32_lo1[3], v32_hi1[3], v32_lo2[3], v32_hi2[3];
for(int i=0; i<3; i++) {
v32_lo1[i] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_lo[i],_mm_setzero_si128()));
v32_lo2[i] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_lo[i],_mm_setzero_si128()));
v32_hi1[i] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_hi[i],_mm_setzero_si128()));
v32_hi2[i] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_hi[i],_mm_setzero_si128()));
} // On a nos rgb sur 32 bits
__m128i yuv[3]; // {Y, U, V}
__m128 ylo1 = _mm_add_ps(_mm_mul_ps(v32_lo1[0], _mm_set1_ps(m[0])), _mm_add_ps(_mm_mul_ps(v32_lo1[1], _mm_set1_ps(m[1])), _mm_mul_ps(v32_lo1[2], _mm_set1_ps(m[2]))));
__m128 ylo2 = _mm_add_ps(_mm_mul_ps(v32_lo2[0], _mm_set1_ps(m[0])), _mm_add_ps(_mm_mul_ps(v32_lo2[1], _mm_set1_ps(m[1])), _mm_mul_ps(v32_lo2[2], _mm_set1_ps(m[2]))));
__m128 yhi1 = _mm_add_ps(_mm_mul_ps(v32_hi1[0], _mm_set1_ps(m[0])), _mm_add_ps(_mm_mul_ps(v32_hi1[1], _mm_set1_ps(m[1])), _mm_mul_ps(v32_hi1[2], _mm_set1_ps(m[2]))));
__m128 yhi2 = _mm_add_ps(_mm_mul_ps(v32_hi2[0], _mm_set1_ps(m[0])), _mm_add_ps(_mm_mul_ps(v32_hi2[1], _mm_set1_ps(m[1])), _mm_mul_ps(v32_hi2[2], _mm_set1_ps(m[2]))));
__m128i ylo1i = _mm_cvtps_epi32(ylo1);
__m128i ylo2i = _mm_cvtps_epi32(ylo2);
__m128i yhi1i = _mm_cvtps_epi32(yhi1);
__m128i yhi2i = _mm_cvtps_epi32(yhi2);
__m128i ylo = _mm_packus_epi32(ylo1i, ylo2i);
__m128i yhi = _mm_packus_epi32(yhi1i, yhi2i);
yuv[0] = _mm_packus_epi16(ylo, yhi);
ylo1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_lo1[0], _mm_set1_ps(m[3])), _mm_add_ps(_mm_mul_ps(v32_lo1[1], _mm_set1_ps(m[4])), _mm_mul_ps(v32_lo1[2], _mm_set1_ps(m[5])))), _mm_set1_ps(128.0f));
ylo2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_lo2[0], _mm_set1_ps(m[3])), _mm_add_ps(_mm_mul_ps(v32_lo2[1], _mm_set1_ps(m[4])), _mm_mul_ps(v32_lo2[2], _mm_set1_ps(m[5])))), _mm_set1_ps(128.0f));
yhi1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_hi1[0], _mm_set1_ps(m[3])), _mm_add_ps(_mm_mul_ps(v32_hi1[1], _mm_set1_ps(m[4])), _mm_mul_ps(v32_hi1[2], _mm_set1_ps(m[5])))), _mm_set1_ps(128.0f));
yhi2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_hi2[0], _mm_set1_ps(m[3])), _mm_add_ps(_mm_mul_ps(v32_hi2[1], _mm_set1_ps(m[4])), _mm_mul_ps(v32_hi2[2], _mm_set1_ps(m[5])))), _mm_set1_ps(128.0f));
ylo1i = _mm_cvtps_epi32(ylo1);
ylo2i = _mm_cvtps_epi32(ylo2);
yhi1i = _mm_cvtps_epi32(yhi1);
yhi2i = _mm_cvtps_epi32(yhi2);
ylo = _mm_packus_epi32(ylo1i, ylo2i);
yhi = _mm_packus_epi32(yhi1i, yhi2i);
yuv[1] = _mm_packus_epi16(ylo, yhi);
ylo1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_lo1[0], _mm_set1_ps(m[6])), _mm_add_ps(_mm_mul_ps(v32_lo1[1], _mm_set1_ps(m[7])), _mm_mul_ps(v32_lo1[2], _mm_set1_ps(m[8])))), _mm_set1_ps(128.0f));
ylo2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_lo2[0], _mm_set1_ps(m[6])), _mm_add_ps(_mm_mul_ps(v32_lo2[1], _mm_set1_ps(m[7])), _mm_mul_ps(v32_lo2[2], _mm_set1_ps(m[8])))), _mm_set1_ps(128.0f));
yhi1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_hi1[0], _mm_set1_ps(m[6])), _mm_add_ps(_mm_mul_ps(v32_hi1[1], _mm_set1_ps(m[7])), _mm_mul_ps(v32_hi1[2], _mm_set1_ps(m[8])))), _mm_set1_ps(128.0f));
yhi2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v32_hi2[0], _mm_set1_ps(m[6])), _mm_add_ps(_mm_mul_ps(v32_hi2[1], _mm_set1_ps(m[7])), _mm_mul_ps(v32_hi2[2], _mm_set1_ps(m[8])))), _mm_set1_ps(128.0f));
ylo1i = _mm_cvtps_epi32(ylo1);
ylo2i = _mm_cvtps_epi32(ylo2);
yhi1i = _mm_cvtps_epi32(yhi1);
yhi2i = _mm_cvtps_epi32(yhi2);
ylo = _mm_packus_epi32(ylo1i, ylo2i);
yhi = _mm_packus_epi32(yhi1i, yhi2i);
yuv[2] = _mm_packus_epi16(ylo, yhi);
_mm_storeu_si128((__m128i*)y,yuv[0]);
_mm_storeu_si128((__m128i*)u,yuv[1]);
_mm_storeu_si128((__m128i*)v,yuv[2]);
}
_mm_maddubs_epi16
. – Z bosonu
andv
as well. But I think it's hard to beat_mm_maddubs_epi16
. You get multiplication, one of the two additions and 8-bit to 16-bit conversion for the price of one. I have not checked the instruction tables. I'm know that_mm_hadd_epi32
should be avoided but in this case I think the horizontal operators are the best solution. – Z bosonpacks
instead ofpackus
and the third was even more subtle. Subtracting over 128 was causing overflow beyond -32768. Using 64 instead fixed this (and I proved it always gets it right). If you want to now more see my edits. – Z boson