SSE3 is awesome, but for those who can't use it for whatever reason, here's the conversion in x86 assembler, hand-optimized by yours truly. For completeness, I give the conversion in both directions: RGB32->RGB24 and RGB24->RGB32.
Note that interjay's C code leaves trash in the MSB (the alpha channel) of the destination pixels. This might not matter in some applications, but it matters in mine, hence my RGB24->RGB32 code forces the MSB to zero. Similarly, my RGB32->RGB24 code ignores the MSB; this avoids garbage output if the source data has a non-zero alpha channel. These features cost almost nothing in terms of performance, as verified by benchmarks.
For RGB32->RGB24 I was able to beat the VC++ optimizer by about 20%. For RGB24->RGB32 the gain was insignificant. Benchmarking was done on an i5 2500K. I omit the benchmarking code here, but if anyone wants it I'll provide it. The most important optimization was bumping the source pointer as soon as possible (see the ASAP comment). My best guess is that this increases parallelism by allowing the instruction pipeline to prefetch sooner. Other than that I just reordered some instructions to reduce dependencies and overlap memory accesses with bit-bashing.
void ConvRGB32ToRGB24(const UINT *Src, UINT *Dst, UINT Pixels)
{
#if !USE_ASM
for (UINT i = 0; i < Pixels; i += 4) {
UINT sa = Src[i + 0] & 0xffffff;
UINT sb = Src[i + 1] & 0xffffff;
UINT sc = Src[i + 2] & 0xffffff;
UINT sd = Src[i + 3];
Dst[0] = sa | (sb << 24);
Dst[1] = (sb >> 8) | (sc << 16);
Dst[2] = (sc >> 16) | (sd << 8);
Dst += 3;
}
#else
__asm {
mov ecx, Pixels
shr ecx, 2 // 4 pixels at once
jz ConvRGB32ToRGB24_$2
mov esi, Src
mov edi, Dst
ConvRGB32ToRGB24_$1:
mov ebx, [esi + 4] // sb
and ebx, 0ffffffh // sb & 0xffffff
mov eax, [esi + 0] // sa
and eax, 0ffffffh // sa & 0xffffff
mov edx, ebx // copy sb
shl ebx, 24 // sb << 24
or eax, ebx // sa | (sb << 24)
mov [edi + 0], eax // Dst[0]
shr edx, 8 // sb >> 8
mov eax, [esi + 8] // sc
and eax, 0ffffffh // sc & 0xffffff
mov ebx, eax // copy sc
shl eax, 16 // sc << 16
or eax, edx // (sb >> 8) | (sc << 16)
mov [edi + 4], eax // Dst[1]
shr ebx, 16 // sc >> 16
mov eax, [esi + 12] // sd
add esi, 16 // Src += 4 (ASAP)
shl eax, 8 // sd << 8
or eax, ebx // (sc >> 16) | (sd << 8)
mov [edi + 8], eax // Dst[2]
add edi, 12 // Dst += 3
dec ecx
jnz SHORT ConvRGB32ToRGB24_$1
ConvRGB32ToRGB24_$2:
}
#endif
}
void ConvRGB24ToRGB32(const UINT *Src, UINT *Dst, UINT Pixels)
{
#if !USE_ASM
for (UINT i = 0; i < Pixels; i += 4) {
UINT sa = Src[0];
UINT sb = Src[1];
UINT sc = Src[2];
Dst[i + 0] = sa & 0xffffff;
Dst[i + 1] = ((sa >> 24) | (sb << 8)) & 0xffffff;
Dst[i + 2] = ((sb >> 16) | (sc << 16)) & 0xffffff;
Dst[i + 3] = sc >> 8;
Src += 3;
}
#else
__asm {
mov ecx, Pixels
shr ecx, 2 // 4 pixels at once
jz SHORT ConvRGB24ToRGB32_$2
mov esi, Src
mov edi, Dst
push ebp
ConvRGB24ToRGB32_$1:
mov ebx, [esi + 4] // sb
mov edx, ebx // copy sb
mov eax, [esi + 0] // sa
mov ebp, eax // copy sa
and ebx, 0ffffh // sb & 0xffff
shl ebx, 8 // (sb & 0xffff) << 8
and eax, 0ffffffh // sa & 0xffffff
mov [edi + 0], eax // Dst[0]
shr ebp, 24 // sa >> 24
or ebx, ebp // (sa >> 24) | ((sb & 0xffff) << 8)
mov [edi + 4], ebx // Dst[1]
shr edx, 16 // sb >> 16
mov eax, [esi + 8] // sc
add esi, 12 // Src += 12 (ASAP)
mov ebx, eax // copy sc
and eax, 0ffh // sc & 0xff
shl eax, 16 // (sc & 0xff) << 16
or eax, edx // (sb >> 16) | ((sc & 0xff) << 16)
mov [edi + 8], eax // Dst[2]
shr ebx, 8 // sc >> 8
mov [edi + 12], ebx // Dst[3]
add edi, 16 // Dst += 16
dec ecx
jnz SHORT ConvRGB24ToRGB32_$1
pop ebp
ConvRGB24ToRGB32_$2:
}
#endif
}
And while we're at it, here are the same conversions in actual SSE3 assembly. This only works if you have an assembler (FASM is free) and have a CPU that supports SSE3 (likely but it's better to check). Note that the intrinsics don't necessarily output something this efficient, it totally depends on the tools you use and what platform you're compiling for. Here, it's straightforward: what you see is what you get. This code generates the same output as the x86 code above, and it's about 1.5x faster (on an i5 2500K).
format MS COFF
section '.text' code readable executable
public _ConvRGB32ToRGB24SSE3
; ebp + 8 Src (*RGB32, 16-byte aligned)
; ebp + 12 Dst (*RGB24, 16-byte aligned)
; ebp + 16 Pixels
_ConvRGB32ToRGB24SSE3:
push ebp
mov ebp, esp
mov eax, [ebp + 8]
mov edx, [ebp + 12]
mov ecx, [ebp + 16]
shr ecx, 4
jz done1
movupd xmm7, [mask1]
top1:
movupd xmm0, [eax + 0] ; sa = Src[0]
pshufb xmm0, xmm7 ; sa = _mm_shuffle_epi8(sa, mask)
movupd xmm1, [eax + 16] ; sb = Src[1]
pshufb xmm1, xmm7 ; sb = _mm_shuffle_epi8(sb, mask)
movupd xmm2, xmm1 ; sb1 = sb
pslldq xmm1, 12 ; sb = _mm_slli_si128(sb, 12)
por xmm0, xmm1 ; sa = _mm_or_si128(sa, sb)
movupd [edx + 0], xmm0 ; Dst[0] = sa
psrldq xmm2, 4 ; sb1 = _mm_srli_si128(sb1, 4)
movupd xmm0, [eax + 32] ; sc = Src[2]
pshufb xmm0, xmm7 ; sc = _mm_shuffle_epi8(sc, mask)
movupd xmm1, xmm0 ; sc1 = sc
pslldq xmm0, 8 ; sc = _mm_slli_si128(sc, 8)
por xmm0, xmm2 ; sc = _mm_or_si128(sb1, sc)
movupd [edx + 16], xmm0 ; Dst[1] = sc
psrldq xmm1, 8 ; sc1 = _mm_srli_si128(sc1, 8)
movupd xmm0, [eax + 48] ; sd = Src[3]
pshufb xmm0, xmm7 ; sd = _mm_shuffle_epi8(sd, mask)
pslldq xmm0, 4 ; sd = _mm_slli_si128(sd, 4)
por xmm0, xmm1 ; sd = _mm_or_si128(sc1, sd)
movupd [edx + 32], xmm0 ; Dst[2] = sd
add eax, 64
add edx, 48
dec ecx
jnz top1
done1:
pop ebp
ret
public _ConvRGB24ToRGB32SSE3
; ebp + 8 Src (*RGB24, 16-byte aligned)
; ebp + 12 Dst (*RGB32, 16-byte aligned)
; ebp + 16 Pixels
_ConvRGB24ToRGB32SSE3:
push ebp
mov ebp, esp
mov eax, [ebp + 8]
mov edx, [ebp + 12]
mov ecx, [ebp + 16]
shr ecx, 4
jz done2
movupd xmm7, [mask2]
top2:
movupd xmm0, [eax + 0] ; sa = Src[0]
movupd xmm1, [eax + 16] ; sb = Src[1]
movupd xmm2, [eax + 32] ; sc = Src[2]
movupd xmm3, xmm0 ; sa1 = sa
pshufb xmm0, xmm7 ; sa = _mm_shuffle_epi8(sa, mask)
movupd [edx], xmm0 ; Dst[0] = sa
movupd xmm4, xmm1 ; sb1 = sb
palignr xmm1, xmm3, 12 ; sb = _mm_alignr_epi8(sb, sa1, 12)
pshufb xmm1, xmm7 ; sb = _mm_shuffle_epi8(sb, mask);
movupd [edx + 16], xmm1 ; Dst[1] = sb
movupd xmm3, xmm2 ; sc1 = sc
palignr xmm2, xmm4, 8 ; sc = _mm_alignr_epi8(sc, sb1, 8)
pshufb xmm2, xmm7 ; sc = _mm_shuffle_epi8(sc, mask)
movupd [edx + 32], xmm2 ; Dst[2] = sc
palignr xmm3, xmm3, 4 ; sc1 = _mm_alignr_epi8(sc1, sc1, 4)
pshufb xmm3, xmm7 ; sc1 = _mm_shuffle_epi8(sc1, mask)
movupd [edx + 48], xmm3 ; Dst[3] = sc1
add eax, 48
add edx, 64
dec ecx
jnz top2
done2:
pop ebp
ret
section '.data' data readable writeable align 16
label mask1 dqword
db 0,1,2,4, 5,6,8,9, 10,12,13,14, -1,-1,-1,-1
label mask2 dqword
db 0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1