This question seems similar to Getting max value in a __m128i vector with SSE? but with shorts and minimum instead of integer + maximum. This is what I came up with:
typedef short int weight;
weight horizontal_min_Vec4i(__m128i x) {
__m128i max1 = _mm_shufflehi_epi16(x, _MM_SHUFFLE(0, 0, 3, 2));
__m128i max1b = _mm_shufflelo_epi16(x, _MM_SHUFFLE(0, 0, 3, 2));
__m128i max2 = _mm_min_epi16(max1, max1b);
//max2 = _mm_min_epi16(max2, x);
max1 = _mm_shufflehi_epi16(max2, _MM_SHUFFLE(0, 0, 0, 1));
max1b = _mm_shufflelo_epi16(max2, _MM_SHUFFLE(0, 0, 0, 1));
__m128i max3 = _mm_min_epi16(max1, max1b);
max2 = _mm_min_epi16(max2, max3);
return min(_mm_extract_epi16(max2, 0), _mm_extract_epi16(max2, 4));
}
The function basically does the same as the answer in https://stackoverflow.com/a/18616825/1500111 for the upper and lower parts of x. So, I know the minimum value is either in the position 0 or 4 of the __m128i variable max2. Although it is much faster than the no SIMD function horizontal_min_Vec4i_Plain(__m128i x)
shown below, I am afraid the bottleneck is the _mm_extract_epi16 operation
at the last line. Is there a better way to achieve this, for a better speed up? I am using Haswell so I have access to the latest SSE extensions.
weight horizontal_min_Vec4i_Plain(__m128i x) {
weight result[8] __attribute__((aligned(16)));
_mm_store_si128((__m128i *) result, x);
weight myMin = result[0];
for (int l = 1; l < 8; l++) {
if (myMin > result[l]) {
myMin = result[l];
}
}
return myMin;
}
_mm_minpos_epu16
? – harold