5
votes

I'd like to combine two 256 bit vectors (__m256d) which contain the masks as a result of a comparison-operation (such as _mm256_cmp_pd) to one 256 bit vector, by omitting the upper half of every 64 bit double.

So, if in the following, a_i, b_i, ... are 32 bit words, and I have two 256 bit (4 x double) vectors that have the following structure:

a_0, a_0, b_0, b_0, c_0, c_0, d_0, d_0, and a_1, a_1, b_1, b_1, c_1, c_1, d_1, d_1.

I'd like to have a single 256 bit vector with the following structure:

a_0, b_0, c_0, d_0, a_1, b_1, c_1, d_1.

How do I do this efficiently using Intel intrinsics? The instruction set available is everything up to AVX.

2
I assume you intended _mm_cmp_pd in place of __mm_cmp_pd. But _mm_cmp_pd returns __m128d.user1940376

2 Answers

4
votes

It looks like you can exploit the fact that a bit pattern of all 1s is a NaN in both single and double precision, and similarly a bit pattern of all 0s is a 0.0 in both cases. So to pack your two double mask vectors to a single float vector you can do this:

 __m256 v = _mm256_set_m128(_mm256_cvtpd_ps(v0), _mm256_cvtpd_ps(v1));

Note that if you do not have _mm256_set_m128 then you can define it as:

#define _mm256_set_m128(va, vb) \
        _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1)

Here's a demo:

#include <stdio.h>
#include <immintrin.h>

#define _mm256_set_m128(va, vb) \
        _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1)

static void printvd(const char * label, __m256d v)
{
    int64_t a[4];
    _mm256_storeu_pd((double *)a, v);
    printf("%s = %lld %lld %lld %lld\n", label, a[0],  a[1],  a[2],  a[3]);
}

static void printvf(const char * label, __m256 v)
{
    int32_t a[8];
    _mm256_storeu_ps((float *)a, v);
    printf("%s = %d %d %d %d %d %d %d %d\n", label, a[0],  a[1],  a[2],  a[3],  a[4],  a[5],  a[6],  a[7]);
}

int main()
{
    __m256d v0 = _mm256_set_pd(0.0, 1.0, 2.0, 3.0);
    __m256d v1 = _mm256_set_pd(3.0, 2.0, 1.0, 0.0);
    __m256d vcmp0 = _mm256_cmp_pd(v0, v1, 1);
    __m256d vcmp1 = _mm256_cmp_pd(v1, v0, 1);
    __m256 vcmp = _mm256_set_m128(_mm256_cvtpd_ps(vcmp0), _mm256_cvtpd_ps(vcmp1));
    printvd("vcmp0", vcmp0);
    printvd("vcmp1", vcmp1);
    printvf("vcmp ", vcmp);
    return 0;
}

Test:

$ gcc -Wall -mavx so_avx_test.c && ./a.out
vcmp0 = 0 0 -1 -1
vcmp1 = -1 -1 0 0
vcmp  = -1 -1 0 0 0 0 -1 -1
2
votes

In the code below, function1() performs the operation. The main program supplies sample data and prints results. The FFFFFFFF portions of the sample data are the upper halves to be omitted. The remaining dwords of sample data contain unique patterns. The program output is:

v0=A0000000 FFFFFFFF B0000000 FFFFFFFF C0000000 FFFFFFFF D0000000 FFFFFFFF
v1=A0000001 FFFFFFFF B0000001 FFFFFFFF C0000001 FFFFFFFF D0000001 FFFFFFFF
vr=A0000000 B0000000 C0000000 D0000000 A0000001 B0000001 C0000001 D0000001

The code was tested with VS2013 using command line cl /Ox /arch:AVX sample.c and with gcc 4.9.0 using command line gcc -O3 -mavx -c sample.c.

The limited cross-lane capability of AVX makes the solution relatively complicated.

#include <intrin.h>
#include <stdint.h>
#include <stdio.h>

//---------------------------------------------------------------------------

static void dump (void *data)
    {
    uint32_t *d32 = data;
    int index;

    for (index = 0; index < 8; index++)
        printf ("%08X ", d32 [index]);
    printf ("\n");
    }

//---------------------------------------------------------------------------

 static __m256d function1 (__m256d v0, __m256d v1)
    {
    __m256d tmp0 = _mm256_permute2f128_pd (v0, v1, 0x20);
    __m256d tmp1 = _mm256_permute2f128_pd (v0, v1, 0x31);
    return _mm256_castps_pd (_mm256_shuffle_ps (_mm256_castpd_ps (tmp0), _mm256_castpd_ps (tmp1), 0x88));
    }

//---------------------------------------------------------------------------

int main (void)
    {
    __m256d v0, v1, vr;

    v0 = _mm256_castsi256_pd (_mm256_set_epi32 (0xffffffff, 0xd0000000, 0xffffffff, 0xc0000000, 0xffffffff, 0xb0000000, 0xffffffff, 0xa0000000));
    v1 = _mm256_castsi256_pd (_mm256_set_epi32 (0xffffffff, 0xd0000001, 0xffffffff, 0xc0000001, 0xffffffff, 0xb0000001, 0xffffffff, 0xa0000001));
    vr = function1 (v0, v1);
    printf ("v0="); dump (&v0);
    printf ("v1="); dump (&v1);
    printf ("vr="); dump (&vr);
    return 0;
    }