Accurate vectorizable implementation of acosf()

Question

Simple implementations of acosf() can easily achieve an error bound of 1.5 ulp with respect to the infinitely precise (mathematical) result, if the platform has support for fused multiply-add (FMA). This means results never differ by more than one ulp from the correctly rounded result in round-to-nearest-or-even mode.

However, such an implementation usually comprises two major code branches that split the primary approximation interval [0,1] roughly in half, as in the exemplary code below. This branchyness inhibits automatic vectorization by compilers when targeting SIMD architectures.

Is there an alternative algorithmic approach that lends itself more readily to automatic vectorization, while maintaining the same error bound of 1.5 ulps? Platform support for FMA can be assumed.

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}

njuffa njuffa · Accepted Answer · 2018-03-03T00:27:01

The closest I have come to a satisfactory solution is based on an idea from a news posting by Robert Harley, in which he observed that for x in [0,1], acos(x) ≈ √(2*(1-x)), and that a polynomial can provide the scale factor necessary to make this an accurate approximation across the entire interval. As can be seen in the code below, this approach results in straight-line code with just two uses of the ternary operator to handle arguments in the negative half-plane.

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

#define VECTORIZABLE 1
#define ARR_LEN      (1 << 24)
#define MAX_ULP      1 /* deviation from correctly rounded result */

#if VECTORIZABLE  
/* 
 Compute arccos(a) with a maximum error of 1.496766 ulp 
 This uses an idea from Robert Harley's posting in comp.arch.arithmetic on 1996/07/12
 https://groups.google.com/forum/#!original/comp.arch.arithmetic/wqCPkCCXqWs/T9qCkHtGE2YJ
*/
float my_acosf (float a)
{
    float r, s, t;
    s = (a < 0.0f) ? 2.0f : (-2.0f);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
    r = (a < 0.0f) ? t : r;
    return r;
}

#else // VECTORIZABLE

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}
#endif // VECTORIZABLE

int main (void)
{
    double darg, dref;
    float ref, *a, *b;
    uint32_t argi, resi, refi;

    printf ("%svectorizable implementation of acos\n", 
            VECTORIZABLE ? "" : "non-");

    a = (float *)malloc (sizeof(a[0]) * ARR_LEN);
    b = (float *)malloc (sizeof(b[0]) * ARR_LEN);

    argi = 0x00000000;
    do {

        for (int i = 0; i < ARR_LEN; i++) {
            memcpy (&a[i], &argi, sizeof(a[i]));
            argi++;
        }

        for (int i = 0; i < ARR_LEN; i++) {
            b[i] = my_acosf (a[i]);
        }

        for (int i = 0; i < ARR_LEN; i++) {
            darg = (double)a[i];
            dref = acos (darg);
            ref = (float)dref;
            memcpy (&refi, &ref, sizeof(refi));
            memcpy (&resi, &b[i], sizeof(resi));
            if (llabs ((long long int)resi - (long long int)refi) > MAX_ULP) {
                printf ("error > 1 ulp a[i]=% 14.6a  b[i]=% 14.6a  ref=% 14.6a  dref=% 21.13a\n", 
                        a[i], b[i], ref, dref);
                printf ("test FAILED\n");

                return EXIT_FAILURE;
            }
        }

        printf ("^^^^ argi = %08x\n", argi);
    } while (argi);

    printf ("test PASSED\n");

    free (a);
    free (b);

    return EXIT_SUCCESS;
}

Although the structure of this code appears conducive to auto-vectorization, I have not had much luck when targeting AVX2 with the compilers offered by Compiler Explorer. The only compiler that appears to be able to vectorize this code in the context of the inner loop of my test app above is Clang. But Clang seems to be able to do this only if I specify -ffast-math, which, however, has the undesired side-effect of turning the sqrtf() invocation into an approximate square root computed via rsqrt. I experimented with some less intrusive switches, e.g. -fno-honor-nans, -fno-math-errno, -fno-trapping-math, but my_acosf() did not vectorize even when I used those in combination.

For now I have resorted to manual translation of the above code into AVX2 + FMA intrinsics, like so:

#include "immintrin.h"

/* maximum error = 1.496766 ulp */
__m256 _mm256_acos_ps (__m256 x)
{
    const __m256 zero= _mm256_set1_ps ( 0.0f);
    const __m256 two = _mm256_set1_ps ( 2.0f);
    const __m256 mtwo= _mm256_set1_ps (-2.0f);
    const __m256 c0  = _mm256_set1_ps ( 0x1.c86000p-22f); //  4.25032340e-7
    const __m256 c1  = _mm256_set1_ps (-0x1.0258fap-19f); // -1.92483935e-6
    const __m256 c2  = _mm256_set1_ps ( 0x1.90c5c4p-18f); //  5.97197595e-6
    const __m256 c3  = _mm256_set1_ps (-0x1.55668cp-19f); // -2.54363249e-6
    const __m256 c4  = _mm256_set1_ps ( 0x1.c3f78ap-16f); //  2.69393295e-5
    const __m256 c5  = _mm256_set1_ps ( 0x1.e8f446p-14f); //  1.16575764e-4
    const __m256 c6  = _mm256_set1_ps ( 0x1.6df072p-11f); //  6.97973708e-4
    const __m256 c7  = _mm256_set1_ps ( 0x1.3332a6p-8f);  //  4.68746712e-3
    const __m256 c8  = _mm256_set1_ps ( 0x1.555550p-5f);  //  4.16666567e-2
    const __m256 pi0 = _mm256_set1_ps ( 0x1.ddcb02p+0f);  //  1.86637890e+0
    const __m256 pi1 = _mm256_set1_ps ( 0x1.aee9d6p+0f);  //  1.68325555e+0
    __m256 s, r, t, m;

    s = two;
    t = mtwo;
    m = _mm256_cmp_ps (x, zero, _CMP_LT_OQ);
    t = _mm256_blendv_ps (t, s, m);
    t = _mm256_fmadd_ps (x, t, s);
    s = _mm256_sqrt_ps (t);
    r = c0;
    r = _mm256_fmadd_ps (r, t, c1);
    r = _mm256_fmadd_ps (r, t, c2);
    r = _mm256_fmadd_ps (r, t, c3);
    r = _mm256_fmadd_ps (r, t, c4);
    r = _mm256_fmadd_ps (r, t, c5);
    r = _mm256_fmadd_ps (r, t, c6);
    r = _mm256_fmadd_ps (r, t, c7);
    r = _mm256_fmadd_ps (r, t, c8);
    r = _mm256_mul_ps (r, t);
    r = _mm256_fmadd_ps (r, s, s);
    t = _mm256_sub_ps (zero, r);
    t = _mm256_fmadd_ps (pi0, pi1, t);
    r = _mm256_blendv_ps (r, t, m);
    return r;
}

Accurate vectorizable implementation of acosf()

3 Answers