The closest I have come to a satisfactory solution is based on an idea from a news posting by Robert Harley, in which he observed that for x in [0,1], acos(x) ≈ √(2*(1-x)), and that a polynomial can provide the scale factor necessary to make this an accurate approximation across the entire interval. As can be seen in the code below, this approach results in straight-line code with just two uses of the ternary operator to handle arguments in the negative half-plane.
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#define VECTORIZABLE 1
#define ARR_LEN (1 << 24)
#define MAX_ULP 1 /* deviation from correctly rounded result */
#if VECTORIZABLE
/*
Compute arccos(a) with a maximum error of 1.496766 ulp
This uses an idea from Robert Harley's posting in comp.arch.arithmetic on 1996/07/12
https://groups.google.com/forum/#!original/comp.arch.arithmetic/wqCPkCCXqWs/T9qCkHtGE2YJ
*/
float my_acosf (float a)
{
float r, s, t;
s = (a < 0.0f) ? 2.0f : (-2.0f);
t = fmaf (s, a, 2.0f);
s = sqrtf (t);
r = 0x1.c86000p-22f; // 4.25032340e-7
r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
r = fmaf (r, t, 0x1.90c5c4p-18f); // 5.97197595e-6
r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
r = fmaf (r, t, 0x1.c3f78ap-16f); // 2.69393295e-5
r = fmaf (r, t, 0x1.e8f446p-14f); // 1.16575764e-4
r = fmaf (r, t, 0x1.6df072p-11f); // 6.97973708e-4
r = fmaf (r, t, 0x1.3332a6p-08f); // 4.68746712e-3
r = fmaf (r, t, 0x1.555550p-05f); // 4.16666567e-2
r = r * t;
r = fmaf (r, s, s);
t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
r = (a < 0.0f) ? t : r;
return r;
}
#else // VECTORIZABLE
/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
float r, s;
s = a * a;
r = 0x1.a7f260p-5f; // 5.17513156e-2
r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
r = r * s;
r = fmaf (r, a, a);
return r;
}
/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
float r;
r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
if (r > -0.5625f) {
/* arccos(x) = pi/2 - arcsin(x) */
r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
} else {
/* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
}
if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
/* arccos (-x) = pi - arccos(x) */
r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
}
return r;
}
#endif // VECTORIZABLE
int main (void)
{
double darg, dref;
float ref, *a, *b;
uint32_t argi, resi, refi;
printf ("%svectorizable implementation of acos\n",
VECTORIZABLE ? "" : "non-");
a = (float *)malloc (sizeof(a[0]) * ARR_LEN);
b = (float *)malloc (sizeof(b[0]) * ARR_LEN);
argi = 0x00000000;
do {
for (int i = 0; i < ARR_LEN; i++) {
memcpy (&a[i], &argi, sizeof(a[i]));
argi++;
}
for (int i = 0; i < ARR_LEN; i++) {
b[i] = my_acosf (a[i]);
}
for (int i = 0; i < ARR_LEN; i++) {
darg = (double)a[i];
dref = acos (darg);
ref = (float)dref;
memcpy (&refi, &ref, sizeof(refi));
memcpy (&resi, &b[i], sizeof(resi));
if (llabs ((long long int)resi - (long long int)refi) > MAX_ULP) {
printf ("error > 1 ulp a[i]=% 14.6a b[i]=% 14.6a ref=% 14.6a dref=% 21.13a\n",
a[i], b[i], ref, dref);
printf ("test FAILED\n");
return EXIT_FAILURE;
}
}
printf ("^^^^ argi = %08x\n", argi);
} while (argi);
printf ("test PASSED\n");
free (a);
free (b);
return EXIT_SUCCESS;
}
Although the structure of this code appears conducive to auto-vectorization, I have not had much luck when targeting AVX2
with the compilers offered by Compiler Explorer. The only compiler that appears to be able to vectorize this code in the context of the inner loop of my test app above is Clang. But Clang seems to be able to do this only if I specify -ffast-math
, which, however, has the undesired side-effect of turning the sqrtf()
invocation into an approximate square root computed via rsqrt
. I experimented with some less intrusive switches, e.g. -fno-honor-nans
, -fno-math-errno
, -fno-trapping-math
, but my_acosf()
did not vectorize even when I used those in combination.
For now I have resorted to manual translation of the above code into AVX2
+ FMA
intrinsics, like so:
#include "immintrin.h"
/* maximum error = 1.496766 ulp */
__m256 _mm256_acos_ps (__m256 x)
{
const __m256 zero= _mm256_set1_ps ( 0.0f);
const __m256 two = _mm256_set1_ps ( 2.0f);
const __m256 mtwo= _mm256_set1_ps (-2.0f);
const __m256 c0 = _mm256_set1_ps ( 0x1.c86000p-22f); // 4.25032340e-7
const __m256 c1 = _mm256_set1_ps (-0x1.0258fap-19f); // -1.92483935e-6
const __m256 c2 = _mm256_set1_ps ( 0x1.90c5c4p-18f); // 5.97197595e-6
const __m256 c3 = _mm256_set1_ps (-0x1.55668cp-19f); // -2.54363249e-6
const __m256 c4 = _mm256_set1_ps ( 0x1.c3f78ap-16f); // 2.69393295e-5
const __m256 c5 = _mm256_set1_ps ( 0x1.e8f446p-14f); // 1.16575764e-4
const __m256 c6 = _mm256_set1_ps ( 0x1.6df072p-11f); // 6.97973708e-4
const __m256 c7 = _mm256_set1_ps ( 0x1.3332a6p-8f); // 4.68746712e-3
const __m256 c8 = _mm256_set1_ps ( 0x1.555550p-5f); // 4.16666567e-2
const __m256 pi0 = _mm256_set1_ps ( 0x1.ddcb02p+0f); // 1.86637890e+0
const __m256 pi1 = _mm256_set1_ps ( 0x1.aee9d6p+0f); // 1.68325555e+0
__m256 s, r, t, m;
s = two;
t = mtwo;
m = _mm256_cmp_ps (x, zero, _CMP_LT_OQ);
t = _mm256_blendv_ps (t, s, m);
t = _mm256_fmadd_ps (x, t, s);
s = _mm256_sqrt_ps (t);
r = c0;
r = _mm256_fmadd_ps (r, t, c1);
r = _mm256_fmadd_ps (r, t, c2);
r = _mm256_fmadd_ps (r, t, c3);
r = _mm256_fmadd_ps (r, t, c4);
r = _mm256_fmadd_ps (r, t, c5);
r = _mm256_fmadd_ps (r, t, c6);
r = _mm256_fmadd_ps (r, t, c7);
r = _mm256_fmadd_ps (r, t, c8);
r = _mm256_mul_ps (r, t);
r = _mm256_fmadd_ps (r, s, s);
t = _mm256_sub_ps (zero, r);
t = _mm256_fmadd_ps (pi0, pi1, t);
r = _mm256_blendv_ps (r, t, m);
return r;
}