AArch64: Improve codegen SVE log1p helper

Improve codegen by packing coefficients.
4% and 2% improvement in throughput microbenchmark on Neoverse V1, for acosh
and atanh respectively.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
This commit is contained in:
Luna Lamb 2025-06-18 16:12:19 +00:00 committed by Wilco Dijkstra
parent dee22d2a81
commit 6849c5b791
3 changed files with 71 additions and 24 deletions

View File

@ -30,10 +30,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
}
/* SVE approximation for double-precision acosh, based on log1p.
The largest observed error is 3.19 ULP in the region where the
The largest observed error is 3.14 ULP in the region where the
argument to log1p falls in the k=0 interval, i.e. x close to 1:
SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
want 0x1.ed23399f51373p-2. */
SV_NAME_D1 (acosh)(0x1.1e80ed12f0ad1p+0) got 0x1.ef0cee7c33ce1p-2
want 0x1.ef0cee7c33ce4p-2. */
svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
{
/* (ix - One) >= (BigBound - One). */

View File

@ -30,7 +30,7 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
}
/* SVE approximation for double-precision atanh, based on log1p.
The greatest observed error is 2.81 ULP:
The greatest observed error is 3.3 ULP:
_ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
want 0x1.ffd8ff31b501cp-6. */
svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
@ -42,7 +42,6 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
/* It is special if iax >= 1. */
// svbool_t special = svcmpge (pg, iax, One);
svbool_t special = svacge (pg, x, 1.0);
/* Computation is performed based on the following sequence of equality:

View File

@ -21,11 +21,12 @@
#define AARCH64_FPU_SV_LOG1P_INLINE_H
#include "sv_math.h"
#include "poly_sve_f64.h"
static const struct sv_log1p_data
{
double poly[19], ln2[2];
double c0, c2, c4, c6, c8, c10, c12, c14, c16;
double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
double ln2_lo, ln2_hi;
uint64_t hf_rt2_top;
uint64_t one_m_hf_rt2_top;
uint32_t bottom_mask;
@ -33,15 +34,30 @@ static const struct sv_log1p_data
} sv_log1p_data = {
/* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].
*/
.poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
-0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
-0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
-0x1.cfa7385bdb37ep-6 },
.ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },
.c0 = -0x1.ffffffffffffbp-2,
.c1 = 0x1.55555555551a9p-2,
.c2 = -0x1.00000000008e3p-2,
.c3 = 0x1.9999999a32797p-3,
.c4 = -0x1.555555552fecfp-3,
.c5 = 0x1.249248e071e5ap-3,
.c6 = -0x1.ffffff8bf8482p-4,
.c7 = 0x1.c71c8f07da57ap-4,
.c8 = -0x1.9999ca4ccb617p-4,
.c9 = 0x1.7459ad2e1dfa3p-4,
.c10 = -0x1.554d2680a3ff2p-4,
.c11 = 0x1.3b4c54d487455p-4,
.c12 = -0x1.2548a9ffe80e6p-4,
.c13 = 0x1.0f389a24b2e07p-4,
.c14 = -0x1.eee4db15db335p-5,
.c15 = 0x1.e95b494d4a5ddp-5,
.c16 = -0x1.15fdf07cb7c73p-4,
.c17 = 0x1.0310b70800fcfp-4,
.c18 = -0x1.cfa7385bdb37ep-6,
.ln2_lo = 0x1.62e42fefa3800p-1,
.ln2_hi = 0x1.ef35793c76730p-45,
/* top32(asuint64(sqrt(2)/2)) << 32. */
.hf_rt2_top = 0x3fe6a09e00000000,
/* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32. */
.one_m_hf_rt2_top = 0x00095f6200000000,
.bottom_mask = 0xffffffff,
.one_top = 0x3ff
@ -51,14 +67,14 @@ static inline svfloat64_t
sv_log1p_inline (svfloat64_t x, const svbool_t pg)
{
/* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which
differs from v_log1p_2u5.c by:
differs from advsimd/log1p.c by:
- No special-case handling - this should be dealt with by the caller.
- Pairwise Horner polynomial evaluation for improved accuracy.
- Optionally simulate the shortcut for k=0, used in the scalar routine,
using svsel, for improved accuracy when the argument to log1p is close
to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1
in the source of the caller before including this file.
See sv_log1p_2u1.c for details of the algorithm. */
See sve/log1p.c for details of the algorithm. */
const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data);
svfloat64_t m = svadd_x (pg, x, 1);
svuint64_t mi = svreinterpret_u64 (m);
@ -79,7 +95,7 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
svfloat64_t cm;
#ifndef WANT_SV_LOG1P_K0_SHORTCUT
#error \
#error \
"Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
#elif WANT_SV_LOG1P_K0_SHORTCUT
/* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
@ -96,14 +112,46 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
#endif
/* Approximate log1p(f) on the reduced input using a polynomial. */
svfloat64_t f2 = svmul_x (pg, f, f);
svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly);
svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f),
f4 = svmul_x (svptrue_b64 (), f2, f2),
f8 = svmul_x (svptrue_b64 (), f4, f4),
f16 = svmul_x (svptrue_b64 (), f8, f8);
svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17);
/* Order-18 Estrin scheme. */
svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0);
svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1);
svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0);
svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1);
svfloat64_t p03 = svmla_x (pg, p01, f2, p23);
svfloat64_t p47 = svmla_x (pg, p45, f2, p67);
svfloat64_t p07 = svmla_x (pg, p03, f4, p47);
svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0);
svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1);
svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0);
svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1);
svfloat64_t p811 = svmla_x (pg, p89, f2, p1011);
svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415);
svfloat64_t p815 = svmla_x (pg, p811, f4, p1215);
svfloat64_t p015 = svmla_x (pg, p07, f8, p815);
svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0);
svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1);
svfloat64_t p = svmla_x (pg, p015, f16, p1618);
/* Assemble log1p(x) = k * log2 + log1p(f) + c/m. */
svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]);
svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]);
svfloat64_t ln2_lo_hi = svld1rq (svptrue_b64 (), &d->ln2_lo);
svfloat64_t ylo = svmla_lane (cm, k, ln2_lo_hi, 0);
svfloat64_t yhi = svmla_lane (f, k, ln2_lo_hi, 1);
return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
return svmad_x (pg, p, f2, svadd_x (pg, ylo, yhi));
}
#endif