aarch64: Optimise AdvSIMD log10

Optimise AdvSIMD log10 by vectorising the special case.
For subnormal input values, use the same scaling technique as
described in the single precision equivalent.
Then check for inf, nan and x<=0.
This commit is contained in:
James Chesterman 2025-11-19 21:40:43 +00:00 committed by Adhemerval Zanella
parent 59c706b418
commit e3c40c8db0
1 changed files with 70 additions and 32 deletions

View File

@ -21,11 +21,13 @@
static const struct data static const struct data
{ {
uint64x2_t off, sign_exp_mask, offset_lower_bound; uint64x2_t off, offset_lower_bound;
uint32x4_t special_bound; uint32x4_t special_bound_u32;
uint64x2_t sign_exp_mask, special_bound;
double invln10, log10_2; double invln10, log10_2;
double c1, c3; double c1, c3;
float64x2_t c0, c2, c4; float64x2_t c0, c2, c4;
float64x2_t pinf, minf, nan;
} data = { } data = {
/* Computed from log coefficients divided by log(10) then rounded to double /* Computed from log coefficients divided by log(10) then rounded to double
precision. */ precision. */
@ -37,12 +39,17 @@ static const struct data
.invln10 = 0x1.bcb7b1526e50ep-2, .invln10 = 0x1.bcb7b1526e50ep-2,
.log10_2 = 0x1.34413509f79ffp-2, .log10_2 = 0x1.34413509f79ffp-2,
.off = V2 (0x3fe6900900000000), .off = V2 (0x3fe6900900000000),
.sign_exp_mask = V2 (0xfff0000000000000),
/* Lower bound is 0x0010000000000000. For /* Lower bound is 0x0010000000000000. For
optimised register use subnormals are detected after offset has been optimised register use subnormals are detected after offset has been
subtracted, so lower bound - offset (which wraps around). */ subtracted, so lower bound - offset (which wraps around). */
.offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000), .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
.special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */ .special_bound_u32
= V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */
.sign_exp_mask = V2 (0xfff0000000000000),
.special_bound = V2 (0x7ffe000000000000),
.pinf = V2 (INFINITY),
.minf = V2 (-INFINITY),
.nan = V2 (NAN),
}; };
#define N (1 << V_LOG10_TABLE_BITS) #define N (1 << V_LOG10_TABLE_BITS)
@ -70,29 +77,8 @@ lookup (uint64x2_t i)
} }
static float64x2_t VPCS_ATTR NOINLINE static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2, log10_core (uint64x2_t u, uint64x2_t u_off, const struct data *d)
uint32x2_t special, const struct data *d)
{ {
float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
}
/* Fast implementation of double-precision vector log10
is a slight modification of double-precision vector log.
Max ULP error: < 2.5 ulp (nearest rounding.)
Maximum measured at 2.46 ulp for x in [0.96, 0.97]
_ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
want 0x1.fff6be3cae4b9p-6. */
float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
uint64x2_t u = vreinterpretq_u64_f64 (x);
uint64x2_t u_off = vsubq_u64 (u, d->off);
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact. /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
The range is split into N subintervals. The range is split into N subintervals.
The ith subinterval contains z and c is near its center. */ The ith subinterval contains z and c is near its center. */
@ -102,9 +88,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
struct entry e = lookup (u_off); struct entry e = lookup (u_off);
uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
vget_low_u32 (d->special_bound));
/* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */ /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc); float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
float64x2_t kd = vcvtq_f64_s64 (k); float64x2_t kd = vcvtq_f64_s64 (k);
@ -125,8 +108,63 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0); float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
y = vfmaq_f64 (y, d->c4, r2); y = vfmaq_f64 (y, d->c4, r2);
y = vfmaq_f64 (p, y, r2); y = vfmaq_f64 (p, y, r2);
if (__glibc_unlikely (v_any_u32h (special)))
return special_case (hi, u_off, y, r2, special, d);
return vfmaq_f64 (hi, y, r2); return vfmaq_f64 (hi, y, r2);
} }
static inline float64x2_t VPCS_ATTR
special_case (uint64x2_t u_off, const struct data *d)
{
float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
/* If x is special, compute 2log(sqrt(x)), else compute log(x).
x might be subnormal, and sqrting it makes it larger.
And the above two expressions are equivalent. */
uint64x2_t special
= vcgeq_u64 (vsubq_u64 (u_off, d->offset_lower_bound), d->special_bound);
float64x2_t x_sqrt = vbslq_f64 (special, vsqrtq_f64 (x), x);
u_off = vsubq_u64 (vreinterpretq_u64_f64 (x_sqrt), d->off);
/* Don't pass u into this, it isn't using x_sqrt. */
float64x2_t y = log10_core (vreinterpretq_u64_f64 (x_sqrt), u_off, d);
y = vbslq_f64 (special, vmulq_f64 (y, v_f64 (2.0f)), y);
/* Is true for +/- inf, +/- nan as well as all negative numbers. */
uint64x2_t is_infnan
= vcgeq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (d->pinf));
uint64x2_t infnan_or_zero = vorrq_u64 (is_infnan, vceqzq_f64 (x));
y = vbslq_f64 (infnan_or_zero, d->nan, y);
uint64x2_t ret_pinf = vceqq_f64 (x, d->pinf);
uint64x2_t ret_minf = vceqzq_f64 (x);
y = vbslq_f64 (ret_pinf, d->pinf, y);
y = vbslq_f64 (ret_minf, d->minf, y);
return y;
}
/* Fast implementation of double-precision vector log10
is a slight modification of double-precision vector log.
Max ULP error: < 2.5 ulp (nearest rounding.)
Maximum measured at 2.46 ulp for x in [0.96, 0.97]
_ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
want 0x1.fff6be3cae4b9p-6. */
float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
{
const struct data *d = ptr_barrier (&data);
/* To avoid having to mov x out of the way, keep u after offset has been
applied, and recover x by adding the offset back in the special-case
handler. */
uint64x2_t u = vreinterpretq_u64_f64 (x);
uint64x2_t u_off = vsubq_u64 (u, d->off);
uint32x2_t special_u32 = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
vget_low_u32 (d->special_bound_u32));
if (__glibc_unlikely (v_any_u32h (special_u32)))
return special_case (u_off, d);
/* Making what would usually be an inline function into NOINLINE helps
performance. This is because the register allocation in the fast pass does
not consider what registers are used in the special case function. */
return log10_core (u, u_off, d);
}