From e3c40c8db0992cedcb48c7fd9607c19fc90a3cc3 Mon Sep 17 00:00:00 2001
From: James Chesterman <James.Chesterman@arm.com>
Date: Wed, 19 Nov 2025 21:40:43 +0000
Subject: [PATCH] aarch64: Optimise AdvSIMD log10

Optimise AdvSIMD log10 by vectorising the special case.
For subnormal input values, use the same scaling technique as
described in the single precision equivalent.
Then check for inf, nan and x<=0.
---
 sysdeps/aarch64/fpu/log10_advsimd.c | 102 +++++++++++++++++++---------
 1 file changed, 70 insertions(+), 32 deletions(-)

diff --git a/sysdeps/aarch64/fpu/log10_advsimd.c b/sysdeps/aarch64/fpu/log10_advsimd.c
index faa2b255c0..15bad6ae20 100644
--- a/sysdeps/aarch64/fpu/log10_advsimd.c
+++ b/sysdeps/aarch64/fpu/log10_advsimd.c
@@ -21,11 +21,13 @@
 
 static const struct data
 {
-  uint64x2_t off, sign_exp_mask, offset_lower_bound;
-  uint32x4_t special_bound;
+  uint64x2_t off, offset_lower_bound;
+  uint32x4_t special_bound_u32;
+  uint64x2_t sign_exp_mask, special_bound;
   double invln10, log10_2;
   double c1, c3;
   float64x2_t c0, c2, c4;
+  float64x2_t pinf, minf, nan;
 } data = {
   /* Computed from log coefficients divided by log(10) then rounded to double
      precision.  */
@@ -37,12 +39,17 @@ static const struct data
   .invln10 = 0x1.bcb7b1526e50ep-2,
   .log10_2 = 0x1.34413509f79ffp-2,
   .off = V2 (0x3fe6900900000000),
-  .sign_exp_mask = V2 (0xfff0000000000000),
   /* Lower bound is 0x0010000000000000. For
      optimised register use subnormals are detected after offset has been
      subtracted, so lower bound - offset (which wraps around).  */
   .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
-  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000.  */
+  .special_bound_u32
+  = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000.  */
+  .sign_exp_mask = V2 (0xfff0000000000000),
+  .special_bound = V2 (0x7ffe000000000000),
+  .pinf = V2 (INFINITY),
+  .minf = V2 (-INFINITY),
+  .nan = V2 (NAN),
 };
 
 #define N (1 << V_LOG10_TABLE_BITS)
@@ -70,29 +77,8 @@ lookup (uint64x2_t i)
 }
 
 static float64x2_t VPCS_ATTR NOINLINE
-special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
-	      uint32x2_t special, const struct data *d)
+log10_core (uint64x2_t u, uint64x2_t u_off, const struct data *d)
 {
-  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
-  return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
-}
-
-/* Fast implementation of double-precision vector log10
-   is a slight modification of double-precision vector log.
-   Max ULP error: < 2.5 ulp (nearest rounding.)
-   Maximum measured at 2.46 ulp for x in [0.96, 0.97]
-   _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
-				       want 0x1.fff6be3cae4b9p-6.  */
-float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
-{
-  const struct data *d = ptr_barrier (&data);
-
-  /* To avoid having to mov x out of the way, keep u after offset has been
-     applied, and recover x by adding the offset back in the special-case
-     handler.  */
-  uint64x2_t u = vreinterpretq_u64_f64 (x);
-  uint64x2_t u_off = vsubq_u64 (u, d->off);
-
   /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
@@ -102,9 +88,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
 
   struct entry e = lookup (u_off);
 
-  uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
-				 vget_low_u32 (d->special_bound));
-
   /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2).  */
   float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
   float64x2_t kd = vcvtq_f64_s64 (k);
@@ -125,8 +108,63 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
   float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
   y = vfmaq_f64 (y, d->c4, r2);
   y = vfmaq_f64 (p, y, r2);
-
-  if (__glibc_unlikely (v_any_u32h (special)))
-    return special_case (hi, u_off, y, r2, special, d);
   return vfmaq_f64 (hi, y, r2);
 }
+
+static inline float64x2_t VPCS_ATTR
+special_case (uint64x2_t u_off, const struct data *d)
+{
+  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
+  /* If x is special, compute 2log(sqrt(x)), else compute log(x).
+     x might be subnormal, and sqrting it makes it larger.
+     And the above two expressions are equivalent.  */
+  uint64x2_t special
+      = vcgeq_u64 (vsubq_u64 (u_off, d->offset_lower_bound), d->special_bound);
+  float64x2_t x_sqrt = vbslq_f64 (special, vsqrtq_f64 (x), x);
+
+  u_off = vsubq_u64 (vreinterpretq_u64_f64 (x_sqrt), d->off);
+
+  /* Don't pass u into this, it isn't using x_sqrt.  */
+  float64x2_t y = log10_core (vreinterpretq_u64_f64 (x_sqrt), u_off, d);
+
+  y = vbslq_f64 (special, vmulq_f64 (y, v_f64 (2.0f)), y);
+
+  /* Is true for +/- inf, +/- nan as well as all negative numbers.  */
+  uint64x2_t is_infnan
+      = vcgeq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (d->pinf));
+  uint64x2_t infnan_or_zero = vorrq_u64 (is_infnan, vceqzq_f64 (x));
+
+  y = vbslq_f64 (infnan_or_zero, d->nan, y);
+  uint64x2_t ret_pinf = vceqq_f64 (x, d->pinf);
+  uint64x2_t ret_minf = vceqzq_f64 (x);
+  y = vbslq_f64 (ret_pinf, d->pinf, y);
+  y = vbslq_f64 (ret_minf, d->minf, y);
+  return y;
+}
+
+/* Fast implementation of double-precision vector log10
+   is a slight modification of double-precision vector log.
+   Max ULP error: < 2.5 ulp (nearest rounding.)
+   Maximum measured at 2.46 ulp for x in [0.96, 0.97]
+   _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
+				       want 0x1.fff6be3cae4b9p-6.  */
+float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
+{
+  const struct data *d = ptr_barrier (&data);
+
+  /* To avoid having to mov x out of the way, keep u after offset has been
+     applied, and recover x by adding the offset back in the special-case
+     handler.  */
+  uint64x2_t u = vreinterpretq_u64_f64 (x);
+  uint64x2_t u_off = vsubq_u64 (u, d->off);
+
+  uint32x2_t special_u32 = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
+				     vget_low_u32 (d->special_bound_u32));
+
+  if (__glibc_unlikely (v_any_u32h (special_u32)))
+    return special_case (u_off, d);
+  /* Making what would usually be an inline function into NOINLINE helps
+     performance. This is because the register allocation in the fast pass does
+     not consider what registers are used in the special case function.  */
+  return log10_core (u, u_off, d);
+}