AArch64: Optimise SVE FP64 Hyperbolics

Reworke SVE FP64 hyperbolics to use the SVE FEXPA instruction. Also update the special case handelling for large inputs to be entirely vectorised. Performance improvements on Neoverse V1: cosh_sve: 19% for |x| < 709, 5x otherwise sinh_sve: 24% for |x| < 709, 5.9x otherwise tanh_sve: 12% for |x| < 19, 9x otherwise Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2025-06-18 16:19:22 +00:00 · 2025-06-18 16:19:22 +00:00 · dee22d2a81
parent 1e3d1ddf97
commit dee22d2a81
3 changed files with 283 additions and 159 deletions
--- a/sysdeps/aarch64/fpu/cosh_sve.c
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
@ -21,71 +21,99 @@

 static const struct data
 {
-  float64_t poly[3];
-  float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
+  double c0, c2;
+  double c1, c3;
+  float64_t inv_ln2, ln2_hi, ln2_lo, shift;
  uint64_t special_bound;
 } data = {
-  .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
-	    0x1.5555576a59599p-5, },
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1.fffffffffdbcdp-2,
+  .c1 = 0x1.555555555444cp-3,
+  .c2 = 0x1.555573c6a9f7dp-5,
+  .c3 = 0x1.1111266d28935p-7,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  /* 1/ln2.  */
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  .shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022.  */

-  .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2.  */
-  /* -ln2/N.  */
-  .ln2_hi = -0x1.62e42fefa39efp-9,
-  .ln2_lo = -0x1.abc9e3b39803f3p-64,
-  .shift = 0x1.8p+52,
-  .thres = 704.0,
-
-  /* 0x1.6p9, above which exp overflows.  */
-  .special_bound = 0x4086000000000000,
+  /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows.  */
+  .special_bound = 0x40862e37e7d8ba72,
 };

-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svbool_t pg, svfloat64_t t, svbool_t special)
-{
-  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
-  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
-  svfloat64_t y = svadd_x (pg, half_t, half_over_t);
-  return sv_call_f64 (cosh, x, y, special);
-}
-
-/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
-   special-case handling or tail.  */
+/* Helper for approximating exp(x)/2.
+   Functionally identical to FEXPA exp(x), but an adjustment in
+   the shift value which leads to a reduction in the exponent of scale by 1,
+   thus halving the result at no cost.  */
 static inline svfloat64_t
-exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+exp_over_two_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
 {
  /* Calculate exp(x).  */
  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
  svfloat64_t n = svsub_x (pg, z, d->shift);

-  svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
-  r = svmla_x (pg, r, n, d->ln2_lo);
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);

-  svuint64_t u = svreinterpret_u64 (z);
-  svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
-  svuint64_t i = svand_x (svptrue_b64 (), u, 0xff);
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);

-  svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
-  y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
-  y = svmla_x (pg, sv_f64 (1.0), r, y);
-  y = svmul_x (svptrue_b64 (), r, y);
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+  svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
+  svfloat64_t p = svmla_x (pg, r, p04, r2);

-  /* s = 2^(n/N).  */
-  u = svld1_gather_index (pg, __v_exp_tail_data, i);
-  svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
+  svfloat64_t scale = svexpa (u);

-  return svmla_x (pg, s, s, y);
+  return svmla_x (pg, scale, scale, p);
+}
+
+/* Vectorised special case to handle values past where exp_inline overflows.
+   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+   the valid range of inputs, and returns inf for anything past that.  */
+static svfloat64_t NOINLINE
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t,
+	      const struct data *d)
+{
+  /* Finish fast path to compute values for non-special cases.  */
+  svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25);
+  svfloat64_t y = svadd_x (pg, t, inv_twoexp);
+
+  /* Halves input value, and then check if any cases
+     are still going to overflow.  */
+  ax = svmul_x (special, ax, 0.5);
+  svbool_t is_safe
+      = svcmplt (special, svreinterpret_u64 (ax), d->special_bound);
+
+  /* Computes exp(x/2), and sets any overflowing lanes to inf.  */
+  svfloat64_t half_exp = exp_over_two_inline (special, ax, d);
+  half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY));
+
+  /* Construct special case cosh(x) = (exp(x/2)^2)/2.  */
+  svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2);
+  svfloat64_t special_y = svmul_x (special, exp, half_exp);
+
+  /* Select correct return values for special and non-special cases.  */
+  special_y = svsel (special, special_y, y);
+
+  /* Ensure an input of nan is correctly propagated.  */
+  svbool_t is_nan
+      = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000));
+  return svsel (is_nan, ax, svsel (special, special_y, y));
 }

 /* Approximation for SVE double-precision cosh(x) using exp_inline.
   cosh(x) = (exp(x) + exp(-x)) / 2.
-   The greatest observed error is in the scalar fall-back region, so is the
-   same as the scalar routine, 1.93 ULP:
-   _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
-				       want 0x1.fd774e958236fp+1021.
+   The greatest observed error in special case region is 2.66 + 0.5 ULP:
+   _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023
+				       want 0x1.f9b2d3d22399bp+1023

-   The greatest observed error in the non-special region is 1.54 ULP:
-   _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
-				       want 0x1.f5e2bb8d5c991p+8.  */
+  The greatest observed error in the non-special region is 1.01 + 0.5 ULP:
+  _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3
+				      want 0x1.890b225657f82p+3.  */
 svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
 {
  const struct data *d = ptr_barrier (&data);
@ -94,14 +122,13 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
  svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);

  /* Up to the point that exp overflows, we can use it to calculate cosh by
-     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
-  svfloat64_t t = exp_inline (ax, pg, d);
+     (exp(|x|)/2 + 1) / (2 * exp(|x|)).  */
+  svfloat64_t half_exp = exp_over_two_inline (pg, ax, d);

-  /* Fall back to scalar for any special cases.  */
+  /* Falls back to entirely standalone vectorized special case.  */
  if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, pg, t, special);
+    return special_case (pg, special, ax, half_exp, d);

-  svfloat64_t half_t = svmul_x (svptrue_b64 (), t, 0.5);
-  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
-  return svadd_x (pg, half_t, half_over_t);
+  svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25);
+  return svadd_x (pg, half_exp, inv_twoexp);
 }
--- a/sysdeps/aarch64/fpu/sinh_sve.c
+++ b/sysdeps/aarch64/fpu/sinh_sve.c
@ -18,90 +18,153 @@
   <https://www.gnu.org/licenses/>.  */

 #include "sv_math.h"
-#include "poly_sve_f64.h"

 static const struct data
 {
-  float64_t poly[11];
-  float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
  uint64_t halff;
-  int64_t onef;
-  uint64_t large_bound;
+  double c2, c4;
+  double inv_ln2;
+  double ln2_hi, ln2_lo;
+  double c0, c1, c3;
+  double shift, special_bound, bound;
+  uint64_t expm1_data[20];
 } data = {
-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
-	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
-	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
-	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
-	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
-  .inv_ln2 = 0x1.71547652b82fep0,
-  .m_ln2_hi = -0x1.62e42fefa39efp-1,
-  .m_ln2_lo = -0x1.abc9e3b39803fp-56,
-  .shift = 0x1.8p52,
+  /* Table lookup of 2^(i/64) - 1, for values of i from 0..19.  */
+  .expm1_data = {
+    0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+    0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+    0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+    0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+    0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+  },

+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1p-1,
+  .c1 = 0x1.55555555548f9p-3,
+  .c2 = 0x1.5555555554c22p-5,
+  .c3 = 0x1.111123aaa2fb2p-7,
+  .c4 = 0x1.6c16d77d98e5bp-10,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023.  */
  .halff = 0x3fe0000000000000,
-  .onef = 0x3ff0000000000000,
-  /* 2^9. expm1 helper overflows for large input.  */
-  .large_bound = 0x4080000000000000,
+  .special_bound = 0x1.62e37e7d8ba72p+9,	/* ln(2^(1024 - 1/128)).  */
+  .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64.  */
 };

+/* A specialised FEXPA expm1 that is only valid for positive inputs and
+   has no special cases. Based off the full FEXPA expm1 implementated for
+   _ZGVsMxv_expm1, with a slightly modified file to keep sinh under 3.5ULP.  */
 static inline svfloat64_t
-expm1_inline (svfloat64_t x, svbool_t pg)
+expm1_inline (svbool_t pg, svfloat64_t x)
 {
  const struct data *d = ptr_barrier (&data);

-  /* Reduce argument:
-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
-     where i = round(x / ln2)
-     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
-  svfloat64_t j
-      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
-  svint64_t i = svcvt_s64_x (pg, j);
-  svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
-  f = svmla_x (pg, f, j, d->m_ln2_lo);
-  /* Approximate expm1(f) using polynomial.  */
-  svfloat64_t f2 = svmul_x (pg, f, f);
-  svfloat64_t f4 = svmul_x (pg, f2, f2);
-  svfloat64_t f8 = svmul_x (pg, f4, f4);
-  svfloat64_t p
-      = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
-  /* t = 2^i.  */
-  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
-  /* expm1(x) ~= p * t + (t - 1).  */
-  return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
+
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
+
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+
+  svfloat64_t p;
+  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  p = svmad_x (pg, c34, r2, c12);
+  p = svmad_x (pg, p, r, sv_f64 (d->c0));
+  p = svmad_x (pg, p, r2, r);
+
+  svfloat64_t scale = svexpa (u);
+
+  /* We want to construct expm1(x) = (scale - 1) + scale * poly.
+     However, for values of scale close to 1, scale-1 causes large ULP errors
+     due to cancellation.
+
+     This can be circumvented by using a small lookup for scale-1
+     when our input is below a certain bound, otherwise we can use FEXPA.  */
+  svbool_t is_small = svaclt (pg, x, d->bound);
+
+  /* Index via the input of FEXPA, but we only care about the lower 5 bits.  */
+  svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+  /* Compute scale - 1 from FEXPA, and lookup values where this fails.  */
+  svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+  svuint64_t scalem1_lookup
+      = svld1_gather_index (is_small, d->expm1_data, base_idx);
+
+  /* Select the appropriate scale - 1 value based on x.  */
+  svfloat64_t scalem1
+      = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+
+  /* return expm1 = scale - 1 + (scale * poly).  */
+  return svmla_x (pg, scalem1, scale, p);
 }

+/* Vectorised special case to handle values past where exp_inline overflows.
+   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+   the valid range of inputs, and returns inf for anything past that.  */
 static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svbool_t pg)
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax,
+	      svfloat64_t halfsign, const struct data *d)
 {
-  return sv_call_f64 (sinh, x, x, pg);
+  /* Halves input value, and then check if any cases
+     are still going to overflow.  */
+  ax = svmul_x (special, ax, 0.5);
+  svbool_t is_safe = svaclt (special, ax, d->special_bound);
+
+  svfloat64_t t = expm1_inline (pg, ax);
+
+  /* Finish fastpass to compute values for non-special cases.  */
+  svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+  y = svmul_x (pg, y, halfsign);
+
+  /* Computes special lane, and set remaining overflow lanes to inf.  */
+  svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign);
+  svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t);
+
+  svuint64_t signed_inf
+      = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign),
+		 sv_u64 (0x7ff0000000000000));
+  special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf));
+
+  /* Join resulting vectors together and return.  */
+  return svsel (special, special_y, y);
 }

-/* Approximation for SVE double-precision sinh(x) using expm1.
-   sinh(x) = (exp(x) - exp(-x)) / 2.
-   The greatest observed error is 2.57 ULP:
-   _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
-				       want 0x1.ab929fc64bd63p-2.  */
+/* Approximation for SVE double-precision sinh(x) using FEXPA expm1.
+   Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy.
+   The greatest observed error in the non-special region is 2.63 + 0.5 ULP:
+   _ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2
+				       want 0x1.c3587faf97b09p-2
+
+   The greatest observed error in the special region is 2.65 + 0.5 ULP:
+   _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023
+				       want 0x1.fffd30eea0063p+1023.  */
 svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
 {
  const struct data *d = ptr_barrier (&data);

+  svbool_t special = svacge (pg, x, d->special_bound);
  svfloat64_t ax = svabs_x (pg, x);
  svuint64_t sign
      = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
  svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));

-  svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
-
  /* Fall back to scalar variant for all lanes if any are special.  */
  if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, pg);
+    return special_case (pg, special, ax, halfsign, d);

  /* Up to the point that expm1 overflows, we can use it to calculate sinh
     using a slight rearrangement of the definition of sinh. This allows us to
     retain acceptable accuracy for very small inputs.  */
-  svfloat64_t t = expm1_inline (ax, pg);
+  svfloat64_t t = expm1_inline (pg, ax);
  t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
  return svmul_x (pg, t, halfsign);
 }
--- a/sysdeps/aarch64/fpu/tanh_sve.c
+++ b/sysdeps/aarch64/fpu/tanh_sve.c
@ -18,83 +18,117 @@
   <https://www.gnu.org/licenses/>.  */

 #include "sv_math.h"
-#include "poly_sve_f64.h"

 static const struct data
 {
-  float64_t poly[11];
-  float64_t inv_ln2, ln2_hi, ln2_lo, shift;
-  uint64_t thresh, tiny_bound;
+  double ln2_hi, ln2_lo;
+  double c2, c4;
+  double c0, c1, c3;
+  double two_over_ln2, shift;
+  uint64_t tiny_bound;
+  double large_bound, fexpa_bound;
+  uint64_t e2xm1_data[20];
 } data = {
-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
-	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
-	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
-	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
-	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
-  .inv_ln2 = 0x1.71547652b82fep0,
-  .ln2_hi = -0x1.62e42fefa39efp-1,
-  .ln2_lo = -0x1.abc9e3b39803fp-56,
-  .shift = 0x1.8p52,
-
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1p-1,
+  .c1 = 0x1.55555555548f9p-3,
+  .c2 = 0x1.5555555554c22p-5,
+  .c3 = 0x1.111123aaa2fb2p-7,
+  .c4 = 0x1.6c16d77d98e5bp-10,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .two_over_ln2 = 0x1.71547652b82fep+1,
+  .shift = 0x1.800000000ffc0p+46,   /* 1.5*2^46+1023.  */
  .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27).  */
-  /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound).  */
-  .thresh = 0x01f241bf835f9d5f,
+  .large_bound = 0x1.30fc1931f09cap+4, /* arctanh(1 - 2^-54).  */
+  .fexpa_bound = 0x1.a56ef8ec924ccp-4,	  /* 19/64 * ln2/2.  */
+  /* Table lookup of 2^(i/64) - 1, for values of i from 0..19.  */
+  .e2xm1_data = {
+    0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+    0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+    0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+    0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+    0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+  },
 };

+/* An expm1 inspired, FEXPA based helper function that returns an
+   accurate estimate for e^2x - 1. With no special case or support for
+   negative inputs of x.  */
 static inline svfloat64_t
-expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+e2xm1_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
 {
-  /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
-     the scalar variant of tanh.  */
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->two_over_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);

-  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
-  svfloat64_t j
-      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
-  svint64_t i = svcvt_s64_x (pg, j);
-  svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
-  f = svmla_x (pg, f, j, d->ln2_lo);
+  /* r = x - n * ln2/2, r is in [-ln2/(2N), ln2/(2N)].  */
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t r = svadd_x (pg, x, x);
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);

-  /* Approximate expm1(f) using polynomial.  */
-  svfloat64_t f2 = svmul_x (pg, f, f);
-  svfloat64_t f4 = svmul_x (pg, f2, f2);
-  svfloat64_t p = svmla_x (
-      pg, f, f2,
-      sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);

-  /* t = 2 ^ i.  */
-  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
-  /* expm1(x) = p * t + (t - 1).  */
-  return svmla_x (pg, svsub_x (pg, t, 1), p, t);
+  svfloat64_t p;
+  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  p = svmad_x (pg, c34, r2, c12);
+  p = svmad_x (pg, p, r, sv_f64 (d->c0));
+  p = svmad_x (pg, p, r2, r);
+
+  svfloat64_t scale = svexpa (u);
+
+  /* We want to construct e2xm1(x) = (scale - 1) + scale * poly.
+     However, for values of scale close to 1, scale-1 causes large ULP errors
+     due to cancellation.
+
+     This can be circumvented by using a small lookup for scale-1
+     when our input is below a certain bound, otherwise we can use FEXPA.  */
+  svbool_t is_small = svaclt (pg, x, d->fexpa_bound);
+
+  /* Index via the input of FEXPA, but we only care about the lower 5 bits.  */
+  svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+  /* Compute scale - 1 from FEXPA, and lookup values where this fails.  */
+  svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+  svuint64_t scalem1_lookup
+      = svld1_gather_index (is_small, d->e2xm1_data, base_idx);
+
+  /* Select the appropriate scale - 1 value based on x.  */
+  svfloat64_t scalem1
+      = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+  return svmla_x (pg, scalem1, scale, p);
 }

-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
-{
-  return sv_call_f64 (tanh, x, y, special);
-}
-
-/* SVE approximation for double-precision tanh(x), using a simplified
-   version of expm1. The greatest observed error is 2.77 ULP:
-   _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
-				       want -0x1.bd6a21a163624p-3.  */
+/* SVE approximation for double-precision tanh(x), using a modified version of
+   FEXPA expm1 to calculate e^2x - 1.
+   The greatest observed error is 2.79 + 0.5 ULP:
+   _ZGVsMxv_tanh (0x1.fff868eb3c223p-9) got 0x1.fff7be486cae6p-9
+				       want 0x1.fff7be486cae9p-9.  */
 svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
 {
  const struct data *d = ptr_barrier (&data);

-  svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
+  svbool_t large = svacge (pg, x, d->large_bound);

-  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
-  svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
+  /* We can use tanh(x) = (e^2x - 1) / (e^2x + 1) to approximate tanh.
+  As an additional optimisation, we can ensure more accurate values of e^x
+  by only using positive inputs. So we calculate tanh(|x|), and restore the
+  sign of the input before returning.  */
+  svfloat64_t ax = svabs_x (pg, x);
+  svuint64_t sign_bit
+      = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));

-  svfloat64_t u = svadd_x (pg, x, x);
+  svfloat64_t p = e2xm1_inline (pg, ax, d);
+  svfloat64_t q = svadd_x (pg, p, 2);

-  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
-  svfloat64_t q = expm1_inline (u, pg, d);
-  svfloat64_t qp2 = svadd_x (pg, q, 2);
+  /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly
+     rounded, at this point we can return 1 directly, with sign correction.
+     This will also act as a guard against our approximation overflowing.  */
+  svfloat64_t y = svsel (large, sv_f64 (1.0), svdiv_x (pg, p, q));

-  if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, svdiv_x (pg, q, qp2), special);
-  return svdiv_x (pg, q, qp2);
+  return svreinterpret_f64 (svorr_x (pg, sign_bit, svreinterpret_u64 (y)));
 }