AArch64: Fix and improve SVE pow(f) special cases

powf: Update scalar special case function to best use new interface. pow: Make specialcase NOINLINE to prevent str/ldr leaking in fast path. Remove depency in sv_call2, as new callback impl is not a performance gain. Replace with vectorised specialcase since structure of scalar routine is fairly simple. Throughput gain of about 5-10% on V1 for large values and 25% for subnormal `x`. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2025-11-18 15:09:05 +00:00 · 2025-11-18 15:09:05 +00:00 · bb6519de1e
parent e889160273
commit bb6519de1e
2 changed files with 40 additions and 70 deletions
--- a/sysdeps/aarch64/fpu/pow_sve.c
+++ b/sysdeps/aarch64/fpu/pow_sve.c
@ -31,8 +31,8 @@
   The SVE algorithm drops the tail in the exp computation at the price of
   a lower accuracy, slightly above 1ULP.
   The SVE algorithm also drops the special treatement of small (< 2^-65) and
-   large (> 2^63) finite values of |y|, as they only affect non-round to nearest
+   large (> 2^63) finite values of |y|, as they only affect non-round to
-   modes.
+   nearest modes.
   Maximum measured error is 1.04 ULPs:
   SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
@ -156,42 +156,22 @@ sv_zeroinfnan (svbool_t pg, svuint64_t i)
   a double.  (int32_t)KI is the k used in the argument reduction and exponent
   adjustment of scale, positive k here means the result may overflow and
   negative k means the result may underflow.  */
 static inline double
 specialcase (double tmp, uint64_t sbits, uint64_t ki)
 {
  double scale;
  if ((ki & 0x80000000) == 0)
    {
      /* k > 0, the exponent of scale might have overflowed by <= 460.  */
      sbits -= 1009ull << 52;
      scale = asdouble (sbits);
      return 0x1p1009 * (scale + scale * tmp);
    }
  /* k < 0, need special care in the subnormal range.  */
  sbits += 1022ull << 52;
  /* Note: sbits is signed scale.  */
  scale = asdouble (sbits);
  double y = scale + scale * tmp;
  return 0x1p-1022 * y;
 }
 /* Scalar fallback for special cases of SVE pow's exp.  */
 static inline svfloat64_t
-sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
+specialcase (svfloat64_t tmp, svuint64_t sbits, svuint64_t ki, svbool_t cmp)
 		     svfloat64_t y, svbool_t cmp)
 {
-  svbool_t p = svpfirst (cmp, svpfalse ());
+  svbool_t p_pos = svcmpge_n_f64 (cmp, svreinterpret_f64_u64 (ki), 0.0);
-  while (svptest_any (cmp, p))
+
-    {
+  /* Scale up or down depending on sign of k.  */
-      double sx1 = svclastb (p, 0, x1);
+  svint64_t offset
-      uint64_t su1 = svclastb (p, 0, u1);
+      = svsel_s64 (p_pos, sv_s64 (1009ull << 52), sv_s64 (-1022ull << 52));
-      uint64_t su2 = svclastb (p, 0, u2);
+  svfloat64_t factor
-      double elem = specialcase (sx1, su1, su2);
+      = svsel_f64 (p_pos, sv_f64 (0x1p1009), sv_f64 (0x1p-1022));
-      svfloat64_t y2 = sv_f64 (elem);
+
-      y = svsel (p, y2, y);
+  svuint64_t offset_sbits
-      p = svpnext_b64 (cmp, p);
+      = svsub_u64_x (cmp, sbits, svreinterpret_u64_s64 (offset));
-    }
+  svfloat64_t scale = svreinterpret_f64_u64 (offset_sbits);
-  return y;
+  svfloat64_t res = svmad_f64_x (cmp, scale, tmp, scale);
  return svmul_f64_x (cmp, res, factor);
 }
 /* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
@ -214,8 +194,8 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
  /* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */
  /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
-     that uses array of structures. We also do the lookup earlier in the code to
+     that uses array of structures. We also do the lookup earlier in the code
-     make sure it finishes as early as possible.  */
+     to make sure it finishes as early as possible.  */
  svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
  svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
  svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
@ -325,14 +305,14 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
      svbool_t oflow = svcmpge (pg, abstop, HugeExp);
      oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
-      /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
+      /* Handle underflow and overlow in scale.
-    or underflow.  */
+	 For large |x| values (512 < |x| < 1024), scale * (1 + TMP) can
 	 overflow or underflow.  */
      svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
      if (__glibc_unlikely (svptest_any (pg, special)))
 	z = svsel (special, specialcase (tmp, sbits, ki, special), z);
-      /* Update result with special and large cases.  */
+      /* Handle underflow and overflow in exp.  */
      z = sv_call_specialcase (tmp, sbits, ki, z, special);
      /* Handle underflow and overflow.  */
      svbool_t x_is_neg = svcmplt (pg, x, 0);
      svuint64_t sign_mask
 	  = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
@ -353,7 +333,7 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
 }
 static inline double
-pow_sc (double x, double y)
+pow_specialcase (double x, double y)
 {
  uint64_t ix = asuint64 (x);
  uint64_t iy = asuint64 (y);
@ -382,6 +362,14 @@ pow_sc (double x, double y)
  return x;
 }
 /* Scalar fallback for special case routines with custom signature.  */
 static svfloat64_t NOINLINE
 sv_pow_specialcase (svfloat64_t x1, svfloat64_t x2, svfloat64_t y,
 		    svbool_t cmp)
 {
  return sv_call2_f64 (pow_specialcase, x1, x2, y, cmp);
 }
 svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
 {
  const struct data *d = ptr_barrier (&data);
@ -444,7 +432,7 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
  /* Cases of zero/inf/nan x or y.  */
  if (__glibc_unlikely (svptest_any (svptrue_b64 (), special)))
-    vz = sv_call2_f64 (pow_sc, x, y, vz, special);
+    vz = sv_pow_specialcase (x, y, vz, special);
  return vz;
 }
--- a/sysdeps/aarch64/fpu/powf_sve.c
+++ b/sysdeps/aarch64/fpu/powf_sve.c
@ -116,11 +116,10 @@ zeroinfnan (uint32_t ix)
   preamble of scalar powf except that we do not update ix and sign_bias. This
   is done in the preamble of the SVE powf.  */
 static inline float
-powf_specialcase (float x, float y, float z)
+powf_specialcase (float x, float y)
 {
  uint32_t ix = asuint (x);
  uint32_t iy = asuint (y);
  /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan).  */
  if (__glibc_unlikely (zeroinfnan (iy)))
    {
      if (2 * iy == 0)
@ -142,32 +141,15 @@ powf_specialcase (float x, float y, float z)
 	x2 = -x2;
      return iy & 0x80000000 ? 1 / x2 : x2;
    }
-  /* We need a return here in case x<0 and y is integer, but all other tests
+  /* Return x for convenience, but make sure result is never used.  */
-   need to be run.  */
+  return x;
  return z;
 }
 /* Scalar fallback for special case routines with custom signature.  */
 static svfloat32_t NOINLINE
-sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
+sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
 {
-  /* Special cases of x or y: zero, inf and nan.  */
+  return sv_call2_f32 (powf_specialcase, x1, x2, y, cmp);
  svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
  svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
  svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
  svbool_t p = svpfirst (cmp, svpfalse ());
  while (svptest_any (cmp, p))
    {
      float sx1 = svclastb (p, 0, x1);
      float sx2 = svclastb (p, 0, x2);
      float elem = svclastb (p, 0, y);
      elem = powf_specialcase (sx1, sx2, elem);
      svfloat32_t y2 = sv_f32 (elem);
      y = svsel (p, y2, y);
      p = svpnext_b32 (cmp, p);
    }
  return y;
 }
 /* Compute core for half of the lanes in double precision.  */
@ -330,7 +312,7 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
  ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
  if (__glibc_unlikely (svptest_any (cmp, cmp)))
-    return sv_call_powf_sc (x, y, ret);
+    return sv_call_powf_sc (x, y, ret, cmp);
  return ret;
 }