AArch64: Fix and improve SVE pow(f) special cases

powf:

Update scalar special case function to best use new interface.

pow:

Make specialcase NOINLINE to prevent str/ldr leaking in fast path.
Remove depency in sv_call2, as new callback impl is not a
performance gain.
Replace with vectorised specialcase since structure of scalar
routine is fairly simple.

Throughput gain of about 5-10% on V1 for large values and 25% for subnormal `x`.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
This commit is contained in:
Pierre Blanchard 2025-11-18 15:09:05 +00:00 committed by Wilco Dijkstra
parent e889160273
commit bb6519de1e
2 changed files with 40 additions and 70 deletions

View File

@ -31,8 +31,8 @@
The SVE algorithm drops the tail in the exp computation at the price of The SVE algorithm drops the tail in the exp computation at the price of
a lower accuracy, slightly above 1ULP. a lower accuracy, slightly above 1ULP.
The SVE algorithm also drops the special treatement of small (< 2^-65) and The SVE algorithm also drops the special treatement of small (< 2^-65) and
large (> 2^63) finite values of |y|, as they only affect non-round to nearest large (> 2^63) finite values of |y|, as they only affect non-round to
modes. nearest modes.
Maximum measured error is 1.04 ULPs: Maximum measured error is 1.04 ULPs:
SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12) SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
@ -156,42 +156,22 @@ sv_zeroinfnan (svbool_t pg, svuint64_t i)
a double. (int32_t)KI is the k used in the argument reduction and exponent a double. (int32_t)KI is the k used in the argument reduction and exponent
adjustment of scale, positive k here means the result may overflow and adjustment of scale, positive k here means the result may overflow and
negative k means the result may underflow. */ negative k means the result may underflow. */
static inline double
specialcase (double tmp, uint64_t sbits, uint64_t ki)
{
double scale;
if ((ki & 0x80000000) == 0)
{
/* k > 0, the exponent of scale might have overflowed by <= 460. */
sbits -= 1009ull << 52;
scale = asdouble (sbits);
return 0x1p1009 * (scale + scale * tmp);
}
/* k < 0, need special care in the subnormal range. */
sbits += 1022ull << 52;
/* Note: sbits is signed scale. */
scale = asdouble (sbits);
double y = scale + scale * tmp;
return 0x1p-1022 * y;
}
/* Scalar fallback for special cases of SVE pow's exp. */
static inline svfloat64_t static inline svfloat64_t
sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2, specialcase (svfloat64_t tmp, svuint64_t sbits, svuint64_t ki, svbool_t cmp)
svfloat64_t y, svbool_t cmp)
{ {
svbool_t p = svpfirst (cmp, svpfalse ()); svbool_t p_pos = svcmpge_n_f64 (cmp, svreinterpret_f64_u64 (ki), 0.0);
while (svptest_any (cmp, p))
{ /* Scale up or down depending on sign of k. */
double sx1 = svclastb (p, 0, x1); svint64_t offset
uint64_t su1 = svclastb (p, 0, u1); = svsel_s64 (p_pos, sv_s64 (1009ull << 52), sv_s64 (-1022ull << 52));
uint64_t su2 = svclastb (p, 0, u2); svfloat64_t factor
double elem = specialcase (sx1, su1, su2); = svsel_f64 (p_pos, sv_f64 (0x1p1009), sv_f64 (0x1p-1022));
svfloat64_t y2 = sv_f64 (elem);
y = svsel (p, y2, y); svuint64_t offset_sbits
p = svpnext_b64 (cmp, p); = svsub_u64_x (cmp, sbits, svreinterpret_u64_s64 (offset));
} svfloat64_t scale = svreinterpret_f64_u64 (offset_sbits);
return y; svfloat64_t res = svmad_f64_x (cmp, scale, tmp, scale);
return svmul_f64_x (cmp, res, factor);
} }
/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about /* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
@ -214,8 +194,8 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
/* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */ /* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
/* SVE lookup requires 3 separate lookup tables, as opposed to scalar version /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
that uses array of structures. We also do the lookup earlier in the code to that uses array of structures. We also do the lookup earlier in the code
make sure it finishes as early as possible. */ to make sure it finishes as early as possible. */
svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i); svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i); svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i); svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
@ -325,14 +305,14 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
svbool_t oflow = svcmpge (pg, abstop, HugeExp); svbool_t oflow = svcmpge (pg, abstop, HugeExp);
oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow)); oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
/* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow /* Handle underflow and overlow in scale.
or underflow. */ For large |x| values (512 < |x| < 1024), scale * (1 + TMP) can
overflow or underflow. */
svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow)); svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
if (__glibc_unlikely (svptest_any (pg, special)))
z = svsel (special, specialcase (tmp, sbits, ki, special), z);
/* Update result with special and large cases. */ /* Handle underflow and overflow in exp. */
z = sv_call_specialcase (tmp, sbits, ki, z, special);
/* Handle underflow and overflow. */
svbool_t x_is_neg = svcmplt (pg, x, 0); svbool_t x_is_neg = svcmplt (pg, x, 0);
svuint64_t sign_mask svuint64_t sign_mask
= svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS); = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
@ -353,7 +333,7 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
} }
static inline double static inline double
pow_sc (double x, double y) pow_specialcase (double x, double y)
{ {
uint64_t ix = asuint64 (x); uint64_t ix = asuint64 (x);
uint64_t iy = asuint64 (y); uint64_t iy = asuint64 (y);
@ -382,6 +362,14 @@ pow_sc (double x, double y)
return x; return x;
} }
/* Scalar fallback for special case routines with custom signature. */
static svfloat64_t NOINLINE
sv_pow_specialcase (svfloat64_t x1, svfloat64_t x2, svfloat64_t y,
svbool_t cmp)
{
return sv_call2_f64 (pow_specialcase, x1, x2, y, cmp);
}
svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg) svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
{ {
const struct data *d = ptr_barrier (&data); const struct data *d = ptr_barrier (&data);
@ -444,7 +432,7 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
/* Cases of zero/inf/nan x or y. */ /* Cases of zero/inf/nan x or y. */
if (__glibc_unlikely (svptest_any (svptrue_b64 (), special))) if (__glibc_unlikely (svptest_any (svptrue_b64 (), special)))
vz = sv_call2_f64 (pow_sc, x, y, vz, special); vz = sv_pow_specialcase (x, y, vz, special);
return vz; return vz;
} }

View File

@ -116,11 +116,10 @@ zeroinfnan (uint32_t ix)
preamble of scalar powf except that we do not update ix and sign_bias. This preamble of scalar powf except that we do not update ix and sign_bias. This
is done in the preamble of the SVE powf. */ is done in the preamble of the SVE powf. */
static inline float static inline float
powf_specialcase (float x, float y, float z) powf_specialcase (float x, float y)
{ {
uint32_t ix = asuint (x); uint32_t ix = asuint (x);
uint32_t iy = asuint (y); uint32_t iy = asuint (y);
/* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */
if (__glibc_unlikely (zeroinfnan (iy))) if (__glibc_unlikely (zeroinfnan (iy)))
{ {
if (2 * iy == 0) if (2 * iy == 0)
@ -142,32 +141,15 @@ powf_specialcase (float x, float y, float z)
x2 = -x2; x2 = -x2;
return iy & 0x80000000 ? 1 / x2 : x2; return iy & 0x80000000 ? 1 / x2 : x2;
} }
/* We need a return here in case x<0 and y is integer, but all other tests /* Return x for convenience, but make sure result is never used. */
need to be run. */ return x;
return z;
} }
/* Scalar fallback for special case routines with custom signature. */ /* Scalar fallback for special case routines with custom signature. */
static svfloat32_t NOINLINE static svfloat32_t NOINLINE
sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y) sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
{ {
/* Special cases of x or y: zero, inf and nan. */ return sv_call2_f32 (powf_specialcase, x1, x2, y, cmp);
svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
svbool_t p = svpfirst (cmp, svpfalse ());
while (svptest_any (cmp, p))
{
float sx1 = svclastb (p, 0, x1);
float sx2 = svclastb (p, 0, x2);
float elem = svclastb (p, 0, y);
elem = powf_specialcase (sx1, sx2, elem);
svfloat32_t y2 = sv_f32 (elem);
y = svsel (p, y2, y);
p = svpnext_b32 (cmp, p);
}
return y;
} }
/* Compute core for half of the lanes in double precision. */ /* Compute core for half of the lanes in double precision. */
@ -330,7 +312,7 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf (""))); ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
if (__glibc_unlikely (svptest_any (cmp, cmp))) if (__glibc_unlikely (svptest_any (cmp, cmp)))
return sv_call_powf_sc (x, y, ret); return sv_call_powf_sc (x, y, ret, cmp);
return ret; return ret;
} }