AArch64: Fix and improve SVE pow(f) special cases

powf:

Update scalar special case function to best use new interface.

pow:

Make specialcase NOINLINE to prevent str/ldr leaking in fast path.
Remove depency in sv_call2, as new callback impl is not a
performance gain.
Replace with vectorised specialcase since structure of scalar
routine is fairly simple.

Throughput gain of about 5-10% on V1 for large values and 25% for subnormal `x`.

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
This commit is contained in:
Pierre Blanchard 2025-11-18 15:09:05 +00:00 committed by Wilco Dijkstra
parent e889160273
commit bb6519de1e
2 changed files with 40 additions and 70 deletions

View File

@ -31,8 +31,8 @@
The SVE algorithm drops the tail in the exp computation at the price of
a lower accuracy, slightly above 1ULP.
The SVE algorithm also drops the special treatement of small (< 2^-65) and
large (> 2^63) finite values of |y|, as they only affect non-round to nearest
modes.
large (> 2^63) finite values of |y|, as they only affect non-round to
nearest modes.
Maximum measured error is 1.04 ULPs:
SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
@ -156,42 +156,22 @@ sv_zeroinfnan (svbool_t pg, svuint64_t i)
a double. (int32_t)KI is the k used in the argument reduction and exponent
adjustment of scale, positive k here means the result may overflow and
negative k means the result may underflow. */
static inline double
specialcase (double tmp, uint64_t sbits, uint64_t ki)
{
double scale;
if ((ki & 0x80000000) == 0)
{
/* k > 0, the exponent of scale might have overflowed by <= 460. */
sbits -= 1009ull << 52;
scale = asdouble (sbits);
return 0x1p1009 * (scale + scale * tmp);
}
/* k < 0, need special care in the subnormal range. */
sbits += 1022ull << 52;
/* Note: sbits is signed scale. */
scale = asdouble (sbits);
double y = scale + scale * tmp;
return 0x1p-1022 * y;
}
/* Scalar fallback for special cases of SVE pow's exp. */
static inline svfloat64_t
sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
svfloat64_t y, svbool_t cmp)
specialcase (svfloat64_t tmp, svuint64_t sbits, svuint64_t ki, svbool_t cmp)
{
svbool_t p = svpfirst (cmp, svpfalse ());
while (svptest_any (cmp, p))
{
double sx1 = svclastb (p, 0, x1);
uint64_t su1 = svclastb (p, 0, u1);
uint64_t su2 = svclastb (p, 0, u2);
double elem = specialcase (sx1, su1, su2);
svfloat64_t y2 = sv_f64 (elem);
y = svsel (p, y2, y);
p = svpnext_b64 (cmp, p);
}
return y;
svbool_t p_pos = svcmpge_n_f64 (cmp, svreinterpret_f64_u64 (ki), 0.0);
/* Scale up or down depending on sign of k. */
svint64_t offset
= svsel_s64 (p_pos, sv_s64 (1009ull << 52), sv_s64 (-1022ull << 52));
svfloat64_t factor
= svsel_f64 (p_pos, sv_f64 (0x1p1009), sv_f64 (0x1p-1022));
svuint64_t offset_sbits
= svsub_u64_x (cmp, sbits, svreinterpret_u64_s64 (offset));
svfloat64_t scale = svreinterpret_f64_u64 (offset_sbits);
svfloat64_t res = svmad_f64_x (cmp, scale, tmp, scale);
return svmul_f64_x (cmp, res, factor);
}
/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
@ -214,8 +194,8 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
/* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
/* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
that uses array of structures. We also do the lookup earlier in the code to
make sure it finishes as early as possible. */
that uses array of structures. We also do the lookup earlier in the code
to make sure it finishes as early as possible. */
svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
@ -325,14 +305,14 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
svbool_t oflow = svcmpge (pg, abstop, HugeExp);
oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
/* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
or underflow. */
/* Handle underflow and overlow in scale.
For large |x| values (512 < |x| < 1024), scale * (1 + TMP) can
overflow or underflow. */
svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
if (__glibc_unlikely (svptest_any (pg, special)))
z = svsel (special, specialcase (tmp, sbits, ki, special), z);
/* Update result with special and large cases. */
z = sv_call_specialcase (tmp, sbits, ki, z, special);
/* Handle underflow and overflow. */
/* Handle underflow and overflow in exp. */
svbool_t x_is_neg = svcmplt (pg, x, 0);
svuint64_t sign_mask
= svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
@ -353,7 +333,7 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
}
static inline double
pow_sc (double x, double y)
pow_specialcase (double x, double y)
{
uint64_t ix = asuint64 (x);
uint64_t iy = asuint64 (y);
@ -382,6 +362,14 @@ pow_sc (double x, double y)
return x;
}
/* Scalar fallback for special case routines with custom signature. */
static svfloat64_t NOINLINE
sv_pow_specialcase (svfloat64_t x1, svfloat64_t x2, svfloat64_t y,
svbool_t cmp)
{
return sv_call2_f64 (pow_specialcase, x1, x2, y, cmp);
}
svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
{
const struct data *d = ptr_barrier (&data);
@ -444,7 +432,7 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
/* Cases of zero/inf/nan x or y. */
if (__glibc_unlikely (svptest_any (svptrue_b64 (), special)))
vz = sv_call2_f64 (pow_sc, x, y, vz, special);
vz = sv_pow_specialcase (x, y, vz, special);
return vz;
}

View File

@ -116,11 +116,10 @@ zeroinfnan (uint32_t ix)
preamble of scalar powf except that we do not update ix and sign_bias. This
is done in the preamble of the SVE powf. */
static inline float
powf_specialcase (float x, float y, float z)
powf_specialcase (float x, float y)
{
uint32_t ix = asuint (x);
uint32_t iy = asuint (y);
/* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */
if (__glibc_unlikely (zeroinfnan (iy)))
{
if (2 * iy == 0)
@ -142,32 +141,15 @@ powf_specialcase (float x, float y, float z)
x2 = -x2;
return iy & 0x80000000 ? 1 / x2 : x2;
}
/* We need a return here in case x<0 and y is integer, but all other tests
need to be run. */
return z;
/* Return x for convenience, but make sure result is never used. */
return x;
}
/* Scalar fallback for special case routines with custom signature. */
static svfloat32_t NOINLINE
sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
{
/* Special cases of x or y: zero, inf and nan. */
svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
svbool_t p = svpfirst (cmp, svpfalse ());
while (svptest_any (cmp, p))
{
float sx1 = svclastb (p, 0, x1);
float sx2 = svclastb (p, 0, x2);
float elem = svclastb (p, 0, y);
elem = powf_specialcase (sx1, sx2, elem);
svfloat32_t y2 = sv_f32 (elem);
y = svsel (p, y2, y);
p = svpnext_b32 (cmp, p);
}
return y;
return sv_call2_f32 (powf_specialcase, x1, x2, y, cmp);
}
/* Compute core for half of the lanes in double precision. */
@ -330,7 +312,7 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
if (__glibc_unlikely (svptest_any (cmp, cmp)))
return sv_call_powf_sc (x, y, ret);
return sv_call_powf_sc (x, y, ret, cmp);
return ret;
}