mirror of git://sourceware.org/git/glibc.git
AArch64: Fix and improve SVE pow(f) special cases
powf: Update scalar special case function to best use new interface. pow: Make specialcase NOINLINE to prevent str/ldr leaking in fast path. Remove depency in sv_call2, as new callback impl is not a performance gain. Replace with vectorised specialcase since structure of scalar routine is fairly simple. Throughput gain of about 5-10% on V1 for large values and 25% for subnormal `x`. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
parent
e889160273
commit
bb6519de1e
|
|
@ -31,8 +31,8 @@
|
|||
The SVE algorithm drops the tail in the exp computation at the price of
|
||||
a lower accuracy, slightly above 1ULP.
|
||||
The SVE algorithm also drops the special treatement of small (< 2^-65) and
|
||||
large (> 2^63) finite values of |y|, as they only affect non-round to nearest
|
||||
modes.
|
||||
large (> 2^63) finite values of |y|, as they only affect non-round to
|
||||
nearest modes.
|
||||
|
||||
Maximum measured error is 1.04 ULPs:
|
||||
SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
|
||||
|
|
@ -156,42 +156,22 @@ sv_zeroinfnan (svbool_t pg, svuint64_t i)
|
|||
a double. (int32_t)KI is the k used in the argument reduction and exponent
|
||||
adjustment of scale, positive k here means the result may overflow and
|
||||
negative k means the result may underflow. */
|
||||
static inline double
|
||||
specialcase (double tmp, uint64_t sbits, uint64_t ki)
|
||||
{
|
||||
double scale;
|
||||
if ((ki & 0x80000000) == 0)
|
||||
{
|
||||
/* k > 0, the exponent of scale might have overflowed by <= 460. */
|
||||
sbits -= 1009ull << 52;
|
||||
scale = asdouble (sbits);
|
||||
return 0x1p1009 * (scale + scale * tmp);
|
||||
}
|
||||
/* k < 0, need special care in the subnormal range. */
|
||||
sbits += 1022ull << 52;
|
||||
/* Note: sbits is signed scale. */
|
||||
scale = asdouble (sbits);
|
||||
double y = scale + scale * tmp;
|
||||
return 0x1p-1022 * y;
|
||||
}
|
||||
|
||||
/* Scalar fallback for special cases of SVE pow's exp. */
|
||||
static inline svfloat64_t
|
||||
sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
|
||||
svfloat64_t y, svbool_t cmp)
|
||||
specialcase (svfloat64_t tmp, svuint64_t sbits, svuint64_t ki, svbool_t cmp)
|
||||
{
|
||||
svbool_t p = svpfirst (cmp, svpfalse ());
|
||||
while (svptest_any (cmp, p))
|
||||
{
|
||||
double sx1 = svclastb (p, 0, x1);
|
||||
uint64_t su1 = svclastb (p, 0, u1);
|
||||
uint64_t su2 = svclastb (p, 0, u2);
|
||||
double elem = specialcase (sx1, su1, su2);
|
||||
svfloat64_t y2 = sv_f64 (elem);
|
||||
y = svsel (p, y2, y);
|
||||
p = svpnext_b64 (cmp, p);
|
||||
}
|
||||
return y;
|
||||
svbool_t p_pos = svcmpge_n_f64 (cmp, svreinterpret_f64_u64 (ki), 0.0);
|
||||
|
||||
/* Scale up or down depending on sign of k. */
|
||||
svint64_t offset
|
||||
= svsel_s64 (p_pos, sv_s64 (1009ull << 52), sv_s64 (-1022ull << 52));
|
||||
svfloat64_t factor
|
||||
= svsel_f64 (p_pos, sv_f64 (0x1p1009), sv_f64 (0x1p-1022));
|
||||
|
||||
svuint64_t offset_sbits
|
||||
= svsub_u64_x (cmp, sbits, svreinterpret_u64_s64 (offset));
|
||||
svfloat64_t scale = svreinterpret_f64_u64 (offset_sbits);
|
||||
svfloat64_t res = svmad_f64_x (cmp, scale, tmp, scale);
|
||||
return svmul_f64_x (cmp, res, factor);
|
||||
}
|
||||
|
||||
/* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
|
||||
|
|
@ -214,8 +194,8 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
|
|||
|
||||
/* log(x) = k*Ln2 + log(c) + log1p(z/c-1). */
|
||||
/* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
|
||||
that uses array of structures. We also do the lookup earlier in the code to
|
||||
make sure it finishes as early as possible. */
|
||||
that uses array of structures. We also do the lookup earlier in the code
|
||||
to make sure it finishes as early as possible. */
|
||||
svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
|
||||
svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
|
||||
svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
|
||||
|
|
@ -325,14 +305,14 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
|
|||
svbool_t oflow = svcmpge (pg, abstop, HugeExp);
|
||||
oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
|
||||
|
||||
/* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
|
||||
or underflow. */
|
||||
/* Handle underflow and overlow in scale.
|
||||
For large |x| values (512 < |x| < 1024), scale * (1 + TMP) can
|
||||
overflow or underflow. */
|
||||
svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
z = svsel (special, specialcase (tmp, sbits, ki, special), z);
|
||||
|
||||
/* Update result with special and large cases. */
|
||||
z = sv_call_specialcase (tmp, sbits, ki, z, special);
|
||||
|
||||
/* Handle underflow and overflow. */
|
||||
/* Handle underflow and overflow in exp. */
|
||||
svbool_t x_is_neg = svcmplt (pg, x, 0);
|
||||
svuint64_t sign_mask
|
||||
= svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
|
||||
|
|
@ -353,7 +333,7 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
|
|||
}
|
||||
|
||||
static inline double
|
||||
pow_sc (double x, double y)
|
||||
pow_specialcase (double x, double y)
|
||||
{
|
||||
uint64_t ix = asuint64 (x);
|
||||
uint64_t iy = asuint64 (y);
|
||||
|
|
@ -382,6 +362,14 @@ pow_sc (double x, double y)
|
|||
return x;
|
||||
}
|
||||
|
||||
/* Scalar fallback for special case routines with custom signature. */
|
||||
static svfloat64_t NOINLINE
|
||||
sv_pow_specialcase (svfloat64_t x1, svfloat64_t x2, svfloat64_t y,
|
||||
svbool_t cmp)
|
||||
{
|
||||
return sv_call2_f64 (pow_specialcase, x1, x2, y, cmp);
|
||||
}
|
||||
|
||||
svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
|
@ -444,7 +432,7 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
|
|||
|
||||
/* Cases of zero/inf/nan x or y. */
|
||||
if (__glibc_unlikely (svptest_any (svptrue_b64 (), special)))
|
||||
vz = sv_call2_f64 (pow_sc, x, y, vz, special);
|
||||
vz = sv_pow_specialcase (x, y, vz, special);
|
||||
|
||||
return vz;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -116,11 +116,10 @@ zeroinfnan (uint32_t ix)
|
|||
preamble of scalar powf except that we do not update ix and sign_bias. This
|
||||
is done in the preamble of the SVE powf. */
|
||||
static inline float
|
||||
powf_specialcase (float x, float y, float z)
|
||||
powf_specialcase (float x, float y)
|
||||
{
|
||||
uint32_t ix = asuint (x);
|
||||
uint32_t iy = asuint (y);
|
||||
/* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */
|
||||
if (__glibc_unlikely (zeroinfnan (iy)))
|
||||
{
|
||||
if (2 * iy == 0)
|
||||
|
|
@ -142,32 +141,15 @@ powf_specialcase (float x, float y, float z)
|
|||
x2 = -x2;
|
||||
return iy & 0x80000000 ? 1 / x2 : x2;
|
||||
}
|
||||
/* We need a return here in case x<0 and y is integer, but all other tests
|
||||
need to be run. */
|
||||
return z;
|
||||
/* Return x for convenience, but make sure result is never used. */
|
||||
return x;
|
||||
}
|
||||
|
||||
/* Scalar fallback for special case routines with custom signature. */
|
||||
static svfloat32_t NOINLINE
|
||||
sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
|
||||
sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
|
||||
{
|
||||
/* Special cases of x or y: zero, inf and nan. */
|
||||
svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
|
||||
svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
|
||||
svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
|
||||
|
||||
svbool_t p = svpfirst (cmp, svpfalse ());
|
||||
while (svptest_any (cmp, p))
|
||||
{
|
||||
float sx1 = svclastb (p, 0, x1);
|
||||
float sx2 = svclastb (p, 0, x2);
|
||||
float elem = svclastb (p, 0, y);
|
||||
elem = powf_specialcase (sx1, sx2, elem);
|
||||
svfloat32_t y2 = sv_f32 (elem);
|
||||
y = svsel (p, y2, y);
|
||||
p = svpnext_b32 (cmp, p);
|
||||
}
|
||||
return y;
|
||||
return sv_call2_f32 (powf_specialcase, x1, x2, y, cmp);
|
||||
}
|
||||
|
||||
/* Compute core for half of the lanes in double precision. */
|
||||
|
|
@ -330,7 +312,7 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
|||
ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
|
||||
|
||||
if (__glibc_unlikely (svptest_any (cmp, cmp)))
|
||||
return sv_call_powf_sc (x, y, ret);
|
||||
return sv_call_powf_sc (x, y, ret, cmp);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue