mirror of git://sourceware.org/git/glibc.git
AArch64: Improve codegen for SVE powf
Improve memory access with indexed/unpredicated instructions. Eliminate register spills. Speedup on Neoverse V1: 3%. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
parent
0b195651db
commit
95e807209b
|
@ -26,7 +26,6 @@
|
|||
#define Tlogc __v_powf_data.logc
|
||||
#define Texp __v_powf_data.scale
|
||||
#define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
|
||||
#define Shift 0x1.8p52
|
||||
#define Norm 0x1p23f /* 0x4b000000. */
|
||||
|
||||
/* Overall ULP error bound for pow is 2.6 ulp
|
||||
|
@ -36,7 +35,7 @@ static const struct data
|
|||
double log_poly[4];
|
||||
double exp_poly[3];
|
||||
float uflow_bound, oflow_bound, small_bound;
|
||||
uint32_t sign_bias, sign_mask, subnormal_bias, off;
|
||||
uint32_t sign_bias, subnormal_bias, off;
|
||||
} data = {
|
||||
/* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
|
||||
V_POWF_EXP2_N. */
|
||||
|
@ -53,7 +52,6 @@ static const struct data
|
|||
.small_bound = 0x1p-126f,
|
||||
.off = 0x3f35d000,
|
||||
.sign_bias = SignBias,
|
||||
.sign_mask = 0x80000000,
|
||||
.subnormal_bias = 0x0b800000, /* 23 << 23. */
|
||||
};
|
||||
|
||||
|
@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x)
|
|||
static inline svbool_t
|
||||
sv_zeroinfnan (svbool_t pg, svuint32_t i)
|
||||
{
|
||||
return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
|
||||
return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
|
||||
2u * 0x7f800000 - 1);
|
||||
}
|
||||
|
||||
|
@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z)
|
|||
}
|
||||
|
||||
/* Scalar fallback for special case routines with custom signature. */
|
||||
static inline svfloat32_t
|
||||
sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
|
||||
static svfloat32_t NOINLINE
|
||||
sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
|
||||
{
|
||||
/* Special cases of x or y: zero, inf and nan. */
|
||||
svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
|
||||
svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
|
||||
svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
|
||||
|
||||
svbool_t p = svpfirst (cmp, svpfalse ());
|
||||
while (svptest_any (cmp, p))
|
||||
{
|
||||
|
@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
|
|||
|
||||
/* Polynomial to approximate log1p(r)/ln2. */
|
||||
svfloat64_t logx = A (0);
|
||||
logx = svmla_x (pg, A (1), r, logx);
|
||||
logx = svmla_x (pg, A (2), r, logx);
|
||||
logx = svmla_x (pg, A (3), r, logx);
|
||||
logx = svmla_x (pg, y0, r, logx);
|
||||
logx = svmad_x (pg, r, logx, A (1));
|
||||
logx = svmad_x (pg, r, logx, A (2));
|
||||
logx = svmad_x (pg, r, logx, A (3));
|
||||
logx = svmad_x (pg, r, logx, y0);
|
||||
*pylogx = svmul_x (pg, y, logx);
|
||||
|
||||
/* z - kd is in [-1, 1] in non-nearest rounding modes. */
|
||||
svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
|
||||
svuint64_t ki = svreinterpret_u64 (kd);
|
||||
kd = svsub_x (pg, kd, Shift);
|
||||
svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
|
||||
svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
|
||||
|
||||
r = svsub_x (pg, *pylogx, kd);
|
||||
|
||||
/* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
|
||||
svuint64_t t
|
||||
= svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
|
||||
svuint64_t ski = svadd_x (pg, ki, sign_bias);
|
||||
t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
|
||||
svuint64_t t = svld1_gather_index (
|
||||
svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
|
||||
svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
|
||||
t = svadd_x (svptrue_b64 (), t,
|
||||
svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
|
||||
svfloat64_t s = svreinterpret_f64 (t);
|
||||
|
||||
svfloat64_t p = C (0);
|
||||
p = svmla_x (pg, C (1), p, r);
|
||||
p = svmla_x (pg, C (2), p, r);
|
||||
p = svmla_x (pg, s, p, svmul_x (pg, s, r));
|
||||
p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
|
||||
|
||||
return p;
|
||||
}
|
||||
|
@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
|
|||
{
|
||||
const svbool_t ptrue = svptrue_b64 ();
|
||||
|
||||
/* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
|
||||
order to perform core computation in double precision. */
|
||||
/* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
|
||||
* in order to perform core computation in double precision. */
|
||||
const svbool_t pg_lo = svunpklo (pg);
|
||||
const svbool_t pg_hi = svunpkhi (pg);
|
||||
svfloat64_t y_lo = svcvt_f64_x (
|
||||
ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
|
||||
svfloat64_t y_hi = svcvt_f64_x (
|
||||
ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
|
||||
svfloat32_t z = svreinterpret_f32 (iz);
|
||||
svfloat64_t z_lo = svcvt_f64_x (
|
||||
ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
|
||||
svfloat64_t z_hi = svcvt_f64_x (
|
||||
ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
|
||||
svfloat64_t y_lo
|
||||
= svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
|
||||
svfloat64_t y_hi
|
||||
= svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
|
||||
svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
|
||||
svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
|
||||
svuint64_t i_lo = svunpklo (i);
|
||||
svuint64_t i_hi = svunpkhi (i);
|
||||
svint64_t k_lo = svunpklo (k);
|
||||
|
@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
|
|||
/* Implementation of SVE powf.
|
||||
Provides the same accuracy as AdvSIMD powf, since it relies on the same
|
||||
algorithm. The theoretical maximum error is under 2.60 ULPs.
|
||||
Maximum measured error is 2.56 ULPs:
|
||||
SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
|
||||
want 0x1.fd4b06p+127. */
|
||||
Maximum measured error is 2.57 ULPs:
|
||||
SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
|
||||
want 0x1.fff862p+127. */
|
||||
svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
|||
svuint32_t viy0 = svreinterpret_u32 (y);
|
||||
|
||||
/* Negative x cases. */
|
||||
svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
|
||||
svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
|
||||
svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
|
||||
|
||||
/* Set sign_bias and ix depending on sign of x and nature of y. */
|
||||
svbool_t yisnotint_xisneg = svpfalse_b ();
|
||||
svbool_t yint_or_xpos = pg;
|
||||
svuint32_t sign_bias = sv_u32 (0);
|
||||
svuint32_t vix = vix0;
|
||||
if (__glibc_unlikely (svptest_any (pg, xisneg)))
|
||||
{
|
||||
/* Determine nature of y. */
|
||||
yisnotint_xisneg = svisnotint (xisneg, y);
|
||||
svbool_t yisint_xisneg = svisint (xisneg, y);
|
||||
yint_or_xpos = svisint (xisneg, y);
|
||||
svbool_t yisodd_xisneg = svisodd (xisneg, y);
|
||||
/* ix set to abs(ix) if y is integer. */
|
||||
vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
|
||||
vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
|
||||
/* Set to SignBias if x is negative and y is odd. */
|
||||
sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
|
||||
}
|
||||
|
@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
|||
svbool_t cmp = svorr_z (pg, xspecial, yspecial);
|
||||
|
||||
/* Small cases of x: |x| < 0x1p-126. */
|
||||
svbool_t xsmall = svaclt (pg, x, d->small_bound);
|
||||
if (__glibc_unlikely (svptest_any (pg, xsmall)))
|
||||
svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
|
||||
if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
|
||||
{
|
||||
/* Normalize subnormal x so exponent becomes negative. */
|
||||
svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
|
||||
|
@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
|
|||
vix = svsel (xsmall, vix_norm, vix);
|
||||
}
|
||||
/* Part of core computation carried in working precision. */
|
||||
svuint32_t tmp = svsub_x (pg, vix, d->off);
|
||||
svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
|
||||
V_POWF_LOG2_N - 1);
|
||||
svuint32_t top = svand_x (pg, tmp, 0xff800000);
|
||||
svuint32_t iz = svsub_x (pg, vix, top);
|
||||
svint32_t k
|
||||
= svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
|
||||
svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
|
||||
svuint32_t i = svand_x (
|
||||
yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
|
||||
V_POWF_LOG2_N - 1);
|
||||
svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
|
||||
svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
|
||||
svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
|
||||
(23 - V_POWF_EXP2_TABLE_BITS));
|
||||
|
||||
/* Compute core in extended precision and return intermediate ylogx results to
|
||||
handle cases of underflow and underflow in exp. */
|
||||
/* Compute core in extended precision and return intermediate ylogx results
|
||||
* to handle cases of underflow and underflow in exp. */
|
||||
svfloat32_t ylogx;
|
||||
svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
|
||||
svfloat32_t ret
|
||||
= sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
|
||||
|
||||
/* Handle exp special cases of underflow and overflow. */
|
||||
svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
|
||||
svuint32_t sign
|
||||
= svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
|
||||
svfloat32_t ret_oflow
|
||||
= svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
|
||||
= svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
|
||||
svfloat32_t ret_uflow = svreinterpret_f32 (sign);
|
||||
ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
|
||||
ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
|
||||
ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
|
||||
ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
|
||||
|
||||
/* Cases of finite y and finite negative x. */
|
||||
ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
|
||||
ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
|
||||
|
||||
if (__glibc_unlikely (svptest_any (pg, cmp)))
|
||||
return sv_call_powf_sc (x, y, ret, cmp);
|
||||
if (__glibc_unlikely (svptest_any (cmp, cmp)))
|
||||
return sv_call_powf_sc (x, y, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue