mirror of git://sourceware.org/git/glibc.git
aarch64: Optimise AdvSIMD log10
Optimise AdvSIMD log10 by vectorising the special case. For subnormal input values, use the same scaling technique as described in the single precision equivalent. Then check for inf, nan and x<=0.
This commit is contained in:
parent
59c706b418
commit
e3c40c8db0
|
|
@ -21,11 +21,13 @@
|
||||||
|
|
||||||
static const struct data
|
static const struct data
|
||||||
{
|
{
|
||||||
uint64x2_t off, sign_exp_mask, offset_lower_bound;
|
uint64x2_t off, offset_lower_bound;
|
||||||
uint32x4_t special_bound;
|
uint32x4_t special_bound_u32;
|
||||||
|
uint64x2_t sign_exp_mask, special_bound;
|
||||||
double invln10, log10_2;
|
double invln10, log10_2;
|
||||||
double c1, c3;
|
double c1, c3;
|
||||||
float64x2_t c0, c2, c4;
|
float64x2_t c0, c2, c4;
|
||||||
|
float64x2_t pinf, minf, nan;
|
||||||
} data = {
|
} data = {
|
||||||
/* Computed from log coefficients divided by log(10) then rounded to double
|
/* Computed from log coefficients divided by log(10) then rounded to double
|
||||||
precision. */
|
precision. */
|
||||||
|
|
@ -37,12 +39,17 @@ static const struct data
|
||||||
.invln10 = 0x1.bcb7b1526e50ep-2,
|
.invln10 = 0x1.bcb7b1526e50ep-2,
|
||||||
.log10_2 = 0x1.34413509f79ffp-2,
|
.log10_2 = 0x1.34413509f79ffp-2,
|
||||||
.off = V2 (0x3fe6900900000000),
|
.off = V2 (0x3fe6900900000000),
|
||||||
.sign_exp_mask = V2 (0xfff0000000000000),
|
|
||||||
/* Lower bound is 0x0010000000000000. For
|
/* Lower bound is 0x0010000000000000. For
|
||||||
optimised register use subnormals are detected after offset has been
|
optimised register use subnormals are detected after offset has been
|
||||||
subtracted, so lower bound - offset (which wraps around). */
|
subtracted, so lower bound - offset (which wraps around). */
|
||||||
.offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
|
.offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
|
||||||
.special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */
|
.special_bound_u32
|
||||||
|
= V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000. */
|
||||||
|
.sign_exp_mask = V2 (0xfff0000000000000),
|
||||||
|
.special_bound = V2 (0x7ffe000000000000),
|
||||||
|
.pinf = V2 (INFINITY),
|
||||||
|
.minf = V2 (-INFINITY),
|
||||||
|
.nan = V2 (NAN),
|
||||||
};
|
};
|
||||||
|
|
||||||
#define N (1 << V_LOG10_TABLE_BITS)
|
#define N (1 << V_LOG10_TABLE_BITS)
|
||||||
|
|
@ -70,29 +77,8 @@ lookup (uint64x2_t i)
|
||||||
}
|
}
|
||||||
|
|
||||||
static float64x2_t VPCS_ATTR NOINLINE
|
static float64x2_t VPCS_ATTR NOINLINE
|
||||||
special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
|
log10_core (uint64x2_t u, uint64x2_t u_off, const struct data *d)
|
||||||
uint32x2_t special, const struct data *d)
|
|
||||||
{
|
{
|
||||||
float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
|
|
||||||
return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Fast implementation of double-precision vector log10
|
|
||||||
is a slight modification of double-precision vector log.
|
|
||||||
Max ULP error: < 2.5 ulp (nearest rounding.)
|
|
||||||
Maximum measured at 2.46 ulp for x in [0.96, 0.97]
|
|
||||||
_ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
|
|
||||||
want 0x1.fff6be3cae4b9p-6. */
|
|
||||||
float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
|
|
||||||
{
|
|
||||||
const struct data *d = ptr_barrier (&data);
|
|
||||||
|
|
||||||
/* To avoid having to mov x out of the way, keep u after offset has been
|
|
||||||
applied, and recover x by adding the offset back in the special-case
|
|
||||||
handler. */
|
|
||||||
uint64x2_t u = vreinterpretq_u64_f64 (x);
|
|
||||||
uint64x2_t u_off = vsubq_u64 (u, d->off);
|
|
||||||
|
|
||||||
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
|
/* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
|
||||||
The range is split into N subintervals.
|
The range is split into N subintervals.
|
||||||
The ith subinterval contains z and c is near its center. */
|
The ith subinterval contains z and c is near its center. */
|
||||||
|
|
@ -102,9 +88,6 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
|
||||||
|
|
||||||
struct entry e = lookup (u_off);
|
struct entry e = lookup (u_off);
|
||||||
|
|
||||||
uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
|
|
||||||
vget_low_u32 (d->special_bound));
|
|
||||||
|
|
||||||
/* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
|
/* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2). */
|
||||||
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
|
||||||
float64x2_t kd = vcvtq_f64_s64 (k);
|
float64x2_t kd = vcvtq_f64_s64 (k);
|
||||||
|
|
@ -125,8 +108,63 @@ float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
|
||||||
float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
|
float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
|
||||||
y = vfmaq_f64 (y, d->c4, r2);
|
y = vfmaq_f64 (y, d->c4, r2);
|
||||||
y = vfmaq_f64 (p, y, r2);
|
y = vfmaq_f64 (p, y, r2);
|
||||||
|
|
||||||
if (__glibc_unlikely (v_any_u32h (special)))
|
|
||||||
return special_case (hi, u_off, y, r2, special, d);
|
|
||||||
return vfmaq_f64 (hi, y, r2);
|
return vfmaq_f64 (hi, y, r2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline float64x2_t VPCS_ATTR
|
||||||
|
special_case (uint64x2_t u_off, const struct data *d)
|
||||||
|
{
|
||||||
|
float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
|
||||||
|
/* If x is special, compute 2log(sqrt(x)), else compute log(x).
|
||||||
|
x might be subnormal, and sqrting it makes it larger.
|
||||||
|
And the above two expressions are equivalent. */
|
||||||
|
uint64x2_t special
|
||||||
|
= vcgeq_u64 (vsubq_u64 (u_off, d->offset_lower_bound), d->special_bound);
|
||||||
|
float64x2_t x_sqrt = vbslq_f64 (special, vsqrtq_f64 (x), x);
|
||||||
|
|
||||||
|
u_off = vsubq_u64 (vreinterpretq_u64_f64 (x_sqrt), d->off);
|
||||||
|
|
||||||
|
/* Don't pass u into this, it isn't using x_sqrt. */
|
||||||
|
float64x2_t y = log10_core (vreinterpretq_u64_f64 (x_sqrt), u_off, d);
|
||||||
|
|
||||||
|
y = vbslq_f64 (special, vmulq_f64 (y, v_f64 (2.0f)), y);
|
||||||
|
|
||||||
|
/* Is true for +/- inf, +/- nan as well as all negative numbers. */
|
||||||
|
uint64x2_t is_infnan
|
||||||
|
= vcgeq_u64 (vreinterpretq_u64_f64 (x), vreinterpretq_u64_f64 (d->pinf));
|
||||||
|
uint64x2_t infnan_or_zero = vorrq_u64 (is_infnan, vceqzq_f64 (x));
|
||||||
|
|
||||||
|
y = vbslq_f64 (infnan_or_zero, d->nan, y);
|
||||||
|
uint64x2_t ret_pinf = vceqq_f64 (x, d->pinf);
|
||||||
|
uint64x2_t ret_minf = vceqzq_f64 (x);
|
||||||
|
y = vbslq_f64 (ret_pinf, d->pinf, y);
|
||||||
|
y = vbslq_f64 (ret_minf, d->minf, y);
|
||||||
|
return y;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Fast implementation of double-precision vector log10
|
||||||
|
is a slight modification of double-precision vector log.
|
||||||
|
Max ULP error: < 2.5 ulp (nearest rounding.)
|
||||||
|
Maximum measured at 2.46 ulp for x in [0.96, 0.97]
|
||||||
|
_ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
|
||||||
|
want 0x1.fff6be3cae4b9p-6. */
|
||||||
|
float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
|
||||||
|
{
|
||||||
|
const struct data *d = ptr_barrier (&data);
|
||||||
|
|
||||||
|
/* To avoid having to mov x out of the way, keep u after offset has been
|
||||||
|
applied, and recover x by adding the offset back in the special-case
|
||||||
|
handler. */
|
||||||
|
uint64x2_t u = vreinterpretq_u64_f64 (x);
|
||||||
|
uint64x2_t u_off = vsubq_u64 (u, d->off);
|
||||||
|
|
||||||
|
uint32x2_t special_u32 = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
|
||||||
|
vget_low_u32 (d->special_bound_u32));
|
||||||
|
|
||||||
|
if (__glibc_unlikely (v_any_u32h (special_u32)))
|
||||||
|
return special_case (u_off, d);
|
||||||
|
/* Making what would usually be an inline function into NOINLINE helps
|
||||||
|
performance. This is because the register allocation in the fast pass does
|
||||||
|
not consider what registers are used in the special case function. */
|
||||||
|
return log10_core (u, u_off, d);
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue