mirror of git://sourceware.org/git/glibc.git
aarch64: Optimise AdvSIMD atanhf
Optimise AdvSIMD atanhf by vectorising the special case. There are asymptotes at x = -1 and x = 1. So return inf for these. Values for which |x| > 1, return NaN. R.Throughput difference on V2 with GCC@15: 58-60% improvement in special cases. No regression in fast pass.
This commit is contained in:
parent
0e734b2b0c
commit
f9bb6bcff6
|
|
@ -23,46 +23,68 @@
|
||||||
const static struct data
|
const static struct data
|
||||||
{
|
{
|
||||||
struct v_log1pf_data log1pf_consts;
|
struct v_log1pf_data log1pf_consts;
|
||||||
|
uint32x4_t abs_mask;
|
||||||
|
float32x4_t half;
|
||||||
uint32x4_t one;
|
uint32x4_t one;
|
||||||
|
float32x4_t pinf, minf, nan;
|
||||||
} data = {
|
} data = {
|
||||||
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
|
.log1pf_consts = V_LOG1PF_CONSTANTS_TABLE,
|
||||||
|
.abs_mask = V4 (0x7fffffff),
|
||||||
|
.half = V4 (0.5f),
|
||||||
.one = V4 (0x3f800000),
|
.one = V4 (0x3f800000),
|
||||||
|
.pinf = V4 (INFINITY),
|
||||||
|
.minf = V4 (-INFINITY),
|
||||||
|
.nan = V4 (NAN),
|
||||||
};
|
};
|
||||||
|
|
||||||
#define AbsMask v_u32 (0x7fffffff)
|
static float32x4_t VPCS_ATTR NOINLINE
|
||||||
#define Half v_u32 (0x3f000000)
|
special_case (float32x4_t x, float32x4_t y, float32x4_t halfsign,
|
||||||
|
uint32x4_t special, const struct data *d)
|
||||||
static float32x4_t NOINLINE VPCS_ATTR
|
|
||||||
special_case (float32x4_t x, float32x4_t halfsign, float32x4_t y,
|
|
||||||
uint32x4_t special)
|
|
||||||
{
|
{
|
||||||
return v_call_f32 (atanhf, vbslq_f32 (AbsMask, x, halfsign),
|
y = log1pf_inline (y, &d->log1pf_consts);
|
||||||
vmulq_f32 (halfsign, y), special);
|
y = vmulq_f32 (halfsign, y);
|
||||||
|
|
||||||
|
float32x4_t float_one = vreinterpretq_f32_u32 (d->one);
|
||||||
|
/* Need the NOT here to make an input of NaN return true. */
|
||||||
|
uint32x4_t ret_nan = vmvnq_u32 (vcaltq_f32 (x, float_one));
|
||||||
|
uint32x4_t ret_pinf = vceqzq_f32 (vsubq_f32 (x, float_one));
|
||||||
|
uint32x4_t ret_minf = vceqzq_f32 (vaddq_f32 (x, float_one));
|
||||||
|
|
||||||
|
y = vbslq_f32 (ret_nan, d->nan, y);
|
||||||
|
y = vbslq_f32 (ret_pinf, d->pinf, y);
|
||||||
|
y = vbslq_f32 (ret_minf, d->minf, y);
|
||||||
|
return y;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Approximation for vector single-precision atanh(x) using modified log1p.
|
/* Approximation for vector single-precision atanh(x) using modified log1p.
|
||||||
The maximum error is 2.93 ULP:
|
The maximum error is 2.93 ULP:
|
||||||
_ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
|
_ZGVnN4v_atanhf(0x1.f43d7p-5) got 0x1.f4dcfep-5
|
||||||
want 0x1.f4dcf8p-5. */
|
want 0x1.f4dcf8p-5. */
|
||||||
VPCS_ATTR float32x4_t NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
|
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atanh) (float32x4_t x)
|
||||||
{
|
{
|
||||||
const struct data *d = ptr_barrier (&data);
|
const struct data *d = ptr_barrier (&data);
|
||||||
|
|
||||||
float32x4_t halfsign = vbslq_f32 (AbsMask, v_f32 (0.5), x);
|
/* This computes a 1/2 with the sign bit being the sign bit of x.
|
||||||
|
This is because:
|
||||||
|
atanh(x) = 1/2 ln ((1+x)/(1-x))
|
||||||
|
= 1/2 log1pf ((1+x)/(1-x) - 1)
|
||||||
|
= 1/2 log1pf (2x/(1-x)). */
|
||||||
|
float32x4_t halfsign = vbslq_f32 (d->abs_mask, d->half, x);
|
||||||
float32x4_t ax = vabsq_f32 (x);
|
float32x4_t ax = vabsq_f32 (x);
|
||||||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||||
|
|
||||||
|
/* Values between but not including -1 and 1 are treated appropriately by the
|
||||||
|
fast pass. -1 and 1 are asymptotes of atanh(x), so they are treated as
|
||||||
|
special cases along with any input outside of these bounds. */
|
||||||
uint32x4_t special = vcgeq_u32 (iax, d->one);
|
uint32x4_t special = vcgeq_u32 (iax, d->one);
|
||||||
|
|
||||||
float32x4_t y = vdivq_f32 (vaddq_f32 (ax, ax),
|
float32x4_t r = vdivq_f32 (vaddq_f32 (ax, ax),
|
||||||
vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
|
vsubq_f32 (vreinterpretq_f32_u32 (d->one), ax));
|
||||||
y = log1pf_inline (y, &d->log1pf_consts);
|
|
||||||
|
|
||||||
/* If exceptions not required, pass ax to special-case for shorter dependency
|
|
||||||
chain. If exceptions are required ax will have been zerofied, so have to
|
|
||||||
pass x. */
|
|
||||||
if (__glibc_unlikely (v_any_u32 (special)))
|
if (__glibc_unlikely (v_any_u32 (special)))
|
||||||
return special_case (ax, halfsign, y, special);
|
return special_case (x, r, halfsign, special, d);
|
||||||
|
|
||||||
|
float32x4_t y = log1pf_inline (r, &d->log1pf_consts);
|
||||||
return vmulq_f32 (halfsign, y);
|
return vmulq_f32 (halfsign, y);
|
||||||
}
|
}
|
||||||
libmvec_hidden_def (V_NAME_F1 (atanh))
|
libmvec_hidden_def (V_NAME_F1 (atanh))
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue