mirror of git://sourceware.org/git/glibc.git
AArch64: Improve codegen in users of ADVSIMD expm1f helper
Rearrange operations so MOV is not necessary in reduction or around the special-case handler. Reduce memory access by using more indexed MLAs in polynomial. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
parent
5bc100bd4b
commit
7900ac490d
|
|
@ -18,27 +18,18 @@
|
||||||
<https://www.gnu.org/licenses/>. */
|
<https://www.gnu.org/licenses/>. */
|
||||||
|
|
||||||
#include "v_math.h"
|
#include "v_math.h"
|
||||||
#include "poly_advsimd_f32.h"
|
#include "v_expm1f_inline.h"
|
||||||
|
|
||||||
static const struct data
|
static const struct data
|
||||||
{
|
{
|
||||||
float32x4_t poly[5];
|
struct v_expm1f_data d;
|
||||||
float invln2_and_ln2[4];
|
|
||||||
float32x4_t shift;
|
|
||||||
int32x4_t exponent_bias;
|
|
||||||
#if WANT_SIMD_EXCEPT
|
#if WANT_SIMD_EXCEPT
|
||||||
uint32x4_t thresh;
|
uint32x4_t thresh;
|
||||||
#else
|
#else
|
||||||
float32x4_t oflow_bound;
|
float32x4_t oflow_bound;
|
||||||
#endif
|
#endif
|
||||||
} data = {
|
} data = {
|
||||||
/* Generated using fpminimax with degree=5 in [-log(2)/2, log(2)/2]. */
|
.d = V_EXPM1F_DATA,
|
||||||
.poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5),
|
|
||||||
V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) },
|
|
||||||
/* Stores constants: invln2, ln2_hi, ln2_lo, 0. */
|
|
||||||
.invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 },
|
|
||||||
.shift = V4 (0x1.8p23f),
|
|
||||||
.exponent_bias = V4 (0x3f800000),
|
|
||||||
#if !WANT_SIMD_EXCEPT
|
#if !WANT_SIMD_EXCEPT
|
||||||
/* Value above which expm1f(x) should overflow. Absolute value of the
|
/* Value above which expm1f(x) should overflow. Absolute value of the
|
||||||
underflow bound is greater than this, so it catches both cases - there is
|
underflow bound is greater than this, so it catches both cases - there is
|
||||||
|
|
@ -55,67 +46,38 @@ static const struct data
|
||||||
#define TinyBound v_u32 (0x34000000 << 1)
|
#define TinyBound v_u32 (0x34000000 << 1)
|
||||||
|
|
||||||
static float32x4_t VPCS_ATTR NOINLINE
|
static float32x4_t VPCS_ATTR NOINLINE
|
||||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
special_case (float32x4_t x, uint32x4_t special, const struct data *d)
|
||||||
{
|
{
|
||||||
return v_call_f32 (expm1f, x, y, special);
|
return v_call_f32 (
|
||||||
|
expm1f, x, expm1f_inline (v_zerofy_f32 (x, special), &d->d), special);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Single-precision vector exp(x) - 1 function.
|
/* Single-precision vector exp(x) - 1 function.
|
||||||
The maximum error is 1.51 ULP:
|
The maximum error is 1.62 ULP:
|
||||||
_ZGVnN4v_expm1f (0x1.8baa96p-2) got 0x1.e2fb9p-2
|
_ZGVnN4v_expm1f(0x1.85f83p-2) got 0x1.da9f4p-2
|
||||||
want 0x1.e2fb94p-2. */
|
want 0x1.da9f44p-2. */
|
||||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
|
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (expm1) (float32x4_t x)
|
||||||
{
|
{
|
||||||
const struct data *d = ptr_barrier (&data);
|
const struct data *d = ptr_barrier (&data);
|
||||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
|
||||||
|
|
||||||
#if WANT_SIMD_EXCEPT
|
#if WANT_SIMD_EXCEPT
|
||||||
|
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||||
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|
/* If fp exceptions are to be triggered correctly, fall back to scalar for
|
||||||
|x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
|
|x| < 2^-23, |x| > oflow_bound, Inf & NaN. Add ix to itself for
|
||||||
shift-left by 1, and compare with thresh which was left-shifted offline -
|
shift-left by 1, and compare with thresh which was left-shifted offline -
|
||||||
this is effectively an absolute compare. */
|
this is effectively an absolute compare. */
|
||||||
uint32x4_t special
|
uint32x4_t special
|
||||||
= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
|
= vcgeq_u32 (vsubq_u32 (vaddq_u32 (ix, ix), TinyBound), d->thresh);
|
||||||
if (__glibc_unlikely (v_any_u32 (special)))
|
|
||||||
x = v_zerofy_f32 (x, special);
|
|
||||||
#else
|
#else
|
||||||
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
|
/* Handles very large values (+ve and -ve), +/-NaN, +/-Inf. */
|
||||||
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
|
uint32x4_t special = vcagtq_f32 (x, d->oflow_bound);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Reduce argument to smaller range:
|
|
||||||
Let i = round(x / ln2)
|
|
||||||
and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
|
|
||||||
exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
|
|
||||||
where 2^i is exact because i is an integer. */
|
|
||||||
float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
|
||||||
float32x4_t j
|
|
||||||
= vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
|
|
||||||
int32x4_t i = vcvtq_s32_f32 (j);
|
|
||||||
float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
|
|
||||||
f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
|
|
||||||
|
|
||||||
/* Approximate expm1(f) using polynomial.
|
|
||||||
Taylor expansion for expm1(x) has the form:
|
|
||||||
x + ax^2 + bx^3 + cx^4 ....
|
|
||||||
So we calculate the polynomial P(f) = a + bf + cf^2 + ...
|
|
||||||
and assemble the approximation expm1(f) ~= f + f^2 * P(f). */
|
|
||||||
float32x4_t p = v_horner_4_f32 (f, d->poly);
|
|
||||||
p = vfmaq_f32 (f, vmulq_f32 (f, f), p);
|
|
||||||
|
|
||||||
/* Assemble the result.
|
|
||||||
expm1(x) ~= 2^i * (p + 1) - 1
|
|
||||||
Let t = 2^i. */
|
|
||||||
int32x4_t u = vaddq_s32 (vshlq_n_s32 (i, 23), d->exponent_bias);
|
|
||||||
float32x4_t t = vreinterpretq_f32_s32 (u);
|
|
||||||
|
|
||||||
if (__glibc_unlikely (v_any_u32 (special)))
|
if (__glibc_unlikely (v_any_u32 (special)))
|
||||||
return special_case (vreinterpretq_f32_u32 (ix),
|
return special_case (x, special, d);
|
||||||
vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t),
|
|
||||||
special);
|
|
||||||
|
|
||||||
/* expm1(x) ~= p * t + (t - 1). */
|
/* expm1(x) ~= p * t + (t - 1). */
|
||||||
return vfmaq_f32 (vsubq_f32 (t, v_f32 (1.0f)), p, t);
|
return expm1f_inline (x, &d->d);
|
||||||
}
|
}
|
||||||
libmvec_hidden_def (V_NAME_F1 (expm1))
|
libmvec_hidden_def (V_NAME_F1 (expm1))
|
||||||
HALF_WIDTH_ALIAS_F1 (expm1)
|
HALF_WIDTH_ALIAS_F1 (expm1)
|
||||||
|
|
|
||||||
|
|
@ -23,15 +23,13 @@
|
||||||
static const struct data
|
static const struct data
|
||||||
{
|
{
|
||||||
struct v_expm1f_data expm1f_consts;
|
struct v_expm1f_data expm1f_consts;
|
||||||
uint32x4_t halff;
|
|
||||||
#if WANT_SIMD_EXCEPT
|
#if WANT_SIMD_EXCEPT
|
||||||
uint32x4_t tiny_bound, thresh;
|
uint32x4_t tiny_bound, thresh;
|
||||||
#else
|
#else
|
||||||
uint32x4_t oflow_bound;
|
float32x4_t oflow_bound;
|
||||||
#endif
|
#endif
|
||||||
} data = {
|
} data = {
|
||||||
.expm1f_consts = V_EXPM1F_DATA,
|
.expm1f_consts = V_EXPM1F_DATA,
|
||||||
.halff = V4 (0x3f000000),
|
|
||||||
#if WANT_SIMD_EXCEPT
|
#if WANT_SIMD_EXCEPT
|
||||||
/* 0x1.6a09e8p-32, below which expm1f underflows. */
|
/* 0x1.6a09e8p-32, below which expm1f underflows. */
|
||||||
.tiny_bound = V4 (0x2fb504f4),
|
.tiny_bound = V4 (0x2fb504f4),
|
||||||
|
|
@ -39,14 +37,15 @@ static const struct data
|
||||||
.thresh = V4 (0x12fbbbb3),
|
.thresh = V4 (0x12fbbbb3),
|
||||||
#else
|
#else
|
||||||
/* 0x1.61814ep+6, above which expm1f helper overflows. */
|
/* 0x1.61814ep+6, above which expm1f helper overflows. */
|
||||||
.oflow_bound = V4 (0x42b0c0a7),
|
.oflow_bound = V4 (0x1.61814ep+6),
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
static float32x4_t NOINLINE VPCS_ATTR
|
static float32x4_t NOINLINE VPCS_ATTR
|
||||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
special_case (float32x4_t x, float32x4_t t, float32x4_t halfsign,
|
||||||
|
uint32x4_t special)
|
||||||
{
|
{
|
||||||
return v_call_f32 (sinhf, x, y, special);
|
return v_call_f32 (sinhf, x, vmulq_f32 (t, halfsign), special);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Approximation for vector single-precision sinh(x) using expm1.
|
/* Approximation for vector single-precision sinh(x) using expm1.
|
||||||
|
|
@ -60,15 +59,15 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
|
||||||
|
|
||||||
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
uint32x4_t ix = vreinterpretq_u32_f32 (x);
|
||||||
float32x4_t ax = vabsq_f32 (x);
|
float32x4_t ax = vabsq_f32 (x);
|
||||||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
float32x4_t halfsign = vreinterpretq_f32_u32 (
|
||||||
uint32x4_t sign = veorq_u32 (ix, iax);
|
vbslq_u32 (v_u32 (0x80000000), ix, vreinterpretq_u32_f32 (v_f32 (0.5))));
|
||||||
float32x4_t halfsign = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->halff));
|
|
||||||
|
|
||||||
#if WANT_SIMD_EXCEPT
|
#if WANT_SIMD_EXCEPT
|
||||||
uint32x4_t special = vcgeq_u32 (vsubq_u32 (iax, d->tiny_bound), d->thresh);
|
uint32x4_t special = vcgeq_u32 (
|
||||||
|
vsubq_u32 (vreinterpretq_u32_f32 (ax), d->tiny_bound), d->thresh);
|
||||||
ax = v_zerofy_f32 (ax, special);
|
ax = v_zerofy_f32 (ax, special);
|
||||||
#else
|
#else
|
||||||
uint32x4_t special = vcgeq_u32 (iax, d->oflow_bound);
|
uint32x4_t special = vcageq_f32 (x, d->oflow_bound);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
|
/* Up to the point that expm1f overflows, we can use it to calculate sinhf
|
||||||
|
|
@ -80,7 +79,7 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (sinh) (float32x4_t x)
|
||||||
/* Fall back to the scalar variant for any lanes that should trigger an
|
/* Fall back to the scalar variant for any lanes that should trigger an
|
||||||
exception. */
|
exception. */
|
||||||
if (__glibc_unlikely (v_any_u32 (special)))
|
if (__glibc_unlikely (v_any_u32 (special)))
|
||||||
return special_case (x, vmulq_f32 (t, halfsign), special);
|
return special_case (x, t, halfsign, special);
|
||||||
|
|
||||||
return vmulq_f32 (t, halfsign);
|
return vmulq_f32 (t, halfsign);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -28,13 +28,16 @@ static const struct data
|
||||||
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
|
/* 0x1.205966p+3, above which tanhf rounds to 1 (or -1 for negative). */
|
||||||
.boring_bound = V4 (0x41102cb3),
|
.boring_bound = V4 (0x41102cb3),
|
||||||
.large_bound = V4 (0x7f800000),
|
.large_bound = V4 (0x7f800000),
|
||||||
.onef = V4 (0x3f800000),
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static float32x4_t NOINLINE VPCS_ATTR
|
static float32x4_t NOINLINE VPCS_ATTR
|
||||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
|
special_case (float32x4_t x, uint32x4_t is_boring, float32x4_t boring,
|
||||||
|
float32x4_t q, uint32x4_t special)
|
||||||
{
|
{
|
||||||
return v_call_f32 (tanhf, x, y, special);
|
return v_call_f32 (
|
||||||
|
tanhf, x,
|
||||||
|
vbslq_f32 (is_boring, boring, vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)))),
|
||||||
|
special);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Approximation for single-precision vector tanh(x), using a simplified
|
/* Approximation for single-precision vector tanh(x), using a simplified
|
||||||
|
|
@ -50,7 +53,9 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
|
||||||
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
uint32x4_t iax = vreinterpretq_u32_f32 (ax);
|
||||||
uint32x4_t sign = veorq_u32 (ix, iax);
|
uint32x4_t sign = veorq_u32 (ix, iax);
|
||||||
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
|
uint32x4_t is_boring = vcgtq_u32 (iax, d->boring_bound);
|
||||||
float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (sign, d->onef));
|
/* expm1 exponent bias is 1.0f reinterpreted to int. */
|
||||||
|
float32x4_t boring = vreinterpretq_f32_u32 (vorrq_u32 (
|
||||||
|
sign, vreinterpretq_u32_s32 (d->expm1f_consts.exponent_bias)));
|
||||||
|
|
||||||
#if WANT_SIMD_EXCEPT
|
#if WANT_SIMD_EXCEPT
|
||||||
/* If fp exceptions are to be triggered properly, set all special and boring
|
/* If fp exceptions are to be triggered properly, set all special and boring
|
||||||
|
|
@ -66,10 +71,12 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanh) (float32x4_t x)
|
||||||
|
|
||||||
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
/* tanh(x) = (e^2x - 1) / (e^2x + 1). */
|
||||||
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
|
float32x4_t q = expm1f_inline (vmulq_n_f32 (x, 2), &d->expm1f_consts);
|
||||||
float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
|
|
||||||
if (__glibc_unlikely (v_any_u32 (special)))
|
if (__glibc_unlikely (v_any_u32 (special)))
|
||||||
return special_case (vreinterpretq_f32_u32 (ix),
|
return special_case (vreinterpretq_f32_u32 (ix), is_boring, boring, q,
|
||||||
vbslq_f32 (is_boring, boring, y), special);
|
special);
|
||||||
|
|
||||||
|
float32x4_t y = vdivq_f32 (q, vaddq_f32 (q, v_f32 (2.0)));
|
||||||
return vbslq_f32 (is_boring, boring, y);
|
return vbslq_f32 (is_boring, boring, y);
|
||||||
}
|
}
|
||||||
libmvec_hidden_def (V_NAME_F1 (tanh))
|
libmvec_hidden_def (V_NAME_F1 (tanh))
|
||||||
|
|
|
||||||
|
|
@ -21,48 +21,47 @@
|
||||||
#define AARCH64_FPU_V_EXPM1F_INLINE_H
|
#define AARCH64_FPU_V_EXPM1F_INLINE_H
|
||||||
|
|
||||||
#include "v_math.h"
|
#include "v_math.h"
|
||||||
#include "poly_advsimd_f32.h"
|
#include "math_config.h"
|
||||||
|
|
||||||
struct v_expm1f_data
|
struct v_expm1f_data
|
||||||
{
|
{
|
||||||
float32x4_t poly[5];
|
float32x4_t c0, c2;
|
||||||
float invln2_and_ln2[4];
|
|
||||||
float32x4_t shift;
|
|
||||||
int32x4_t exponent_bias;
|
int32x4_t exponent_bias;
|
||||||
|
float c1, c3, inv_ln2, c4;
|
||||||
|
float ln2_hi, ln2_lo;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
|
/* Coefficients generated using fpminimax with degree=5 in [-log(2)/2,
|
||||||
log(2)/2]. Exponent bias is asuint(1.0f).
|
log(2)/2]. Exponent bias is asuint(1.0f). */
|
||||||
invln2_and_ln2 Stores constants: invln2, ln2_lo, ln2_hi, 0. */
|
|
||||||
#define V_EXPM1F_DATA \
|
#define V_EXPM1F_DATA \
|
||||||
{ \
|
{ \
|
||||||
.poly = { V4 (0x1.fffffep-2), V4 (0x1.5554aep-3), V4 (0x1.555736p-5), \
|
.c0 = V4 (0x1.fffffep-2), .c1 = 0x1.5554aep-3, .c2 = V4 (0x1.555736p-5), \
|
||||||
V4 (0x1.12287cp-7), V4 (0x1.6b55a2p-10) }, \
|
.c3 = 0x1.12287cp-7, .c4 = 0x1.6b55a2p-10, \
|
||||||
.shift = V4 (0x1.8p23f), .exponent_bias = V4 (0x3f800000), \
|
.exponent_bias = V4 (0x3f800000), .inv_ln2 = 0x1.715476p+0f, \
|
||||||
.invln2_and_ln2 = { 0x1.715476p+0f, 0x1.62e4p-1f, 0x1.7f7d1cp-20f, 0 }, \
|
.ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f, \
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline float32x4_t
|
static inline float32x4_t
|
||||||
expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
|
expm1f_inline (float32x4_t x, const struct v_expm1f_data *d)
|
||||||
{
|
{
|
||||||
/* Helper routine for calculating exp(x) - 1.
|
/* Helper routine for calculating exp(x) - 1. */
|
||||||
Copied from v_expm1f_1u6.c, with all special-case handling removed - the
|
|
||||||
calling routine should handle special values if required. */
|
float32x2_t ln2 = vld1_f32 (&d->ln2_hi);
|
||||||
|
float32x4_t lane_consts = vld1q_f32 (&d->c1);
|
||||||
|
|
||||||
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
/* Reduce argument: f in [-ln2/2, ln2/2], i is exact. */
|
||||||
float32x4_t invln2_and_ln2 = vld1q_f32 (d->invln2_and_ln2);
|
float32x4_t j = vrndaq_f32 (vmulq_laneq_f32 (x, lane_consts, 2));
|
||||||
float32x4_t j
|
|
||||||
= vsubq_f32 (vfmaq_laneq_f32 (d->shift, x, invln2_and_ln2, 0), d->shift);
|
|
||||||
int32x4_t i = vcvtq_s32_f32 (j);
|
int32x4_t i = vcvtq_s32_f32 (j);
|
||||||
float32x4_t f = vfmsq_laneq_f32 (x, j, invln2_and_ln2, 1);
|
float32x4_t f = vfmsq_lane_f32 (x, j, ln2, 0);
|
||||||
f = vfmsq_laneq_f32 (f, j, invln2_and_ln2, 2);
|
f = vfmsq_lane_f32 (f, j, ln2, 1);
|
||||||
|
|
||||||
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f).
|
/* Approximate expm1(f) with polynomial P, expm1(f) ~= f + f^2 * P(f). */
|
||||||
Uses Estrin scheme, where the main _ZGVnN4v_expm1f routine uses
|
|
||||||
Horner. */
|
|
||||||
float32x4_t f2 = vmulq_f32 (f, f);
|
float32x4_t f2 = vmulq_f32 (f, f);
|
||||||
float32x4_t f4 = vmulq_f32 (f2, f2);
|
float32x4_t f4 = vmulq_f32 (f2, f2);
|
||||||
float32x4_t p = v_estrin_4_f32 (f, f2, f4, d->poly);
|
float32x4_t p01 = vfmaq_laneq_f32 (d->c0, f, lane_consts, 0);
|
||||||
|
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, f, lane_consts, 1);
|
||||||
|
float32x4_t p = vfmaq_f32 (p01, f2, p23);
|
||||||
|
p = vfmaq_laneq_f32 (p, f4, lane_consts, 3);
|
||||||
p = vfmaq_f32 (f, f2, p);
|
p = vfmaq_f32 (f, f2, p);
|
||||||
|
|
||||||
/* t = 2^i. */
|
/* t = 2^i. */
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue