mirror of git://sourceware.org/git/glibc.git
AArch64: Implement exp2m1 and exp10m1 routines
Vector variants of the new C23 exp2m1 & exp10m1 routines. Note: Benchmark inputs for exp2m1 & exp10m1 are identical to exp2 & exp10 respectively, this also includes the floating point variations. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
parent
54bd776f99
commit
8ced7815fb
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -187,6 +187,28 @@
|
|||
#define __DECL_SIMD_expm1f64x
|
||||
#define __DECL_SIMD_expm1f128x
|
||||
|
||||
#define __DECL_SIMD_exp2m1
|
||||
#define __DECL_SIMD_exp2m1f
|
||||
#define __DECL_SIMD_exp2m1l
|
||||
#define __DECL_SIMD_exp2m1f16
|
||||
#define __DECL_SIMD_exp2m1f32
|
||||
#define __DECL_SIMD_exp2m1f64
|
||||
#define __DECL_SIMD_exp2m1f128
|
||||
#define __DECL_SIMD_exp2m1f32x
|
||||
#define __DECL_SIMD_exp2m1f64x
|
||||
#define __DECL_SIMD_exp2m1f128x
|
||||
|
||||
#define __DECL_SIMD_exp10m1
|
||||
#define __DECL_SIMD_exp10m1f
|
||||
#define __DECL_SIMD_exp10m1l
|
||||
#define __DECL_SIMD_exp10m1f16
|
||||
#define __DECL_SIMD_exp10m1f32
|
||||
#define __DECL_SIMD_exp10m1f64
|
||||
#define __DECL_SIMD_exp10m1f128
|
||||
#define __DECL_SIMD_exp10m1f32x
|
||||
#define __DECL_SIMD_exp10m1f64x
|
||||
#define __DECL_SIMD_exp10m1f128x
|
||||
|
||||
#define __DECL_SIMD_sinh
|
||||
#define __DECL_SIMD_sinhf
|
||||
#define __DECL_SIMD_sinhl
|
||||
|
|
|
|||
|
|
@ -136,10 +136,10 @@ __MATHCALL (modf,, (_Mdouble_ __x, _Mdouble_ *__iptr)) __nonnull ((2));
|
|||
__MATHCALL_VEC (exp10,, (_Mdouble_ __x));
|
||||
|
||||
/* Return exp2(X) - 1. */
|
||||
__MATHCALL (exp2m1,, (_Mdouble_ __x));
|
||||
__MATHCALL_VEC (exp2m1,, (_Mdouble_ __x));
|
||||
|
||||
/* Return exp10(X) - 1. */
|
||||
__MATHCALL (exp10m1,, (_Mdouble_ __x));
|
||||
__MATHCALL_VEC (exp10m1,, (_Mdouble_ __x));
|
||||
|
||||
/* Return log2(1 + X). */
|
||||
__MATHCALL (log2p1,, (_Mdouble_ __x));
|
||||
|
|
|
|||
|
|
@ -19,6 +19,8 @@ libmvec-supported-funcs = acos \
|
|||
exp10 \
|
||||
exp2 \
|
||||
expm1 \
|
||||
exp2m1 \
|
||||
exp10m1 \
|
||||
hypot \
|
||||
log \
|
||||
log10 \
|
||||
|
|
|
|||
|
|
@ -179,4 +179,16 @@ libmvec {
|
|||
_ZGVsMxvv_atan2pi;
|
||||
_ZGVsMxvv_atan2pif;
|
||||
}
|
||||
GLIBC_2.43 {
|
||||
_ZGVnN2v_exp2m1;
|
||||
_ZGVnN2v_exp2m1f;
|
||||
_ZGVnN4v_exp2m1f;
|
||||
_ZGVsMxv_exp2m1;
|
||||
_ZGVsMxv_exp2m1f;
|
||||
_ZGVnN2v_exp10m1;
|
||||
_ZGVnN2v_exp10m1f;
|
||||
_ZGVnN4v_exp10m1f;
|
||||
_ZGVsMxv_exp10m1;
|
||||
_ZGVsMxv_exp10m1f;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -36,6 +36,8 @@ libmvec_hidden_proto (V_NAME_F1(exp10));
|
|||
libmvec_hidden_proto (V_NAME_F1(exp2));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp));
|
||||
libmvec_hidden_proto (V_NAME_F1(expm1));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp2m1));
|
||||
libmvec_hidden_proto (V_NAME_F1(exp10m1));
|
||||
libmvec_hidden_proto (V_NAME_F2(hypot));
|
||||
libmvec_hidden_proto (V_NAME_F1(log10));
|
||||
libmvec_hidden_proto (V_NAME_F1(log1p));
|
||||
|
|
|
|||
|
|
@ -113,6 +113,14 @@
|
|||
# define __DECL_SIMD_expm1 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_expm1f
|
||||
# define __DECL_SIMD_expm1f __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_exp2m1
|
||||
# define __DECL_SIMD_exp2m1 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_exp2m1f
|
||||
# define __DECL_SIMD_exp2m1f __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_exp10m1
|
||||
# define __DECL_SIMD_exp10m1 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_exp10m1f
|
||||
# define __DECL_SIMD_exp10m1f __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_hypot
|
||||
# define __DECL_SIMD_hypot __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_hypotf
|
||||
|
|
@ -212,6 +220,8 @@ __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
|
|||
__vpcs __f32x4_t _ZGVnN4v_exp10f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp2f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_expm1f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp2m1f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_exp10m1f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4vv_hypotf (__f32x4_t, __f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
|
||||
|
|
@ -247,6 +257,8 @@ __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
|
|||
__vpcs __f64x2_t _ZGVnN2v_exp10 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp2 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_expm1 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp2m1 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_exp10m1 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2vv_hypot (__f64x2_t, __f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
|
||||
|
|
@ -287,6 +299,8 @@ __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
|
|||
__sv_f32_t _ZGVsMxv_exp10f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_exp2f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_expm1f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_exp2m1f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_exp10m1f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxvv_hypotf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
|
||||
|
|
@ -322,6 +336,8 @@ __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
|
|||
__sv_f64_t _ZGVsMxv_exp10 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp2 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_expm1 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp2m1 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_exp10m1 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxvv_hypot (__sv_f64_t, __sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,202 @@
|
|||
/* Double-precision (Advanced SIMD) exp10m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 306.0 /* floor (log10 (2^1023)) - 1. */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 163840.0 /* 1280.0 * N. */
|
||||
|
||||
/* Value of |x| below which scale - 1 contributes produces large error. */
|
||||
#define TableBound 0x1.a308a4198c9d7p-4 /* log10(2) * 87/256. */
|
||||
|
||||
#define N (1 << V_EXP_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
const static struct data
|
||||
{
|
||||
double c6, c0;
|
||||
double c2, c4;
|
||||
double log2_10_hi, log2_10_lo;
|
||||
float64x2_t c1, c3, c5;
|
||||
float64x2_t log10_2, shift;
|
||||
float64x2_t special_bound, scale_thresh;
|
||||
uint64x2_t sm1_tbl_off, sm1_tbl_mask;
|
||||
float64x2_t rnd2zero;
|
||||
uint64_t scalem1[88];
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm. */
|
||||
.c0 = 0x1.26bb1bbb55516p1,
|
||||
.c1 = V2 (0x1.53524c73cea69p1),
|
||||
.c2 = 0x1.0470591b30b6bp1,
|
||||
.c3 = V2 (0x1.2bd7609b57e25p0),
|
||||
.c4 = 0x1.1517d41bddcbep-1,
|
||||
.c5 = V2 (0x1.a9a12d6f0755cp-3),
|
||||
.c6 = -0x1.76a8d3abd7025p9,
|
||||
|
||||
/* Values of x which should round to the zeroth index of the exp10m1 table. */
|
||||
.rnd2zero = V2 (-0x1.34413509f79ffp-10), /* (2^-8)/log2(10). */
|
||||
.sm1_tbl_off = V2 (24),
|
||||
.sm1_tbl_mask = V2 (0x3f),
|
||||
|
||||
.log10_2 = V2 (0x1.a934f0979a371p8), /* N/log2(10). */
|
||||
.log2_10_hi = 0x1.34413509f79ffp-9, /* log2(10)/N. */
|
||||
.log2_10_lo = -0x1.9dc1da994fd21p-66,
|
||||
.shift = V2 (0x1.8p+52),
|
||||
.scale_thresh = V2 (ScaleBound),
|
||||
.special_bound = V2 (SpecialBound),
|
||||
|
||||
/* Table containing 2^x - 1, for 2^x values close to 1.
|
||||
The table holds values of 2^(i/128) - 1, computed in
|
||||
arbitrary precision.
|
||||
The 1st half contains values associated to i=0..43.
|
||||
The 2nd half contains values associated to i=-44..-1. */
|
||||
.scalem1 = {
|
||||
0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
|
||||
0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
|
||||
0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
|
||||
0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
|
||||
0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
|
||||
0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
|
||||
0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
|
||||
0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
|
||||
0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
|
||||
0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
|
||||
0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
|
||||
0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
|
||||
0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
|
||||
0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
|
||||
0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
|
||||
0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
|
||||
0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
|
||||
0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
|
||||
0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
|
||||
0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
|
||||
0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
|
||||
0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
|
||||
0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
|
||||
0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
|
||||
0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
|
||||
0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
|
||||
0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
|
||||
0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
|
||||
0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
|
||||
0xbf761eea3847077b,
|
||||
}
|
||||
};
|
||||
|
||||
#define SpecialOffset v_u64 (0x6000000000000000) /* 0x1p513. */
|
||||
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
|
||||
#define SpecialBias1 v_u64 (0x7000000000000000) /* 0x1p769. */
|
||||
#define SpecialBias2 v_u64 (0x3010000000000000) /* 0x1p-254. */
|
||||
|
||||
static inline float64x2_t VPCS_ATTR
|
||||
special_case (float64x2_t s, float64x2_t y, float64x2_t n,
|
||||
const struct data *d)
|
||||
{
|
||||
/* 2^(n/N) may overflow, break it up into s1*s2. */
|
||||
uint64x2_t b = vandq_u64 (vcltzq_f64 (n), SpecialOffset);
|
||||
float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (SpecialBias1, b));
|
||||
float64x2_t s2 = vreinterpretq_f64_u64 (
|
||||
vaddq_u64 (vsubq_u64 (vreinterpretq_u64_f64 (s), SpecialBias2), b));
|
||||
uint64x2_t cmp = vcagtq_f64 (n, d->scale_thresh);
|
||||
float64x2_t r1 = vmulq_f64 (s1, s1);
|
||||
float64x2_t r0 = vmulq_f64 (vfmaq_f64 (s2, y, s2), s1);
|
||||
return vsubq_f64 (vbslq_f64 (cmp, r1, r0), v_f64 (1.0));
|
||||
}
|
||||
|
||||
static inline uint64x2_t
|
||||
lookup_sbits (uint64x2_t i)
|
||||
{
|
||||
return (uint64x2_t){ __v_exp_data[i[0] & IndexMask],
|
||||
__v_exp_data[i[1] & IndexMask] };
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
lookup_sm1bits (float64x2_t x, uint64x2_t u, const struct data *d)
|
||||
{
|
||||
/* Extract sign bit and use as offset into table. */
|
||||
uint64x2_t is_neg = vcltq_f64 (x, d->rnd2zero);
|
||||
uint64x2_t offset = vandq_u64 (is_neg, d->sm1_tbl_off);
|
||||
uint64x2_t base_idx = vandq_u64 (u, d->sm1_tbl_mask);
|
||||
uint64x2_t idx = vaddq_u64 (base_idx, offset);
|
||||
|
||||
uint64x2_t sm1 = { d->scalem1[idx[0]], d->scalem1[idx[1]] };
|
||||
return vreinterpretq_f64_u64 (sm1);
|
||||
}
|
||||
|
||||
/* Fast vector implementation of exp10m1.
|
||||
Maximum measured error is 2.53 + 0.5 ulp.
|
||||
_ZGVnN2v_exp10m1 (0x1.347007ffed62cp-10) got 0x1.63955944d1d83p-9
|
||||
want 0x1.63955944d1d80p-9. */
|
||||
float64x2_t VPCS_ATTR V_NAME_D1 (exp10m1) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
uint64x2_t cmp = vcageq_f64 (x, d->special_bound);
|
||||
|
||||
/* n = round(x/(log10(2)/N)). */
|
||||
float64x2_t z = vfmaq_f64 (d->shift, x, d->log10_2);
|
||||
uint64x2_t u = vreinterpretq_u64_f64 (z);
|
||||
float64x2_t n = vsubq_f64 (z, d->shift);
|
||||
|
||||
float64x2_t c24 = vld1q_f64 (&d->c2);
|
||||
float64x2_t c60 = vld1q_f64 (&d->c6);
|
||||
float64x2_t log_2_10 = vld1q_f64 (&d->log2_10_hi);
|
||||
|
||||
/* r = x - n*log10(2)/N. */
|
||||
float64x2_t r = x;
|
||||
r = vfmsq_laneq_f64 (r, n, log_2_10, 0);
|
||||
r = vfmsq_laneq_f64 (r, n, log_2_10, 1);
|
||||
|
||||
/* y = exp10(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
float64x2_t p12 = vfmaq_laneq_f64 (d->c1, r, c24, 0);
|
||||
float64x2_t p34 = vfmaq_laneq_f64 (d->c3, r, c24, 1);
|
||||
float64x2_t p56 = vfmaq_laneq_f64 (d->c5, r, c60, 0);
|
||||
|
||||
float64x2_t p36 = vfmaq_f64 (p34, r2, p56);
|
||||
float64x2_t p16 = vfmaq_f64 (p12, r2, p36);
|
||||
|
||||
float64x2_t p0 = vmulq_laneq_f64 (r, c60, 1);
|
||||
float64x2_t p = vfmaq_f64 (p0, r2, p16);
|
||||
|
||||
uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
|
||||
|
||||
/* scale = 2^(n/N). */
|
||||
uint64x2_t scale_bits = lookup_sbits (u);
|
||||
float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (scale_bits, e));
|
||||
float64x2_t scalem1 = vsubq_f64 (scale, v_f64 (1.0));
|
||||
|
||||
/* Use table to gather scalem1 for small values of x. */
|
||||
uint64x2_t is_small = vcaltq_f64 (x, v_f64 (TableBound));
|
||||
if (v_any_u64 (is_small))
|
||||
scalem1 = vbslq_f64 (is_small, lookup_sm1bits (x, u, d), scalem1);
|
||||
|
||||
/* Construct exp10m1 = (scale - 1) + scale * poly. */
|
||||
float64x2_t y = vfmaq_f64 (scalem1, scale, p);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (v_any_u64 (cmp)))
|
||||
return vbslq_f64 (cmp, special_case (scale, p, n, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
|
|
@ -0,0 +1,184 @@
|
|||
/* Double-precision (SVE) exp10m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 0x1.33f4bedd4fa70p+8 /* log10(2^(1023 + 1/128)). */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 1280.0
|
||||
|
||||
/* Value of |x| below which scale - 1 contributes produces large error. */
|
||||
#define FexpaBound 0x1.2a9f2b61a7e2p-4 /* 31*(log10(2)/128). */
|
||||
|
||||
static const struct data
|
||||
{
|
||||
double log2_10_hi, log2_10_lo;
|
||||
double c3, c5;
|
||||
double c0, c1, c2, c4;
|
||||
double shift, log10_2, special_bound;
|
||||
uint64_t scalem1[32];
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm. */
|
||||
.c0 = 0x1.26bb1bbb55516p1,
|
||||
.c1 = 0x1.53524c73cea6ap1,
|
||||
.c2 = 0x1.0470591d8bd2ep1,
|
||||
.c3 = 0x1.2bd7609dfea43p0,
|
||||
.c4 = 0x1.142d0d89058f1p-1,
|
||||
.c5 = 0x1.a80cf2ddd513p-3,
|
||||
|
||||
/* 1.5*2^46+1023. This value is further explained below. */
|
||||
.shift = 0x1.800000000ffc0p+46,
|
||||
.log10_2 = 0x1.a934f0979a371p1, /* 1/log2(10). */
|
||||
.log2_10_hi = 0x1.34413509f79ffp-2, /* log2(10). */
|
||||
.log2_10_lo = -0x1.9dc1da994fd21p-59,
|
||||
.special_bound = SpecialBound,
|
||||
|
||||
/* Table emulating FEXPA - 1, for values of FEXPA close to 1.
|
||||
The table holds values of 2^(i/64) - 1, computed in arbitrary precision.
|
||||
The 1st half contains values associated to i=0..+15.
|
||||
The 2nd half contains values associated to i=0..-15. */
|
||||
.scalem1 = {
|
||||
0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f,
|
||||
0x3fa0e8a30eb37901, 0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440,
|
||||
0x3fb1301d0125b50a, 0x3fb429aaea92ddfb, 0x3fb72b83c7d517ae,
|
||||
0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
|
||||
0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709,
|
||||
0x3fc6942d3720185a, 0x0000000000000000, 0xbfc331751ec3a814,
|
||||
0xbfc20224341286e4, 0xbfc0cf85bed0f8b7, 0xbfbf332113d56b1f,
|
||||
0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424,
|
||||
0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a,
|
||||
0xbfaafd11874c009e, 0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89,
|
||||
0xbf95f134923757f3, 0xbf860f9f985bc9f4,
|
||||
},
|
||||
};
|
||||
|
||||
#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
|
||||
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
|
||||
#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
|
||||
#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
|
||||
|
||||
static NOINLINE svfloat64_t
|
||||
special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p,
|
||||
svfloat64_t n)
|
||||
{
|
||||
/* s=2^n may overflow, break it up into s=s1*s2,
|
||||
such that exp = s + s*y can be computed as s1*(s2+s2*y)
|
||||
and s1*s1 overflows only if n>0. */
|
||||
|
||||
/* If n<=0 then set b to 0x6, 0 otherwise. */
|
||||
svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0. */
|
||||
svuint64_t b
|
||||
= svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0. */
|
||||
|
||||
/* Set s1 to generate overflow depending on sign of exponent n,
|
||||
ie. s1 = 0x70...0 - b. */
|
||||
svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
|
||||
/* Offset s to avoid overflow in final result if n is below threshold.
|
||||
ie. s2 = as_u64 (s) - 0x3010...0 + b. */
|
||||
svfloat64_t s2 = svreinterpret_f64 (
|
||||
svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
|
||||
|
||||
/* |n| > 1280 => 2^(n) overflows. */
|
||||
svbool_t p_cmp = svacgt (pg, n, ScaleBound);
|
||||
|
||||
svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
|
||||
svfloat64_t r2 = svmla_x (pg, s2, s2, p);
|
||||
svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
|
||||
|
||||
svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes. */
|
||||
return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0));
|
||||
}
|
||||
|
||||
/* FEXPA based SVE exp10m1 algorithm.
|
||||
Maximum measured error is 2.87 + 0.5 ULP:
|
||||
_ZGVsMxv_exp10m1(0x1.64645f11e94c6p-4) got 0x1.c64d54eb7658dp-3
|
||||
want 0x1.c64d54eb7658ap-3. */
|
||||
svfloat64_t SV_NAME_D1 (exp10m1) (svfloat64_t x, svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
svbool_t special = svacgt (pg, x, d->special_bound);
|
||||
|
||||
/* n = round(x/(log10(2)/N)). */
|
||||
svfloat64_t shift = sv_f64 (d->shift);
|
||||
svfloat64_t z = svmla_x (pg, shift, x, d->log10_2);
|
||||
svfloat64_t n = svsub_x (pg, z, shift);
|
||||
|
||||
/* r = x - n*log10(2)/N. */
|
||||
svfloat64_t log2_10 = svld1rq (svptrue_b64 (), &d->log2_10_hi);
|
||||
svfloat64_t r = x;
|
||||
r = svmls_lane (r, n, log2_10, 0);
|
||||
r = svmls_lane (r, n, log2_10, 1);
|
||||
|
||||
/* scale = 2^(n/N), computed using FEXPA. FEXPA does not propagate NaNs, so
|
||||
for consistent NaN handling we have to manually propagate them. This
|
||||
comes at significant performance cost. */
|
||||
svuint64_t u = svreinterpret_u64 (z);
|
||||
svfloat64_t scale = svexpa (u);
|
||||
svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c3);
|
||||
/* Approximate exp10(r) using polynomial. */
|
||||
svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
svfloat64_t p01 = svmla_x (pg, sv_f64 (d->c0), r, sv_f64 (d->c1));
|
||||
svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c24, 0);
|
||||
svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r, c24, 1);
|
||||
svfloat64_t p25 = svmla_x (pg, p23, p45, r2);
|
||||
svfloat64_t p05 = svmla_x (pg, p01, p25, r2);
|
||||
|
||||
svfloat64_t p = svmul_x (pg, p05, r);
|
||||
|
||||
svfloat64_t scalem1 = svsub_x (pg, scale, 1.0);
|
||||
|
||||
/* For small values, use a lookup table for a more accurate scalem1. */
|
||||
svbool_t is_small = svaclt (pg, x, FexpaBound);
|
||||
if (svptest_any (pg, is_small))
|
||||
{
|
||||
/* Use the low 4 bits of the input of FEXPA as index. */
|
||||
svuint64_t base_idx = svand_x (pg, u, 0xf);
|
||||
|
||||
/* We can use the sign of x as a fifth bit to account for the asymmetry
|
||||
of e^x around 0. */
|
||||
svuint64_t signBit
|
||||
= svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4);
|
||||
svuint64_t idx = svorr_x (pg, base_idx, signBit);
|
||||
|
||||
/* Lookup values for scale - 1 for small x. */
|
||||
svfloat64_t lookup
|
||||
= svreinterpret_f64 (svld1_gather_index (is_small, d->scalem1, idx));
|
||||
|
||||
/* Select the appropriate scale - 1 value based on x. */
|
||||
scalem1 = svsel (is_small, lookup, scalem1);
|
||||
}
|
||||
|
||||
svfloat64_t y = svmla_x (pg, scalem1, scale, p);
|
||||
|
||||
/* FEXPA returns nan for large inputs so we special case those. */
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
{
|
||||
/* FEXPA zeroes the sign bit, however the sign is meaningful to the
|
||||
special case function so needs to be copied.
|
||||
e = sign bit of u << 46. */
|
||||
svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
|
||||
/* Copy sign to scale. */
|
||||
scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
|
||||
return special_case (pg, y, scale, p, n);
|
||||
}
|
||||
|
||||
return y;
|
||||
}
|
||||
|
|
@ -0,0 +1,120 @@
|
|||
/* Single-precision (Advanced SIMD) exp10m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 192.0f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float log10_2_high, log10_2_low;
|
||||
float log10_lo, c2, c4, c6;
|
||||
float32x4_t log10_hi, c1, c3, c5, c7, c8;
|
||||
float32x4_t inv_log10_2, special_bound;
|
||||
uint32x4_t exponent_bias, special_offset, special_bias;
|
||||
float32x4_t scale_thresh;
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm with minimisation of relative
|
||||
error. */
|
||||
.log10_hi = V4 (0x1.26bb1b8000000p+1),
|
||||
.log10_lo = 0x1.daaa8b0000000p-26,
|
||||
.c1 = V4 (0x1.53524ep1),
|
||||
.c2 = 0x1.046fc8p1,
|
||||
.c3 = V4 (0x1.2bd376p0),
|
||||
.c4 = 0x1.156f8p-1,
|
||||
.c5 = V4 (0x1.b28c0ep-3),
|
||||
.c6 = -0x1.05e38ep-4,
|
||||
.c7 = V4 (-0x1.c79f4ap-4),
|
||||
.c8 = V4 (0x1.2d6f34p1),
|
||||
.inv_log10_2 = V4 (0x1.a934fp+1),
|
||||
.log10_2_high = 0x1.344136p-2,
|
||||
.log10_2_low = 0x1.ec10cp-27,
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
.special_offset = V4 (0x82000000),
|
||||
.special_bias = V4 (0x7f000000),
|
||||
.scale_thresh = V4 (ScaleBound),
|
||||
.special_bound = V4 (SpecialBound),
|
||||
};
|
||||
|
||||
static float32x4_t VPCS_ATTR NOINLINE
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
|
||||
float32x4_t r = vbslq_f32 (cmp1, r1, r0);
|
||||
return vsubq_f32 (vbslq_f32 (cmp2, r2, r), v_f32 (1.0f));
|
||||
}
|
||||
|
||||
/* Fast vector implementation of single-precision exp10m1.
|
||||
Algorithm is accurate to 1.70 + 0.5 ULP.
|
||||
_ZGVnN4v_exp10m1f(0x1.36f94cp-3) got 0x1.ac96acp-2
|
||||
want 0x1.ac96bp-2. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp10m1) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
|
||||
with poly(r) in [1/sqrt(2), sqrt(2)] and
|
||||
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
|
||||
float32x4_t log10_2 = vld1q_f32 (&d->log10_2_high);
|
||||
float32x4_t n = vrndaq_f32 (vmulq_f32 (x, d->inv_log10_2));
|
||||
float32x4_t r = vfmsq_laneq_f32 (x, n, log10_2, 0);
|
||||
r = vfmaq_laneq_f32 (r, n, log10_2, 1);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (n)), 23);
|
||||
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
|
||||
/* Pairwise Horner scheme. */
|
||||
float32x4_t log10lo_c246 = vld1q_f32 (&d->log10_lo);
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
float32x4_t p78 = vfmaq_f32 (d->c7, r, d->c8);
|
||||
float32x4_t p56 = vfmaq_laneq_f32 (d->c5, r, log10lo_c246, 3);
|
||||
float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log10lo_c246, 2);
|
||||
float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log10lo_c246, 1);
|
||||
float32x4_t p58 = vfmaq_f32 (p56, r2, p78);
|
||||
float32x4_t p36 = vfmaq_f32 (p34, r2, p58);
|
||||
float32x4_t p16 = vfmaq_f32 (p12, r2, p36);
|
||||
|
||||
float32x4_t poly
|
||||
= vfmaq_laneq_f32 (vmulq_f32 (d->log10_hi, r), r, log10lo_c246, 0);
|
||||
poly = vfmaq_f32 (poly, p16, r2);
|
||||
|
||||
float32x4_t y = vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (v_any_u32 (cmp)))
|
||||
return vbslq_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (exp10m1))
|
||||
HALF_WIDTH_ALIAS_F1 (exp10m1)
|
||||
|
|
@ -0,0 +1,122 @@
|
|||
/* Single-precision (SVE) exp10m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 192.0f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
float log10_2_high, log10_2_low;
|
||||
float log10_lo, c2, c4, c6, c8;
|
||||
float32_t log10_hi, c1, c3, c5, c7;
|
||||
float32_t inv_log10_2, special_bound;
|
||||
uint32_t exponent_bias, special_offset, special_bias;
|
||||
float32_t scale_thresh;
|
||||
} data = {
|
||||
/* Coefficients generated using Remez algorithm with minimisation of relative
|
||||
error. */
|
||||
.log10_hi = 0x1.26bb1b8000000p+1,
|
||||
.log10_lo = 0x1.daaa8b0000000p-26,
|
||||
.c1 = 0x1.53524ep1,
|
||||
.c2 = 0x1.046fc8p1,
|
||||
.c3 = 0x1.2bd376p0,
|
||||
.c4 = 0x1.156f8p-1,
|
||||
.c5 = 0x1.b28c0ep-3,
|
||||
.c6 = -0x1.05e38ep-4,
|
||||
.c7 = -0x1.c79f4ap-4,
|
||||
.c8 = 0x1.2d6f34p1,
|
||||
.inv_log10_2 = 0x1.a934fp+1,
|
||||
.log10_2_high = 0x1.344136p-2,
|
||||
.log10_2_low = 0x1.ec10cp-27,
|
||||
.exponent_bias = 0x3f800000,
|
||||
.special_offset = 0x82000000,
|
||||
.special_bias = 0x7f000000,
|
||||
.scale_thresh = ScaleBound,
|
||||
.special_bound = SpecialBound,
|
||||
};
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
special_case (svfloat32_t poly, svfloat32_t n, svuint32_t e, svbool_t cmp1,
|
||||
svfloat32_t scale, const struct data *d)
|
||||
{
|
||||
svbool_t b = svcmple (svptrue_b32 (), n, 0.0f);
|
||||
svfloat32_t s1 = svreinterpret_f32 (
|
||||
svsel (b, sv_u32 (d->special_offset + d->special_bias),
|
||||
sv_u32 (d->special_bias)));
|
||||
svfloat32_t s2
|
||||
= svreinterpret_f32 (svsub_m (b, e, sv_u32 (d->special_offset)));
|
||||
svbool_t cmp2 = svacgt (svptrue_b32 (), n, d->scale_thresh);
|
||||
svfloat32_t r2 = svmul_x (svptrue_b32 (), s1, s1);
|
||||
svfloat32_t r1
|
||||
= svmul_x (svptrue_b32 (), svmla_x (svptrue_b32 (), s2, poly, s2), s1);
|
||||
svfloat32_t r0 = svmla_x (svptrue_b32 (), scale, poly, scale);
|
||||
svfloat32_t r = svsel (cmp1, r1, r0);
|
||||
return svsub_x (svptrue_b32 (), svsel (cmp2, r2, r), 1.0f);
|
||||
}
|
||||
|
||||
/* Fast vector implementation of single-precision exp10.
|
||||
Algorithm is accurate to 1.68 + 0.5 ULP.
|
||||
_ZGVnN4v_exp10m1f(0x1.3aeffep-3) got 0x1.b3139p-2
|
||||
want 0x1.b3138cp-2. */
|
||||
svfloat32_t SV_NAME_F1 (exp10m1) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* exp10(x) = 2^n * 10^r = 2^n * (1 + poly (r)),
|
||||
with poly(r) in [1/sqrt(2), sqrt(2)] and
|
||||
x = r + n * log10 (2), with r in [-log10(2)/2, log10(2)/2]. */
|
||||
svfloat32_t log10_2 = svld1rq (svptrue_b32 (), &d->log10_2_high);
|
||||
svfloat32_t n = svrinta_x (pg, svmul_x (pg, x, d->inv_log10_2));
|
||||
svfloat32_t r = svmls_lane_f32 (x, n, log10_2, 0);
|
||||
r = svmla_lane_f32 (r, n, log10_2, 1);
|
||||
|
||||
svuint32_t e = svlsl_x (pg, svreinterpret_u32 (svcvt_s32_x (pg, n)), 23);
|
||||
|
||||
svfloat32_t scale
|
||||
= svreinterpret_f32 (svadd_n_u32_x (pg, e, d->exponent_bias));
|
||||
svbool_t cmp = svacgt_n_f32 (pg, n, d->special_bound);
|
||||
|
||||
/* Pairwise Horner scheme. */
|
||||
svfloat32_t r2 = svmul_x (pg, r, r);
|
||||
svfloat32_t c2468 = svld1rq (svptrue_b32 (), &d->c2);
|
||||
svfloat32_t p78 = svmla_lane (sv_f32 (d->c7), r, c2468, 3);
|
||||
svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), r, c2468, 2);
|
||||
svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, c2468, 1);
|
||||
svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, c2468, 0);
|
||||
svfloat32_t p58 = svmla_x (pg, p56, r2, p78);
|
||||
svfloat32_t p36 = svmla_x (pg, p34, r2, p58);
|
||||
svfloat32_t p16 = svmla_x (pg, p12, r2, p36);
|
||||
|
||||
svfloat32_t poly = svmla_n_f32_x (pg, svmul_x (pg, r, sv_f32 (d->log10_hi)),
|
||||
r, d->log10_lo);
|
||||
poly = svmla_x (pg, poly, p16, r2);
|
||||
|
||||
svfloat32_t y = svmla_x (pg, svsub_x (pg, scale, 1.0f), poly, scale);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (svptest_any (pg, cmp)))
|
||||
return svsel_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
|
|
@ -0,0 +1,194 @@
|
|||
/* Double-precision (Advanced SIMD) exp2m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 1022.0
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 1280.0
|
||||
|
||||
/* 87/256, value of x under which table lookup is used for 2^x-1. */
|
||||
#define TableBound 0x1.5bfffffffffffp-2
|
||||
|
||||
/* Number of bits for each value in the table. */
|
||||
#define N (1 << V_EXP_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint64x2_t exponent_bias, special_offset, special_bias, special_bias2,
|
||||
sm1_tbl_off, sm1_tbl_mask;
|
||||
float64x2_t scale_thresh, special_bound, shift, rnd2zero;
|
||||
float64x2_t log2_hi, c1, c3, c5;
|
||||
double log2_lo, c2, c4, c6;
|
||||
uint64_t scalem1[88];
|
||||
} data = {
|
||||
/* Coefficients generated using remez's algorithm for exp2m1(x). */
|
||||
.log2_hi = V2 (0x1.62e42fefa39efp-1),
|
||||
.log2_lo = 0x1.abc9e3b39803f3p-56,
|
||||
.c1 = V2 (0x1.ebfbdff82c58ep-3),
|
||||
.c2 = 0x1.c6b08d71f5804p-5,
|
||||
.c3 = V2 (0x1.3b2ab6fee7509p-7),
|
||||
.c4 = 0x1.5d1d37eb33b15p-10,
|
||||
.c5 = V2 (0x1.423f35f371d9ap-13),
|
||||
.c6 = 0x1.e7d57ad9a5f93p-5,
|
||||
.exponent_bias = V2 (0x3ff0000000000000),
|
||||
.special_offset = V2 (0x6000000000000000), /* 0x1p513. */
|
||||
.special_bias = V2 (0x7000000000000000), /* 0x1p769. */
|
||||
.special_bias2 = V2 (0x3010000000000000), /* 0x1p-254. */
|
||||
.scale_thresh = V2 (ScaleBound),
|
||||
.special_bound = V2 (SpecialBound),
|
||||
.shift = V2 (0x1.8p52 / N),
|
||||
.rnd2zero = V2 (-0x1p-8),
|
||||
.sm1_tbl_off = V2 (24),
|
||||
.sm1_tbl_mask = V2 (0x3f),
|
||||
|
||||
/* Table containing 2^x - 1, for 2^x values close to 1.
|
||||
The table holds values of 2^(i/128) - 1, computed in
|
||||
arbitrary precision.
|
||||
The 1st half contains values associated to i=0..43.
|
||||
The 2nd half contains values associated to i=-44..-1. */
|
||||
.scalem1 = {
|
||||
0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
|
||||
0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
|
||||
0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
|
||||
0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
|
||||
0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
|
||||
0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
|
||||
0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
|
||||
0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
|
||||
0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
|
||||
0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
|
||||
0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
|
||||
0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
|
||||
0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
|
||||
0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
|
||||
0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
|
||||
0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
|
||||
0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
|
||||
0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
|
||||
0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
|
||||
0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
|
||||
0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
|
||||
0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
|
||||
0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
|
||||
0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
|
||||
0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
|
||||
0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
|
||||
0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
|
||||
0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
|
||||
0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
|
||||
0xbf761eea3847077b,
|
||||
}
|
||||
};
|
||||
|
||||
static inline uint64x2_t
|
||||
lookup_sbits (uint64x2_t i)
|
||||
{
|
||||
return (uint64x2_t){ __v_exp_data[i[0] & IndexMask],
|
||||
__v_exp_data[i[1] & IndexMask] };
|
||||
}
|
||||
|
||||
static inline float64x2_t
|
||||
lookup_sm1bits (float64x2_t x, uint64x2_t u, const struct data *d)
|
||||
{
|
||||
/* Extract sign bit and use as offset into table. */
|
||||
uint64x2_t is_neg = vcltq_f64 (x, d->rnd2zero);
|
||||
uint64x2_t offset = vandq_u64 (is_neg, d->sm1_tbl_off);
|
||||
uint64x2_t base_idx = vandq_u64 (u, d->sm1_tbl_mask);
|
||||
uint64x2_t idx = vaddq_u64 (base_idx, offset);
|
||||
|
||||
/* Fall back to table lookup for 2^x - 1, when x is close to zero to
|
||||
avoid large errors. */
|
||||
uint64x2_t sm1 = { d->scalem1[idx[0]], d->scalem1[idx[1]] };
|
||||
return vreinterpretq_f64_u64 (sm1);
|
||||
}
|
||||
|
||||
static inline VPCS_ATTR float64x2_t
|
||||
special_case (float64x2_t poly, float64x2_t n, uint64x2_t e, float64x2_t scale,
|
||||
const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint64x2_t b = vandq_u64 (vclezq_f64 (n), d->special_offset);
|
||||
float64x2_t s1 = vreinterpretq_f64_u64 (vsubq_u64 (d->special_bias, b));
|
||||
float64x2_t s2 = vreinterpretq_f64_u64 (vaddq_u64 (
|
||||
vsubq_u64 (vreinterpretq_u64_f64 (scale), d->special_bias2), b));
|
||||
uint64x2_t cmp2 = vcagtq_f64 (n, d->scale_thresh);
|
||||
float64x2_t r1 = vmulq_f64 (s1, s1);
|
||||
float64x2_t r2 = vmulq_f64 (vfmaq_f64 (s2, poly, s2), s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
return vsubq_f64 (vbslq_f64 (cmp2, r1, r2), v_f64 (1.0f));
|
||||
}
|
||||
|
||||
/* Double-precision vector exp2(x) - 1 function.
|
||||
The maximum error is 2.55 + 0.5 ULP.
|
||||
_ZGVnN2v_exp2m1 (0x1.1113e87a035ap-8) got 0x1.7b1d06f0a7d36p-9
|
||||
want 0x1.7b1d06f0a7d33p-9. */
|
||||
VPCS_ATTR float64x2_t V_NAME_D1 (exp2m1) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* exp2(x) = 2^n (1 + poly(r))
|
||||
x = n + r, with r in [-1/2N, 1/2N].
|
||||
n is a floating point number, multiple of 1/N. */
|
||||
float64x2_t z = vaddq_f64 (d->shift, x);
|
||||
uint64x2_t u = vreinterpretq_u64_f64 (z);
|
||||
float64x2_t n = vsubq_f64 (z, d->shift);
|
||||
|
||||
/* Calculate scale, 2^n. */
|
||||
uint64x2_t e = vshlq_n_u64 (u, 52 - V_EXP_TABLE_BITS);
|
||||
uint64x2_t scale_bits = lookup_sbits (u);
|
||||
float64x2_t scale = vreinterpretq_f64_u64 (vaddq_u64 (scale_bits, e));
|
||||
|
||||
uint64x2_t cmp = vcagtq_f64 (x, d->special_bound);
|
||||
|
||||
/* Pairwise Horner scheme. */
|
||||
float64x2_t r = vsubq_f64 (x, n);
|
||||
float64x2_t r2 = vmulq_f64 (r, r);
|
||||
|
||||
float64x2_t log2lo_c2 = vld1q_f64 (&d->log2_lo);
|
||||
float64x2_t c4c6 = vld1q_f64 (&d->c4);
|
||||
|
||||
float64x2_t p56 = vfmaq_laneq_f64 (d->c5, r, c4c6, 1);
|
||||
float64x2_t p34 = vfmaq_laneq_f64 (d->c3, r, c4c6, 0);
|
||||
float64x2_t p12 = vfmaq_laneq_f64 (d->c1, r, log2lo_c2, 1);
|
||||
float64x2_t p36 = vfmaq_f64 (p34, r2, p56);
|
||||
float64x2_t p16 = vfmaq_f64 (p12, r2, p36);
|
||||
float64x2_t poly
|
||||
= vfmaq_laneq_f64 (vmulq_f64 (d->log2_hi, r), r, log2lo_c2, 0);
|
||||
poly = vfmaq_f64 (poly, p16, r2);
|
||||
|
||||
float64x2_t scalem1 = vsubq_f64 (scale, v_f64 (1.0));
|
||||
|
||||
/* Use table to gather scalem1 for small values of x. */
|
||||
uint64x2_t is_small = vcaltq_f64 (x, v_f64 (TableBound));
|
||||
if (v_any_u64 (is_small))
|
||||
scalem1 = vbslq_f64 (is_small, lookup_sm1bits (x, u, d), scalem1);
|
||||
|
||||
/* Construct exp2m1 = (scale - 1) + scale * poly. */
|
||||
float64x2_t y = vfmaq_f64 (scalem1, poly, scale);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (v_any_u64 (cmp)))
|
||||
return vbslq_f64 (cmp, special_case (poly, n, e, scale, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
|
|
@ -0,0 +1,197 @@
|
|||
/* Double-precision (SVE) exp2m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 1022.0
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 1280.0
|
||||
|
||||
/* 87/256, value of x under which table lookup is used for 2^x-1. */
|
||||
#define TableBound 0x1.5bfffffffffffp-2
|
||||
|
||||
/* Number of bits for each value in the table. */
|
||||
#define N (1 << V_EXP_TABLE_BITS)
|
||||
#define IndexMask (N - 1)
|
||||
|
||||
static const struct data
|
||||
{
|
||||
double shift, rnd2zero;
|
||||
double c1, c3, c5;
|
||||
double c0, c2, c4;
|
||||
uint64_t sm1_tbl_mask, sm1_tbl_off;
|
||||
uint64_t scalem1[88];
|
||||
} data = {
|
||||
/* Generated using fpminimax. */
|
||||
.c0 = 0x1.62e42fefa39efp-1,
|
||||
.c1 = 0x1.ebfbdff82c58ep-3,
|
||||
.c2 = 0x1.c6b08d707e662p-5,
|
||||
.c3 = 0x1.3b2ab6fc45f33p-7,
|
||||
.c4 = 0x1.5d86c0ff8618dp-10,
|
||||
.c5 = 0x1.4301374d5d2f5p-13,
|
||||
.shift = 0x1.8p52 / N,
|
||||
.sm1_tbl_mask = 0x3f,
|
||||
.sm1_tbl_off = 24,
|
||||
.rnd2zero = -0x1p-8,
|
||||
/* Table containing 2^x - 1, for 2^x values close to 1.
|
||||
The table holds values of 2^(i/128) - 1, computed in
|
||||
arbitrary precision.
|
||||
The 1st half contains values associated to i=0..43.
|
||||
The 2nd half contains values associated to i=-44..-1. */
|
||||
.scalem1= {
|
||||
0x0000000000000000, 0x3f763da9fb33356e, 0x3f864d1f3bc03077,
|
||||
0x3f90c57a1b9fe12f, 0x3f966c34c5615d0f, 0x3f9c1aca777db772,
|
||||
0x3fa0e8a30eb37901, 0x3fa3c7d958de7069, 0x3fa6ab0d9f3121ec,
|
||||
0x3fa992456e48fee8, 0x3fac7d865a7a3440, 0x3faf6cd5ffda635e,
|
||||
0x3fb1301d0125b50a, 0x3fb2abdc06c31cc0, 0x3fb429aaea92ddfb,
|
||||
0x3fb5a98c8a58e512, 0x3fb72b83c7d517ae, 0x3fb8af9388c8de9c,
|
||||
0x3fba35beb6fcb754, 0x3fbbbe084045cd3a, 0x3fbd4873168b9aa8,
|
||||
0x3fbed5022fcd91cc, 0x3fc031dc431466b2, 0x3fc0fa4c8beee4b1,
|
||||
0x3fc1c3d373ab11c3, 0x3fc28e727d9531fa, 0x3fc35a2b2f13e6e9,
|
||||
0x3fc426ff0fab1c05, 0x3fc4f4efa8fef709, 0x3fc5c3fe86d6cc80,
|
||||
0x3fc6942d3720185a, 0x3fc7657d49f17ab1, 0x3fc837f0518db8a9,
|
||||
0x3fc90b87e266c18a, 0x3fc9e0459320b7fa, 0x3fcab62afc94ff86,
|
||||
0x3fcb8d39b9d54e55, 0x3fcc6573682ec32c, 0x3fcd3ed9a72cffb7,
|
||||
0x3fce196e189d4724, 0x3fcef5326091a112, 0x3fcfd228256400dd,
|
||||
0x3fd0582887dcb8a8, 0x3fd0c7d76542a25b, 0xbfcb23213cc8e86c,
|
||||
0xbfca96ecd0deb7c4, 0xbfca09f58086c6c2, 0xbfc97c3a3cd7e119,
|
||||
0xbfc8edb9f5703dc0, 0xbfc85e7398737374, 0xbfc7ce6612886a6d,
|
||||
0xbfc73d904ed74b33, 0xbfc6abf137076a8e, 0xbfc61987b33d329e,
|
||||
0xbfc58652aa180903, 0xbfc4f25100b03219, 0xbfc45d819a94b14b,
|
||||
0xbfc3c7e359c9266a, 0xbfc331751ec3a814, 0xbfc29a35c86a9b1a,
|
||||
0xbfc20224341286e4, 0xbfc1693f3d7be6da, 0xbfc0cf85bed0f8b7,
|
||||
0xbfc034f690a387de, 0xbfbf332113d56b1f, 0xbfbdfaa500017c2d,
|
||||
0xbfbcc0768d4175a6, 0xbfbb84935fc8c257, 0xbfba46f918837cb7,
|
||||
0xbfb907a55511e032, 0xbfb7c695afc3b424, 0xbfb683c7bf93b074,
|
||||
0xbfb53f391822dbc7, 0xbfb3f8e749b3e342, 0xbfb2b0cfe1266bd4,
|
||||
0xbfb166f067f25cfe, 0xbfb01b466423250a, 0xbfad9b9eb0a5ed76,
|
||||
0xbfaafd11874c009e, 0xbfa85ae0438b37cb, 0xbfa5b505d5b6f268,
|
||||
0xbfa30b7d271980f7, 0xbfa05e4119ea5d89, 0xbf9b5a991288ad16,
|
||||
0xbf95f134923757f3, 0xbf90804a4c683d8f, 0xbf860f9f985bc9f4,
|
||||
0xbf761eea3847077b,
|
||||
}
|
||||
};
|
||||
|
||||
static inline svuint64_t
|
||||
lookup_sbits (svbool_t pg, svuint64_t indices)
|
||||
{
|
||||
/* Mask indices to valid range. */
|
||||
svuint64_t masked = svand_z (pg, indices, svdup_n_u64 (IndexMask));
|
||||
return svld1_gather_index (pg, __v_exp_data, masked);
|
||||
}
|
||||
|
||||
static inline svfloat64_t
|
||||
lookup_sm1bits (svbool_t pg, svfloat64_t x, svuint64_t u, const struct data *d)
|
||||
{
|
||||
/* Extract sign bit and use as offset into table. */
|
||||
svbool_t signBit = svcmplt_f64 (pg, x, sv_f64 (d->rnd2zero));
|
||||
svuint64_t base_idx = svand_x (pg, u, d->sm1_tbl_mask);
|
||||
svuint64_t idx = svadd_m (signBit, base_idx, sv_u64 (d->sm1_tbl_off));
|
||||
|
||||
/* Fall back to table lookup for 2^x - 1, when x is close to zero to
|
||||
avoid large errors. */
|
||||
svuint64_t sm1 = svld1_gather_index (svptrue_b64 (), d->scalem1, idx);
|
||||
return svreinterpret_f64 (sm1);
|
||||
}
|
||||
|
||||
#define SpecialOffset 0x6000000000000000 /* 0x1p513. */
|
||||
/* SpecialBias1 + SpecialBias1 = asuint(1.0). */
|
||||
#define SpecialBias1 0x7000000000000000 /* 0x1p769. */
|
||||
#define SpecialBias2 0x3010000000000000 /* 0x1p-254. */
|
||||
|
||||
static inline svfloat64_t
|
||||
special_case (svfloat64_t s, svfloat64_t y, svfloat64_t n,
|
||||
const struct data *d)
|
||||
{
|
||||
/* s=2^n may overflow, break it up into s=s1*s2,
|
||||
such that exp = s + s*y can be computed as s1*(s2+s2*y)
|
||||
and s1*s1 overflows only if n>0. */
|
||||
svbool_t p_sign = svcmple (svptrue_b64 (), n, 0.0); /* n <= 0. */
|
||||
svuint64_t b = svdup_u64_z (p_sign, SpecialOffset);
|
||||
|
||||
/* Set s1 to generate overflow depending on sign of exponent n. */
|
||||
svfloat64_t s1
|
||||
= svreinterpret_f64 (svsubr_x (svptrue_b64 (), b, SpecialBias1));
|
||||
/* Offset s to avoid overflow in final result if n is below threshold. */
|
||||
svfloat64_t s2 = svreinterpret_f64 (svadd_x (
|
||||
svptrue_b64 (),
|
||||
svsub_x (svptrue_b64 (), svreinterpret_u64 (s), SpecialBias2), b));
|
||||
|
||||
/* |n| > 1280 => 2^(n) overflows. */
|
||||
svbool_t p_cmp = svacle (svptrue_b64 (), n, ScaleBound);
|
||||
|
||||
svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
|
||||
svfloat64_t r2 = svmla_x (svptrue_b64 (), s2, s2, y);
|
||||
svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
|
||||
|
||||
return svsub_x (svptrue_b64 (), svsel (p_cmp, r0, r1), 1.0);
|
||||
}
|
||||
|
||||
/* Double-precision SVE exp2(x) - 1.
|
||||
Maximum error is 2.58 + 0.5 ULP.
|
||||
_ZGVsMxv_exp2m1(0x1.0284a345c99bfp-8) got 0x1.66df630cd2965p-9
|
||||
want 0x1.66df630cd2962p-9. */
|
||||
svfloat64_t SV_NAME_D1 (exp2m1) (svfloat64_t x, svbool_t pg)
|
||||
{
|
||||
/* exp2(x) = 2^n (1 + poly(r))
|
||||
x = n + r, with r in [-1/2N, 1/2N].
|
||||
n is a floating point number, multiple of 1/N. */
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
svbool_t special = svacge (pg, x, SpecialBound);
|
||||
|
||||
svfloat64_t z = svadd_x (pg, x, d->shift);
|
||||
svuint64_t u = svreinterpret_u64 (z);
|
||||
svfloat64_t n = svsub_x (pg, z, d->shift);
|
||||
|
||||
svfloat64_t r = svsub_x (svptrue_b64 (), x, n);
|
||||
svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
|
||||
|
||||
/* Look up table to calculate 2^n. */
|
||||
svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TABLE_BITS);
|
||||
svuint64_t scale_bits = lookup_sbits (pg, u);
|
||||
svfloat64_t scale = svreinterpret_f64_u64 (svadd_x (pg, scale_bits, e));
|
||||
|
||||
/* Pairwise Horner scheme. */
|
||||
svfloat64_t c35 = svld1rq (svptrue_b64 (), &d->c3);
|
||||
|
||||
svfloat64_t p01 = svmla_x (pg, sv_f64 (d->c0), r, d->c1);
|
||||
svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c35, 0);
|
||||
svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), r, c35, 1);
|
||||
|
||||
svfloat64_t p25 = svmla_x (pg, p23, r2, p45);
|
||||
svfloat64_t p05 = svmla_x (pg, p01, r2, p25);
|
||||
svfloat64_t poly = svmul_x (pg, p05, r);
|
||||
|
||||
/* Use table to gather scalem1 for small values of x. */
|
||||
svbool_t is_small = svaclt (pg, x, TableBound);
|
||||
svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0));
|
||||
if (svptest_any (pg, is_small))
|
||||
scalem1 = svsel_f64 (is_small, lookup_sm1bits (pg, x, u, d), scalem1);
|
||||
|
||||
/* Construct exp2m1 = (scale - 1) + scale * poly. */
|
||||
svfloat64_t y = svmla_x (pg, scalem1, scale, poly);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (svptest_any (pg, special)))
|
||||
return svsel_f64 (special, special_case (scale, poly, n, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
/* Single-precision (Advanced SIMD) exp2m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 192.0f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint32x4_t exponent_bias, special_offset, special_bias;
|
||||
float32x4_t scale_thresh, special_bound;
|
||||
float32x4_t log2_hi, c1, c3, c5;
|
||||
float log2_lo, c2, c4, c6;
|
||||
} data = {
|
||||
/* Coefficients generated using remez's algorithm for exp2m1f(x). */
|
||||
.log2_hi = V4 (0x1.62e43p-1),
|
||||
.log2_lo = -0x1.05c610p-29,
|
||||
.c1 = V4 (0x1.ebfbep-3),
|
||||
.c2 = 0x1.c6b06ep-5,
|
||||
.c3 = V4 (0x1.3b2a5cp-7),
|
||||
.c4 = 0x1.5da59ep-10,
|
||||
.c5 = V4 (0x1.440dccp-13),
|
||||
.c6 = 0x1.e081d6p-17,
|
||||
.exponent_bias = V4 (0x3f800000),
|
||||
.special_offset = V4 (0x82000000),
|
||||
.special_bias = V4 (0x7f000000),
|
||||
.scale_thresh = V4 (ScaleBound),
|
||||
.special_bound = V4 (SpecialBound),
|
||||
};
|
||||
|
||||
static float32x4_t VPCS_ATTR
|
||||
special_case (float32x4_t poly, float32x4_t n, uint32x4_t e, uint32x4_t cmp1,
|
||||
float32x4_t scale, const struct data *d)
|
||||
{
|
||||
/* 2^n may overflow, break it up into s1*s2. */
|
||||
uint32x4_t b = vandq_u32 (vclezq_f32 (n), d->special_offset);
|
||||
float32x4_t s1 = vreinterpretq_f32_u32 (vaddq_u32 (b, d->special_bias));
|
||||
float32x4_t s2 = vreinterpretq_f32_u32 (vsubq_u32 (e, b));
|
||||
uint32x4_t cmp2 = vcagtq_f32 (n, d->scale_thresh);
|
||||
float32x4_t r2 = vmulq_f32 (s1, s1);
|
||||
float32x4_t r1 = vmulq_f32 (vfmaq_f32 (s2, poly, s2), s1);
|
||||
/* Similar to r1 but avoids double rounding in the subnormal range. */
|
||||
float32x4_t r0 = vfmaq_f32 (scale, poly, scale);
|
||||
float32x4_t r = vbslq_f32 (cmp1, r1, r0);
|
||||
return vsubq_f32 (vbslq_f32 (cmp2, r2, r), v_f32 (1.0f));
|
||||
}
|
||||
|
||||
/* Single-precision vector exp2(x) - 1 function.
|
||||
The maximum error is 1.76 + 0.5 ULP.
|
||||
_ZGVnN4v_exp2m1f (0x1.018af8p-1) got 0x1.ab2ebcp-2
|
||||
want 0x1.ab2ecp-2. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (exp2m1) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* exp2(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
|
||||
x = n + r, with r in [-1/2, 1/2]. */
|
||||
float32x4_t n = vrndaq_f32 (x);
|
||||
float32x4_t r = vsubq_f32 (x, n);
|
||||
uint32x4_t e = vshlq_n_u32 (vreinterpretq_u32_s32 (vcvtaq_s32_f32 (x)), 23);
|
||||
float32x4_t scale = vreinterpretq_f32_u32 (vaddq_u32 (e, d->exponent_bias));
|
||||
|
||||
uint32x4_t cmp = vcagtq_f32 (n, d->special_bound);
|
||||
|
||||
float32x4_t log2lo_c246 = vld1q_f32 (&d->log2_lo);
|
||||
float32x4_t r2 = vmulq_f32 (r, r);
|
||||
|
||||
/* Pairwise horner scheme. */
|
||||
float32x4_t p56 = vfmaq_laneq_f32 (d->c5, r, log2lo_c246, 3);
|
||||
float32x4_t p34 = vfmaq_laneq_f32 (d->c3, r, log2lo_c246, 2);
|
||||
float32x4_t p12 = vfmaq_laneq_f32 (d->c1, r, log2lo_c246, 1);
|
||||
float32x4_t p36 = vfmaq_f32 (p34, r2, p56);
|
||||
float32x4_t p16 = vfmaq_f32 (p12, r2, p36);
|
||||
float32x4_t poly
|
||||
= vfmaq_laneq_f32 (vmulq_f32 (d->log2_hi, r), r, log2lo_c246, 0);
|
||||
poly = vfmaq_f32 (poly, p16, r2);
|
||||
|
||||
float32x4_t y = vfmaq_f32 (vsubq_f32 (scale, v_f32 (1.0f)), poly, scale);
|
||||
|
||||
if (__glibc_unlikely (v_any_u32 (cmp)))
|
||||
return vbslq_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
libmvec_hidden_def (V_NAME_F1 (exp2m1))
|
||||
HALF_WIDTH_ALIAS_F1 (exp2m1)
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
/* Single-precision (SVE) exp2m1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
/* Value of |x| above which scale overflows without special treatment. */
|
||||
#define SpecialBound 126.0f /* rint (log2 (2^127 / (1 + sqrt (2)))). */
|
||||
|
||||
/* Value of n above which scale overflows even with special treatment. */
|
||||
#define ScaleBound 192.0f
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint32_t exponent_bias, special_offset, special_bias;
|
||||
float32_t scale_thresh, special_bound;
|
||||
float log2_lo, c2, c4, c6;
|
||||
float log2_hi, c1, c3, c5, shift;
|
||||
} data = {
|
||||
/* Coefficients generated using remez's algorithm for exp2m1f(x). */
|
||||
.log2_hi = 0x1.62e43p-1,
|
||||
.log2_lo = -0x1.05c610p-29,
|
||||
.c1 = 0x1.ebfbep-3,
|
||||
.c2 = 0x1.c6b06ep-5,
|
||||
.c3 = 0x1.3b2a5cp-7,
|
||||
.c4 = 0x1.5da59ep-10,
|
||||
.c5 = 0x1.440dccp-13,
|
||||
.c6 = 0x1.e081d6p-17,
|
||||
.exponent_bias = 0x3f800000,
|
||||
.special_offset = 0x82000000,
|
||||
.special_bias = 0x7f000000,
|
||||
.scale_thresh = ScaleBound,
|
||||
.special_bound = SpecialBound,
|
||||
};
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
special_case (svfloat32_t poly, svfloat32_t n, svuint32_t e, svbool_t cmp1,
|
||||
svfloat32_t scale, const struct data *d)
|
||||
{
|
||||
svbool_t b = svcmple (svptrue_b32 (), n, 0.0f);
|
||||
svfloat32_t s1 = svreinterpret_f32 (
|
||||
svsel (b, sv_u32 (d->special_offset + d->special_bias),
|
||||
sv_u32 (d->special_bias)));
|
||||
svfloat32_t s2
|
||||
= svreinterpret_f32 (svsub_m (b, e, sv_u32 (d->special_offset)));
|
||||
svbool_t cmp2 = svacgt (svptrue_b32 (), n, d->scale_thresh);
|
||||
svfloat32_t r2 = svmul_x (svptrue_b32 (), s1, s1);
|
||||
svfloat32_t r1
|
||||
= svmul_x (svptrue_b32 (), svmla_x (svptrue_b32 (), s2, poly, s2), s1);
|
||||
svfloat32_t r0 = svmla_x (svptrue_b32 (), scale, poly, scale);
|
||||
svfloat32_t r = svsel (cmp1, r1, r0);
|
||||
return svsub_x (svptrue_b32 (), svsel (cmp2, r2, r), 1.0f);
|
||||
}
|
||||
|
||||
/* Single-precision vector exp2(x) - 1 function.
|
||||
The maximum error is 1.76 + 0.5 ULP.
|
||||
_ZGVsMxv_exp2m1f (0x1.018af8p-1) got 0x1.ab2ebcp-2
|
||||
want 0x1.ab2ecp-2. */
|
||||
svfloat32_t SV_NAME_F1 (exp2m1) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
svfloat32_t n = svrinta_x (pg, x);
|
||||
svfloat32_t r = svsub_x (pg, x, n);
|
||||
|
||||
svuint32_t e = svlsl_x (pg, svreinterpret_u32 (svcvt_s32_x (pg, n)), 23);
|
||||
svfloat32_t scale
|
||||
= svreinterpret_f32 (svadd_n_u32_x (pg, e, d->exponent_bias));
|
||||
|
||||
svbool_t cmp = svacgt_n_f32 (pg, n, d->special_bound);
|
||||
|
||||
svfloat32_t r2 = svmul_x (pg, r, r);
|
||||
|
||||
svfloat32_t log2lo_c246 = svld1rq (svptrue_b32 (), &d->log2_lo);
|
||||
svfloat32_t p56 = svmla_lane (sv_f32 (d->c5), r, log2lo_c246, 3);
|
||||
svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, log2lo_c246, 2);
|
||||
svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, log2lo_c246, 1);
|
||||
|
||||
svfloat32_t p36 = svmla_x (pg, p34, p56, r2);
|
||||
svfloat32_t p16 = svmla_x (pg, p12, p36, r2);
|
||||
|
||||
svfloat32_t poly = svmla_lane (
|
||||
svmul_x (svptrue_b32 (), r, sv_f32 (d->log2_hi)), r, log2lo_c246, 0);
|
||||
poly = svmla_x (pg, poly, p16, r2);
|
||||
|
||||
svfloat32_t y = svmla_x (pg, svsub_x (pg, scale, 1.0f), poly, scale);
|
||||
|
||||
/* Fallback to special case for lanes with overflow. */
|
||||
if (__glibc_unlikely (svptest_any (pg, cmp)))
|
||||
return svsel_f32 (cmp, special_case (poly, n, e, cmp, scale, d), y);
|
||||
|
||||
return y;
|
||||
}
|
||||
|
|
@ -44,6 +44,8 @@ VPCS_VECTOR_WRAPPER (exp_advsimd, _ZGVnN2v_exp)
|
|||
VPCS_VECTOR_WRAPPER (exp10_advsimd, _ZGVnN2v_exp10)
|
||||
VPCS_VECTOR_WRAPPER (exp2_advsimd, _ZGVnN2v_exp2)
|
||||
VPCS_VECTOR_WRAPPER (expm1_advsimd, _ZGVnN2v_expm1)
|
||||
VPCS_VECTOR_WRAPPER (exp2m1_advsimd, _ZGVnN2v_exp2m1)
|
||||
VPCS_VECTOR_WRAPPER (exp10m1_advsimd, _ZGVnN2v_exp10m1)
|
||||
VPCS_VECTOR_WRAPPER_ff (hypot_advsimd, _ZGVnN2vv_hypot)
|
||||
VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
|
||||
VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
|
||||
|
|
|
|||
|
|
@ -63,6 +63,8 @@ SVE_VECTOR_WRAPPER (exp_sve, _ZGVsMxv_exp)
|
|||
SVE_VECTOR_WRAPPER (exp10_sve, _ZGVsMxv_exp10)
|
||||
SVE_VECTOR_WRAPPER (exp2_sve, _ZGVsMxv_exp2)
|
||||
SVE_VECTOR_WRAPPER (expm1_sve, _ZGVsMxv_expm1)
|
||||
SVE_VECTOR_WRAPPER (exp2m1_sve, _ZGVsMxv_exp2m1)
|
||||
SVE_VECTOR_WRAPPER (exp10m1_sve, _ZGVsMxv_exp10m1)
|
||||
SVE_VECTOR_WRAPPER_ff (hypot_sve, _ZGVsMxvv_hypot)
|
||||
SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
|
||||
SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
|
||||
|
|
|
|||
|
|
@ -44,6 +44,8 @@ VPCS_VECTOR_WRAPPER (expf_advsimd, _ZGVnN4v_expf)
|
|||
VPCS_VECTOR_WRAPPER (exp10f_advsimd, _ZGVnN4v_exp10f)
|
||||
VPCS_VECTOR_WRAPPER (exp2f_advsimd, _ZGVnN4v_exp2f)
|
||||
VPCS_VECTOR_WRAPPER (expm1f_advsimd, _ZGVnN4v_expm1f)
|
||||
VPCS_VECTOR_WRAPPER (exp2m1f_advsimd, _ZGVnN4v_exp2m1f)
|
||||
VPCS_VECTOR_WRAPPER (exp10m1f_advsimd, _ZGVnN4v_exp10m1f)
|
||||
VPCS_VECTOR_WRAPPER_ff (hypotf_advsimd, _ZGVnN4vv_hypotf)
|
||||
VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
|
||||
VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
|
||||
|
|
|
|||
|
|
@ -63,6 +63,8 @@ SVE_VECTOR_WRAPPER (expf_sve, _ZGVsMxv_expf)
|
|||
SVE_VECTOR_WRAPPER (exp10f_sve, _ZGVsMxv_exp10f)
|
||||
SVE_VECTOR_WRAPPER (exp2f_sve, _ZGVsMxv_exp2f)
|
||||
SVE_VECTOR_WRAPPER (expm1f_sve, _ZGVsMxv_expm1f)
|
||||
SVE_VECTOR_WRAPPER (exp2m1f_sve, _ZGVsMxv_exp2m1f)
|
||||
SVE_VECTOR_WRAPPER (exp10m1f_sve, _ZGVsMxv_exp10m1f)
|
||||
SVE_VECTOR_WRAPPER_ff (hypotf_sve, _ZGVsMxvv_hypotf)
|
||||
SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
|
||||
SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
|
||||
|
|
|
|||
|
|
@ -168,3 +168,13 @@ GLIBC_2.42 _ZGVsMxv_atanpi F
|
|||
GLIBC_2.42 _ZGVsMxv_atanpif F
|
||||
GLIBC_2.42 _ZGVsMxvv_atan2pi F
|
||||
GLIBC_2.42 _ZGVsMxvv_atan2pif F
|
||||
GLIBC_2.43 _ZGVnN2v_exp10m1 F
|
||||
GLIBC_2.43 _ZGVnN2v_exp10m1f F
|
||||
GLIBC_2.43 _ZGVnN2v_exp2m1 F
|
||||
GLIBC_2.43 _ZGVnN2v_exp2m1f F
|
||||
GLIBC_2.43 _ZGVnN4v_exp10m1f F
|
||||
GLIBC_2.43 _ZGVnN4v_exp2m1f F
|
||||
GLIBC_2.43 _ZGVsMxv_exp10m1 F
|
||||
GLIBC_2.43 _ZGVsMxv_exp10m1f F
|
||||
GLIBC_2.43 _ZGVsMxv_exp2m1 F
|
||||
GLIBC_2.43 _ZGVsMxv_exp2m1f F
|
||||
|
|
|
|||
Loading…
Reference in New Issue