mirror of git://sourceware.org/git/glibc.git
AArch64: Implement AdvSIMD and SVE log2p1(f) routines
Vector variants of the new C23 log2p1 routines. Note: Benchmark inputs for log2p1(f) are identical to log1p(f). Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
parent
afce5fccdf
commit
db42732474
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -264,6 +264,17 @@
|
|||
#define __DECL_SIMD_log2f64x
|
||||
#define __DECL_SIMD_log2f128x
|
||||
|
||||
#define __DECL_SIMD_log2p1
|
||||
#define __DECL_SIMD_log2p1f
|
||||
#define __DECL_SIMD_log2p1l
|
||||
#define __DECL_SIMD_log2p1f16
|
||||
#define __DECL_SIMD_log2p1f32
|
||||
#define __DECL_SIMD_log2p1f64
|
||||
#define __DECL_SIMD_log2p1f128
|
||||
#define __DECL_SIMD_log2p1f32x
|
||||
#define __DECL_SIMD_log2p1f64x
|
||||
#define __DECL_SIMD_log2p1f128x
|
||||
|
||||
#define __DECL_SIMD_log1p
|
||||
#define __DECL_SIMD_log1pf
|
||||
#define __DECL_SIMD_log1pl
|
||||
|
|
|
|||
|
|
@ -142,7 +142,7 @@ __MATHCALL_VEC (exp2m1,, (_Mdouble_ __x));
|
|||
__MATHCALL_VEC (exp10m1,, (_Mdouble_ __x));
|
||||
|
||||
/* Return log2(1 + X). */
|
||||
__MATHCALL (log2p1,, (_Mdouble_ __x));
|
||||
__MATHCALL_VEC (log2p1,, (_Mdouble_ __x));
|
||||
|
||||
/* Return log10(1 + X). */
|
||||
__MATHCALL (log10p1,, (_Mdouble_ __x));
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ libmvec-supported-funcs = acos \
|
|||
log10 \
|
||||
log1p \
|
||||
log2 \
|
||||
log2p1 \
|
||||
pow \
|
||||
sin \
|
||||
sinh \
|
||||
|
|
|
|||
|
|
@ -190,5 +190,10 @@ libmvec {
|
|||
_ZGVnN4v_exp10m1f;
|
||||
_ZGVsMxv_exp10m1;
|
||||
_ZGVsMxv_exp10m1f;
|
||||
_ZGVnN2v_log2p1;
|
||||
_ZGVnN2v_log2p1f;
|
||||
_ZGVnN4v_log2p1f;
|
||||
_ZGVsMxv_log2p1;
|
||||
_ZGVsMxv_log2p1f;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ libmvec_hidden_proto (V_NAME_F2(hypot));
|
|||
libmvec_hidden_proto (V_NAME_F1(log10));
|
||||
libmvec_hidden_proto (V_NAME_F1(log1p));
|
||||
libmvec_hidden_proto (V_NAME_F1(log2));
|
||||
libmvec_hidden_proto (V_NAME_F1(log2p1));
|
||||
libmvec_hidden_proto (V_NAME_F1(logp1));
|
||||
libmvec_hidden_proto (V_NAME_F1(log));
|
||||
libmvec_hidden_proto (V_NAME_F2(pow));
|
||||
|
|
|
|||
|
|
@ -141,6 +141,10 @@
|
|||
# define __DECL_SIMD_log2 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_log2f
|
||||
# define __DECL_SIMD_log2f __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_log2p1
|
||||
# define __DECL_SIMD_log2p1 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_log2p1f
|
||||
# define __DECL_SIMD_log2p1f __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_logp1
|
||||
# define __DECL_SIMD_logp1 __DECL_SIMD_aarch64
|
||||
# undef __DECL_SIMD_logp1f
|
||||
|
|
@ -227,6 +231,7 @@ __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
|
|||
__vpcs __f32x4_t _ZGVnN4v_log10f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_log1pf (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_log2f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_log2p1f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_logp1f (__f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4vv_powf (__f32x4_t, __f32x4_t);
|
||||
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
|
||||
|
|
@ -264,6 +269,7 @@ __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
|
|||
__vpcs __f64x2_t _ZGVnN2v_log10 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log1p (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log2 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_log2p1 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_logp1 (__f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2vv_pow (__f64x2_t, __f64x2_t);
|
||||
__vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
|
||||
|
|
@ -306,6 +312,7 @@ __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
|
|||
__sv_f32_t _ZGVsMxv_log10f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_log1pf (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_log2f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_log2p1f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_logp1f (__sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxvv_powf (__sv_f32_t, __sv_f32_t, __sv_bool_t);
|
||||
__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
|
||||
|
|
@ -343,6 +350,7 @@ __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
|
|||
__sv_f64_t _ZGVsMxv_log10 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log1p (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log2 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_log2p1 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_logp1 (__sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxvv_pow (__sv_f64_t, __sv_f64_t, __sv_bool_t);
|
||||
__sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
|
||||
|
|
|
|||
|
|
@ -71,7 +71,11 @@
|
|||
!GCC$ builtin (log1pf) attributes simd (notinbranch)
|
||||
!GCC$ builtin (log2) attributes simd (notinbranch)
|
||||
!GCC$ builtin (log2f) attributes simd (notinbranch)
|
||||
!GCC$ builtin (log2p1) attributes simd (notinbranch)
|
||||
!GCC$ builtin (log2p1f) attributes simd (notinbranch)
|
||||
!GCC$ builtin (logf) attributes simd (notinbranch)
|
||||
!GCC$ builtin (logp1) attributes simd (notinbranch)
|
||||
!GCC$ builtin (logp1f) attributes simd (notinbranch)
|
||||
!GCC$ builtin (pow) attributes simd (notinbranch)
|
||||
!GCC$ builtin (powf) attributes simd (notinbranch)
|
||||
!GCC$ builtin (sin) attributes simd (notinbranch)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,136 @@
|
|||
/* Double-precision (Advanced SIMD) log2p1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
|
||||
float64x2_t c2, c4, c6, c8, c10, c12, c14, c16, c18, c20;
|
||||
double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
|
||||
uint64x2_t hf_rt2_top, one_m_hf_rt2_top, umask;
|
||||
int64x2_t one_top;
|
||||
double inv_ln2;
|
||||
} data = {
|
||||
/* Coefficients generated using FPMinmax deg=20, in
|
||||
[sqrt(2)/2-1, sqrt(2)-1]. */
|
||||
.c1 = 0x1.71547652b82fep0,
|
||||
.c2 = V2 (-0x1.71547652b8303p-1),
|
||||
.c3 = 0x1.ec709dc39ff3bp-2,
|
||||
.c4 = V2 (-0x1.71547652b807fp-2),
|
||||
.c5 = 0x1.2776c50f6352cp-2,
|
||||
.c6 = V2 (-0x1.ec709dc417686p-3),
|
||||
.c7 = 0x1.a61762480fdcap-3,
|
||||
.c8 = V2 (-0x1.715475e0466fp-3),
|
||||
.c9 = 0x1.484b288053b76p-3,
|
||||
.c10 = V2 (-0x1.2776eb394443fp-3),
|
||||
.c11 = 0x1.0c98198fec4f9p-3,
|
||||
.c12 = V2 (-0x1.ec64788a5e48p-4),
|
||||
.c13 = 0x1.c6df9fa8ddd65p-4,
|
||||
.c14 = V2 (-0x1.a72110bd827d7p-4),
|
||||
.c15 = 0x1.875793fc4422dp-4,
|
||||
.c16 = V2 (-0x1.64f6f207a7e15p-4),
|
||||
.c17 = 0x1.60b98dabeba61p-4,
|
||||
.c18 = V2 (-0x1.90f4556c87c7fp-4),
|
||||
.c19 = 0x1.76554651dcda3p-4,
|
||||
.c20 = V2 (-0x1.4f96285ff7616p-5),
|
||||
.hf_rt2_top = V2 (0x3fe6a09e00000000),
|
||||
.one_m_hf_rt2_top = V2 (0x00095f6200000000),
|
||||
.umask = V2 (0x000fffff00000000),
|
||||
.one_top = V2 (0x3ff),
|
||||
.inv_ln2 = 0x1.71547652b82fep+0,
|
||||
};
|
||||
#define BottomMask v_u64 (0xffffffff)
|
||||
static float64x2_t VPCS_ATTR
|
||||
special_case (float64x2_t x, float64x2_t y, uint64x2_t cmp)
|
||||
{
|
||||
|
||||
uint64x2_t ret_inf = vcgeq_f64 (x, v_f64 (INFINITY));
|
||||
uint64x2_t neg_val
|
||||
= vbslq_u64 (vcgeq_f64 (x, v_f64 (-1)), v_u64 (0xfff0000000000000),
|
||||
v_u64 (0x7fffffffffffffff));
|
||||
float64x2_t s = vreinterpretq_f64_u64 (
|
||||
vbslq_u64 (ret_inf, (v_u64 (0x7ff0000000000000)), neg_val));
|
||||
|
||||
return vbslq_f64 (cmp, s, y);
|
||||
}
|
||||
|
||||
/* Vector log2p1 approximation using polynomial on reduced interval.
|
||||
Worst-case error is 3.0 ULP:
|
||||
_ZGVnN2v_log2p1(0x1.07062df05d415p-23) got 0x1.7b76ae4a7f996p-23
|
||||
want 0x1.7b76ae4a7f999p-23 . */
|
||||
VPCS_ATTR float64x2_t V_NAME_D1 (log2p1) (float64x2_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
float64x2_t m = vaddq_f64 (x, v_f64 (1.0));
|
||||
uint64x2_t mi = vreinterpretq_u64_f64 (m);
|
||||
uint64x2_t u = vaddq_u64 (mi, d->one_m_hf_rt2_top);
|
||||
|
||||
int64x2_t ki
|
||||
= vsubq_s64 (vreinterpretq_s64_u64 (vshrq_n_u64 (u, 52)), d->one_top);
|
||||
float64x2_t k = vcvtq_f64_s64 (ki);
|
||||
|
||||
/* Reduce x to f in [sqrt(2)/2, sqrt (2)]. */
|
||||
uint64x2_t utop = vaddq_u64 (vandq_u64 (u, d->umask), d->hf_rt2_top);
|
||||
uint64x2_t u_red = vorrq_u64 (utop, vandq_u64 (mi, BottomMask));
|
||||
|
||||
float64x2_t f = vsubq_f64 (vreinterpretq_f64_u64 (u_red), v_f64 (1.0));
|
||||
|
||||
/* Correction term c/m. */
|
||||
float64x2_t cm = vdivq_f64 (vsubq_f64 (x, vsubq_f64 (m, v_f64 (1.0))), m);
|
||||
|
||||
float64x2_t f2 = vmulq_f64 (f, f);
|
||||
|
||||
float64x2_t c13 = vld1q_f64 (&d->c1);
|
||||
float64x2_t c57 = vld1q_f64 (&d->c5);
|
||||
float64x2_t c911 = vld1q_f64 (&d->c9);
|
||||
float64x2_t c1315 = vld1q_f64 (&d->c13);
|
||||
float64x2_t c1719 = vld1q_f64 (&d->c17);
|
||||
float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, f, c1719, 1);
|
||||
float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, f, c1719, 0);
|
||||
float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, f, c1315, 1);
|
||||
float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, f, c1315, 0);
|
||||
float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, f, c911, 1);
|
||||
float64x2_t p89 = vfmaq_laneq_f64 (d->c8, f, c911, 0);
|
||||
float64x2_t p67 = vfmaq_laneq_f64 (d->c6, f, c57, 1);
|
||||
float64x2_t p45 = vfmaq_laneq_f64 (d->c4, f, c57, 0);
|
||||
float64x2_t p23 = vfmaq_laneq_f64 (d->c2, f, c13, 1);
|
||||
float64x2_t p = vfmaq_f64 (p1819, f2, d->c20);
|
||||
p = vfmaq_f64 (p1617, f2, p);
|
||||
p = vfmaq_f64 (p1415, f2, p);
|
||||
p = vfmaq_f64 (p1213, f2, p);
|
||||
p = vfmaq_f64 (p1011, f2, p);
|
||||
p = vfmaq_f64 (p89, f2, p);
|
||||
p = vfmaq_f64 (p67, f2, p);
|
||||
p = vfmaq_f64 (p45, f2, p);
|
||||
p = vfmaq_f64 (p23, f2, p);
|
||||
p = vfmaq_f64 (v_f64 (d->c1), f, p);
|
||||
|
||||
/* Assemble log2p1(x) = k + log2p1(f) + c/(m * ln2). */
|
||||
float64x2_t cm_ln2 = vmulq_f64 (cm, v_f64 (d->inv_ln2));
|
||||
float64x2_t y = vfmaq_f64 (k, p, f);
|
||||
y = vaddq_f64 (y, cm_ln2);
|
||||
|
||||
uint64x2_t special_cases
|
||||
= vorrq_u64 (vcleq_f64 (x, v_f64 (-1)), vcgeq_f64 (x, v_f64 (INFINITY)));
|
||||
if (__glibc_unlikely (v_any_u64 (special_cases)))
|
||||
return special_case (x, y, special_cases);
|
||||
return y;
|
||||
}
|
||||
|
|
@ -0,0 +1,131 @@
|
|||
/* Double-precision (SVE) log2p1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
static const struct data
|
||||
{
|
||||
double c2, c4, c6, c8, c10, c12, c14, c16, c18, c20, inv_ln2;
|
||||
double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
|
||||
uint64_t hf_rt2_top, one_m_hf_rt2_top;
|
||||
int64_t one_top;
|
||||
} data = {
|
||||
/* Coefficients generated using FPMinmax deg=20, in
|
||||
[sqrt(2)/2-1, sqrt(2)-1]. */
|
||||
.c1 = 0x1.71547652b82fep0,
|
||||
.c2 = -0x1.71547652b8303p-1,
|
||||
.c3 = 0x1.ec709dc39ff3bp-2,
|
||||
.c4 = -0x1.71547652b807fp-2,
|
||||
.c5 = 0x1.2776c50f6352cp-2,
|
||||
.c6 = -0x1.ec709dc417686p-3,
|
||||
.c7 = 0x1.a61762480fdcap-3,
|
||||
.c8 = -0x1.715475e0466fp-3,
|
||||
.c9 = 0x1.484b288053b76p-3,
|
||||
.c10 = -0x1.2776eb394443fp-3,
|
||||
.c11 = 0x1.0c98198fec4f9p-3,
|
||||
.c12 = -0x1.ec64788a5e48p-4,
|
||||
.c13 = 0x1.c6df9fa8ddd65p-4,
|
||||
.c14 = -0x1.a72110bd827d7p-4,
|
||||
.c15 = 0x1.875793fc4422dp-4,
|
||||
.c16 = -0x1.64f6f207a7e15p-4,
|
||||
.c17 = 0x1.60b98dabeba61p-4,
|
||||
.c18 = -0x1.90f4556c87c7fp-4,
|
||||
.c19 = 0x1.76554651dcda3p-4,
|
||||
.c20 = -0x1.4f96285ff7616p-5,
|
||||
.hf_rt2_top = 0x3fe6a09e00000000,
|
||||
.one_m_hf_rt2_top = 0x00095f6200000000,
|
||||
.one_top = 0x3ff,
|
||||
.inv_ln2 = 0x1.71547652b82fep+0,
|
||||
};
|
||||
|
||||
static svfloat64_t NOINLINE
|
||||
special_case (svfloat64_t x, svfloat64_t y, svbool_t special, svbool_t pg)
|
||||
{
|
||||
|
||||
y = svsel (special, sv_f64 (NAN), y);
|
||||
svbool_t ret_pinf = svcmpeq_f64 (pg, x, sv_f64 (INFINITY));
|
||||
svbool_t ret_minf = svcmpeq_f64 (pg, x, sv_f64 (-1.0));
|
||||
y = svsel (ret_pinf, sv_f64 (INFINITY), y);
|
||||
return svsel (ret_minf, sv_f64 (-INFINITY), y);
|
||||
}
|
||||
|
||||
/* Vector log2p1 approximation using polynomial on reduced interval.
|
||||
Worst-case error is 3.0 ULP:
|
||||
_ZGVsMxv_log2p1(0x1.62e029c6f784fp-18) got 0x1.fff9d9148a06fp-18
|
||||
want 0x1.fff9d9148a072p-18 . */
|
||||
svfloat64_t SV_NAME_D1 (log2p1) (svfloat64_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
svfloat64_t m = svadd_x (pg, x, 1);
|
||||
svuint64_t mi = svreinterpret_u64 (m);
|
||||
svuint64_t u = svadd_x (pg, mi, d->one_m_hf_rt2_top);
|
||||
|
||||
svint64_t ki
|
||||
= svsub_x (pg, svreinterpret_s64 (svlsr_x (pg, u, 52)), d->one_top);
|
||||
svfloat64_t k = svcvt_f64_x (pg, ki);
|
||||
|
||||
/* Reduce x to f in [sqrt(2)/2, sqrt(2)]. */
|
||||
svuint64_t utop
|
||||
= svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hf_rt2_top);
|
||||
svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, 0x00000000ffffffff));
|
||||
svfloat64_t f = svsub_x (svptrue_b64 (), svreinterpret_f64 (u_red), 1);
|
||||
|
||||
/* Correction term c/m. */
|
||||
svfloat64_t c = svsub_x (svptrue_b64 (), x, svsub_x (svptrue_b64 (), m, 1));
|
||||
svfloat64_t cm;
|
||||
cm = svdiv_x (pg, c, m);
|
||||
|
||||
svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f);
|
||||
|
||||
svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
|
||||
svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
|
||||
svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
|
||||
svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
|
||||
svfloat64_t c1719 = svld1rq (svptrue_b64 (), &d->c17);
|
||||
svfloat64_t c20_inv_ln2 = svld1rq (svptrue_b64 (), &d->c20);
|
||||
|
||||
svfloat64_t p1819 = svmla_lane_f64 (sv_f64 (d->c18), f, c1719, 1);
|
||||
svfloat64_t p1617 = svmla_lane_f64 (sv_f64 (d->c16), f, c1719, 0);
|
||||
svfloat64_t p1415 = svmla_lane_f64 (sv_f64 (d->c14), f, c1315, 1);
|
||||
svfloat64_t p1213 = svmla_lane_f64 (sv_f64 (d->c12), f, c1315, 0);
|
||||
svfloat64_t p1011 = svmla_lane_f64 (sv_f64 (d->c10), f, c911, 1);
|
||||
svfloat64_t p89 = svmla_lane_f64 (sv_f64 (d->c8), f, c911, 0);
|
||||
svfloat64_t p67 = svmla_lane_f64 (sv_f64 (d->c6), f, c57, 1);
|
||||
svfloat64_t p45 = svmla_lane_f64 (sv_f64 (d->c4), f, c57, 0);
|
||||
svfloat64_t p23 = svmla_lane_f64 (sv_f64 (d->c2), f, c13, 1);
|
||||
svfloat64_t p = svmla_lane_f64 (p1819, f2, c20_inv_ln2, 0);
|
||||
p = svmla_x (pg, p1617, f2, p);
|
||||
p = svmla_x (pg, p1415, f2, p);
|
||||
p = svmla_x (pg, p1213, f2, p);
|
||||
p = svmla_x (pg, p1011, f2, p);
|
||||
p = svmla_x (pg, p89, f2, p);
|
||||
p = svmla_x (pg, p67, f2, p);
|
||||
p = svmla_x (pg, p45, f2, p);
|
||||
p = svmla_x (pg, p23, f2, p);
|
||||
p = svmla_x (pg, sv_f64 (d->c1), f, p);
|
||||
|
||||
/* Assemble log2p1(x) = k + log2p1(f) + c/(m * ln2). */
|
||||
svfloat64_t cm_k_ln2 = svmla_lane_f64 (k, cm, c20_inv_ln2, 1);
|
||||
svbool_t special_cases
|
||||
= svorn_z (pg, svcmple (svptrue_b64 (), x, sv_f64 (-1.0)),
|
||||
svcmplt (pg, x, sv_f64 (INFINITY)));
|
||||
if (__glibc_unlikely (svptest_any (pg, special_cases)))
|
||||
return special_case (x, svmla_x (pg, cm_k_ln2, p, f), special_cases, pg);
|
||||
return svmla_x (pg, cm_k_ln2, p, f);
|
||||
}
|
||||
|
|
@ -0,0 +1,115 @@
|
|||
/* Single-precision vector (Advanced SIMD) log2p1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "v_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint32x4_t four;
|
||||
int32x4_t three_quarters;
|
||||
float32x4_t c2, c4, c6, c8, c10, c12;
|
||||
float c1, c3, c5, c7, c9, c11, one_quarter, small;
|
||||
float32x4_t pinf, minf, nan;
|
||||
} data = {
|
||||
/* Polynomial generated using FPMinimax in [-0.25, 0.5]. */
|
||||
.c1 = 0x1.715476p0, .c2 = V4 (-0x1.71548p-1),
|
||||
.c3 = 0x1.ec718p-2, .c4 = V4 (-0x1.714fecp-2),
|
||||
.c5 = 0x1.27498ep-2, .c6 = V4 (-0x1.ecd864p-3),
|
||||
.c7 = 0x1.ace5b4p-3, .c8 = V4 (-0x1.7800fcp-3),
|
||||
.c9 = 0x1.226c92p-3, .c10 = V4 (-0x1.92cbb2p-4),
|
||||
.c11 = 0x1.624cb2p-4, .c12 = V4 (-0x1.bb0f1p-5),
|
||||
.four = V4 (0x40800000), .three_quarters = V4 (0x3f400000),
|
||||
.one_quarter = 0.25f, .small = 0x1p-23f,
|
||||
.pinf = V4 (INFINITY), .minf = V4 (-INFINITY),
|
||||
.nan = V4 (NAN)
|
||||
};
|
||||
|
||||
static inline float32x4_t
|
||||
special_case (float32x4_t x, float32x4_t y, uint32x4_t cmp,
|
||||
const struct data *d)
|
||||
{
|
||||
y = vbslq_f32 (cmp, d->nan, y);
|
||||
uint32x4_t ret_pinf = vceqq_f32 (x, d->pinf);
|
||||
uint32x4_t ret_minf = vceqq_f32 (x, v_f32 (-1.0));
|
||||
y = vbslq_f32 (ret_pinf, d->pinf, y);
|
||||
return vbslq_f32 (ret_minf, d->minf, y);
|
||||
}
|
||||
|
||||
/* Vector log2p1f approximation using polynomial on reduced interval.
|
||||
Worst-case error is 1.93 ULP:
|
||||
_ZGVnN4v_log2p1f(0x1.8789fcp-2) got 0x1.de58d4p-2
|
||||
want 0x1.de58d8p-2. */
|
||||
float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (log2p1) (float32x4_t x)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
|
||||
is in [-0.25, 0.5]):
|
||||
log2p1(x) = log2(t) + log(2^k) = log2p1(m) + k.
|
||||
|
||||
We approximate log2p1(m) with a polynomial, then scale by
|
||||
k. Instead of doing this directly, we use an intermediate
|
||||
scale factor s = 4*k to ensure the scale is representable
|
||||
as a normalised fp32 number. */
|
||||
|
||||
float32x4_t m = vaddq_f32 (x, v_f32 (1.0f));
|
||||
/* Choose k to scale x to the range [-1/4, 1/2]. */
|
||||
|
||||
int32x4_t k
|
||||
= vandq_s32 (vsubq_s32 (vreinterpretq_s32_f32 (m), d->three_quarters),
|
||||
vreinterpretq_s32_f32 (d->minf));
|
||||
uint32x4_t ku = vreinterpretq_u32_s32 (k);
|
||||
|
||||
/* Scale up to ensure that the scale factor is representable as normalised
|
||||
fp32 number, and scale m down accordingly. */
|
||||
float32x4_t s = vreinterpretq_f32_u32 (vsubq_u32 (d->four, ku));
|
||||
|
||||
/* Scale x by exponent manipulation. */
|
||||
float32x4_t m_scale
|
||||
= vreinterpretq_f32_u32 (vsubq_u32 (vreinterpretq_u32_f32 (x), ku));
|
||||
float32x4_t consts = vld1q_f32 (&d->c9);
|
||||
m_scale = vaddq_f32 (m_scale, vfmaq_laneq_f32 (v_f32 (-1.0f), s, consts, 2));
|
||||
|
||||
float32x4_t scale_back = vmulq_laneq_f32 (vcvtq_f32_s32 (k), consts, 3);
|
||||
float32x4_t m2 = vmulq_f32 (m_scale, m_scale);
|
||||
|
||||
/* Order-12 pairwise Horner. */
|
||||
float32x4_t c1357 = vld1q_f32 (&d->c1);
|
||||
float32x4_t p23 = vfmaq_laneq_f32 (d->c2, m_scale, c1357, 1);
|
||||
float32x4_t p45 = vfmaq_laneq_f32 (d->c4, m_scale, c1357, 2);
|
||||
float32x4_t p67 = vfmaq_laneq_f32 (d->c6, m_scale, c1357, 3);
|
||||
float32x4_t p89 = vfmaq_laneq_f32 (d->c8, m_scale, consts, 0);
|
||||
float32x4_t p1011 = vfmaq_laneq_f32 (d->c10, m_scale, consts, 1);
|
||||
|
||||
float32x4_t p = vfmaq_f32 (p1011, m2, d->c12);
|
||||
p = vfmaq_f32 (p89, m2, p);
|
||||
p = vfmaq_f32 (p67, m2, p);
|
||||
p = vfmaq_f32 (p45, m2, p);
|
||||
p = vfmaq_f32 (p23, m2, p);
|
||||
float32x4_t scaled_c1 = vfmaq_laneq_f32 (scale_back, m_scale, c1357, 0);
|
||||
uint32x4_t special_cases = vorrq_u32 (vmvnq_u32 (vcaltq_f32 (x, d->pinf)),
|
||||
vcleq_f32 (x, v_f32 (-1.0)));
|
||||
if (__glibc_unlikely (v_any_u32 (special_cases)))
|
||||
return special_case (x, vfmaq_f32 (scaled_c1, m2, p), special_cases, d);
|
||||
|
||||
return vfmaq_f32 (scaled_c1, m2, p);
|
||||
}
|
||||
|
||||
libmvec_hidden_def (V_NAME_F1 (log2p1))
|
||||
HALF_WIDTH_ALIAS_F1 (log2p1)
|
||||
|
|
@ -0,0 +1,111 @@
|
|||
/* Single-precision vector (SVE) log2p1 function
|
||||
|
||||
Copyright (C) 2025 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include "sv_math.h"
|
||||
|
||||
static const struct data
|
||||
{
|
||||
uint32_t four;
|
||||
int32_t three_quarters;
|
||||
float32_t c2, c4, c6, c8, c10, c12;
|
||||
float32_t c1, c3, c5, c7, c9, c11;
|
||||
} data = {
|
||||
/* Polynomial generated using FPMinimax in [-0.25, 0.5]. */
|
||||
.c1 = 0x1.715476p0, .c2 = -0x1.71548p-1, .c3 = 0x1.ec718p-2,
|
||||
.c4 = -0x1.714fecp-2, .c5 = 0x1.27498ep-2, .c6 = -0x1.ecd864p-3,
|
||||
.c7 = 0x1.ace5b4p-3, .c8 = -0x1.7800fcp-3, .c9 = 0x1.226c92p-3,
|
||||
.c10 = -0x1.92cbb2p-4, .c11 = 0x1.624cb2p-4, .c12 = -0x1.bb0f1p-5,
|
||||
.four = 0x40800000, .three_quarters = 0x3f400000
|
||||
};
|
||||
|
||||
static svfloat32_t NOINLINE
|
||||
special_case (svfloat32_t x, svfloat32_t y, const svbool_t pg,
|
||||
svbool_t special)
|
||||
{
|
||||
y = svsel (special, sv_f32 (NAN), y);
|
||||
svbool_t ret_pinf = svcmpeq_f32 (pg, x, sv_f32 (INFINITY));
|
||||
svbool_t ret_minf = svcmpeq_f32 (pg, x, sv_f32 (-1.0));
|
||||
y = svsel (ret_pinf, sv_f32 (INFINITY), y);
|
||||
return svsel (ret_minf, sv_f32 (-INFINITY), y);
|
||||
}
|
||||
|
||||
/* Vector log2p1f approximation using polynomial on reduced interval.
|
||||
Worst-case error is 1.90 ULP:
|
||||
_ZGVsMxv_log2p1f(0x1.8789fcp-2) got 0x1.de58d4p-2
|
||||
want 0x1.de58d8p-2. */
|
||||
svfloat32_t SV_NAME_F1 (log2p1) (svfloat32_t x, const svbool_t pg)
|
||||
{
|
||||
const struct data *d = ptr_barrier (&data);
|
||||
|
||||
/* With x + 1 = t * 2^k (where t = m + 1 and k is chosen such that m
|
||||
is in [-0.25, 0.5]):
|
||||
log2p1(x) = log2(t) + log(2^k) = log2p1(m) + k.
|
||||
|
||||
We approximate log2p1(m) with a polynomial, then scale by
|
||||
k. Instead of doing this directly, we use an intermediate
|
||||
scale factor s = 4*k to ensure the scale is representable
|
||||
as a normalised fp32 number. */
|
||||
|
||||
svfloat32_t m = svadd_x (svptrue_b32 (), x, 1);
|
||||
|
||||
/* Choose k to scale x to the range [-1/4, 1/2]. */
|
||||
svint32_t k = svand_x (
|
||||
pg, svsub_x (svptrue_b32 (), svreinterpret_s32 (m), d->three_quarters),
|
||||
sv_s32 (0xff800000));
|
||||
|
||||
/* Scale up to ensure that the scale factor is representable as normalised
|
||||
fp32 number, and scale m down accordingly. */
|
||||
svfloat32_t s = svreinterpret_f32 (svsubr_x (pg, k, d->four));
|
||||
|
||||
/* Scale x by exponent manipulation. */
|
||||
svfloat32_t m_scale = svreinterpret_f32_u32 (
|
||||
svsub_x (svptrue_b32 (), svreinterpret_u32 (x), svreinterpret_u32 (k)));
|
||||
m_scale = svadd_x (svptrue_b32 (), m_scale,
|
||||
svmla_x (pg, sv_f32 (-1.0f), sv_f32 (0.25f), s));
|
||||
|
||||
svfloat32_t scale_back = svmul_x (
|
||||
svptrue_b32 (), svcvt_f32_x (svptrue_b32 (), k), sv_f32 (0x1.0p-23f));
|
||||
svfloat32_t m2 = svmul_x (svptrue_b32 (), m_scale, m_scale);
|
||||
|
||||
/* Order-12 pairwise Horner. */
|
||||
svfloat32_t c1357 = svld1rq (svptrue_b32 (), &d->c1);
|
||||
svfloat32_t c911 = svld1rq (svptrue_b32 (), &d->c9);
|
||||
svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), m_scale, c1357, 1);
|
||||
svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), m_scale, c1357, 2);
|
||||
svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), m_scale, c1357, 3);
|
||||
svfloat32_t p89 = svmla_lane (sv_f32 (d->c8), m_scale, c911, 0);
|
||||
svfloat32_t p1011 = svmla_lane (sv_f32 (d->c10), m_scale, c911, 1);
|
||||
|
||||
svfloat32_t p = svmla_x (pg, p1011, m2, d->c12);
|
||||
p = svmla_x (pg, p89, m2, p);
|
||||
p = svmla_x (pg, p67, m2, p);
|
||||
p = svmla_x (pg, p45, m2, p);
|
||||
p = svmla_x (pg, p23, m2, p);
|
||||
|
||||
/* Scaling factor m_scale is muld with c1 coeff, then added to p. */
|
||||
svfloat32_t scaled_c1 = svmla_lane (scale_back, m_scale, c1357, 0);
|
||||
|
||||
/* Special cases: x <= -1, x == inf, x == nan. */
|
||||
svbool_t special_cases
|
||||
= svorn_z (pg, svcmple (svptrue_b32 (), x, sv_f32 (-1.0)),
|
||||
svcmplt (pg, x, sv_f32 (INFINITY)));
|
||||
if (__glibc_unlikely (svptest_any (pg, special_cases)))
|
||||
return special_case (x, svmla_x (pg, scaled_c1, m2, p), pg, special_cases);
|
||||
return svmla_x (pg, scaled_c1, m2, p);
|
||||
}
|
||||
|
|
@ -51,6 +51,7 @@ VPCS_VECTOR_WRAPPER (log_advsimd, _ZGVnN2v_log)
|
|||
VPCS_VECTOR_WRAPPER (log10_advsimd, _ZGVnN2v_log10)
|
||||
VPCS_VECTOR_WRAPPER (log1p_advsimd, _ZGVnN2v_log1p)
|
||||
VPCS_VECTOR_WRAPPER (log2_advsimd, _ZGVnN2v_log2)
|
||||
VPCS_VECTOR_WRAPPER (log2p1_advsimd, _ZGVnN2v_log2p1)
|
||||
VPCS_VECTOR_WRAPPER_ff (pow_advsimd, _ZGVnN2vv_pow)
|
||||
VPCS_VECTOR_WRAPPER (sin_advsimd, _ZGVnN2v_sin)
|
||||
VPCS_VECTOR_WRAPPER (sinh_advsimd, _ZGVnN2v_sinh)
|
||||
|
|
|
|||
|
|
@ -70,6 +70,7 @@ SVE_VECTOR_WRAPPER (log_sve, _ZGVsMxv_log)
|
|||
SVE_VECTOR_WRAPPER (log10_sve, _ZGVsMxv_log10)
|
||||
SVE_VECTOR_WRAPPER (log1p_sve, _ZGVsMxv_log1p)
|
||||
SVE_VECTOR_WRAPPER (log2_sve, _ZGVsMxv_log2)
|
||||
SVE_VECTOR_WRAPPER (log2p1_sve, _ZGVsMxv_log2p1)
|
||||
SVE_VECTOR_WRAPPER_ff (pow_sve, _ZGVsMxvv_pow)
|
||||
SVE_VECTOR_WRAPPER (sin_sve, _ZGVsMxv_sin)
|
||||
SVE_VECTOR_WRAPPER (sinh_sve, _ZGVsMxv_sinh)
|
||||
|
|
|
|||
|
|
@ -51,6 +51,7 @@ VPCS_VECTOR_WRAPPER (logf_advsimd, _ZGVnN4v_logf)
|
|||
VPCS_VECTOR_WRAPPER (log10f_advsimd, _ZGVnN4v_log10f)
|
||||
VPCS_VECTOR_WRAPPER (log1pf_advsimd, _ZGVnN4v_log1pf)
|
||||
VPCS_VECTOR_WRAPPER (log2f_advsimd, _ZGVnN4v_log2f)
|
||||
VPCS_VECTOR_WRAPPER (log2p1f_advsimd, _ZGVnN4v_log2p1f)
|
||||
VPCS_VECTOR_WRAPPER_ff (powf_advsimd, _ZGVnN4vv_powf)
|
||||
VPCS_VECTOR_WRAPPER (sinf_advsimd, _ZGVnN4v_sinf)
|
||||
VPCS_VECTOR_WRAPPER (sinhf_advsimd, _ZGVnN4v_sinhf)
|
||||
|
|
|
|||
|
|
@ -70,6 +70,7 @@ SVE_VECTOR_WRAPPER (logf_sve, _ZGVsMxv_logf)
|
|||
SVE_VECTOR_WRAPPER (log10f_sve, _ZGVsMxv_log10f)
|
||||
SVE_VECTOR_WRAPPER (log1pf_sve, _ZGVsMxv_log1pf)
|
||||
SVE_VECTOR_WRAPPER (log2f_sve, _ZGVsMxv_log2f)
|
||||
SVE_VECTOR_WRAPPER (log2p1f_sve, _ZGVsMxv_log2p1f)
|
||||
SVE_VECTOR_WRAPPER_ff (powf_sve, _ZGVsMxvv_powf)
|
||||
SVE_VECTOR_WRAPPER (sinf_sve, _ZGVsMxv_sinf)
|
||||
SVE_VECTOR_WRAPPER (sinhf_sve, _ZGVsMxv_sinhf)
|
||||
|
|
|
|||
|
|
@ -172,9 +172,14 @@ GLIBC_2.43 _ZGVnN2v_exp10m1 F
|
|||
GLIBC_2.43 _ZGVnN2v_exp10m1f F
|
||||
GLIBC_2.43 _ZGVnN2v_exp2m1 F
|
||||
GLIBC_2.43 _ZGVnN2v_exp2m1f F
|
||||
GLIBC_2.43 _ZGVnN2v_log2p1 F
|
||||
GLIBC_2.43 _ZGVnN2v_log2p1f F
|
||||
GLIBC_2.43 _ZGVnN4v_exp10m1f F
|
||||
GLIBC_2.43 _ZGVnN4v_exp2m1f F
|
||||
GLIBC_2.43 _ZGVnN4v_log2p1f F
|
||||
GLIBC_2.43 _ZGVsMxv_exp10m1 F
|
||||
GLIBC_2.43 _ZGVsMxv_exp10m1f F
|
||||
GLIBC_2.43 _ZGVsMxv_exp2m1 F
|
||||
GLIBC_2.43 _ZGVsMxv_exp2m1f F
|
||||
GLIBC_2.43 _ZGVsMxv_log2p1 F
|
||||
GLIBC_2.43 _ZGVsMxv_log2p1f F
|
||||
|
|
|
|||
Loading…
Reference in New Issue