diff --git a/sysdeps/arm/fpu/math_private.h b/sysdeps/arm/fpu/math_private.h
new file mode 100644
index 0000000000..b66ce65c76
--- /dev/null
+++ b/sysdeps/arm/fpu/math_private.h
@@ -0,0 +1,32 @@
+/* Configure optimized libm functions. AArch64 version.
+ Copyright (C) 2017-2025 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ . */
+
+#ifndef ARM_MATH_PRIVATE_H
+#define ARM_MATH_PRIVATE_H 1
+
+#include
+
+/* For int64_t to double conversion, libgcc might not respect the rounding
+ mode [1].
+
+ [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91970 */
+#define TOINT64_INTRINSICS 0
+
+#include_next
+
+#endif
diff --git a/sysdeps/i386/Makefile b/sysdeps/i386/Makefile
index bb4f59094c..11ddbd402d 100644
--- a/sysdeps/i386/Makefile
+++ b/sysdeps/i386/Makefile
@@ -13,6 +13,7 @@ CFLAGS-e_gamma_r.c += -fexcess-precision=standard
CFLAGS-s_erf.c += -fexcess-precision=standard
CFLAGS-s_erfc.c += -fexcess-precision=standard
CFLAGS-s_erf_common.c += -fexcess-precision=standard
+CFLAGS-s_fma.c += -fexcess-precision=standard
endif
ifeq ($(subdir),gmon)
diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
index 36a47ae7db..6a7b98e1f0 100644
--- a/sysdeps/ieee754/dbl-64/math_config.h
+++ b/sysdeps/ieee754/dbl-64/math_config.h
@@ -85,6 +85,24 @@ static inline int32_t
converttoint (double x);
#endif
+#ifndef TOINT64_INTRINSICS
+# define TOINT64_INTRINSICS 1
+#endif
+
+static inline double convertfromint64 (int64_t a)
+{
+#if !TOINT64_INTRINSICS
+ union { int64_t x; double d; } low = {.d = 0x1.0p52};
+
+ double high = (int32_t)(a >> 32) * 0x1.0p32;
+ low.x |= a & INT64_C(0x00000000ffffffff);
+
+ return (high - 0x1.0p52) + low.d;
+#else
+ return a;
+#endif
+}
+
static inline uint64_t
asuint64 (double f)
{
diff --git a/sysdeps/ieee754/dbl-64/s_fma.c b/sysdeps/ieee754/dbl-64/s_fma.c
index c7ff3cf447..f89b3bb0cc 100644
--- a/sysdeps/ieee754/dbl-64/s_fma.c
+++ b/sysdeps/ieee754/dbl-64/s_fma.c
@@ -23,17 +23,51 @@
#include
#undef dfmal
#undef f32xfmaf64
-#include
-#include
-#include
-#include
#include
#include
-#include
+#include
-/* This implementation uses rounding to odd to avoid problems with
- double rounding. See a paper by Boldo and Melquiond:
- http://www.lri.fr/~melquion/doc/08-tc.pdf */
+
+#if !USE_FMA_BUILTIN
+# include
+# include "math_config.h"
+# include
+
+# define ZEROINFNAN (0x7ff - EXPONENT_BIAS - MANTISSA_WIDTH - 1)
+
+struct num
+{
+ uint64_t m;
+ int e;
+ int sign;
+};
+
+static inline struct num normalize (double x)
+{
+ uint64_t ix = asuint64 (x);
+ int e = ix >> MANTISSA_WIDTH;
+ int sign = e & 0x800;
+ e &= 0x7ff;
+ if (!e)
+ {
+ ix = asuint64 (x * 0x1p63);
+ e = ix >> MANTISSA_WIDTH & 0x7ff;
+ e = e ? e-63 : 0x800;
+ }
+ ix &= (UINT64_C(1) << MANTISSA_WIDTH) - 1;
+ ix |= UINT64_C(1) << MANTISSA_WIDTH;
+ ix <<= 1;
+ e -= EXPONENT_BIAS + MANTISSA_WIDTH + 1;
+ return (struct num){ix,e,sign};
+}
+
+static void mul (uint64_t *hi, uint64_t *lo, uint64_t x, uint64_t y)
+{
+ u128 r = u128_mul (u128_from_u64 (x), u128_from_u64 (y));
+ *hi = u128_high (r);
+ *lo = u128_low (r);
+}
+#endif
double
__fma (double x, double y, double z)
@@ -41,271 +75,171 @@ __fma (double x, double y, double z)
#if USE_FMA_BUILTIN
return __builtin_fma (x, y, z);
#else
- /* Use generic implementation. */
- union ieee754_double u, v, w;
- int adjust = 0;
- u.d = x;
- v.d = y;
- w.d = z;
- if (__builtin_expect (u.ieee.exponent + v.ieee.exponent
- >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG, 0)
- || __builtin_expect (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG, 0)
- || __builtin_expect (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG, 0)
- || __builtin_expect (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG, 0)
- || __builtin_expect (u.ieee.exponent + v.ieee.exponent
- <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG, 0))
+ /* Normalize so top 10bits and last bit are 0 */
+ struct num nx, ny, nz;
+ nx = normalize (x);
+ ny = normalize (y);
+ nz = normalize (z);
+
+ if (nx.e >= ZEROINFNAN || ny.e >= ZEROINFNAN)
+ return x * y + z;
+ if (nz.e >= ZEROINFNAN)
{
- /* If z is Inf, but x and y are finite, the result should be
- z rather than NaN. */
- if (w.ieee.exponent == 0x7ff
- && u.ieee.exponent != 0x7ff
- && v.ieee.exponent != 0x7ff)
- return (z + x) + y;
- /* If z is zero and x are y are nonzero, compute the result
- as x * y to avoid the wrong sign of a zero result if x * y
- underflows to 0. */
- if (z == 0 && x != 0 && y != 0)
+ if (nz.e > ZEROINFNAN) /* z==0 */
return x * y;
- /* If x or y or z is Inf/NaN, or if x * y is zero, compute as
- x * y + z. */
- if (u.ieee.exponent == 0x7ff
- || v.ieee.exponent == 0x7ff
- || w.ieee.exponent == 0x7ff
- || x == 0
- || y == 0)
- return x * y + z;
- /* If fma will certainly overflow, compute as x * y. */
- if (u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS)
- return x * y;
- /* If x * y is less than 1/4 of DBL_TRUE_MIN, neither the
- result nor whether there is underflow depends on its exact
- value, only on its sign. */
- if (u.ieee.exponent + v.ieee.exponent
- < IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2)
+ else if (isnan (z))
+ return __builtin_nan ("");
+ return z;
+ }
+
+ /* mul: r = x*y */
+ uint64_t rhi, rlo, zhi, zlo;
+ mul (&rhi, &rlo, nx.m, ny.m);
+ /* Either top 20 or 21 bits of rhi and last 2 bits of rlo are 0 */
+
+ /* align exponents */
+ int e = nx.e + ny.e;
+ int d = nz.e - e;
+ /* Shift bits z<<=kz, r>>=kr, so kz+kr == d, set e = e+kr (== ez-kz). */
+ if (d > 0)
+ {
+ if (d < 64)
{
- int neg = u.ieee.negative ^ v.ieee.negative;
- double tiny = neg ? -0x1p-1074 : 0x1p-1074;
- if (w.ieee.exponent >= 3)
- return tiny + z;
- /* Scaling up, adding TINY and scaling down produces the
- correct result, because in round-to-nearest mode adding
- TINY has no effect and in other modes double rounding is
- harmless. But it may not produce required underflow
- exceptions. */
- v.d = z * 0x1p54 + tiny;
- if (TININESS_AFTER_ROUNDING
- ? v.ieee.exponent < 55
- : (w.ieee.exponent == 0
- || (w.ieee.exponent == 1
- && w.ieee.negative != neg
- && w.ieee.mantissa1 == 0
- && w.ieee.mantissa0 == 0)))
+ zlo = nz.m << d;
+ zhi = nz.m >> (64 - d);
+ }
+ else
+ {
+ zlo = 0;
+ zhi = nz.m;
+ e = nz.e - 64;
+ d -= 64;
+ if (d < 64)
{
- double force_underflow = x * y;
- math_force_eval (force_underflow);
+ rlo = rhi << (64 - d) | rlo >> d | !!(rlo << (64 - d));
+ rhi = rhi >> d;
}
- return v.d * 0x1p-54;
- }
- if (u.ieee.exponent + v.ieee.exponent
- >= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
- {
- /* Compute 1p-53 times smaller result and multiply
- at the end. */
- if (u.ieee.exponent > v.ieee.exponent)
- u.ieee.exponent -= DBL_MANT_DIG;
else
- v.ieee.exponent -= DBL_MANT_DIG;
- /* If x + y exponent is very large and z exponent is very small,
- it doesn't matter if we don't adjust it. */
- if (w.ieee.exponent > DBL_MANT_DIG)
- w.ieee.exponent -= DBL_MANT_DIG;
- adjust = 1;
- }
- else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
- {
- /* Similarly.
- If z exponent is very large and x and y exponents are
- very small, adjust them up to avoid spurious underflows,
- rather than down. */
- if (u.ieee.exponent + v.ieee.exponent
- <= IEEE754_DOUBLE_BIAS + 2 * DBL_MANT_DIG)
{
- if (u.ieee.exponent > v.ieee.exponent)
- u.ieee.exponent += 2 * DBL_MANT_DIG + 2;
- else
- v.ieee.exponent += 2 * DBL_MANT_DIG + 2;
+ rlo = 1;
+ rhi = 0;
}
- else if (u.ieee.exponent > v.ieee.exponent)
- {
- if (u.ieee.exponent > DBL_MANT_DIG)
- u.ieee.exponent -= DBL_MANT_DIG;
- }
- else if (v.ieee.exponent > DBL_MANT_DIG)
- v.ieee.exponent -= DBL_MANT_DIG;
- w.ieee.exponent -= DBL_MANT_DIG;
- adjust = 1;
}
- else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
- {
- u.ieee.exponent -= DBL_MANT_DIG;
- if (v.ieee.exponent)
- v.ieee.exponent += DBL_MANT_DIG;
- else
- v.d *= 0x1p53;
- }
- else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
- {
- v.ieee.exponent -= DBL_MANT_DIG;
- if (u.ieee.exponent)
- u.ieee.exponent += DBL_MANT_DIG;
- else
- u.d *= 0x1p53;
- }
- else /* if (u.ieee.exponent + v.ieee.exponent
- <= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */
- {
- if (u.ieee.exponent > v.ieee.exponent)
- u.ieee.exponent += 2 * DBL_MANT_DIG + 2;
- else
- v.ieee.exponent += 2 * DBL_MANT_DIG + 2;
- if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 6)
- {
- if (w.ieee.exponent)
- w.ieee.exponent += 2 * DBL_MANT_DIG + 2;
- else
- w.d *= 0x1p108;
- adjust = -1;
- }
- /* Otherwise x * y should just affect inexact
- and nothing else. */
- }
- x = u.d;
- y = v.d;
- z = w.d;
- }
-
- /* Ensure correct sign of exact 0 + 0. */
- if (__glibc_unlikely ((x == 0 || y == 0) && z == 0))
- {
- x = math_opt_barrier (x);
- return x * y + z;
- }
-
- fenv_t env;
- libc_feholdexcept_setround (&env, FE_TONEAREST);
-
- /* Multiplication m1 + m2 = x * y using Dekker's algorithm. */
-#define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1)
- double x1 = x * C;
- double y1 = y * C;
- double m1 = x * y;
- x1 = (x - x1) + x1;
- y1 = (y - y1) + y1;
- double x2 = x - x1;
- double y2 = y - y1;
- double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
-
- /* Addition a1 + a2 = z + m1 using Knuth's algorithm. */
- double a1 = z + m1;
- double t1 = a1 - z;
- double t2 = a1 - t1;
- t1 = m1 - t1;
- t2 = z - t2;
- double a2 = t1 + t2;
- /* Ensure the arithmetic is not scheduled after feclearexcept call. */
- math_force_eval (m2);
- math_force_eval (a2);
- __feclearexcept (FE_INEXACT);
-
- /* If the result is an exact zero, ensure it has the correct sign. */
- if (a1 == 0 && m2 == 0)
- {
- libc_feupdateenv (&env);
- /* Ensure that round-to-nearest value of z + m1 is not reused. */
- z = math_opt_barrier (z);
- return z + m1;
- }
-
- libc_fesetround (FE_TOWARDZERO);
-
- /* Perform m2 + a2 addition with round to odd. */
- u.d = a2 + m2;
-
- if (__glibc_unlikely (adjust < 0))
- {
- if ((u.ieee.mantissa1 & 1) == 0)
- u.ieee.mantissa1 |= libc_fetestexcept (FE_INEXACT) != 0;
- v.d = a1 + u.d;
- /* Ensure the addition is not scheduled after fetestexcept call. */
- math_force_eval (v.d);
- }
-
- /* Reset rounding mode and test for inexact simultaneously. */
- int j = libc_feupdateenv_test (&env, FE_INEXACT) != 0;
-
- /* Ensure value of a1 + u.d is not reused. */
- a1 = math_opt_barrier (a1);
-
- if (__glibc_likely (adjust == 0))
- {
- if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
- u.ieee.mantissa1 |= j;
- /* Result is a1 + u.d. */
- return a1 + u.d;
- }
- else if (__glibc_likely (adjust > 0))
- {
- if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
- u.ieee.mantissa1 |= j;
- /* Result is a1 + u.d, scaled up. */
- return (a1 + u.d) * 0x1p53;
}
else
{
- /* If a1 + u.d is exact, the only rounding happens during
- scaling down. */
- if (j == 0)
- return v.d * 0x1p-108;
- /* If result rounded to zero is not subnormal, no double
- rounding will occur. */
- if (v.ieee.exponent > 108)
- return (a1 + u.d) * 0x1p-108;
- /* If v.d * 0x1p-108 with round to zero is a subnormal above
- or equal to DBL_MIN / 2, then v.d * 0x1p-108 shifts mantissa
- down just by 1 bit, which means v.ieee.mantissa1 |= j would
- change the round bit, not sticky or guard bit.
- v.d * 0x1p-108 never normalizes by shifting up,
- so round bit plus sticky bit should be already enough
- for proper rounding. */
- if (v.ieee.exponent == 108)
- {
- /* If the exponent would be in the normal range when
- rounding to normal precision with unbounded exponent
- range, the exact result is known and spurious underflows
- must be avoided on systems detecting tininess after
- rounding. */
- if (TININESS_AFTER_ROUNDING)
- {
- w.d = a1 + u.d;
- if (w.ieee.exponent == 109)
- return w.d * 0x1p-108;
- }
- /* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
- v.ieee.mantissa1 & 1 is the round bit and j is our sticky
- bit. */
- w.d = 0.0;
- w.ieee.mantissa1 = ((v.ieee.mantissa1 & 3) << 1) | j;
- w.ieee.negative = v.ieee.negative;
- v.ieee.mantissa1 &= ~3U;
- v.d *= 0x1p-108;
- w.d *= 0x1p-2;
- return v.d + w.d;
- }
- v.ieee.mantissa1 |= j;
- return v.d * 0x1p-108;
+ zhi = 0;
+ d = -d;
+ if (d == 0)
+ zlo = nz.m;
+ else if (d < 64)
+ zlo = nz.m >> d | !!(nz.m << (64 - d));
+ else
+ zlo = 1;
}
+
+ /* add */
+ int sign = nx.sign ^ ny.sign;
+ bool samesign = !(sign ^ nz.sign);
+ bool nonzero = true;
+ if (samesign)
+ {
+ /* r += z */
+ rlo += zlo;
+ rhi += zhi + (rlo < zlo);
+ }
+ else
+ {
+ /* r -= z */
+ uint64_t t = rlo;
+ rlo -= zlo;
+ rhi = rhi - zhi - (t < rlo);
+ if (rhi >> 63)
+ {
+ rlo = -rlo;
+ rhi = -rhi - !!rlo;
+ sign = !sign;
+ }
+ nonzero = !!rhi;
+ }
+
+ /* Set rhi to top 63bit of the result (last bit is sticky). */
+ if (nonzero)
+ {
+ e += 64;
+ d = stdc_leading_zeros (rhi) - 1;
+ /* note: d > 0 */
+ rhi = rhi << d | rlo >> (64 - d) | !!(rlo << d);
+ }
+ else if (rlo)
+ {
+ d = stdc_leading_zeros (rlo) - 1;
+ if (d < 0)
+ rhi = rlo >> 1 | (rlo & 1);
+ else
+ rhi = rlo << d;
+ }
+ else
+ {
+ /* Exact +-0 */
+ return x * y + z;
+ }
+ e -= d;
+
+ /* Convert to double. */
+ int64_t i = rhi; /* i is in [1<<62,(1<<63)-1] */
+ if (sign)
+ i = -i;
+ double r = convertfromint64 (i); /* |r| is in [0x1p62,0x1p63] */
+
+ if (e < -1022 - 62)
+ {
+ /* Result is subnormal before rounding. */
+ if (e == -1022 - 63)
+ {
+ double c = 0x1p63;
+ if (sign)
+ c = -c;
+ if (r == c)
+ {
+ /* Min normal after rounding, underflow depends
+ on arch behaviour which can be imitated by
+ a double to float conversion. */
+ float fltmin = 0x0.ffffff8p-63 * FLT_MIN * r;
+ return DBL_MIN / FLT_MIN * fltmin;
+ }
+ /* One bit is lost when scaled, add another top bit to
+ only round once at conversion if it is inexact. */
+ if (rhi << 53)
+ {
+ i = rhi >> 1 | (rhi & 1) | 1ull << 62;
+ if (sign)
+ i = -i;
+ r = convertfromint64 (i);
+ r = 2 * r - c; /* remove top bit */
+
+ /* Raise underflow portably, such that it
+ cannot be optimized away. */
+ {
+ double_t tiny = DBL_MIN / FLT_MIN * r;
+ r += (double) (tiny * tiny) * (r - r);
+ }
+ }
+ }
+ else
+ {
+ /* Only round once when scaled. */
+ d = 10;
+ i = (rhi >> d | !!(rhi << (64 - d))) << d;
+ if (sign)
+ i = -i;
+ r = convertfromint64 (i);
+ }
+ }
+ return __scalbn (r, e);
#endif /* ! USE_FMA_BUILTIN */
}
+
#ifndef __fma
libm_alias_double (__fma, fma)
libm_alias_double_narrow (__fma, fma)