math: New generic fma implementation

The current implementation relies on setting the rounding mode for
different calculations (first to FE_TONEAREST and then to FE_TOWARDZERO)
to obtain correctly rounded results. For most CPUs, this adds a significant
performance overhead since it requires executing a typically slow
instruction (to get/set the floating-point status), it necessitates
flushing the pipeline, and breaks some compiler assumptions/optimizations.

This patch introduces a new implementation originally written by Szabolcs
for musl, which utilizes mostly integer arithmetic.  Floating-point
arithmetic is used to raise the expected exceptions, without the need for
fenv.h operations.

I added some changes compared to the original code:

  * Fixed some signaling NaN issues when the 3-argument is NaN.

  * Use math_uint128.h for the 64-bit multiplication operation.  It allows
    the compiler to use 128-bit types where available, which enables some
    optimizations on certain targets (for instance, MIPS64).

  * Fixed an arm32 issue where the libgcc routine might not respect the
    rounding mode [1].  This can also be used on other targets to optimize
    the conversion from int64_t to double.

  * Use -fexcess-precision=standard on i686.

I tested this implementation on various targets (x86_64, i686, arm, aarch64,
powerpc), including some by manually disabling the compiler instructions.

Performance-wise, it shows large improvements:

reciprocal-throughput       master       patched       improvement
x86_64 [2]                289.4640       22.4396            12.90x
i686 [2]                  636.8660      169.3640             3.76x
aarch64 [3]                46.0020       11.3281             4.06x
armhf [3]                   63.989       26.5056             2.41x
powerpc [4]                23.9332       6.40205             3.74x

latency                     master       patched       improvement
x86_64                    293.7360       38.1478             7.70x
i686                      658.4160      187.9940             3.50x
aarch64                    44.5166       14.7157             3.03x
armhf                      63.7678       28.4116             2.24x
power10                    23.8561       11.4250             2.09x

Checked on x86_64-linux-gnu and i686-linux-gnu with —disable-multi-arch,
and on arm-linux-gnueabihf.

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91970
[2] gcc 15.2.1, Zen3
[3] gcc 15.2.1, Neoverse N1
[4] gcc 15.2.1, POWER10

Signed-off-by: Szabolcs Nagy <nsz@gcc.gnu.org>
Co-authored-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
This commit is contained in:
Adhemerval Zanella 2025-11-13 09:58:20 -03:00
parent 5dab2a3195
commit bf211c3499
4 changed files with 243 additions and 258 deletions

View File

@ -0,0 +1,32 @@
/* Configure optimized libm functions. AArch64 version.
Copyright (C) 2017-2025 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#ifndef ARM_MATH_PRIVATE_H
#define ARM_MATH_PRIVATE_H 1
#include <stdint.h>
/* For int64_t to double conversion, libgcc might not respect the rounding
mode [1].
[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=91970 */
#define TOINT64_INTRINSICS 0
#include_next <math_private.h>
#endif

View File

@ -13,6 +13,7 @@ CFLAGS-e_gamma_r.c += -fexcess-precision=standard
CFLAGS-s_erf.c += -fexcess-precision=standard
CFLAGS-s_erfc.c += -fexcess-precision=standard
CFLAGS-s_erf_common.c += -fexcess-precision=standard
CFLAGS-s_fma.c += -fexcess-precision=standard
endif
ifeq ($(subdir),gmon)

View File

@ -85,6 +85,24 @@ static inline int32_t
converttoint (double x);
#endif
#ifndef TOINT64_INTRINSICS
# define TOINT64_INTRINSICS 1
#endif
static inline double convertfromint64 (int64_t a)
{
#if !TOINT64_INTRINSICS
union { int64_t x; double d; } low = {.d = 0x1.0p52};
double high = (int32_t)(a >> 32) * 0x1.0p32;
low.x |= a & INT64_C(0x00000000ffffffff);
return (high - 0x1.0p52) + low.d;
#else
return a;
#endif
}
static inline uint64_t
asuint64 (double f)
{

View File

@ -23,17 +23,51 @@
#include <math.h>
#undef dfmal
#undef f32xfmaf64
#include <fenv.h>
#include <ieee754.h>
#include <math-barriers.h>
#include <fenv_private.h>
#include <libm-alias-double.h>
#include <math-narrow-alias.h>
#include <tininess.h>
#include <math-use-builtins.h>
/* This implementation uses rounding to odd to avoid problems with
double rounding. See a paper by Boldo and Melquiond:
http://www.lri.fr/~melquion/doc/08-tc.pdf */
#if !USE_FMA_BUILTIN
# include <stdbit.h>
# include "math_config.h"
# include <math_uint128.h>
# define ZEROINFNAN (0x7ff - EXPONENT_BIAS - MANTISSA_WIDTH - 1)
struct num
{
uint64_t m;
int e;
int sign;
};
static inline struct num normalize (double x)
{
uint64_t ix = asuint64 (x);
int e = ix >> MANTISSA_WIDTH;
int sign = e & 0x800;
e &= 0x7ff;
if (!e)
{
ix = asuint64 (x * 0x1p63);
e = ix >> MANTISSA_WIDTH & 0x7ff;
e = e ? e-63 : 0x800;
}
ix &= (UINT64_C(1) << MANTISSA_WIDTH) - 1;
ix |= UINT64_C(1) << MANTISSA_WIDTH;
ix <<= 1;
e -= EXPONENT_BIAS + MANTISSA_WIDTH + 1;
return (struct num){ix,e,sign};
}
static void mul (uint64_t *hi, uint64_t *lo, uint64_t x, uint64_t y)
{
u128 r = u128_mul (u128_from_u64 (x), u128_from_u64 (y));
*hi = u128_high (r);
*lo = u128_low (r);
}
#endif
double
__fma (double x, double y, double z)
@ -41,271 +75,171 @@ __fma (double x, double y, double z)
#if USE_FMA_BUILTIN
return __builtin_fma (x, y, z);
#else
/* Use generic implementation. */
union ieee754_double u, v, w;
int adjust = 0;
u.d = x;
v.d = y;
w.d = z;
if (__builtin_expect (u.ieee.exponent + v.ieee.exponent
>= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG, 0)
|| __builtin_expect (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG, 0)
|| __builtin_expect (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG, 0)
|| __builtin_expect (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG, 0)
|| __builtin_expect (u.ieee.exponent + v.ieee.exponent
<= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG, 0))
/* Normalize so top 10bits and last bit are 0 */
struct num nx, ny, nz;
nx = normalize (x);
ny = normalize (y);
nz = normalize (z);
if (nx.e >= ZEROINFNAN || ny.e >= ZEROINFNAN)
return x * y + z;
if (nz.e >= ZEROINFNAN)
{
/* If z is Inf, but x and y are finite, the result should be
z rather than NaN. */
if (w.ieee.exponent == 0x7ff
&& u.ieee.exponent != 0x7ff
&& v.ieee.exponent != 0x7ff)
return (z + x) + y;
/* If z is zero and x are y are nonzero, compute the result
as x * y to avoid the wrong sign of a zero result if x * y
underflows to 0. */
if (z == 0 && x != 0 && y != 0)
if (nz.e > ZEROINFNAN) /* z==0 */
return x * y;
/* If x or y or z is Inf/NaN, or if x * y is zero, compute as
x * y + z. */
if (u.ieee.exponent == 0x7ff
|| v.ieee.exponent == 0x7ff
|| w.ieee.exponent == 0x7ff
|| x == 0
|| y == 0)
return x * y + z;
/* If fma will certainly overflow, compute as x * y. */
if (u.ieee.exponent + v.ieee.exponent > 0x7ff + IEEE754_DOUBLE_BIAS)
return x * y;
/* If x * y is less than 1/4 of DBL_TRUE_MIN, neither the
result nor whether there is underflow depends on its exact
value, only on its sign. */
if (u.ieee.exponent + v.ieee.exponent
< IEEE754_DOUBLE_BIAS - DBL_MANT_DIG - 2)
else if (isnan (z))
return __builtin_nan ("");
return z;
}
/* mul: r = x*y */
uint64_t rhi, rlo, zhi, zlo;
mul (&rhi, &rlo, nx.m, ny.m);
/* Either top 20 or 21 bits of rhi and last 2 bits of rlo are 0 */
/* align exponents */
int e = nx.e + ny.e;
int d = nz.e - e;
/* Shift bits z<<=kz, r>>=kr, so kz+kr == d, set e = e+kr (== ez-kz). */
if (d > 0)
{
if (d < 64)
{
int neg = u.ieee.negative ^ v.ieee.negative;
double tiny = neg ? -0x1p-1074 : 0x1p-1074;
if (w.ieee.exponent >= 3)
return tiny + z;
/* Scaling up, adding TINY and scaling down produces the
correct result, because in round-to-nearest mode adding
TINY has no effect and in other modes double rounding is
harmless. But it may not produce required underflow
exceptions. */
v.d = z * 0x1p54 + tiny;
if (TININESS_AFTER_ROUNDING
? v.ieee.exponent < 55
: (w.ieee.exponent == 0
|| (w.ieee.exponent == 1
&& w.ieee.negative != neg
&& w.ieee.mantissa1 == 0
&& w.ieee.mantissa0 == 0)))
zlo = nz.m << d;
zhi = nz.m >> (64 - d);
}
else
{
zlo = 0;
zhi = nz.m;
e = nz.e - 64;
d -= 64;
if (d < 64)
{
double force_underflow = x * y;
math_force_eval (force_underflow);
rlo = rhi << (64 - d) | rlo >> d | !!(rlo << (64 - d));
rhi = rhi >> d;
}
return v.d * 0x1p-54;
}
if (u.ieee.exponent + v.ieee.exponent
>= 0x7ff + IEEE754_DOUBLE_BIAS - DBL_MANT_DIG)
{
/* Compute 1p-53 times smaller result and multiply
at the end. */
if (u.ieee.exponent > v.ieee.exponent)
u.ieee.exponent -= DBL_MANT_DIG;
else
v.ieee.exponent -= DBL_MANT_DIG;
/* If x + y exponent is very large and z exponent is very small,
it doesn't matter if we don't adjust it. */
if (w.ieee.exponent > DBL_MANT_DIG)
w.ieee.exponent -= DBL_MANT_DIG;
adjust = 1;
}
else if (w.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
{
/* Similarly.
If z exponent is very large and x and y exponents are
very small, adjust them up to avoid spurious underflows,
rather than down. */
if (u.ieee.exponent + v.ieee.exponent
<= IEEE754_DOUBLE_BIAS + 2 * DBL_MANT_DIG)
{
if (u.ieee.exponent > v.ieee.exponent)
u.ieee.exponent += 2 * DBL_MANT_DIG + 2;
else
v.ieee.exponent += 2 * DBL_MANT_DIG + 2;
rlo = 1;
rhi = 0;
}
else if (u.ieee.exponent > v.ieee.exponent)
{
if (u.ieee.exponent > DBL_MANT_DIG)
u.ieee.exponent -= DBL_MANT_DIG;
}
else if (v.ieee.exponent > DBL_MANT_DIG)
v.ieee.exponent -= DBL_MANT_DIG;
w.ieee.exponent -= DBL_MANT_DIG;
adjust = 1;
}
else if (u.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
{
u.ieee.exponent -= DBL_MANT_DIG;
if (v.ieee.exponent)
v.ieee.exponent += DBL_MANT_DIG;
else
v.d *= 0x1p53;
}
else if (v.ieee.exponent >= 0x7ff - DBL_MANT_DIG)
{
v.ieee.exponent -= DBL_MANT_DIG;
if (u.ieee.exponent)
u.ieee.exponent += DBL_MANT_DIG;
else
u.d *= 0x1p53;
}
else /* if (u.ieee.exponent + v.ieee.exponent
<= IEEE754_DOUBLE_BIAS + DBL_MANT_DIG) */
{
if (u.ieee.exponent > v.ieee.exponent)
u.ieee.exponent += 2 * DBL_MANT_DIG + 2;
else
v.ieee.exponent += 2 * DBL_MANT_DIG + 2;
if (w.ieee.exponent <= 4 * DBL_MANT_DIG + 6)
{
if (w.ieee.exponent)
w.ieee.exponent += 2 * DBL_MANT_DIG + 2;
else
w.d *= 0x1p108;
adjust = -1;
}
/* Otherwise x * y should just affect inexact
and nothing else. */
}
x = u.d;
y = v.d;
z = w.d;
}
/* Ensure correct sign of exact 0 + 0. */
if (__glibc_unlikely ((x == 0 || y == 0) && z == 0))
{
x = math_opt_barrier (x);
return x * y + z;
}
fenv_t env;
libc_feholdexcept_setround (&env, FE_TONEAREST);
/* Multiplication m1 + m2 = x * y using Dekker's algorithm. */
#define C ((1 << (DBL_MANT_DIG + 1) / 2) + 1)
double x1 = x * C;
double y1 = y * C;
double m1 = x * y;
x1 = (x - x1) + x1;
y1 = (y - y1) + y1;
double x2 = x - x1;
double y2 = y - y1;
double m2 = (((x1 * y1 - m1) + x1 * y2) + x2 * y1) + x2 * y2;
/* Addition a1 + a2 = z + m1 using Knuth's algorithm. */
double a1 = z + m1;
double t1 = a1 - z;
double t2 = a1 - t1;
t1 = m1 - t1;
t2 = z - t2;
double a2 = t1 + t2;
/* Ensure the arithmetic is not scheduled after feclearexcept call. */
math_force_eval (m2);
math_force_eval (a2);
__feclearexcept (FE_INEXACT);
/* If the result is an exact zero, ensure it has the correct sign. */
if (a1 == 0 && m2 == 0)
{
libc_feupdateenv (&env);
/* Ensure that round-to-nearest value of z + m1 is not reused. */
z = math_opt_barrier (z);
return z + m1;
}
libc_fesetround (FE_TOWARDZERO);
/* Perform m2 + a2 addition with round to odd. */
u.d = a2 + m2;
if (__glibc_unlikely (adjust < 0))
{
if ((u.ieee.mantissa1 & 1) == 0)
u.ieee.mantissa1 |= libc_fetestexcept (FE_INEXACT) != 0;
v.d = a1 + u.d;
/* Ensure the addition is not scheduled after fetestexcept call. */
math_force_eval (v.d);
}
/* Reset rounding mode and test for inexact simultaneously. */
int j = libc_feupdateenv_test (&env, FE_INEXACT) != 0;
/* Ensure value of a1 + u.d is not reused. */
a1 = math_opt_barrier (a1);
if (__glibc_likely (adjust == 0))
{
if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
u.ieee.mantissa1 |= j;
/* Result is a1 + u.d. */
return a1 + u.d;
}
else if (__glibc_likely (adjust > 0))
{
if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
u.ieee.mantissa1 |= j;
/* Result is a1 + u.d, scaled up. */
return (a1 + u.d) * 0x1p53;
}
else
{
/* If a1 + u.d is exact, the only rounding happens during
scaling down. */
if (j == 0)
return v.d * 0x1p-108;
/* If result rounded to zero is not subnormal, no double
rounding will occur. */
if (v.ieee.exponent > 108)
return (a1 + u.d) * 0x1p-108;
/* If v.d * 0x1p-108 with round to zero is a subnormal above
or equal to DBL_MIN / 2, then v.d * 0x1p-108 shifts mantissa
down just by 1 bit, which means v.ieee.mantissa1 |= j would
change the round bit, not sticky or guard bit.
v.d * 0x1p-108 never normalizes by shifting up,
so round bit plus sticky bit should be already enough
for proper rounding. */
if (v.ieee.exponent == 108)
{
/* If the exponent would be in the normal range when
rounding to normal precision with unbounded exponent
range, the exact result is known and spurious underflows
must be avoided on systems detecting tininess after
rounding. */
if (TININESS_AFTER_ROUNDING)
{
w.d = a1 + u.d;
if (w.ieee.exponent == 109)
return w.d * 0x1p-108;
}
/* v.ieee.mantissa1 & 2 is LSB bit of the result before rounding,
v.ieee.mantissa1 & 1 is the round bit and j is our sticky
bit. */
w.d = 0.0;
w.ieee.mantissa1 = ((v.ieee.mantissa1 & 3) << 1) | j;
w.ieee.negative = v.ieee.negative;
v.ieee.mantissa1 &= ~3U;
v.d *= 0x1p-108;
w.d *= 0x1p-2;
return v.d + w.d;
}
v.ieee.mantissa1 |= j;
return v.d * 0x1p-108;
zhi = 0;
d = -d;
if (d == 0)
zlo = nz.m;
else if (d < 64)
zlo = nz.m >> d | !!(nz.m << (64 - d));
else
zlo = 1;
}
/* add */
int sign = nx.sign ^ ny.sign;
bool samesign = !(sign ^ nz.sign);
bool nonzero = true;
if (samesign)
{
/* r += z */
rlo += zlo;
rhi += zhi + (rlo < zlo);
}
else
{
/* r -= z */
uint64_t t = rlo;
rlo -= zlo;
rhi = rhi - zhi - (t < rlo);
if (rhi >> 63)
{
rlo = -rlo;
rhi = -rhi - !!rlo;
sign = !sign;
}
nonzero = !!rhi;
}
/* Set rhi to top 63bit of the result (last bit is sticky). */
if (nonzero)
{
e += 64;
d = stdc_leading_zeros (rhi) - 1;
/* note: d > 0 */
rhi = rhi << d | rlo >> (64 - d) | !!(rlo << d);
}
else if (rlo)
{
d = stdc_leading_zeros (rlo) - 1;
if (d < 0)
rhi = rlo >> 1 | (rlo & 1);
else
rhi = rlo << d;
}
else
{
/* Exact +-0 */
return x * y + z;
}
e -= d;
/* Convert to double. */
int64_t i = rhi; /* i is in [1<<62,(1<<63)-1] */
if (sign)
i = -i;
double r = convertfromint64 (i); /* |r| is in [0x1p62,0x1p63] */
if (e < -1022 - 62)
{
/* Result is subnormal before rounding. */
if (e == -1022 - 63)
{
double c = 0x1p63;
if (sign)
c = -c;
if (r == c)
{
/* Min normal after rounding, underflow depends
on arch behaviour which can be imitated by
a double to float conversion. */
float fltmin = 0x0.ffffff8p-63 * FLT_MIN * r;
return DBL_MIN / FLT_MIN * fltmin;
}
/* One bit is lost when scaled, add another top bit to
only round once at conversion if it is inexact. */
if (rhi << 53)
{
i = rhi >> 1 | (rhi & 1) | 1ull << 62;
if (sign)
i = -i;
r = convertfromint64 (i);
r = 2 * r - c; /* remove top bit */
/* Raise underflow portably, such that it
cannot be optimized away. */
{
double_t tiny = DBL_MIN / FLT_MIN * r;
r += (double) (tiny * tiny) * (r - r);
}
}
}
else
{
/* Only round once when scaled. */
d = 10;
i = (rhi >> d | !!(rhi << (64 - d))) << d;
if (sign)
i = -i;
r = convertfromint64 (i);
}
}
return __scalbn (r, e);
#endif /* ! USE_FMA_BUILTIN */
}
#ifndef __fma
libm_alias_double (__fma, fma)
libm_alias_double_narrow (__fma, fma)