Add add_ssaaaa and sub_ssaaaa to gmp-arch.h

To enable “longlong.h” removal, add_ssaaaa and sub_ssaaaa are moved to
gmp-arch.h.  The generic implementation now uses a static inline.  This
provides better type checking than the GNU extension, which casts the
asm constraint; and it also works better with clang.

Most architectures use the generic implementation, with except of
arc, arm, hppa, x86, m68k, powerpc, and sparc.  The 32 bit architectures
the compiler generates good enough code using uint64_t types, where
for 64 bit architecture the patch leverages the math_u128.h definitions
that uses 128-bit integers when available (all 64 bit architectures
on gcc 15).

The strongly typed implementation required some changes.  I adjusted
_FP_W_TYPE, _FP_WS_TYPE, and _FP_I_TYPE to use the same type as
mp_limb_t on aarch64, powerpc64le, x86_64, and riscv64.  This basically
means using “long” instead of “long long.”

Reviewed-by: Wilco Dijkstra  <Wilco.Dijkstra@arm.com>
This commit is contained in:
Adhemerval Zanella 2025-11-20 15:30:06 -03:00
parent 476e962af7
commit 8cd6efca5b
7 changed files with 101 additions and 21 deletions

View File

@ -2,9 +2,9 @@
#include <fpu_control.h>
#define _FP_W_TYPE_SIZE 64
#define _FP_W_TYPE unsigned long long
#define _FP_W_TYPE unsigned long
#define _FP_WS_TYPE signed long long
#define _FP_I_TYPE long long
#define _FP_I_TYPE long
#define _FP_MUL_MEAT_S(R,X,Y) \
_FP_MUL_MEAT_1_imm(_FP_WFRACBITS_S,R,X,Y)

View File

@ -21,6 +21,7 @@
#include <stdint.h>
#include <gmp.h>
#include <math_uint128.h>
#define LL_B ((mp_limb_t) 1 << (BITS_PER_MP_LIMB / 2))
@ -97,4 +98,58 @@ udiv_qrnnd_generic (mp_limb_t *q, mp_limb_t *r, mp_limb_t n1, mp_limb_t n0,
udiv_qrnnd_generic (&__q, &__r, __n1, __n0, __d)
#endif
/* add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
high_addend_2, low_addend_2) adds two UWtype integers, composed by
HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow
(i.e. carry out) is not stored anywhere, and is lost. */
static __always_inline void
add_ssaaaa_generic (mp_limb_t *sh, mp_limb_t *sl, mp_limb_t ah,
mp_limb_t al, mp_limb_t bh, mp_limb_t bl)
{
#if __WORDSIZE == 32
uint64_t a = (uint64_t)ah << 32 | al;
uint64_t b = (uint64_t)bh << 32 | bl;
uint64_t r = a + b;
*sh = r >> 32;
*sl = r & 0xFFFFFFFF;
#else
u128 r = u128_add (u128_from_hl (ah, al),
u128_from_hl (bh, bl));
*sh = u128_high (r);
*sl = u128_low (r);
#endif
}
#undef add_ssaaaa
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
add_ssaaaa_generic (&sh, &sl, ah, al, bh, bl)
/* sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE
and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,
and is lost. */
static __always_inline void
sub_ddmmss_generic (mp_limb_t *sh, mp_limb_t *sl, mp_limb_t ah,
mp_limb_t al, mp_limb_t bh, mp_limb_t bl)
{
#if __WORDSIZE == 32
uint64_t a = (uint64_t)ah << 32 | al;
uint64_t b = (uint64_t)bh << 32 | bl;
uint64_t r = a - b;
*sh = r >> 32;
*sl = r & 0xFFFFFFFF;
#else
u128 r = u128_sub (u128_from_hl (ah, al),
u128_from_hl (bh, bl));
*sh = u128_high (r);
*sl = u128_low (r);
#endif
}
#undef sub_ddmmss
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
sub_ddmmss_generic (&sh, &sl, ah, al, bh, bl)
#endif /* __GMP_ARCH_H */

View File

@ -19,6 +19,8 @@
#ifndef _MATH_INT128_H
#define _MATH_INT128_H
#include <stdbool.h>
/* Limited support for internal 128 bit integer, used on some math
implementations. It uses compiler builtin type if supported, otherwise
it is emulated. Only unsigned and some operations are currently supported:
@ -27,8 +29,11 @@
- u128_high: return the high part of the number.
- u128_low: return the low part of the number.
- u128_from_u64: create a 128 bit number from a 64 bit one.
- u128_from_hl: create a 128 bit number from two 64 bit numbers.
- u128_mul: multiply two 128 bit numbers.
- u128_add: add two 128 bit numbers.
- u128_sub: subtract two 128 bit numbers.
- u128_neg: negate a 128 bit number.
- u128_lshift: left shift a number.
- u128_rshift: right shift a number.
*/
@ -47,8 +52,10 @@ typedef unsigned __int128 u128;
# define u128_high(__x) (uint64_t)((__x) >> 64)
# define u128_low(__x) (uint64_t)(__x)
# define u128_from_u64(__x) (u128)(__x)
# define u128_from_hl(__h, __l) (((u128)(__h) << 64) | (__l))
# define u128_mul(__x, __y) (__x) * (__y)
# define u128_add(__x, __y) (__x) + (__y)
# define u128_sub(__x, __y) (__x) - (__y)
# define u128_lshift(__x, __y) (__x) << (__y)
# define u128_rshift(__x, __y) (__x) >> (__y)
#else
@ -61,16 +68,28 @@ typedef struct
# define u128_high(__x) (__x).high
# define u128_low(__x) (__x).low
# define u128_from_u64(__x) (u128){.low = (__x), .high = 0}
# define u128_from_hl(__h, __l) (u128){.low = (__l), .high = (__h)}
# define MASK32 (UINT64_C(0xffffffff))
static u128 u128_add (u128 x, u128 y)
static inline u128 u128_add (u128 x, u128 y)
{
bool carry = x.low + y.low < x.low;
return (u128) { .high = x.high + y.high + carry, .low = x.low + y.low };
}
static u128 u128_lshift (u128 x, unsigned int n)
static inline u128 u128_neg (u128 x)
{
u128 xbitnot = u128_from_hl (~x.high, ~x.low);
return u128_add (xbitnot, u128_from_u64 (1));
}
static inline u128 u128_sub (u128 x, u128 y)
{
return u128_add (x, u128_neg (y));
}
static inline u128 u128_lshift (u128 x, unsigned int n)
{
switch (n)
{
@ -82,7 +101,7 @@ static u128 u128_lshift (u128 x, unsigned int n)
}
}
static u128 u128_rshift (u128 x, unsigned int n)
static inline u128 u128_rshift (u128 x, unsigned int n)
{
switch (n)
{
@ -94,7 +113,7 @@ static u128 u128_rshift (u128 x, unsigned int n)
}
}
static u128 u128_mul (u128 x, u128 y)
static inline u128 u128_mul (u128 x, u128 y)
{
if (x.high == 0 && y.high == 0)
{

View File

@ -21,9 +21,9 @@
#include <fpu_control.h>
#define _FP_W_TYPE_SIZE 64
#define _FP_W_TYPE unsigned long long
#define _FP_WS_TYPE signed long long
#define _FP_I_TYPE long long
#define _FP_W_TYPE unsigned long
#define _FP_WS_TYPE signed long
#define _FP_I_TYPE long
#define _FP_MUL_MEAT_S(R, X, Y) _FP_MUL_MEAT_1_imm (_FP_WFRACBITS_S, R, X, Y)
#define _FP_MUL_MEAT_D(R, X, Y) \

View File

@ -1,7 +1,7 @@
#define _FP_W_TYPE_SIZE 64
#define _FP_W_TYPE unsigned long long
#define _FP_WS_TYPE signed long long
#define _FP_I_TYPE long long
#define _FP_W_TYPE unsigned long
#define _FP_WS_TYPE signed long
#define _FP_I_TYPE long
typedef int TItype __attribute__ ((mode (TI)));
typedef unsigned int UTItype __attribute__ ((mode (TI)));

View File

@ -52,9 +52,9 @@
#else
# define _FP_W_TYPE_SIZE 64
# define _FP_W_TYPE unsigned long long
# define _FP_WS_TYPE signed long long
# define _FP_I_TYPE long long
# define _FP_W_TYPE unsigned long
# define _FP_WS_TYPE signed long
# define _FP_I_TYPE long
# define _FP_MUL_MEAT_S(R, X, Y) \
_FP_MUL_MEAT_1_imm (_FP_WFRACBITS_S, R, X, Y)

View File

@ -18,9 +18,15 @@ typedef long int __gcc_CMPtype;
#ifdef __x86_64__
# define _FP_W_TYPE_SIZE 64
# define _FP_W_TYPE unsigned long long
# define _FP_WS_TYPE signed long long
# define _FP_I_TYPE long long
# ifndef __ILP32__
# define _FP_W_TYPE unsigned long
# define _FP_WS_TYPE signed long
# define _FP_I_TYPE long
# else
# define _FP_W_TYPE unsigned long long
# define _FP_WS_TYPE signed long long
# define _FP_I_TYPE long long
# endif
typedef int TItype __attribute__ ((mode (TI)));
typedef unsigned int UTItype __attribute__ ((mode (TI)));
@ -55,9 +61,9 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
} while (0)
#else
# define _FP_W_TYPE_SIZE 32
# define _FP_W_TYPE unsigned int
# define _FP_WS_TYPE signed int
# define _FP_I_TYPE int
# define _FP_W_TYPE unsigned long int
# define _FP_WS_TYPE signed long int
# define _FP_I_TYPE long int
# define __FP_FRAC_ADD_4(r3,r2,r1,r0,x3,x2,x1,x0,y3,y2,y1,y0) \
__asm__ ("add{l} {%11,%3|%3,%11}\n\t" \