Add add_ssaaaa and sub_ssaaaa to gmp-arch.h

To enable “longlong.h” removal, add_ssaaaa and sub_ssaaaa are moved to gmp-arch.h. The generic implementation now uses a static inline. This provides better type checking than the GNU extension, which casts the asm constraint; and it also works better with clang. Most architectures use the generic implementation, with except of arc, arm, hppa, x86, m68k, powerpc, and sparc. The 32 bit architectures the compiler generates good enough code using uint64_t types, where for 64 bit architecture the patch leverages the math_u128.h definitions that uses 128-bit integers when available (all 64 bit architectures on gcc 15). The strongly typed implementation required some changes. I adjusted _FP_W_TYPE, _FP_WS_TYPE, and _FP_I_TYPE to use the same type as mp_limb_t on aarch64, powerpc64le, x86_64, and riscv64. This basically means using “long” instead of “long long.” Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
2025-11-20 15:30:06 -03:00 · 2025-11-20 15:30:06 -03:00 · 8cd6efca5b
parent 476e962af7
commit 8cd6efca5b
7 changed files with 101 additions and 21 deletions
--- a/sysdeps/aarch64/sfp-machine.h
+++ b/sysdeps/aarch64/sfp-machine.h
@ -2,9 +2,9 @@
 #include <fpu_control.h>

 #define _FP_W_TYPE_SIZE		64
-#define _FP_W_TYPE		unsigned long long
+#define _FP_W_TYPE		unsigned long
 #define _FP_WS_TYPE		signed long long
-#define _FP_I_TYPE		long long
+#define _FP_I_TYPE		long

 #define _FP_MUL_MEAT_S(R,X,Y)					\
  _FP_MUL_MEAT_1_imm(_FP_WFRACBITS_S,R,X,Y)
--- a/sysdeps/generic/gmp-arch.h
+++ b/sysdeps/generic/gmp-arch.h
@ -21,6 +21,7 @@

 #include <stdint.h>
 #include <gmp.h>
+#include <math_uint128.h>

 #define LL_B ((mp_limb_t) 1 << (BITS_PER_MP_LIMB / 2))

@ -97,4 +98,58 @@ udiv_qrnnd_generic (mp_limb_t *q, mp_limb_t *r, mp_limb_t n1, mp_limb_t n0,
  udiv_qrnnd_generic (&__q, &__r, __n1, __n0, __d)
 #endif

+
+/* add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
+   high_addend_2, low_addend_2) adds two UWtype integers, composed by
+   HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
+   respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
+   (i.e. carry out) is not stored anywhere, and is lost.  */
+static __always_inline void
+add_ssaaaa_generic (mp_limb_t *sh, mp_limb_t *sl, mp_limb_t ah,
+		    mp_limb_t al,  mp_limb_t bh,  mp_limb_t bl)
+{
+#if __WORDSIZE == 32
+  uint64_t a = (uint64_t)ah << 32 | al;
+  uint64_t b = (uint64_t)bh << 32 | bl;
+  uint64_t r = a + b;
+  *sh = r >> 32;
+  *sl = r & 0xFFFFFFFF;
+#else
+  u128 r = u128_add (u128_from_hl (ah, al),
+                     u128_from_hl (bh, bl));
+  *sh = u128_high (r);
+  *sl = u128_low (r);
+#endif
+}
+#undef add_ssaaaa
+#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
+  add_ssaaaa_generic (&sh, &sl, ah, al, bh, bl)
+
+/* sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
+   high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
+   composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
+   LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
+   and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
+   and is lost.  */
+static __always_inline void
+sub_ddmmss_generic (mp_limb_t *sh, mp_limb_t *sl, mp_limb_t ah,
+		    mp_limb_t al,  mp_limb_t bh,  mp_limb_t bl)
+{
+#if __WORDSIZE == 32
+  uint64_t a = (uint64_t)ah << 32 | al;
+  uint64_t b = (uint64_t)bh << 32 | bl;
+  uint64_t r = a - b;
+  *sh = r >> 32;
+  *sl = r & 0xFFFFFFFF;
+#else
+  u128 r = u128_sub (u128_from_hl (ah, al),
+                     u128_from_hl (bh, bl));
+  *sh = u128_high (r);
+  *sl = u128_low (r);
+#endif
+}
+#undef sub_ddmmss
+#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
+  sub_ddmmss_generic (&sh, &sl, ah, al, bh, bl)
+
 #endif /* __GMP_ARCH_H */
--- a/sysdeps/generic/math_uint128.h
+++ b/sysdeps/generic/math_uint128.h
@ -19,6 +19,8 @@
 #ifndef _MATH_INT128_H
 #define _MATH_INT128_H

+#include <stdbool.h>
+
 /* Limited support for internal 128 bit integer, used on some math
   implementations.  It uses compiler builtin type if supported, otherwise
   it is emulated.  Only unsigned and some operations are currently supported:
@ -27,8 +29,11 @@
   - u128_high:      return the high part of the number.
   - u128_low:       return the low part of the number.
   - u128_from_u64:  create a 128 bit number from a 64 bit one.
+   - u128_from_hl:   create a 128 bit number from two 64 bit numbers.
   - u128_mul:       multiply two 128 bit numbers.
   - u128_add:       add two 128 bit numbers.
+   - u128_sub:       subtract two 128 bit numbers.
+   - u128_neg:       negate a 128 bit number.
   - u128_lshift:    left shift a number.
   - u128_rshift:    right shift a number.
 */
@ -47,8 +52,10 @@ typedef unsigned __int128 u128;
 # define u128_high(__x)         (uint64_t)((__x) >> 64)
 # define u128_low(__x)          (uint64_t)(__x)
 # define u128_from_u64(__x)     (u128)(__x)
+# define u128_from_hl(__h, __l) (((u128)(__h) << 64) | (__l))
 # define u128_mul(__x, __y)     (__x) * (__y)
 # define u128_add(__x, __y)     (__x) + (__y)
+# define u128_sub(__x, __y)     (__x) - (__y)
 # define u128_lshift(__x, __y)  (__x) << (__y)
 # define u128_rshift(__x, __y)  (__x) >> (__y)
 #else
@ -61,16 +68,28 @@ typedef struct
 # define u128_high(__x)         (__x).high
 # define u128_low(__x)          (__x).low
 # define u128_from_u64(__x)     (u128){.low = (__x), .high = 0}
+# define u128_from_hl(__h, __l) (u128){.low = (__l), .high = (__h)}

 # define MASK32                 (UINT64_C(0xffffffff))

-static u128 u128_add (u128 x, u128 y)
+static inline u128 u128_add (u128 x, u128 y)
 {
  bool carry = x.low + y.low < x.low;
  return (u128) { .high = x.high + y.high + carry, .low = x.low + y.low };
 }

-static u128 u128_lshift (u128 x, unsigned int n)
+static inline u128 u128_neg (u128 x)
+{
+  u128 xbitnot = u128_from_hl (~x.high, ~x.low);
+  return u128_add (xbitnot, u128_from_u64 (1));
+}
+
+static inline u128 u128_sub (u128 x, u128 y)
+{
+  return u128_add (x, u128_neg (y));
+}
+
+static inline u128 u128_lshift (u128 x, unsigned int n)
 {
  switch (n)
    {
@ -82,7 +101,7 @@ static u128 u128_lshift (u128 x, unsigned int n)
    }
 }

-static u128 u128_rshift (u128 x, unsigned int n)
+static inline u128 u128_rshift (u128 x, unsigned int n)
 {
  switch (n)
    {
@ -94,7 +113,7 @@ static u128 u128_rshift (u128 x, unsigned int n)
    }
 }

-static u128 u128_mul (u128 x, u128 y)
+static inline u128 u128_mul (u128 x, u128 y)
 {
  if (x.high == 0 && y.high == 0)
    {
--- a/sysdeps/loongarch/sfp-machine.h
+++ b/sysdeps/loongarch/sfp-machine.h
@ -21,9 +21,9 @@
 #include <fpu_control.h>

 #define _FP_W_TYPE_SIZE 64
-#define _FP_W_TYPE unsigned long long
-#define _FP_WS_TYPE signed long long
-#define _FP_I_TYPE long long
+#define _FP_W_TYPE unsigned long
+#define _FP_WS_TYPE signed long
+#define _FP_I_TYPE long

 #define _FP_MUL_MEAT_S(R, X, Y) _FP_MUL_MEAT_1_imm (_FP_WFRACBITS_S, R, X, Y)
 #define _FP_MUL_MEAT_D(R, X, Y) \
--- a/sysdeps/powerpc/powerpc64/le/fpu/sfp-machine.h
+++ b/sysdeps/powerpc/powerpc64/le/fpu/sfp-machine.h
@ -1,7 +1,7 @@
 #define _FP_W_TYPE_SIZE		64
-#define _FP_W_TYPE		unsigned long long
-#define _FP_WS_TYPE		signed long long
-#define _FP_I_TYPE		long long
+#define _FP_W_TYPE		unsigned long
+#define _FP_WS_TYPE		signed long
+#define _FP_I_TYPE		long

 typedef int TItype __attribute__ ((mode (TI)));
 typedef unsigned int UTItype __attribute__ ((mode (TI)));
--- a/sysdeps/riscv/sfp-machine.h
+++ b/sysdeps/riscv/sfp-machine.h
@ -52,9 +52,9 @@
 #else

 # define _FP_W_TYPE_SIZE		64
-# define _FP_W_TYPE		unsigned long long
-# define _FP_WS_TYPE		signed long long
-# define _FP_I_TYPE		long long
+# define _FP_W_TYPE		unsigned long
+# define _FP_WS_TYPE		signed long
+# define _FP_I_TYPE		long

 # define _FP_MUL_MEAT_S(R, X, Y)					\
  _FP_MUL_MEAT_1_imm (_FP_WFRACBITS_S, R, X, Y)
--- a/sysdeps/x86/fpu/sfp-machine.h
+++ b/sysdeps/x86/fpu/sfp-machine.h
@ -18,9 +18,15 @@ typedef long int __gcc_CMPtype;

 #ifdef __x86_64__
 # define _FP_W_TYPE_SIZE	64
-# define _FP_W_TYPE		unsigned long long
-# define _FP_WS_TYPE		signed long long
-# define _FP_I_TYPE		long long
+# ifndef __ILP32__
+#  define _FP_W_TYPE		unsigned long
+#  define _FP_WS_TYPE		signed long
+#  define _FP_I_TYPE		long
+# else
+#  define _FP_W_TYPE		unsigned long long
+#  define _FP_WS_TYPE		signed long long
+#  define _FP_I_TYPE		long long
+# endif

 typedef int TItype __attribute__ ((mode (TI)));
 typedef unsigned int UTItype __attribute__ ((mode (TI)));
@ -55,9 +61,9 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));
  } while (0)
 #else
 # define _FP_W_TYPE_SIZE	32
-# define _FP_W_TYPE		unsigned int
-# define _FP_WS_TYPE		signed int
-# define _FP_I_TYPE		int
+# define _FP_W_TYPE		unsigned long int
+# define _FP_WS_TYPE		signed long int
+# define _FP_I_TYPE		long int

 # define __FP_FRAC_ADD_4(r3,r2,r1,r0,x3,x2,x1,x0,y3,y2,y1,y0)	\
  __asm__ ("add{l} {%11,%3|%3,%11}\n\t"				\