glibc/sysdeps/generic/gmp-arch.h

/* Multiprecision generic functions.
   Copyright (C) 2025 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#ifndef __GMP_ARCH_H
#define __GMP_ARCH_H

#include <stdint.h>
#include <gmp.h>
#include <math_uint128.h>

#define LL_B ((mp_limb_t) 1 << (BITS_PER_MP_LIMB / 2))

static __always_inline mp_limb_t
ll_lowpart (mp_limb_t t)
{
  return t & (LL_B - 1);
}

static __always_inline mp_limb_t
ll_highpart (mp_limb_t t)
{
  return t >> (BITS_PER_MP_LIMB / 2);
}

/* umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
   UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
   word product in HIGH_PROD and LOW_PROD.  */
static __always_inline void
umul_ppmm_generic (mp_limb_t *w1, mp_limb_t *w0, mp_limb_t u, mp_limb_t v)
{
#if BITS_PER_MP_LIMB == 32
  uint64_t t0 = (uint64_t)u * v;
  *w1 = t0 >> 32;
  *w0 = t0;
#else
  u128 r = u128_mul(u128_from_u64 (u), u128_from_u64 (v));
  *w1 = u128_high (r);
  *w0 = u128_low (r);
#endif
}
#define umul_ppmm(__w1, __w0, __u, __v)			\
  ({							\
    __typeof (__w0) __w0t;				\
    __typeof (__w1) __w1t;				\
    umul_ppmm_generic (&__w1t, &__w0t, __u, __v);	\
    __w1 = __w1t;					\
    __w0 = __w0t;					\
  })

/* udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
   denominator) divides a UDWtype, composed by the UWtype integers
   HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
   in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
   than DENOMINATOR for correct operation.  If, in addition, the most
   significant bit of DENOMINATOR must be 1, then the pre-processor symbol
   UDIV_NEEDS_NORMALIZATION is defined to 1.  */
#ifndef udiv_qrnnd
static __always_inline void
udiv_qrnnd_generic (mp_limb_t *q, mp_limb_t *r, mp_limb_t n1, mp_limb_t n0,
		    mp_limb_t d)
{
  mp_limb_t d1 = ll_highpart (d),
            d0 = ll_lowpart (d),
            q1, q0;
  mp_limb_t r1, r0, m;

  r1 = n1 % d1;
  q1 = n1 / d1;
  m = q1 * d0;
  r1 = r1 * LL_B | ll_highpart (n0);
  if (r1 < m)
    {
      q1--;
      r1 += d;
      if (r1 >= d)
        if (r1 < m)
          {
            q1--;
            r1 += d;
          }
    }
  r1 -= m;

  r0 = r1 % d1;
  q0 = r1 / d1;
  m = q0 * d0;
  r0 = r0 * LL_B | ll_lowpart (n0);
  if (r0 < m)
    {
      q0--;
      r0 += d;
      if (r0 >= d)
        if (r0 < m)
          {
            q0--;
            r0 += d;
          }
    }
  r0 -= m;

  *q = q1 * LL_B | q0;
  *r = r0;
}
# define UDIV_NEEDS_NORMALIZATION 1
# define udiv_qrnnd(__q, __r, __n1, __n0, __d) \
  udiv_qrnnd_generic (&__q, &__r, __n1, __n0, __d)
#endif


/* add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
   high_addend_2, low_addend_2) adds two UWtype integers, composed by
   HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
   respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
   (i.e. carry out) is not stored anywhere, and is lost.  */
static __always_inline void
add_ssaaaa_generic (mp_limb_t *sh, mp_limb_t *sl, mp_limb_t ah,
		    mp_limb_t al,  mp_limb_t bh,  mp_limb_t bl)
{
#if BITS_PER_MP_LIMB == 32
  uint64_t a = (uint64_t)ah << 32 | al;
  uint64_t b = (uint64_t)bh << 32 | bl;
  uint64_t r = a + b;
  *sh = r >> 32;
  *sl = r & 0xFFFFFFFF;
#else
  u128 r = u128_add (u128_from_hl (ah, al),
                     u128_from_hl (bh, bl));
  *sh = u128_high (r);
  *sl = u128_low (r);
#endif
}
#define add_ssaaaa(sh, sl, ah, al, bh, bl) \
  add_ssaaaa_generic (&sh, &sl, ah, al, bh, bl)

/* sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
   high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
   composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
   LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
   and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
   and is lost.  */
static __always_inline void
sub_ddmmss_generic (mp_limb_t *sh, mp_limb_t *sl, mp_limb_t ah,
		    mp_limb_t al,  mp_limb_t bh,  mp_limb_t bl)
{
#if BITS_PER_MP_LIMB == 32
  uint64_t a = (uint64_t)ah << 32 | al;
  uint64_t b = (uint64_t)bh << 32 | bl;
  uint64_t r = a - b;
  *sh = r >> 32;
  *sl = r & 0xFFFFFFFF;
#else
  u128 r = u128_sub (u128_from_hl (ah, al),
                     u128_from_hl (bh, bl));
  *sh = u128_high (r);
  *sl = u128_low (r);
#endif
}
#define sub_ddmmss(sh, sl, ah, al, bh, bl) \
  sub_ddmmss_generic (&sh, &sl, ah, al, bh, bl)

#endif /* __GMP_ARCH_H */
Add gmp-arch and udiv_qrnnd To enable “longlong.h” removal, the udiv_qrnnd is moved to a gmp-arch.h file. It allows each architecture to implement its own arch-specific optimizations. The generic implementation now uses a static inline, which provides better type checking than the GNU extension to cast the asm constraint (and it works better with clang). Most of the architecture uses the generic implementation, which is expanded from a macro, except for alpha, x86, m68k, sh, and sparc. I kept that alpha, which uses out-of-the-line implementations and x86, where there is no easy way to use the div{q} instruction from C code. For the rest, the compiler generates good enough code. The hppa also provides arch-specific implementations, but they are not routed in “longlong.h” and thus never used. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2025-11-20 18:30:05 +00:00			`/* Multiprecision generic functions.`
			`Copyright (C) 2025 Free Software Foundation, Inc.`
			`This file is part of the GNU C Library.`

			`The GNU C Library is free software; you can redistribute it and/or`
			`modify it under the terms of the GNU Lesser General Public`
			`License as published by the Free Software Foundation; either`
			`version 2.1 of the License, or (at your option) any later version.`

			`The GNU C Library is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`Lesser General Public License for more details.`

			`You should have received a copy of the GNU Lesser General Public`
			`License along with the GNU C Library; if not, see`
			`<https://www.gnu.org/licenses/>. */`

			`#ifndef __GMP_ARCH_H`
			`#define __GMP_ARCH_H`

			`#include <stdint.h>`
			`#include <gmp.h>`
Add add_ssaaaa and sub_ssaaaa to gmp-arch.h To enable “longlong.h” removal, add_ssaaaa and sub_ssaaaa are moved to gmp-arch.h. The generic implementation now uses a static inline. This provides better type checking than the GNU extension, which casts the asm constraint; and it also works better with clang. Most architectures use the generic implementation, with except of arc, arm, hppa, x86, m68k, powerpc, and sparc. The 32 bit architectures the compiler generates good enough code using uint64_t types, where for 64 bit architecture the patch leverages the math_u128.h definitions that uses 128-bit integers when available (all 64 bit architectures on gcc 15). The strongly typed implementation required some changes. I adjusted _FP_W_TYPE, _FP_WS_TYPE, and _FP_I_TYPE to use the same type as mp_limb_t on aarch64, powerpc64le, x86_64, and riscv64. This basically means using “long” instead of “long long.” Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2025-11-20 18:30:06 +00:00			`#include <math_uint128.h>`
Add gmp-arch and udiv_qrnnd To enable “longlong.h” removal, the udiv_qrnnd is moved to a gmp-arch.h file. It allows each architecture to implement its own arch-specific optimizations. The generic implementation now uses a static inline, which provides better type checking than the GNU extension to cast the asm constraint (and it works better with clang). Most of the architecture uses the generic implementation, which is expanded from a macro, except for alpha, x86, m68k, sh, and sparc. I kept that alpha, which uses out-of-the-line implementations and x86, where there is no easy way to use the div{q} instruction from C code. For the rest, the compiler generates good enough code. The hppa also provides arch-specific implementations, but they are not routed in “longlong.h” and thus never used. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2025-11-20 18:30:05 +00:00
			`#define LL_B ((mp_limb_t) 1 << (BITS_PER_MP_LIMB / 2))`

			`static __always_inline mp_limb_t`
			`ll_lowpart (mp_limb_t t)`
			`{`
			`return t & (LL_B - 1);`
			`}`

			`static __always_inline mp_limb_t`
			`ll_highpart (mp_limb_t t)`
			`{`
			`return t >> (BITS_PER_MP_LIMB / 2);`
			`}`

Add umul_ppmm to gmp-arch.hdoc To enable “longlong.h” removal, the umul_ppmm is moved to a gmp-arch.h. The generic implementation now uses a static inline, which provides better type checking than the GNU extension to cast the asm constraint (and it works better with clang). Most of the architecture uses the generic implementation, which is expanded from a macro, except for alpha, arm, hppa, x86, m68k, mips, powerpc, and sparc. The 32 bit architectures the compiler generates good enough code using uint64_t types, where for 64 bit architecture the patch leverages the math_u128.h definitions that uses 128-bit integers when available (all 64 bit architectures on gcc 15). Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2025-11-20 18:30:07 +00:00			`/* umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two`
			`UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype`
			`word product in HIGH_PROD and LOW_PROD. */`
			`static __always_inline void`
			`umul_ppmm_generic (mp_limb_t w1, mp_limb_t w0, mp_limb_t u, mp_limb_t v)`
			`{`
int128: Check BITS_PER_MP_LIMB == 32 instead of __WORDSIZE == 32 commit 8cd6efca5b3796193ef3ff60d9dbf6e5572b2b73 Author: Adhemerval Zanella <adhemerval.zanella@linaro.org> Date: Thu Nov 20 15:30:06 2025 -0300 Add add_ssaaaa and sub_ssaaaa to gmp-arch.h checks __WORDSIZE == 32 to decide if int128 should be used, which breaks x32 which has int128 and __WORDSIZE == 32. Check BITS_PER_MP_LIMB == 32, instead of __WORDSIZE == 32. This fixes BZ #33677. Tested on x32, x86-64 and i686. Signed-off-by: H.J. Lu <hjl.tools@gmail.com> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> 2025-11-29 03:33:56 +00:00			`#if BITS_PER_MP_LIMB == 32`
Add umul_ppmm to gmp-arch.hdoc To enable “longlong.h” removal, the umul_ppmm is moved to a gmp-arch.h. The generic implementation now uses a static inline, which provides better type checking than the GNU extension to cast the asm constraint (and it works better with clang). Most of the architecture uses the generic implementation, which is expanded from a macro, except for alpha, arm, hppa, x86, m68k, mips, powerpc, and sparc. The 32 bit architectures the compiler generates good enough code using uint64_t types, where for 64 bit architecture the patch leverages the math_u128.h definitions that uses 128-bit integers when available (all 64 bit architectures on gcc 15). Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2025-11-20 18:30:07 +00:00			`uint64_t t0 = (uint64_t)u * v;`
			`*w1 = t0 >> 32;`
			`*w0 = t0;`
			`#else`
			`u128 r = u128_mul(u128_from_u64 (u), u128_from_u64 (v));`
			`*w1 = u128_high (r);`
			`*w0 = u128_low (r);`
			`#endif`
			`}`
			`#define umul_ppmm(__w1, __w0, __u, __v) \`
			`({ \`
			`__typeof (__w0) __w0t; \`
			`__typeof (__w1) __w1t; \`
			`umul_ppmm_generic (&__w1t, &__w0t, __u, __v); \`
			`__w1 = __w1t; \`
			`__w0 = __w0t; \`
			`})`

Add gmp-arch and udiv_qrnnd To enable “longlong.h” removal, the udiv_qrnnd is moved to a gmp-arch.h file. It allows each architecture to implement its own arch-specific optimizations. The generic implementation now uses a static inline, which provides better type checking than the GNU extension to cast the asm constraint (and it works better with clang). Most of the architecture uses the generic implementation, which is expanded from a macro, except for alpha, x86, m68k, sh, and sparc. I kept that alpha, which uses out-of-the-line implementations and x86, where there is no easy way to use the div{q} instruction from C code. For the rest, the compiler generates good enough code. The hppa also provides arch-specific implementations, but they are not routed in “longlong.h” and thus never used. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2025-11-20 18:30:05 +00:00			`/* udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,`
			`denominator) divides a UDWtype, composed by the UWtype integers`
			`HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient`
			`in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less`
			`than DENOMINATOR for correct operation. If, in addition, the most`
			`significant bit of DENOMINATOR must be 1, then the pre-processor symbol`
			`UDIV_NEEDS_NORMALIZATION is defined to 1. */`
			`#ifndef udiv_qrnnd`
			`static __always_inline void`
			`udiv_qrnnd_generic (mp_limb_t q, mp_limb_t r, mp_limb_t n1, mp_limb_t n0,`
			`mp_limb_t d)`
			`{`
			`mp_limb_t d1 = ll_highpart (d),`
			`d0 = ll_lowpart (d),`
			`q1, q0;`
			`mp_limb_t r1, r0, m;`

			`r1 = n1 % d1;`
			`q1 = n1 / d1;`
			`m = q1 * d0;`
			`r1 = r1 * LL_B \| ll_highpart (n0);`
			`if (r1 < m)`
			`{`
			`q1--;`
			`r1 += d;`
			`if (r1 >= d)`
			`if (r1 < m)`
			`{`
			`q1--;`
			`r1 += d;`
			`}`
			`}`
			`r1 -= m;`

			`r0 = r1 % d1;`
			`q0 = r1 / d1;`
			`m = q0 * d0;`
			`r0 = r0 * LL_B \| ll_lowpart (n0);`
			`if (r0 < m)`
			`{`
			`q0--;`
			`r0 += d;`
			`if (r0 >= d)`
			`if (r0 < m)`
			`{`
			`q0--;`
			`r0 += d;`
			`}`
			`}`
			`r0 -= m;`

			`q = q1 LL_B \| q0;`
			`*r = r0;`
			`}`
			`# define UDIV_NEEDS_NORMALIZATION 1`
			`# define udiv_qrnnd(__q, __r, __n1, __n0, __d) \`
			`udiv_qrnnd_generic (&__q, &__r, __n1, __n0, __d)`
			`#endif`

Add add_ssaaaa and sub_ssaaaa to gmp-arch.h To enable “longlong.h” removal, add_ssaaaa and sub_ssaaaa are moved to gmp-arch.h. The generic implementation now uses a static inline. This provides better type checking than the GNU extension, which casts the asm constraint; and it also works better with clang. Most architectures use the generic implementation, with except of arc, arm, hppa, x86, m68k, powerpc, and sparc. The 32 bit architectures the compiler generates good enough code using uint64_t types, where for 64 bit architecture the patch leverages the math_u128.h definitions that uses 128-bit integers when available (all 64 bit architectures on gcc 15). The strongly typed implementation required some changes. I adjusted _FP_W_TYPE, _FP_WS_TYPE, and _FP_I_TYPE to use the same type as mp_limb_t on aarch64, powerpc64le, x86_64, and riscv64. This basically means using “long” instead of “long long.” Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2025-11-20 18:30:06 +00:00
			`/* add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,`
			`high_addend_2, low_addend_2) adds two UWtype integers, composed by`
			`HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2`
			`respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow`
			`(i.e. carry out) is not stored anywhere, and is lost. */`
			`static __always_inline void`
			`add_ssaaaa_generic (mp_limb_t sh, mp_limb_t sl, mp_limb_t ah,`
			`mp_limb_t al, mp_limb_t bh, mp_limb_t bl)`
			`{`
int128: Check BITS_PER_MP_LIMB == 32 instead of __WORDSIZE == 32 commit 8cd6efca5b3796193ef3ff60d9dbf6e5572b2b73 Author: Adhemerval Zanella <adhemerval.zanella@linaro.org> Date: Thu Nov 20 15:30:06 2025 -0300 Add add_ssaaaa and sub_ssaaaa to gmp-arch.h checks __WORDSIZE == 32 to decide if int128 should be used, which breaks x32 which has int128 and __WORDSIZE == 32. Check BITS_PER_MP_LIMB == 32, instead of __WORDSIZE == 32. This fixes BZ #33677. Tested on x32, x86-64 and i686. Signed-off-by: H.J. Lu <hjl.tools@gmail.com> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> 2025-11-29 03:33:56 +00:00			`#if BITS_PER_MP_LIMB == 32`
Add add_ssaaaa and sub_ssaaaa to gmp-arch.h To enable “longlong.h” removal, add_ssaaaa and sub_ssaaaa are moved to gmp-arch.h. The generic implementation now uses a static inline. This provides better type checking than the GNU extension, which casts the asm constraint; and it also works better with clang. Most architectures use the generic implementation, with except of arc, arm, hppa, x86, m68k, powerpc, and sparc. The 32 bit architectures the compiler generates good enough code using uint64_t types, where for 64 bit architecture the patch leverages the math_u128.h definitions that uses 128-bit integers when available (all 64 bit architectures on gcc 15). The strongly typed implementation required some changes. I adjusted _FP_W_TYPE, _FP_WS_TYPE, and _FP_I_TYPE to use the same type as mp_limb_t on aarch64, powerpc64le, x86_64, and riscv64. This basically means using “long” instead of “long long.” Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2025-11-20 18:30:06 +00:00			`uint64_t a = (uint64_t)ah << 32 \| al;`
			`uint64_t b = (uint64_t)bh << 32 \| bl;`
			`uint64_t r = a + b;`
			`*sh = r >> 32;`
			`*sl = r & 0xFFFFFFFF;`
			`#else`
			`u128 r = u128_add (u128_from_hl (ah, al),`
			`u128_from_hl (bh, bl));`
			`*sh = u128_high (r);`
			`*sl = u128_low (r);`
			`#endif`
			`}`
			`#define add_ssaaaa(sh, sl, ah, al, bh, bl) \`
			`add_ssaaaa_generic (&sh, &sl, ah, al, bh, bl)`

			`/* sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,`
			`high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,`
			`composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and`
			`LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE`
			`and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,`
			`and is lost. */`
			`static __always_inline void`
			`sub_ddmmss_generic (mp_limb_t sh, mp_limb_t sl, mp_limb_t ah,`
			`mp_limb_t al, mp_limb_t bh, mp_limb_t bl)`
			`{`
int128: Check BITS_PER_MP_LIMB == 32 instead of __WORDSIZE == 32 commit 8cd6efca5b3796193ef3ff60d9dbf6e5572b2b73 Author: Adhemerval Zanella <adhemerval.zanella@linaro.org> Date: Thu Nov 20 15:30:06 2025 -0300 Add add_ssaaaa and sub_ssaaaa to gmp-arch.h checks __WORDSIZE == 32 to decide if int128 should be used, which breaks x32 which has int128 and __WORDSIZE == 32. Check BITS_PER_MP_LIMB == 32, instead of __WORDSIZE == 32. This fixes BZ #33677. Tested on x32, x86-64 and i686. Signed-off-by: H.J. Lu <hjl.tools@gmail.com> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> 2025-11-29 03:33:56 +00:00			`#if BITS_PER_MP_LIMB == 32`
Add add_ssaaaa and sub_ssaaaa to gmp-arch.h To enable “longlong.h” removal, add_ssaaaa and sub_ssaaaa are moved to gmp-arch.h. The generic implementation now uses a static inline. This provides better type checking than the GNU extension, which casts the asm constraint; and it also works better with clang. Most architectures use the generic implementation, with except of arc, arm, hppa, x86, m68k, powerpc, and sparc. The 32 bit architectures the compiler generates good enough code using uint64_t types, where for 64 bit architecture the patch leverages the math_u128.h definitions that uses 128-bit integers when available (all 64 bit architectures on gcc 15). The strongly typed implementation required some changes. I adjusted _FP_W_TYPE, _FP_WS_TYPE, and _FP_I_TYPE to use the same type as mp_limb_t on aarch64, powerpc64le, x86_64, and riscv64. This basically means using “long” instead of “long long.” Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2025-11-20 18:30:06 +00:00			`uint64_t a = (uint64_t)ah << 32 \| al;`
			`uint64_t b = (uint64_t)bh << 32 \| bl;`
			`uint64_t r = a - b;`
			`*sh = r >> 32;`
			`*sl = r & 0xFFFFFFFF;`
			`#else`
			`u128 r = u128_sub (u128_from_hl (ah, al),`
			`u128_from_hl (bh, bl));`
			`*sh = u128_high (r);`
			`*sl = u128_low (r);`
			`#endif`
			`}`
			`#define sub_ddmmss(sh, sl, ah, al, bh, bl) \`
			`sub_ddmmss_generic (&sh, &sl, ah, al, bh, bl)`

Add gmp-arch and udiv_qrnnd To enable “longlong.h” removal, the udiv_qrnnd is moved to a gmp-arch.h file. It allows each architecture to implement its own arch-specific optimizations. The generic implementation now uses a static inline, which provides better type checking than the GNU extension to cast the asm constraint (and it works better with clang). Most of the architecture uses the generic implementation, which is expanded from a macro, except for alpha, x86, m68k, sh, and sparc. I kept that alpha, which uses out-of-the-line implementations and x86, where there is no easy way to use the div{q} instruction from C code. For the rest, the compiler generates good enough code. The hppa also provides arch-specific implementations, but they are not routed in “longlong.h” and thus never used. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com> 2025-11-20 18:30:05 +00:00			`#endif /* __GMP_ARCH_H */`