x86: Adapt "%v" usage on clang to emit VEX enconding

clang does not support the %v to select the AVX encoding, nor the '%d' asm contrain, and for AVX build it requires all 3 arguments. This patch add a new internal header, math-inline-asm.h, that adds functions to abstract the inline asm required differences between gcc and clang. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2025-10-31 17:00:46 -03:00 · 2025-10-31 17:00:46 -03:00 · 427c25278d
parent d25db12c2a
commit 427c25278d
35 changed files with 210 additions and 121 deletions
--- a/sysdeps/i386/fpu/fclrexcpt.c
+++ b/sysdeps/i386/fpu/fclrexcpt.c
@ -19,6 +19,7 @@
 #include <fenv.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 int
 __feclearexcept (int excepts)
@ -44,13 +45,13 @@ __feclearexcept (int excepts)
      unsigned int xnew_exc;

      /* Get the current MXCSR.  */
-      __asm__ ("%vstmxcsr %0" : "=m" (xnew_exc));
+      stmxcsr_inline_asm (&xnew_exc);

      /* Clear the relevant bits.  */
      xnew_exc &= ~excepts;

      /* Put the new data in effect.  */
-      __asm__ ("%vldmxcsr %0" : : "m" (xnew_exc));
+      ldmxcsr_inline_asm (&xnew_exc);
    }

  /* Success.  */
--- a/sysdeps/i386/fpu/fedisblxcpt.c
+++ b/sysdeps/i386/fpu/fedisblxcpt.c
@ -19,6 +19,7 @@
 #include <fenv.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 int
 fedisableexcept (int excepts)
@ -41,11 +42,11 @@ fedisableexcept (int excepts)
      unsigned int xnew_exc;

      /* Get the current control word.  */
-      __asm__ ("%vstmxcsr %0" : "=m" (xnew_exc));
+      stmxcsr_inline_asm (&xnew_exc);

      xnew_exc |= excepts << 7;

-      __asm__ ("%vldmxcsr %0" : : "m" (xnew_exc));
+      ldmxcsr_inline_asm (&xnew_exc);
    }

  return old_exc;
--- a/sysdeps/i386/fpu/feenablxcpt.c
+++ b/sysdeps/i386/fpu/feenablxcpt.c
@ -19,6 +19,7 @@
 #include <fenv.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 int
 feenableexcept (int excepts)
@ -41,11 +42,11 @@ feenableexcept (int excepts)
      unsigned int xnew_exc;

      /* Get the current control word.  */
-      __asm__ ("%vstmxcsr %0" : "=m" (xnew_exc));
+      stmxcsr_inline_asm (&xnew_exc);

      xnew_exc &= ~(excepts << 7);

-      __asm__ ("%vldmxcsr %0" : : "m" (xnew_exc));
+      ldmxcsr_inline_asm (&xnew_exc);
    }

  return old_exc;
--- a/sysdeps/i386/fpu/fegetenv.c
+++ b/sysdeps/i386/fpu/fegetenv.c
@ -19,6 +19,7 @@
 #include <fenv.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 int
 __fegetenv (fenv_t *envp)
@ -30,7 +31,7 @@ __fegetenv (fenv_t *envp)
  __asm__ ("fldenv %0" : : "m" (*envp));

  if (CPU_FEATURE_USABLE (SSE))
-    __asm__ ("%vstmxcsr %0" : "=m" (envp->__eip));
+    stmxcsr_inline_asm (&envp->__eip);

  /* Success.  */
  return 0;
--- a/sysdeps/i386/fpu/fegetmode.c
+++ b/sysdeps/i386/fpu/fegetmode.c
@ -20,12 +20,13 @@
 #include <fpu_control.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 int
 fegetmode (femode_t *modep)
 {
  _FPU_GETCW (modep->__control_word);
  if (CPU_FEATURE_USABLE (SSE))
-    __asm__ ("%vstmxcsr %0" : "=m" (modep->__mxcsr));
+    stmxcsr_inline_asm (&modep->__mxcsr);
  return 0;
 }
--- a/sysdeps/i386/fpu/feholdexcpt.c
+++ b/sysdeps/i386/fpu/feholdexcpt.c
@ -19,6 +19,7 @@
 #include <fenv.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 int
 __feholdexcept (fenv_t *envp)
@ -33,12 +34,12 @@ __feholdexcept (fenv_t *envp)
      unsigned int xwork;

      /* Get the current control word.  */
-      __asm__ ("%vstmxcsr %0" : "=m" (envp->__eip));
+      stmxcsr_inline_asm (&envp->__eip);

      /* Set all exceptions to non-stop and clear them.  */
      xwork = (envp->__eip | 0x1f80) & ~0x3f;

-      __asm__ ("%vldmxcsr %0" : : "m" (xwork));
+      ldmxcsr_inline_asm (&xwork);
    }

  return 0;
--- a/sysdeps/i386/fpu/fesetenv.c
+++ b/sysdeps/i386/fpu/fesetenv.c
@ -21,6 +21,7 @@
 #include <assert.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>


 /* All exceptions, including the x86-specific "denormal operand"
@ -80,7 +81,7 @@ __fesetenv (const fenv_t *envp)
  if (CPU_FEATURE_USABLE (SSE))
    {
      unsigned int mxcsr;
-      __asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
+      stmxcsr_inline_asm (&mxcsr);

      if (envp == FE_DFL_ENV)
 	{
@ -111,7 +112,7 @@ __fesetenv (const fenv_t *envp)
      else
 	mxcsr = envp->__eip;

-      __asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
+      ldmxcsr_inline_asm (&mxcsr);
    }

  /* Success.  */
--- a/sysdeps/i386/fpu/fesetexcept.c
+++ b/sysdeps/i386/fpu/fesetexcept.c
@ -18,6 +18,7 @@

 #include <fenv.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 int
 fesetexcept (int excepts)
@ -31,15 +32,16 @@ fesetexcept (int excepts)

  if (CPU_FEATURE_USABLE (SSE))
    {
-      /* Get the control word of the SSE unit.  */
      unsigned int mxcsr;
-      __asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
+
+      /* Get the control word of the SSE unit.  */
+      stmxcsr_inline_asm (&mxcsr);

      /* Set relevant flags.  */
      mxcsr |= excepts;

      /* Put the new data in effect.  */
-      __asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
+      ldmxcsr_inline_asm (&mxcsr);
    }
  else
    {
--- a/sysdeps/i386/fpu/fesetmode.c
+++ b/sysdeps/i386/fpu/fesetmode.c
@ -20,6 +20,7 @@
 #include <fpu_control.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 /* All exceptions, including the x86-specific "denormal operand"
   exception.  */
@ -37,7 +38,8 @@ fesetmode (const femode_t *modep)
  if (CPU_FEATURE_USABLE (SSE))
    {
      unsigned int mxcsr;
-      __asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
+
+      stmxcsr_inline_asm (&mxcsr);
      /* Preserve SSE exception flags but restore other state in
 	 MXCSR.  */
      mxcsr &= FE_ALL_EXCEPT_X86;
@ -47,7 +49,7 @@ fesetmode (const femode_t *modep)
 	mxcsr |= FE_ALL_EXCEPT_X86 << 7;
      else
 	mxcsr |= modep->__mxcsr & ~FE_ALL_EXCEPT_X86;
-      __asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
+      ldmxcsr_inline_asm (&mxcsr);
    }
  return 0;
 }
--- a/sysdeps/i386/fpu/fesetround.c
+++ b/sysdeps/i386/fpu/fesetround.c
@ -19,6 +19,7 @@
 #include <fenv.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 int
 __fesetround (int round)
@ -38,11 +39,10 @@ __fesetround (int round)
  if (CPU_FEATURE_USABLE (SSE))
    {
      unsigned int xcw;
-
-      __asm__ ("%vstmxcsr %0" : "=m" (xcw));
+      stmxcsr_inline_asm (&xcw);
      xcw &= ~0x6000;
      xcw |= round << 3;
-      __asm__ ("%vldmxcsr %0" : : "m" (xcw));
+      ldmxcsr_inline_asm (&xcw);
    }

  return 0;
--- a/sysdeps/i386/fpu/feupdateenv.c
+++ b/sysdeps/i386/fpu/feupdateenv.c
@ -19,6 +19,7 @@
 #include <fenv.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 int
 __feupdateenv (const fenv_t *envp)
@ -31,7 +32,7 @@ __feupdateenv (const fenv_t *envp)

  /* If the CPU supports SSE we test the MXCSR as well.  */
  if (CPU_FEATURE_USABLE (SSE))
-    __asm__ ("%vstmxcsr %0" : "=m" (xtemp));
+    stmxcsr_inline_asm (&xtemp);

  temp = (temp | xtemp) & FE_ALL_EXCEPT;

--- a/sysdeps/i386/fpu/fgetexcptflg.c
+++ b/sysdeps/i386/fpu/fgetexcptflg.c
@ -19,6 +19,7 @@
 #include <fenv.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>


 int
@ -34,10 +35,9 @@ __fegetexceptflag (fexcept_t *flagp, int excepts)
  /* If the CPU supports SSE, we clear the MXCSR as well.  */
  if (CPU_FEATURE_USABLE (SSE))
    {
-      unsigned int sse_exc;
-
      /* Get the current MXCSR.  */
-      __asm__ ("%vstmxcsr %0" : "=m" (sse_exc));
+      unsigned int sse_exc;
+      stmxcsr_inline_asm (&sse_exc);

      *flagp |= sse_exc & excepts & FE_ALL_EXCEPT;
    }
--- a/sysdeps/i386/fpu/fsetexcptflg.c
+++ b/sysdeps/i386/fpu/fsetexcptflg.c
@ -18,6 +18,7 @@

 #include <fenv.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 int
 __fesetexceptflag (const fexcept_t *flagp, int excepts)
@ -50,13 +51,13 @@ __fesetexceptflag (const fexcept_t *flagp, int excepts)
      __asm__ ("fldenv %0" : : "m" (temp));

      /* And now similarly for SSE.  */
-      __asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
+      stmxcsr_inline_asm (&mxcsr);

      /* Clear or set relevant flags.  */
      mxcsr ^= (mxcsr ^ *flagp) & excepts;

      /* Put the new data in effect.  */
-      __asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
+      ldmxcsr_inline_asm (&mxcsr);
    }
  else
    {
--- a/sysdeps/i386/fpu/ftestexcept.c
+++ b/sysdeps/i386/fpu/ftestexcept.c
@ -19,19 +19,20 @@
 #include <fenv.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 int
 __fetestexcept (int excepts)
 {
  short temp;
-  int xtemp = 0;
+  unsigned int xtemp = 0;

  /* Get current exceptions.  */
  __asm__ ("fnstsw %0" : "=a" (temp));

  /* If the CPU supports SSE we test the MXCSR as well.  */
  if (CPU_FEATURE_USABLE (SSE))
-    __asm__ ("%vstmxcsr %0" : "=m" (xtemp));
+    stmxcsr_inline_asm (&xtemp);

  return (temp | xtemp) & excepts & FE_ALL_EXCEPT;
 }
--- a/sysdeps/i386/setfpucw.c
+++ b/sysdeps/i386/setfpucw.c
@ -21,6 +21,7 @@
 #include <fenv.h>
 #include <unistd.h>
 #include <ldsodefs.h>
+#include <math-inline-asm.h>

 void
 __setfpucw (fpu_control_t set)
@ -40,14 +41,14 @@ __setfpucw (fpu_control_t set)
  /* If the CPU supports SSE, we set the MXCSR as well.  */
  if (CPU_FEATURE_USABLE (SSE))
    {
+      /* Get the current MXCSR.  */
      unsigned int xnew_exc;

-      /* Get the current MXCSR.  */
-      __asm__ ("%vstmxcsr %0" : "=m" (xnew_exc));
+      stmxcsr_inline_asm (&xnew_exc);

      xnew_exc &= ~((0xc00 << 3) | (FE_ALL_EXCEPT << 7));
      xnew_exc |= ((set & 0xc00) << 3) | ((set & FE_ALL_EXCEPT) << 7);

-      __asm__ ("%vldmxcsr %0" : : "m" (xnew_exc));
+      ldmxcsr_inline_asm (&xnew_exc);
    }
 }
--- a/sysdeps/x86/fpu/fenv_private.h
+++ b/sysdeps/x86/fpu/fenv_private.h
@ -4,6 +4,7 @@
 #include <bits/floatn.h>
 #include <fenv.h>
 #include <fpu_control.h>
+#include <math-inline-asm.h>

 /* This file is used by both the 32- and 64-bit ports.  The 64-bit port
   has a field in the fenv_t for the mxcsr; the 32-bit port does not.
@ -22,10 +23,10 @@ static __always_inline void
 libc_feholdexcept_sse (fenv_t *e)
 {
  unsigned int mxcsr;
-  asm ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);
  e->__mxcsr = mxcsr;
  mxcsr = (mxcsr | 0x1f80) & ~0x3f;
-  asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);
 }

 static __always_inline void
@ -43,9 +44,9 @@ static __always_inline void
 libc_fesetround_sse (int r)
 {
  unsigned int mxcsr;
-  asm ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);
  mxcsr = (mxcsr & ~0x6000) | (r << 3);
-  asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);
 }

 static __always_inline void
@ -61,10 +62,10 @@ static __always_inline void
 libc_feholdexcept_setround_sse (fenv_t *e, int r)
 {
  unsigned int mxcsr;
-  asm ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);
  e->__mxcsr = mxcsr;
  mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);
-  asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);
 }

 /* Set both rounding mode and precision.  A convenience function for use
@ -96,7 +97,7 @@ static __always_inline int
 libc_fetestexcept_sse (int e)
 {
  unsigned int mxcsr;
-  asm volatile ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);
  return mxcsr & e & FE_ALL_EXCEPT;
 }

@ -111,7 +112,7 @@ libc_fetestexcept_387 (int ex)
 static __always_inline void
 libc_fesetenv_sse (fenv_t *e)
 {
-  asm volatile ("%vldmxcsr %0" : : "m" (e->__mxcsr));
+  ldmxcsr_inline_asm (&e->__mxcsr);
 }

 static __always_inline void
@ -129,13 +130,13 @@ static __always_inline int
 libc_feupdateenv_test_sse (fenv_t *e, int ex)
 {
  unsigned int mxcsr, old_mxcsr, cur_ex;
-  asm volatile ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);
  cur_ex = mxcsr & FE_ALL_EXCEPT;

  /* Merge current exceptions with the old environment.  */
  old_mxcsr = e->__mxcsr;
  mxcsr = old_mxcsr | cur_ex;
-  asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);

  /* Raise SIGFPE for any new exceptions since the hold.  Expect that
     the normal environment has all exceptions masked.  */
@ -181,10 +182,10 @@ static __always_inline void
 libc_feholdsetround_sse (fenv_t *e, int r)
 {
  unsigned int mxcsr;
-  asm ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);
  e->__mxcsr = mxcsr;
  mxcsr = (mxcsr & ~0x6000) | (r << 3);
-  asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);
 }

 static __always_inline void
@ -215,9 +216,9 @@ static __always_inline void
 libc_feresetround_sse (fenv_t *e)
 {
  unsigned int mxcsr;
-  asm ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);
  mxcsr = (mxcsr & ~0x6000) | (e->__mxcsr & 0x6000);
-  asm volatile ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);
 }

 static __always_inline void
@ -307,13 +308,13 @@ static __always_inline void
 libc_feholdexcept_setround_sse_ctx (struct rm_ctx *ctx, int r)
 {
  unsigned int mxcsr, new_mxcsr;
-  asm ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);
  new_mxcsr = ((mxcsr | 0x1f80) & ~0x603f) | (r << 3);

  ctx->env.__mxcsr = mxcsr;
  if (__glibc_unlikely (mxcsr != new_mxcsr))
    {
-      asm volatile ("%vldmxcsr %0" : : "m" (new_mxcsr));
+      ldmxcsr_inline_asm (&new_mxcsr);
      ctx->updated_status = true;
    }
  else
@ -404,13 +405,13 @@ libc_feholdsetround_sse_ctx (struct rm_ctx *ctx, int r)
 {
  unsigned int mxcsr, new_mxcsr;

-  asm ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);
  new_mxcsr = (mxcsr & ~0x6000) | (r << 3);

  ctx->env.__mxcsr = mxcsr;
  if (__glibc_unlikely (new_mxcsr != mxcsr))
    {
-      asm volatile ("%vldmxcsr %0" : : "m" (new_mxcsr));
+      ldmxcsr_inline_asm (&new_mxcsr);
      ctx->updated_status = true;
    }
  else
--- a/sysdeps/x86/fpu/math-inline-asm.h
+++ b/sysdeps/x86/fpu/math-inline-asm.h
@ -0,0 +1,77 @@
+/* Math inline asm compat layer
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _MATH_INLINE_ASM
+#define _MATH_INLINE_ASM
+
+#include <sys/cdefs.h>
+
+/* clang does not support the %v to select the AVX encoding, nor the '%d' asm
+   contrain, and for AVX build it requires all 3 arguments.  */
+#ifdef __clang__
+#if defined __AVX__
+#  define VPREFIX    "v"
+#  define VROUND_ARG ", %0"
+# else
+#  define VPREFIX    ""
+#  define VROUND_ARG ""
+# endif
+# define VARGPREFIX  "%"
+#else
+# define VPREFIX     "%v"
+# define VARGPREFIX  "%d"
+# define VROUND_ARG  ""
+#endif
+
+__extern_always_inline double
+trunc_inline_asm (double x)
+{
+  asm (VPREFIX "roundsd $11, " VARGPREFIX "1, %0" VROUND_ARG : "=v" (x)
+       : "v" (x));
+  return x;
+}
+
+__extern_always_inline float
+truncf_inline_asm (float x)
+{
+  asm (VPREFIX "roundss $11, " VARGPREFIX "1, %0" VROUND_ARG : "=v" (x)
+       : "v" (x));
+  return x;
+}
+
+static __always_inline void
+stmxcsr_inline_asm (unsigned int *mxcsr)
+{
+  asm volatile (VPREFIX "stmxcsr %0" : "=m" (*mxcsr));
+}
+
+static __always_inline void
+ldmxcsr_inline_asm (unsigned int *mxcsr)
+{
+  asm volatile (VPREFIX "ldmxcsr %0" : : "m" (*mxcsr));
+}
+
+static __always_inline float
+divss_inline_asm (float x, float y)
+{
+  asm volatile (VPREFIX "divss %1, " VARGPREFIX "0" VROUND_ARG
+		: "+x" (x) : "x" (y));
+  return x;
+}
+
+#endif
--- a/sysdeps/x86/fpu/math_private.h
+++ b/sysdeps/x86/fpu/math_private.h
@ -20,8 +20,10 @@
 #define X86_MATH_PRIVATE_H 1

 #include <math.h>
+#include <math-inline-asm.h>
 #include_next <math_private.h>

+
 __extern_always_inline long double
 __NTH (__ieee754_atan2l (long double y, long double x))
 {
@ -36,8 +38,7 @@ __trunc (double x)
 #if HAVE_X86_INLINE_TRUNC || !defined __SSE4_1__
  return trunc (x);
 #else
-  asm ("%vroundsd $11, %d1, %0" : "=v" (x) : "v" (x));
-  return x;
+  return trunc_inline_asm (x);
 #endif
 }

@ -47,8 +48,7 @@ __truncf (float x)
 #if HAVE_X86_INLINE_TRUNC || !defined __SSE4_1__
  return truncf (x);
 #else
-  asm ("%vroundss $11, %d1, %0" : "=v" (x) : "v" (x));
-  return x;
+  return truncf_inline_asm (x);
 #endif
 }

--- a/sysdeps/x86/fpu/sfp-machine.h
+++ b/sysdeps/x86/fpu/sfp-machine.h
@ -1,6 +1,8 @@
 /* Configure soft-fp for building sqrtf128.  Based on sfp-machine.h in
   libgcc, with soft-float and other irrelevant parts removed.  */

+#include <math-inline-asm.h>
+
 #if HAVE_X86_LIBGCC_CMP_RETURN_ATTR
 /* The type of the result of a floating point comparison.  This must
   match `__libgcc_cmp_return__' in GCC for the target.  */
@ -49,7 +51,7 @@ typedef unsigned int UTItype __attribute__ ((mode (TI)));

 # define FP_INIT_ROUNDMODE					\
  do {								\
-    __asm__ __volatile__ ("%vstmxcsr\t%0" : "=m" (_fcw));	\
+    stmxcsr_inline_asm (&_fcw);					\
  } while (0)
 #else
 # define _FP_W_TYPE_SIZE	32
--- a/sysdeps/x86/fpu/test-fenv-sse-2.c
+++ b/sysdeps/x86/fpu/test-fenv-sse-2.c
@ -24,33 +24,22 @@
 #include <stdio.h>
 #include <cpu-features.h>
 #include <support/check.h>
-
-static uint32_t
-get_sse_mxcsr (void)
-{
-  uint32_t temp;
-  __asm__ __volatile__ ("%vstmxcsr %0" : "=m" (temp));
-  return temp;
-}
-
-static void
-set_sse_mxcsr (uint32_t val)
-{
-  __asm__ __volatile__ ("%vldmxcsr %0" : : "m" (val));
-}
+#include <math-inline-asm.h>

 static void
 set_sse_mxcsr_bits (uint32_t mask, uint32_t bits)
 {
-  uint32_t mxcsr = get_sse_mxcsr ();
+  uint32_t mxcsr;
+  stmxcsr_inline_asm (&mxcsr);
  mxcsr = (mxcsr & ~mask) | bits;
-  set_sse_mxcsr (mxcsr);
+  ldmxcsr_inline_asm (&mxcsr);
 }

 static int
 test_sse_mxcsr_bits (const char *test, uint32_t mask, uint32_t bits)
 {
-  uint32_t mxcsr = get_sse_mxcsr ();
+  uint32_t mxcsr;
+  stmxcsr_inline_asm (&mxcsr);
  printf ("Testing %s: mxcsr = %x\n", test, mxcsr);
  if ((mxcsr & mask) == bits)
    {
--- a/sysdeps/x86_64/fpu/fclrexcpt.c
+++ b/sysdeps/x86_64/fpu/fclrexcpt.c
@ -17,6 +17,7 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>

 int
 __feclearexcept (int excepts)
@ -38,13 +39,13 @@ __feclearexcept (int excepts)
  __asm__ ("fldenv %0" : : "m" (temp));

  /* And the same procedure for SSE.  */
-  __asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);

  /* Clear the relevant bits.  */
  mxcsr &= ~excepts;

  /* And put them into effect.  */
-  __asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);

  /* Success.  */
  return 0;
--- a/sysdeps/x86_64/fpu/fedisblxcpt.c
+++ b/sysdeps/x86_64/fpu/fedisblxcpt.c
@ -17,6 +17,7 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>

 int
 fedisableexcept (int excepts)
@ -35,11 +36,11 @@ fedisableexcept (int excepts)
  __asm__ ("fldcw %0" : : "m" (new_exc));

  /* And now the same for the SSE MXCSR register.  */
-  __asm__ ("%vstmxcsr %0" : "=m" (new));
+  stmxcsr_inline_asm (&new);

  /* The SSE exception masks are shifted by 7 bits.  */
  new |= excepts << 7;
-  __asm__ ("%vldmxcsr %0" : : "m" (new));
+  ldmxcsr_inline_asm (&new);

  return old_exc;
 }
--- a/sysdeps/x86_64/fpu/feenablxcpt.c
+++ b/sysdeps/x86_64/fpu/feenablxcpt.c
@ -17,6 +17,7 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>

 int
 feenableexcept (int excepts)
@ -35,11 +36,11 @@ feenableexcept (int excepts)
  __asm__ ("fldcw %0" : : "m" (new_exc));

  /* And now the same for the SSE MXCSR register.  */
-  __asm__ ("%vstmxcsr %0" : "=m" (new));
+  stmxcsr_inline_asm (&new);

  /* The SSE exception masks are shifted by 7 bits.  */
  new &= ~(excepts << 7);
-  __asm__ ("%vldmxcsr %0" : : "m" (new));
+  ldmxcsr_inline_asm (&new);

  return old_exc;
 }
--- a/sysdeps/x86_64/fpu/fegetenv.c
+++ b/sysdeps/x86_64/fpu/fegetenv.c
@ -17,15 +17,17 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>

 int
 __fegetenv (fenv_t *envp)
 {
-  __asm__ ("fnstenv %0\n"
-	   /* fnstenv changes the exception mask, so load back the
-	      stored environment.  */
-	   "fldenv %0\n"
-	   "%vstmxcsr %1" : "=m" (*envp), "=m" (envp->__mxcsr));
+  asm volatile ("fnstenv %0\n"
+		/* fnstenv changes the exception mask, so load back the
+		   stored environment.  */
+		"fldenv %0"
+		: "=m" (*envp));
+  stmxcsr_inline_asm (&envp->__mxcsr);

  /* Success.  */
  return 0;
--- a/sysdeps/x86_64/fpu/fegetmode.c
+++ b/sysdeps/x86_64/fpu/fegetmode.c
@ -18,11 +18,12 @@

 #include <fenv.h>
 #include <fpu_control.h>
+#include <math-inline-asm.h>

 int
 fegetmode (femode_t *modep)
 {
  _FPU_GETCW (modep->__control_word);
-  __asm__ ("%vstmxcsr %0" : "=m" (modep->__mxcsr));
+  stmxcsr_inline_asm (&modep->__mxcsr);
  return 0;
 }
--- a/sysdeps/x86_64/fpu/feholdexcpt.c
+++ b/sysdeps/x86_64/fpu/feholdexcpt.c
@ -17,6 +17,7 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>

 int
 __feholdexcept (fenv_t *envp)
@ -25,14 +26,13 @@ __feholdexcept (fenv_t *envp)

  /* Store the environment.  Recall that fnstenv has a side effect of
     masking all exceptions.  Then clear all exceptions.  */
-  __asm__ ("fnstenv %0\n\t"
-	   "%vstmxcsr %1\n\t"
-	   "fnclex"
-	   : "=m" (*envp), "=m" (envp->__mxcsr));
+  asm volatile ("fnstenv %0" : "=m" (*envp));
+  stmxcsr_inline_asm (&envp->__mxcsr);
+  asm volatile ("fnclex" : "=m" (*envp));

  /* Set the SSE MXCSR register.  */
  mxcsr = (envp->__mxcsr | 0x1f80) & ~0x3f;
-  __asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);

  return 0;
 }
--- a/sysdeps/x86_64/fpu/fesetenv.c
+++ b/sysdeps/x86_64/fpu/fesetenv.c
@ -17,6 +17,7 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>
 #include <fpu_control.h>
 #include <assert.h>

@ -35,8 +36,8 @@ __fesetenv (const fenv_t *envp)
     values which we do not want to come from the saved environment.
     Therefore, we get the current environment and replace the values
     we want to use from the environment specified by the parameter.  */
-  __asm__ ("fnstenv %0\n"
-	   "%vstmxcsr %1" : "=m" (temp), "=m" (temp.__mxcsr));
+  asm volatile ("fnstenv %0" : "=m" (temp));
+  stmxcsr_inline_asm (&temp.__mxcsr);

  if (envp == FE_DFL_ENV)
    {
@ -103,8 +104,8 @@ __fesetenv (const fenv_t *envp)
      temp.__mxcsr = envp->__mxcsr;
    }

-  __asm__ ("fldenv %0\n"
-	   "%vldmxcsr %1" : : "m" (temp), "m" (temp.__mxcsr));
+  asm volatile ("fldenv %0" : "=m" (temp));
+  ldmxcsr_inline_asm (&temp.__mxcsr);

  /* Success.  */
  return 0;
--- a/sysdeps/x86_64/fpu/fesetexcept.c
+++ b/sysdeps/x86_64/fpu/fesetexcept.c
@ -17,15 +17,15 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>

 int
 fesetexcept (int excepts)
 {
  unsigned int mxcsr;
-
-  __asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);
  mxcsr |= excepts & FE_ALL_EXCEPT;
-  __asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);

  return 0;
 }
--- a/sysdeps/x86_64/fpu/fesetmode.c
+++ b/sysdeps/x86_64/fpu/fesetmode.c
@ -17,6 +17,7 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>
 #include <fpu_control.h>

 /* All exceptions, including the x86-specific "denormal operand"
@ -28,7 +29,8 @@ fesetmode (const femode_t *modep)
 {
  fpu_control_t cw;
  unsigned int mxcsr;
-  __asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
+
+  stmxcsr_inline_asm (&mxcsr);
  /* Preserve SSE exception flags but restore other state in
     MXCSR.  */
  mxcsr &= FE_ALL_EXCEPT_X86;
@ -45,6 +47,6 @@ fesetmode (const femode_t *modep)
      mxcsr |= modep->__mxcsr & ~FE_ALL_EXCEPT_X86;
    }
  _FPU_SETCW (cw);
-  __asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);
  return 0;
 }
--- a/sysdeps/x86_64/fpu/fesetround.c
+++ b/sysdeps/x86_64/fpu/fesetround.c
@ -17,12 +17,13 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>

 int
 __fesetround (int round)
 {
  unsigned short int cw;
-  int mxcsr;
+  unsigned int mxcsr;

  if ((round & ~0xc00) != 0)
    /* ROUND is no valid rounding mode.  */
@ -36,10 +37,10 @@ __fesetround (int round)

  /* And now the MSCSR register for SSE, the precision is at different bit
     positions in the different units, we need to shift it 3 bits.  */
-  asm ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);
  mxcsr &= ~ 0x6000;
  mxcsr |= round << 3;
-  asm ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);

  return 0;
 }
--- a/sysdeps/x86_64/fpu/feupdateenv.c
+++ b/sysdeps/x86_64/fpu/feupdateenv.c
@ -17,6 +17,7 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>

 int
 __feupdateenv (const fenv_t *envp)
@ -25,7 +26,8 @@ __feupdateenv (const fenv_t *envp)
  unsigned int xtemp;

  /* Save current exceptions.  */
-  __asm__ ("fnstsw %0\n\t%vstmxcsr %1" : "=m" (temp), "=m" (xtemp));
+  asm volatile ("fnstsw %0" : "=m" (temp));
+  stmxcsr_inline_asm (&xtemp);
  temp = (temp | xtemp) & FE_ALL_EXCEPT;

  /* Install new environment.  */
--- a/sysdeps/x86_64/fpu/fgetexcptflg.c
+++ b/sysdeps/x86_64/fpu/fgetexcptflg.c
@ -17,6 +17,7 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>

 int
 fegetexceptflag (fexcept_t *flagp, int excepts)
@ -25,8 +26,8 @@ fegetexceptflag (fexcept_t *flagp, int excepts)
  unsigned int mxscr;

  /* Get the current exceptions for the x87 FPU and SSE unit.  */
-  __asm__ ("fnstsw %0\n"
-	   "%vstmxcsr %1" : "=m" (temp), "=m" (mxscr));
+  __asm__ ("fnstsw %0" : "=m" (temp));
+  stmxcsr_inline_asm (&mxscr);

  *flagp = (temp | mxscr) & FE_ALL_EXCEPT & excepts;

--- a/sysdeps/x86_64/fpu/fraiseexcpt.c
+++ b/sysdeps/x86_64/fpu/fraiseexcpt.c
@ -17,6 +17,7 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>
 #include <math.h>

 int
@ -29,23 +30,12 @@ __feraiseexcept (int excepts)

  /* First: invalid exception.  */
  if ((FE_INVALID & excepts) != 0)
-    {
-      /* One example of an invalid operation is 0.0 / 0.0.  */
-      float f = 0.0;
-
-      __asm__ __volatile__ ("%vdivss %0, %d0 " : "+x" (f));
-      (void) &f;
-    }
+    /* One example of an invalid operation is 0.0 / 0.0.  */
+    divss_inline_asm (0.0f, 0.0f);

  /* Next: division by zero.  */
  if ((FE_DIVBYZERO & excepts) != 0)
-    {
-      float f = 1.0;
-      float g = 0.0;
-
-      __asm__ __volatile__ ("%vdivss %1, %d0" : "+x" (f) : "x" (g));
-      (void) &f;
-    }
+    divss_inline_asm (1.0f, 0.0f);

  /* Next: overflow.  */
  if ((FE_OVERFLOW & excepts) != 0)
--- a/sysdeps/x86_64/fpu/fsetexcptflg.c
+++ b/sysdeps/x86_64/fpu/fsetexcptflg.c
@ -17,6 +17,7 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>
 #include <math.h>

 int
@ -44,13 +45,13 @@ fesetexceptflag (const fexcept_t *flagp, int excepts)
  __asm__ ("fldenv %0" : : "m" (temp));

  /* And now similarly for SSE.  */
-  __asm__ ("%vstmxcsr %0" : "=m" (mxcsr));
+  stmxcsr_inline_asm (&mxcsr);

  /* Clear or set relevant flags.  */
  mxcsr ^= (mxcsr ^ *flagp) & excepts;

  /* Put the new data in effect.  */
-  __asm__ ("%vldmxcsr %0" : : "m" (mxcsr));
+  ldmxcsr_inline_asm (&mxcsr);

  /* Success.  */
  return 0;
--- a/sysdeps/x86_64/fpu/ftestexcept.c
+++ b/sysdeps/x86_64/fpu/ftestexcept.c
@ -17,6 +17,7 @@
   <https://www.gnu.org/licenses/>.  */

 #include <fenv.h>
+#include <math-inline-asm.h>

 int
 __fetestexcept (int excepts)
@ -25,8 +26,8 @@ __fetestexcept (int excepts)
  unsigned int mxscr;

  /* Get current exceptions.  */
-  __asm__ ("fnstsw %0\n"
-	   "%vstmxcsr %1" : "=m" (temp), "=m" (mxscr));
+  asm volatile ("fnstsw %0" : "=m" (temp));
+  stmxcsr_inline_asm (&mxscr);

  return (temp | mxscr) & excepts & FE_ALL_EXCEPT;
 }