x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765]

This patch fixes SSE4.2 libmvec atan2 function accuracy for following inputs to less than 4 ulps. {0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54} 4.19888 ulps {0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps This fixes BZ #28765. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2022-01-12 11:02:19 -08:00 · 2022-01-12 11:02:19 -08:00 · 49e2bf58d5
parent fcfc908681
commit 49e2bf58d5
1 changed files with 172 additions and 147 deletions
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
@ -65,7 +65,7 @@
 ENTRY(_ZGVbN2vv_atan2_sse4)
        subq      $88, %rsp
        cfi_def_cfa_offset(96)
-        movaps    %xmm0, %xmm8
+        movaps    %xmm1, %xmm11

 /*
 * #define NO_VECTOR_ZERO_ATAN2_ARGS
@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
 * Cannot be replaced by VQRCP(D, dR0, dB);
 * Argument Absolute values
 */
-        movups    dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
+        movups    dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
+        movaps    %xmm0, %xmm10
        movaps    %xmm1, %xmm9
-        movaps    %xmm4, %xmm1
-        andps     %xmm8, %xmm4
-        andps     %xmm9, %xmm1
-        movaps    %xmm4, %xmm2
-        cmpnltpd  %xmm1, %xmm2
+        andps     %xmm10, %xmm1
+        andps     %xmm11, %xmm9
+        movaps    %xmm1, %xmm4
+        cmpnltpd  %xmm9, %xmm4

 /* Argument signs */
-        movups    dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
-        movaps    %xmm2, %xmm0
-        movups    dPIO2+__svml_datan2_data_internal(%rip), %xmm5
-        movaps    %xmm3, %xmm7
-        movaps    %xmm3, %xmm6
+        movups    dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
+        movaps    %xmm4, %xmm0
+        movaps    %xmm5, %xmm8
+        movaps    %xmm5, %xmm7

 /*
 * 1) If y<x then a= y, b=x, PIO2=0
 * 2) If y>x then a=-x, b=y, PIO2=Pi/2
 */
-        orps      %xmm1, %xmm3
-        movaps    %xmm2, %xmm10
-        andps     %xmm2, %xmm5
-        andnps    %xmm4, %xmm0
-        andps     %xmm2, %xmm3
-        andnps    %xmm1, %xmm10
-        andps     %xmm4, %xmm2
-        orps      %xmm3, %xmm0
-        orps      %xmm2, %xmm10
-        divpd     %xmm10, %xmm0
-        movq      iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
-
-/* if x<0, dPI = Pi, else dPI =0 */
-        movaps    %xmm9, %xmm3
+        orps      %xmm9, %xmm5
+        andnps    %xmm1, %xmm0
+        andps     %xmm4, %xmm5
+        andps     %xmm11, %xmm8
+        movups    dPIO2+__svml_datan2_data_internal(%rip), %xmm6
+        orps      %xmm5, %xmm0
+        movaps    %xmm4, %xmm5
+        andps     %xmm4, %xmm6
+        andnps    %xmm9, %xmm5
+        andps     %xmm1, %xmm4
+        orps      %xmm4, %xmm5
+        andps     %xmm10, %xmm7
+        divpd     %xmm5, %xmm0
+        movq      iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
+        xorl      %edx, %edx

 /* Check if y and x are on main path. */
-        pshufd    $221, %xmm1, %xmm12
-        andps     %xmm9, %xmm7
-        psubd     %xmm11, %xmm12
-        andps     %xmm8, %xmm6
-        movq      iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
-        xorl      %edx, %edx
-        movups    %xmm4, 16(%rsp)
+        pshufd    $221, %xmm9, %xmm3
        xorl      %eax, %eax
-        pshufd    $221, %xmm4, %xmm14
-        movdqa    %xmm12, %xmm4
-        pcmpgtd   %xmm13, %xmm4
-        pcmpeqd   %xmm13, %xmm12
-        por       %xmm12, %xmm4
+        pshufd    $221, %xmm1, %xmm13
+        psubd     %xmm2, %xmm3
+        psubd     %xmm2, %xmm13
+        movdqa    %xmm3, %xmm4
+        movq      iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
+        movdqa    %xmm13, %xmm14
+        pcmpgtd   %xmm12, %xmm4
+        pcmpeqd   %xmm12, %xmm3
+        pcmpgtd   %xmm12, %xmm14
+        pcmpeqd   %xmm12, %xmm13

 /* Polynomial. */
        movaps    %xmm0, %xmm12
+        por       %xmm3, %xmm4
        mulpd     %xmm0, %xmm12
-        cmplepd   dZERO+__svml_datan2_data_internal(%rip), %xmm3
-        psubd     %xmm11, %xmm14
-        movdqa    %xmm14, %xmm15
-        pcmpeqd   %xmm13, %xmm14
-        pcmpgtd   %xmm13, %xmm15
-        por       %xmm14, %xmm15
-        movaps    %xmm12, %xmm14
-        mulpd     %xmm12, %xmm14
-        por       %xmm15, %xmm4
-        movaps    %xmm14, %xmm15
-        mulpd     %xmm14, %xmm15
-        movmskps  %xmm4, %ecx
-        movups    %xmm10, (%rsp)
-        movups    dA19+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm10
-        movups    dA18+__svml_datan2_data_internal(%rip), %xmm13
-        movups    dA17+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA15+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm15, %xmm10
-        addpd     dA14+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA13+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA11+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm15, %xmm10
-        addpd     dA10+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA09+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA07+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm15, %xmm10
-        addpd     dA06+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA05+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     dA03+__svml_datan2_data_internal(%rip), %xmm10
-        mulpd     %xmm15, %xmm13
-        mulpd     %xmm15, %xmm11
-        mulpd     %xmm12, %xmm10
-        addpd     dA02+__svml_datan2_data_internal(%rip), %xmm13
-        addpd     dA01+__svml_datan2_data_internal(%rip), %xmm11
-        addpd     %xmm10, %xmm13
-        mulpd     %xmm11, %xmm12
-        mulpd     %xmm13, %xmm14
-        movups    dA16+__svml_datan2_data_internal(%rip), %xmm2
-        mulpd     %xmm15, %xmm2
-        addpd     dA12+__svml_datan2_data_internal(%rip), %xmm2
-        mulpd     %xmm15, %xmm2
-        addpd     dA08+__svml_datan2_data_internal(%rip), %xmm2
-        mulpd     %xmm15, %xmm2
-        addpd     dA04+__svml_datan2_data_internal(%rip), %xmm2

-/* A00=1.0, account for it later  VQFMA(D, dP4, dP4, dR8, dA00); */
-        mulpd     %xmm2, %xmm15
-        addpd     %xmm12, %xmm15
-        addpd     %xmm14, %xmm15
+/* P = A19*R2 + A18 */
+        movups    dA19+__svml_datan2_data_internal(%rip), %xmm15
+        movaps    %xmm11, %xmm2
+        mulpd     %xmm12, %xmm15
+        addpd     dA18+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A17 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA17+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A16 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA16+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A15 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA15+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A14 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA14+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A13 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA13+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A12 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA12+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A11 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA11+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A10 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA10+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A09 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA09+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A08 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA08+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A07 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA07+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A06 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA06+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A05 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA05+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A04 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA04+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A03 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA03+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A02 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA02+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 + A01 */
+        mulpd     %xmm12, %xmm15
+        addpd     dA01+__svml_datan2_data_internal(%rip), %xmm15
+
+/* P = P*R2 */
+        mulpd     %xmm15, %xmm12

 /*
 * Reconstruction.
 * dP=(R+R*dP) + dPIO2
 */
-        mulpd     %xmm0, %xmm15
-        addpd     %xmm15, %xmm0
-        addpd     %xmm5, %xmm0
-        andps     __svml_datan2_data_internal(%rip), %xmm3
+        mulpd     %xmm0, %xmm12
+        addpd     %xmm12, %xmm0
+
+/* if x<0, dPI = Pi, else dPI =0 */
+        movups    dZERO+__svml_datan2_data_internal(%rip), %xmm3
+        por       %xmm13, %xmm14
+        cmplepd   %xmm3, %xmm2
+        addpd     %xmm6, %xmm0
+        andps     __svml_datan2_data_internal(%rip), %xmm2
+        orps      %xmm8, %xmm0
+        addpd     %xmm2, %xmm0
+        por       %xmm14, %xmm4
        orps      %xmm7, %xmm0
-        addpd     %xmm3, %xmm0
+        movmskps  %xmm4, %ecx

 /*  Special branch for fast (vector) processing of zero arguments  */
-        movups    16(%rsp), %xmm11
-        orps      %xmm6, %xmm0
        testb     $3, %cl

 /* Go to auxilary branch */
        jne       L(AUX_BRANCH)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11

 /* Return from auxilary branch
 * for out of main path inputs
@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN):

 /* Go to special inputs processing branch */
        jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11

 /* Restore registers
 * and exit the function
@ -237,8 +264,8 @@ L(EXIT):
 */

 L(SPECIAL_VALUES_BRANCH):
-        movups    %xmm8, 32(%rsp)
-        movups    %xmm9, 48(%rsp)
+        movups    %xmm10, 32(%rsp)
+        movups    %xmm11, 48(%rsp)
        movups    %xmm0, 64(%rsp)
                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0

@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL):
 */

 L(AUX_BRANCH):
-/* Check if at least on of Y or Y is zero: iAXAYZERO */
-        movups    dZERO+__svml_datan2_data_internal(%rip), %xmm2
-
 /* Check if both X & Y are not NaNs:  iXYnotNAN */
-        movaps    %xmm9, %xmm12
-        movaps    %xmm8, %xmm10
-        cmpordpd  %xmm9, %xmm12
-        cmpordpd  %xmm8, %xmm10
-        cmpeqpd   %xmm2, %xmm1
-        cmpeqpd   %xmm2, %xmm11
-        andps     %xmm10, %xmm12
-        orps      %xmm11, %xmm1
-        pshufd    $221, %xmm1, %xmm1
-        pshufd    $221, %xmm12, %xmm11
+        movaps    %xmm11, %xmm13
+        movaps    %xmm10, %xmm12
+        cmpordpd  %xmm11, %xmm13
+        cmpordpd  %xmm10, %xmm12

-/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
-        pand      %xmm11, %xmm1
-
-/* Exclude from previous callout mask zero (and not NaN) arguments */
-        movdqa    %xmm1, %xmm13
-        pandn     %xmm4, %xmm13
+/* Check if at least on of Y or Y is zero: iAXAYZERO */
+        cmpeqpd   %xmm3, %xmm9
+        cmpeqpd   %xmm3, %xmm1

 /*
 *  Path for zero arguments (at least one of both)
 * Check if both args are zeros (den. is zero)
 */
-        movups    (%rsp), %xmm4
-        cmpeqpd   %xmm2, %xmm4
+        cmpeqpd   %xmm3, %xmm5
+        andps     %xmm12, %xmm13
+        orps      %xmm1, %xmm9
+        pshufd    $221, %xmm9, %xmm1
+        pshufd    $221, %xmm13, %xmm9

-/* Go to callout */
-        movmskps  %xmm13, %edx
+/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
+        pand      %xmm9, %xmm1
+
+/* Exclude from previous callout mask zero (and not NaN) arguments */
+        movdqa    %xmm1, %xmm14
+        pandn     %xmm4, %xmm14

 /* Set sPIO2 to zero if den. is zero */
-        movaps    %xmm4, %xmm15
-        andps     %xmm2, %xmm4
-        andnps    %xmm5, %xmm15
-        andl      $3, %edx
-        orps      %xmm4, %xmm15
-        pshufd    $221, %xmm9, %xmm5
-        orps      %xmm7, %xmm15
+        movaps    %xmm5, %xmm4
+        andnps    %xmm6, %xmm4
+        andps     %xmm3, %xmm5

 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
-        pshufd    $221, %xmm2, %xmm7
-        pcmpgtd   %xmm5, %xmm7
-        pshufd    $80, %xmm7, %xmm14
-        andps     %xmm3, %xmm14
-        addpd     %xmm14, %xmm15
+        pshufd    $221, %xmm3, %xmm3
+        orps      %xmm5, %xmm4
+        pshufd    $221, %xmm11, %xmm5
+        orps      %xmm8, %xmm4
+        pcmpgtd   %xmm5, %xmm3
+        pshufd    $80, %xmm3, %xmm6
+        andps     %xmm2, %xmm6
+        addpd     %xmm6, %xmm4
+
+/* Go to callout */
+        movmskps  %xmm14, %edx

 /* Merge results from main and spec path */
-        pshufd    $80, %xmm1, %xmm3
-        orps      %xmm6, %xmm15
-        movdqa    %xmm3, %xmm6
-        andps     %xmm3, %xmm15
-        andnps    %xmm0, %xmm6
-        movaps    %xmm6, %xmm0
-        orps      %xmm15, %xmm0
+        pshufd    $80, %xmm1, %xmm2
+        orps      %xmm7, %xmm4
+        movdqa    %xmm2, %xmm7
+        andps     %xmm2, %xmm4
+        andnps    %xmm0, %xmm7
+        andl      $3, %edx
+        movaps    %xmm7, %xmm0
+        orps      %xmm4, %xmm0

 /* Return to main vector processing path */
        jmp       L(AUX_BRANCH_RETURN)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
 END(_ZGVbN2vv_atan2_sse4)

        .section .rodata, "a"