x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765]

This patch fixes SSE4.2 libmvec atan2 function accuracy for following inputs to less than 4 ulps. {0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54} 4.19888 ulps {0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps This fixes BZ #28765. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2022-01-12 11:02:19 -08:00 · 2022-01-12 11:02:19 -08:00 · 49e2bf58d5
parent fcfc908681
commit 49e2bf58d5
1 changed files with 172 additions and 147 deletions
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_atan22_core_sse4.S
@ -65,7 +65,7 @@
 ENTRY(_ZGVbN2vv_atan2_sse4)
        subq      $88, %rsp
        cfi_def_cfa_offset(96)
-        movaps    %xmm0, %xmm8
+        movaps    %xmm1, %xmm11
 /*
 * #define NO_VECTOR_ZERO_ATAN2_ARGS
@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
 * Cannot be replaced by VQRCP(D, dR0, dB);
 * Argument Absolute values
 */
-        movups    dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
+        movups    dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
        movaps    %xmm0, %xmm10
        movaps    %xmm1, %xmm9
-        movaps    %xmm4, %xmm1
+        andps     %xmm10, %xmm1
-        andps     %xmm8, %xmm4
+        andps     %xmm11, %xmm9
-        andps     %xmm9, %xmm1
+        movaps    %xmm1, %xmm4
-        movaps    %xmm4, %xmm2
+        cmpnltpd  %xmm9, %xmm4
        cmpnltpd  %xmm1, %xmm2
 /* Argument signs */
-        movups    dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
+        movups    dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
-        movaps    %xmm2, %xmm0
+        movaps    %xmm4, %xmm0
-        movups    dPIO2+__svml_datan2_data_internal(%rip), %xmm5
+        movaps    %xmm5, %xmm8
-        movaps    %xmm3, %xmm7
+        movaps    %xmm5, %xmm7
        movaps    %xmm3, %xmm6
 /*
 * 1) If y<x then a= y, b=x, PIO2=0
 * 2) If y>x then a=-x, b=y, PIO2=Pi/2
 */
-        orps      %xmm1, %xmm3
+        orps      %xmm9, %xmm5
-        movaps    %xmm2, %xmm10
+        andnps    %xmm1, %xmm0
-        andps     %xmm2, %xmm5
+        andps     %xmm4, %xmm5
-        andnps    %xmm4, %xmm0
+        andps     %xmm11, %xmm8
-        andps     %xmm2, %xmm3
+        movups    dPIO2+__svml_datan2_data_internal(%rip), %xmm6
-        andnps    %xmm1, %xmm10
+        orps      %xmm5, %xmm0
-        andps     %xmm4, %xmm2
+        movaps    %xmm4, %xmm5
-        orps      %xmm3, %xmm0
+        andps     %xmm4, %xmm6
-        orps      %xmm2, %xmm10
+        andnps    %xmm9, %xmm5
-        divpd     %xmm10, %xmm0
+        andps     %xmm1, %xmm4
-        movq      iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
+        orps      %xmm4, %xmm5
-
+        andps     %xmm10, %xmm7
-/* if x<0, dPI = Pi, else dPI =0 */
+        divpd     %xmm5, %xmm0
-        movaps    %xmm9, %xmm3
+        movq      iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
        xorl      %edx, %edx
 /* Check if y and x are on main path. */
-        pshufd    $221, %xmm1, %xmm12
+        pshufd    $221, %xmm9, %xmm3
        andps     %xmm9, %xmm7
        psubd     %xmm11, %xmm12
        andps     %xmm8, %xmm6
        movq      iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
        xorl      %edx, %edx
        movups    %xmm4, 16(%rsp)
        xorl      %eax, %eax
-        pshufd    $221, %xmm4, %xmm14
+        pshufd    $221, %xmm1, %xmm13
-        movdqa    %xmm12, %xmm4
+        psubd     %xmm2, %xmm3
-        pcmpgtd   %xmm13, %xmm4
+        psubd     %xmm2, %xmm13
-        pcmpeqd   %xmm13, %xmm12
+        movdqa    %xmm3, %xmm4
-        por       %xmm12, %xmm4
+        movq      iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
        movdqa    %xmm13, %xmm14
        pcmpgtd   %xmm12, %xmm4
        pcmpeqd   %xmm12, %xmm3
        pcmpgtd   %xmm12, %xmm14
        pcmpeqd   %xmm12, %xmm13
 /* Polynomial. */
        movaps    %xmm0, %xmm12
        por       %xmm3, %xmm4
        mulpd     %xmm0, %xmm12
        cmplepd   dZERO+__svml_datan2_data_internal(%rip), %xmm3
        psubd     %xmm11, %xmm14
        movdqa    %xmm14, %xmm15
        pcmpeqd   %xmm13, %xmm14
        pcmpgtd   %xmm13, %xmm15
        por       %xmm14, %xmm15
        movaps    %xmm12, %xmm14
        mulpd     %xmm12, %xmm14
        por       %xmm15, %xmm4
        movaps    %xmm14, %xmm15
        mulpd     %xmm14, %xmm15
        movmskps  %xmm4, %ecx
        movups    %xmm10, (%rsp)
        movups    dA19+__svml_datan2_data_internal(%rip), %xmm10
        mulpd     %xmm15, %xmm10
        movups    dA18+__svml_datan2_data_internal(%rip), %xmm13
        movups    dA17+__svml_datan2_data_internal(%rip), %xmm11
        addpd     dA15+__svml_datan2_data_internal(%rip), %xmm10
        mulpd     %xmm15, %xmm13
        mulpd     %xmm15, %xmm11
        mulpd     %xmm15, %xmm10
        addpd     dA14+__svml_datan2_data_internal(%rip), %xmm13
        addpd     dA13+__svml_datan2_data_internal(%rip), %xmm11
        addpd     dA11+__svml_datan2_data_internal(%rip), %xmm10
        mulpd     %xmm15, %xmm13
        mulpd     %xmm15, %xmm11
        mulpd     %xmm15, %xmm10
        addpd     dA10+__svml_datan2_data_internal(%rip), %xmm13
        addpd     dA09+__svml_datan2_data_internal(%rip), %xmm11
        addpd     dA07+__svml_datan2_data_internal(%rip), %xmm10
        mulpd     %xmm15, %xmm13
        mulpd     %xmm15, %xmm11
        mulpd     %xmm15, %xmm10
        addpd     dA06+__svml_datan2_data_internal(%rip), %xmm13
        addpd     dA05+__svml_datan2_data_internal(%rip), %xmm11
        addpd     dA03+__svml_datan2_data_internal(%rip), %xmm10
        mulpd     %xmm15, %xmm13
        mulpd     %xmm15, %xmm11
        mulpd     %xmm12, %xmm10
        addpd     dA02+__svml_datan2_data_internal(%rip), %xmm13
        addpd     dA01+__svml_datan2_data_internal(%rip), %xmm11
        addpd     %xmm10, %xmm13
        mulpd     %xmm11, %xmm12
        mulpd     %xmm13, %xmm14
        movups    dA16+__svml_datan2_data_internal(%rip), %xmm2
        mulpd     %xmm15, %xmm2
        addpd     dA12+__svml_datan2_data_internal(%rip), %xmm2
        mulpd     %xmm15, %xmm2
        addpd     dA08+__svml_datan2_data_internal(%rip), %xmm2
        mulpd     %xmm15, %xmm2
        addpd     dA04+__svml_datan2_data_internal(%rip), %xmm2
-/* A00=1.0, account for it later  VQFMA(D, dP4, dP4, dR8, dA00); */
+/* P = A19*R2 + A18 */
-        mulpd     %xmm2, %xmm15
+        movups    dA19+__svml_datan2_data_internal(%rip), %xmm15
-        addpd     %xmm12, %xmm15
+        movaps    %xmm11, %xmm2
-        addpd     %xmm14, %xmm15
+        mulpd     %xmm12, %xmm15
        addpd     dA18+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A17 */
        mulpd     %xmm12, %xmm15
        addpd     dA17+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A16 */
        mulpd     %xmm12, %xmm15
        addpd     dA16+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A15 */
        mulpd     %xmm12, %xmm15
        addpd     dA15+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A14 */
        mulpd     %xmm12, %xmm15
        addpd     dA14+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A13 */
        mulpd     %xmm12, %xmm15
        addpd     dA13+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A12 */
        mulpd     %xmm12, %xmm15
        addpd     dA12+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A11 */
        mulpd     %xmm12, %xmm15
        addpd     dA11+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A10 */
        mulpd     %xmm12, %xmm15
        addpd     dA10+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A09 */
        mulpd     %xmm12, %xmm15
        addpd     dA09+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A08 */
        mulpd     %xmm12, %xmm15
        addpd     dA08+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A07 */
        mulpd     %xmm12, %xmm15
        addpd     dA07+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A06 */
        mulpd     %xmm12, %xmm15
        addpd     dA06+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A05 */
        mulpd     %xmm12, %xmm15
        addpd     dA05+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A04 */
        mulpd     %xmm12, %xmm15
        addpd     dA04+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A03 */
        mulpd     %xmm12, %xmm15
        addpd     dA03+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A02 */
        mulpd     %xmm12, %xmm15
        addpd     dA02+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 + A01 */
        mulpd     %xmm12, %xmm15
        addpd     dA01+__svml_datan2_data_internal(%rip), %xmm15
 /* P = P*R2 */
        mulpd     %xmm15, %xmm12
 /*
 * Reconstruction.
 * dP=(R+R*dP) + dPIO2
 */
-        mulpd     %xmm0, %xmm15
+        mulpd     %xmm0, %xmm12
-        addpd     %xmm15, %xmm0
+        addpd     %xmm12, %xmm0
-        addpd     %xmm5, %xmm0
+
-        andps     __svml_datan2_data_internal(%rip), %xmm3
+/* if x<0, dPI = Pi, else dPI =0 */
        movups    dZERO+__svml_datan2_data_internal(%rip), %xmm3
        por       %xmm13, %xmm14
        cmplepd   %xmm3, %xmm2
        addpd     %xmm6, %xmm0
        andps     __svml_datan2_data_internal(%rip), %xmm2
        orps      %xmm8, %xmm0
        addpd     %xmm2, %xmm0
        por       %xmm14, %xmm4
        orps      %xmm7, %xmm0
-        addpd     %xmm3, %xmm0
+        movmskps  %xmm4, %ecx
 /*  Special branch for fast (vector) processing of zero arguments  */
        movups    16(%rsp), %xmm11
        orps      %xmm6, %xmm0
        testb     $3, %cl
 /* Go to auxilary branch */
        jne       L(AUX_BRANCH)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
 /* Return from auxilary branch
 * for out of main path inputs
@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN):
 /* Go to special inputs processing branch */
        jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
 /* Restore registers
 * and exit the function
@ -237,8 +264,8 @@ L(EXIT):
 */
 L(SPECIAL_VALUES_BRANCH):
-        movups    %xmm8, 32(%rsp)
+        movups    %xmm10, 32(%rsp)
-        movups    %xmm9, 48(%rsp)
+        movups    %xmm11, 48(%rsp)
        movups    %xmm0, 64(%rsp)
                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL):
 */
 L(AUX_BRANCH):
 /* Check if at least on of Y or Y is zero: iAXAYZERO */
        movups    dZERO+__svml_datan2_data_internal(%rip), %xmm2
 /* Check if both X & Y are not NaNs:  iXYnotNAN */
-        movaps    %xmm9, %xmm12
+        movaps    %xmm11, %xmm13
-        movaps    %xmm8, %xmm10
+        movaps    %xmm10, %xmm12
-        cmpordpd  %xmm9, %xmm12
+        cmpordpd  %xmm11, %xmm13
-        cmpordpd  %xmm8, %xmm10
+        cmpordpd  %xmm10, %xmm12
        cmpeqpd   %xmm2, %xmm1
        cmpeqpd   %xmm2, %xmm11
        andps     %xmm10, %xmm12
        orps      %xmm11, %xmm1
        pshufd    $221, %xmm1, %xmm1
        pshufd    $221, %xmm12, %xmm11
-/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
+/* Check if at least on of Y or Y is zero: iAXAYZERO */
-        pand      %xmm11, %xmm1
+        cmpeqpd   %xmm3, %xmm9
-
+        cmpeqpd   %xmm3, %xmm1
 /* Exclude from previous callout mask zero (and not NaN) arguments */
        movdqa    %xmm1, %xmm13
        pandn     %xmm4, %xmm13
 /*
 *  Path for zero arguments (at least one of both)
 * Check if both args are zeros (den. is zero)
 */
-        movups    (%rsp), %xmm4
+        cmpeqpd   %xmm3, %xmm5
-        cmpeqpd   %xmm2, %xmm4
+        andps     %xmm12, %xmm13
        orps      %xmm1, %xmm9
        pshufd    $221, %xmm9, %xmm1
        pshufd    $221, %xmm13, %xmm9
-/* Go to callout */
+/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
-        movmskps  %xmm13, %edx
+        pand      %xmm9, %xmm1
 /* Exclude from previous callout mask zero (and not NaN) arguments */
        movdqa    %xmm1, %xmm14
        pandn     %xmm4, %xmm14
 /* Set sPIO2 to zero if den. is zero */
-        movaps    %xmm4, %xmm15
+        movaps    %xmm5, %xmm4
-        andps     %xmm2, %xmm4
+        andnps    %xmm6, %xmm4
-        andnps    %xmm5, %xmm15
+        andps     %xmm3, %xmm5
        andl      $3, %edx
        orps      %xmm4, %xmm15
        pshufd    $221, %xmm9, %xmm5
        orps      %xmm7, %xmm15
 /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
-        pshufd    $221, %xmm2, %xmm7
+        pshufd    $221, %xmm3, %xmm3
-        pcmpgtd   %xmm5, %xmm7
+        orps      %xmm5, %xmm4
-        pshufd    $80, %xmm7, %xmm14
+        pshufd    $221, %xmm11, %xmm5
-        andps     %xmm3, %xmm14
+        orps      %xmm8, %xmm4
-        addpd     %xmm14, %xmm15
+        pcmpgtd   %xmm5, %xmm3
        pshufd    $80, %xmm3, %xmm6
        andps     %xmm2, %xmm6
        addpd     %xmm6, %xmm4
 /* Go to callout */
        movmskps  %xmm14, %edx
 /* Merge results from main and spec path */
-        pshufd    $80, %xmm1, %xmm3
+        pshufd    $80, %xmm1, %xmm2
-        orps      %xmm6, %xmm15
+        orps      %xmm7, %xmm4
-        movdqa    %xmm3, %xmm6
+        movdqa    %xmm2, %xmm7
-        andps     %xmm3, %xmm15
+        andps     %xmm2, %xmm4
-        andnps    %xmm0, %xmm6
+        andnps    %xmm0, %xmm7
-        movaps    %xmm6, %xmm0
+        andl      $3, %edx
-        orps      %xmm15, %xmm0
+        movaps    %xmm7, %xmm0
        orps      %xmm4, %xmm0
 /* Return to main vector processing path */
        jmp       L(AUX_BRANCH_RETURN)
-                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
+                                # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
 END(_ZGVbN2vv_atan2_sse4)
        .section .rodata, "a"