x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765]

This patch fixes SSE4.2 libmvec atan2 function accuracy for following
inputs to less than 4 ulps.

{0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54}   4.19888 ulps
{0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps

This fixes BZ #28765.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Sunil K Pandey 2022-01-12 11:02:19 -08:00
parent fcfc908681
commit 49e2bf58d5
1 changed files with 172 additions and 147 deletions

View File

@ -65,7 +65,7 @@
ENTRY(_ZGVbN2vv_atan2_sse4) ENTRY(_ZGVbN2vv_atan2_sse4)
subq $88, %rsp subq $88, %rsp
cfi_def_cfa_offset(96) cfi_def_cfa_offset(96)
movaps %xmm0, %xmm8 movaps %xmm1, %xmm11
/* /*
* #define NO_VECTOR_ZERO_ATAN2_ARGS * #define NO_VECTOR_ZERO_ATAN2_ARGS
@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
* Cannot be replaced by VQRCP(D, dR0, dB); * Cannot be replaced by VQRCP(D, dR0, dB);
* Argument Absolute values * Argument Absolute values
*/ */
movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4 movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
movaps %xmm0, %xmm10
movaps %xmm1, %xmm9 movaps %xmm1, %xmm9
movaps %xmm4, %xmm1 andps %xmm10, %xmm1
andps %xmm8, %xmm4 andps %xmm11, %xmm9
andps %xmm9, %xmm1 movaps %xmm1, %xmm4
movaps %xmm4, %xmm2 cmpnltpd %xmm9, %xmm4
cmpnltpd %xmm1, %xmm2
/* Argument signs */ /* Argument signs */
movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3 movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
movaps %xmm2, %xmm0 movaps %xmm4, %xmm0
movups dPIO2+__svml_datan2_data_internal(%rip), %xmm5 movaps %xmm5, %xmm8
movaps %xmm3, %xmm7 movaps %xmm5, %xmm7
movaps %xmm3, %xmm6
/* /*
* 1) If y<x then a= y, b=x, PIO2=0 * 1) If y<x then a= y, b=x, PIO2=0
* 2) If y>x then a=-x, b=y, PIO2=Pi/2 * 2) If y>x then a=-x, b=y, PIO2=Pi/2
*/ */
orps %xmm1, %xmm3 orps %xmm9, %xmm5
movaps %xmm2, %xmm10 andnps %xmm1, %xmm0
andps %xmm2, %xmm5 andps %xmm4, %xmm5
andnps %xmm4, %xmm0 andps %xmm11, %xmm8
andps %xmm2, %xmm3 movups dPIO2+__svml_datan2_data_internal(%rip), %xmm6
andnps %xmm1, %xmm10 orps %xmm5, %xmm0
andps %xmm4, %xmm2 movaps %xmm4, %xmm5
orps %xmm3, %xmm0 andps %xmm4, %xmm6
orps %xmm2, %xmm10 andnps %xmm9, %xmm5
divpd %xmm10, %xmm0 andps %xmm1, %xmm4
movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11 orps %xmm4, %xmm5
andps %xmm10, %xmm7
/* if x<0, dPI = Pi, else dPI =0 */ divpd %xmm5, %xmm0
movaps %xmm9, %xmm3 movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
xorl %edx, %edx
/* Check if y and x are on main path. */ /* Check if y and x are on main path. */
pshufd $221, %xmm1, %xmm12 pshufd $221, %xmm9, %xmm3
andps %xmm9, %xmm7
psubd %xmm11, %xmm12
andps %xmm8, %xmm6
movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
xorl %edx, %edx
movups %xmm4, 16(%rsp)
xorl %eax, %eax xorl %eax, %eax
pshufd $221, %xmm4, %xmm14 pshufd $221, %xmm1, %xmm13
movdqa %xmm12, %xmm4 psubd %xmm2, %xmm3
pcmpgtd %xmm13, %xmm4 psubd %xmm2, %xmm13
pcmpeqd %xmm13, %xmm12 movdqa %xmm3, %xmm4
por %xmm12, %xmm4 movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
movdqa %xmm13, %xmm14
pcmpgtd %xmm12, %xmm4
pcmpeqd %xmm12, %xmm3
pcmpgtd %xmm12, %xmm14
pcmpeqd %xmm12, %xmm13
/* Polynomial. */ /* Polynomial. */
movaps %xmm0, %xmm12 movaps %xmm0, %xmm12
por %xmm3, %xmm4
mulpd %xmm0, %xmm12 mulpd %xmm0, %xmm12
cmplepd dZERO+__svml_datan2_data_internal(%rip), %xmm3
psubd %xmm11, %xmm14
movdqa %xmm14, %xmm15
pcmpeqd %xmm13, %xmm14
pcmpgtd %xmm13, %xmm15
por %xmm14, %xmm15
movaps %xmm12, %xmm14
mulpd %xmm12, %xmm14
por %xmm15, %xmm4
movaps %xmm14, %xmm15
mulpd %xmm14, %xmm15
movmskps %xmm4, %ecx
movups %xmm10, (%rsp)
movups dA19+__svml_datan2_data_internal(%rip), %xmm10
mulpd %xmm15, %xmm10
movups dA18+__svml_datan2_data_internal(%rip), %xmm13
movups dA17+__svml_datan2_data_internal(%rip), %xmm11
addpd dA15+__svml_datan2_data_internal(%rip), %xmm10
mulpd %xmm15, %xmm13
mulpd %xmm15, %xmm11
mulpd %xmm15, %xmm10
addpd dA14+__svml_datan2_data_internal(%rip), %xmm13
addpd dA13+__svml_datan2_data_internal(%rip), %xmm11
addpd dA11+__svml_datan2_data_internal(%rip), %xmm10
mulpd %xmm15, %xmm13
mulpd %xmm15, %xmm11
mulpd %xmm15, %xmm10
addpd dA10+__svml_datan2_data_internal(%rip), %xmm13
addpd dA09+__svml_datan2_data_internal(%rip), %xmm11
addpd dA07+__svml_datan2_data_internal(%rip), %xmm10
mulpd %xmm15, %xmm13
mulpd %xmm15, %xmm11
mulpd %xmm15, %xmm10
addpd dA06+__svml_datan2_data_internal(%rip), %xmm13
addpd dA05+__svml_datan2_data_internal(%rip), %xmm11
addpd dA03+__svml_datan2_data_internal(%rip), %xmm10
mulpd %xmm15, %xmm13
mulpd %xmm15, %xmm11
mulpd %xmm12, %xmm10
addpd dA02+__svml_datan2_data_internal(%rip), %xmm13
addpd dA01+__svml_datan2_data_internal(%rip), %xmm11
addpd %xmm10, %xmm13
mulpd %xmm11, %xmm12
mulpd %xmm13, %xmm14
movups dA16+__svml_datan2_data_internal(%rip), %xmm2
mulpd %xmm15, %xmm2
addpd dA12+__svml_datan2_data_internal(%rip), %xmm2
mulpd %xmm15, %xmm2
addpd dA08+__svml_datan2_data_internal(%rip), %xmm2
mulpd %xmm15, %xmm2
addpd dA04+__svml_datan2_data_internal(%rip), %xmm2
/* A00=1.0, account for it later VQFMA(D, dP4, dP4, dR8, dA00); */ /* P = A19*R2 + A18 */
mulpd %xmm2, %xmm15 movups dA19+__svml_datan2_data_internal(%rip), %xmm15
addpd %xmm12, %xmm15 movaps %xmm11, %xmm2
addpd %xmm14, %xmm15 mulpd %xmm12, %xmm15
addpd dA18+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A17 */
mulpd %xmm12, %xmm15
addpd dA17+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A16 */
mulpd %xmm12, %xmm15
addpd dA16+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A15 */
mulpd %xmm12, %xmm15
addpd dA15+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A14 */
mulpd %xmm12, %xmm15
addpd dA14+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A13 */
mulpd %xmm12, %xmm15
addpd dA13+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A12 */
mulpd %xmm12, %xmm15
addpd dA12+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A11 */
mulpd %xmm12, %xmm15
addpd dA11+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A10 */
mulpd %xmm12, %xmm15
addpd dA10+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A09 */
mulpd %xmm12, %xmm15
addpd dA09+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A08 */
mulpd %xmm12, %xmm15
addpd dA08+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A07 */
mulpd %xmm12, %xmm15
addpd dA07+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A06 */
mulpd %xmm12, %xmm15
addpd dA06+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A05 */
mulpd %xmm12, %xmm15
addpd dA05+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A04 */
mulpd %xmm12, %xmm15
addpd dA04+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A03 */
mulpd %xmm12, %xmm15
addpd dA03+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A02 */
mulpd %xmm12, %xmm15
addpd dA02+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A01 */
mulpd %xmm12, %xmm15
addpd dA01+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 */
mulpd %xmm15, %xmm12
/* /*
* Reconstruction. * Reconstruction.
* dP=(R+R*dP) + dPIO2 * dP=(R+R*dP) + dPIO2
*/ */
mulpd %xmm0, %xmm15 mulpd %xmm0, %xmm12
addpd %xmm15, %xmm0 addpd %xmm12, %xmm0
addpd %xmm5, %xmm0
andps __svml_datan2_data_internal(%rip), %xmm3 /* if x<0, dPI = Pi, else dPI =0 */
movups dZERO+__svml_datan2_data_internal(%rip), %xmm3
por %xmm13, %xmm14
cmplepd %xmm3, %xmm2
addpd %xmm6, %xmm0
andps __svml_datan2_data_internal(%rip), %xmm2
orps %xmm8, %xmm0
addpd %xmm2, %xmm0
por %xmm14, %xmm4
orps %xmm7, %xmm0 orps %xmm7, %xmm0
addpd %xmm3, %xmm0 movmskps %xmm4, %ecx
/* Special branch for fast (vector) processing of zero arguments */ /* Special branch for fast (vector) processing of zero arguments */
movups 16(%rsp), %xmm11
orps %xmm6, %xmm0
testb $3, %cl testb $3, %cl
/* Go to auxilary branch */ /* Go to auxilary branch */
jne L(AUX_BRANCH) jne L(AUX_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
/* Return from auxilary branch /* Return from auxilary branch
* for out of main path inputs * for out of main path inputs
@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN):
/* Go to special inputs processing branch */ /* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH) jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
/* Restore registers /* Restore registers
* and exit the function * and exit the function
@ -237,8 +264,8 @@ L(EXIT):
*/ */
L(SPECIAL_VALUES_BRANCH): L(SPECIAL_VALUES_BRANCH):
movups %xmm8, 32(%rsp) movups %xmm10, 32(%rsp)
movups %xmm9, 48(%rsp) movups %xmm11, 48(%rsp)
movups %xmm0, 64(%rsp) movups %xmm0, 64(%rsp)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL):
*/ */
L(AUX_BRANCH): L(AUX_BRANCH):
/* Check if at least on of Y or Y is zero: iAXAYZERO */
movups dZERO+__svml_datan2_data_internal(%rip), %xmm2
/* Check if both X & Y are not NaNs: iXYnotNAN */ /* Check if both X & Y are not NaNs: iXYnotNAN */
movaps %xmm9, %xmm12 movaps %xmm11, %xmm13
movaps %xmm8, %xmm10 movaps %xmm10, %xmm12
cmpordpd %xmm9, %xmm12 cmpordpd %xmm11, %xmm13
cmpordpd %xmm8, %xmm10 cmpordpd %xmm10, %xmm12
cmpeqpd %xmm2, %xmm1
cmpeqpd %xmm2, %xmm11
andps %xmm10, %xmm12
orps %xmm11, %xmm1
pshufd $221, %xmm1, %xmm1
pshufd $221, %xmm12, %xmm11
/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */ /* Check if at least on of Y or Y is zero: iAXAYZERO */
pand %xmm11, %xmm1 cmpeqpd %xmm3, %xmm9
cmpeqpd %xmm3, %xmm1
/* Exclude from previous callout mask zero (and not NaN) arguments */
movdqa %xmm1, %xmm13
pandn %xmm4, %xmm13
/* /*
* Path for zero arguments (at least one of both) * Path for zero arguments (at least one of both)
* Check if both args are zeros (den. is zero) * Check if both args are zeros (den. is zero)
*/ */
movups (%rsp), %xmm4 cmpeqpd %xmm3, %xmm5
cmpeqpd %xmm2, %xmm4 andps %xmm12, %xmm13
orps %xmm1, %xmm9
pshufd $221, %xmm9, %xmm1
pshufd $221, %xmm13, %xmm9
/* Go to callout */ /* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
movmskps %xmm13, %edx pand %xmm9, %xmm1
/* Exclude from previous callout mask zero (and not NaN) arguments */
movdqa %xmm1, %xmm14
pandn %xmm4, %xmm14
/* Set sPIO2 to zero if den. is zero */ /* Set sPIO2 to zero if den. is zero */
movaps %xmm4, %xmm15 movaps %xmm5, %xmm4
andps %xmm2, %xmm4 andnps %xmm6, %xmm4
andnps %xmm5, %xmm15 andps %xmm3, %xmm5
andl $3, %edx
orps %xmm4, %xmm15
pshufd $221, %xmm9, %xmm5
orps %xmm7, %xmm15
/* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */ /* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
pshufd $221, %xmm2, %xmm7 pshufd $221, %xmm3, %xmm3
pcmpgtd %xmm5, %xmm7 orps %xmm5, %xmm4
pshufd $80, %xmm7, %xmm14 pshufd $221, %xmm11, %xmm5
andps %xmm3, %xmm14 orps %xmm8, %xmm4
addpd %xmm14, %xmm15 pcmpgtd %xmm5, %xmm3
pshufd $80, %xmm3, %xmm6
andps %xmm2, %xmm6
addpd %xmm6, %xmm4
/* Go to callout */
movmskps %xmm14, %edx
/* Merge results from main and spec path */ /* Merge results from main and spec path */
pshufd $80, %xmm1, %xmm3 pshufd $80, %xmm1, %xmm2
orps %xmm6, %xmm15 orps %xmm7, %xmm4
movdqa %xmm3, %xmm6 movdqa %xmm2, %xmm7
andps %xmm3, %xmm15 andps %xmm2, %xmm4
andnps %xmm0, %xmm6 andnps %xmm0, %xmm7
movaps %xmm6, %xmm0 andl $3, %edx
orps %xmm15, %xmm0 movaps %xmm7, %xmm0
orps %xmm4, %xmm0
/* Return to main vector processing path */ /* Return to main vector processing path */
jmp L(AUX_BRANCH_RETURN) jmp L(AUX_BRANCH_RETURN)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9 # LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
END(_ZGVbN2vv_atan2_sse4) END(_ZGVbN2vv_atan2_sse4)
.section .rodata, "a" .section .rodata, "a"