x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765]

This patch fixes SSE4.2 libmvec atan2 function accuracy for following
inputs to less than 4 ulps.

{0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54}   4.19888 ulps
{0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps

This fixes BZ #28765.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Sunil K Pandey 2022-01-12 11:02:19 -08:00
parent fcfc908681
commit 49e2bf58d5
1 changed files with 172 additions and 147 deletions

View File

@ -65,7 +65,7 @@
ENTRY(_ZGVbN2vv_atan2_sse4)
subq $88, %rsp
cfi_def_cfa_offset(96)
movaps %xmm0, %xmm8
movaps %xmm1, %xmm11
/*
* #define NO_VECTOR_ZERO_ATAN2_ARGS
@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
* Cannot be replaced by VQRCP(D, dR0, dB);
* Argument Absolute values
*/
movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
movaps %xmm0, %xmm10
movaps %xmm1, %xmm9
movaps %xmm4, %xmm1
andps %xmm8, %xmm4
andps %xmm9, %xmm1
movaps %xmm4, %xmm2
cmpnltpd %xmm1, %xmm2
andps %xmm10, %xmm1
andps %xmm11, %xmm9
movaps %xmm1, %xmm4
cmpnltpd %xmm9, %xmm4
/* Argument signs */
movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
movaps %xmm2, %xmm0
movups dPIO2+__svml_datan2_data_internal(%rip), %xmm5
movaps %xmm3, %xmm7
movaps %xmm3, %xmm6
movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
movaps %xmm4, %xmm0
movaps %xmm5, %xmm8
movaps %xmm5, %xmm7
/*
* 1) If y<x then a= y, b=x, PIO2=0
* 2) If y>x then a=-x, b=y, PIO2=Pi/2
*/
orps %xmm1, %xmm3
movaps %xmm2, %xmm10
andps %xmm2, %xmm5
andnps %xmm4, %xmm0
andps %xmm2, %xmm3
andnps %xmm1, %xmm10
andps %xmm4, %xmm2
orps %xmm3, %xmm0
orps %xmm2, %xmm10
divpd %xmm10, %xmm0
movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
/* if x<0, dPI = Pi, else dPI =0 */
movaps %xmm9, %xmm3
orps %xmm9, %xmm5
andnps %xmm1, %xmm0
andps %xmm4, %xmm5
andps %xmm11, %xmm8
movups dPIO2+__svml_datan2_data_internal(%rip), %xmm6
orps %xmm5, %xmm0
movaps %xmm4, %xmm5
andps %xmm4, %xmm6
andnps %xmm9, %xmm5
andps %xmm1, %xmm4
orps %xmm4, %xmm5
andps %xmm10, %xmm7
divpd %xmm5, %xmm0
movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
xorl %edx, %edx
/* Check if y and x are on main path. */
pshufd $221, %xmm1, %xmm12
andps %xmm9, %xmm7
psubd %xmm11, %xmm12
andps %xmm8, %xmm6
movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
xorl %edx, %edx
movups %xmm4, 16(%rsp)
pshufd $221, %xmm9, %xmm3
xorl %eax, %eax
pshufd $221, %xmm4, %xmm14
movdqa %xmm12, %xmm4
pcmpgtd %xmm13, %xmm4
pcmpeqd %xmm13, %xmm12
por %xmm12, %xmm4
pshufd $221, %xmm1, %xmm13
psubd %xmm2, %xmm3
psubd %xmm2, %xmm13
movdqa %xmm3, %xmm4
movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
movdqa %xmm13, %xmm14
pcmpgtd %xmm12, %xmm4
pcmpeqd %xmm12, %xmm3
pcmpgtd %xmm12, %xmm14
pcmpeqd %xmm12, %xmm13
/* Polynomial. */
movaps %xmm0, %xmm12
por %xmm3, %xmm4
mulpd %xmm0, %xmm12
cmplepd dZERO+__svml_datan2_data_internal(%rip), %xmm3
psubd %xmm11, %xmm14
movdqa %xmm14, %xmm15
pcmpeqd %xmm13, %xmm14
pcmpgtd %xmm13, %xmm15
por %xmm14, %xmm15
movaps %xmm12, %xmm14
mulpd %xmm12, %xmm14
por %xmm15, %xmm4
movaps %xmm14, %xmm15
mulpd %xmm14, %xmm15
movmskps %xmm4, %ecx
movups %xmm10, (%rsp)
movups dA19+__svml_datan2_data_internal(%rip), %xmm10
mulpd %xmm15, %xmm10
movups dA18+__svml_datan2_data_internal(%rip), %xmm13
movups dA17+__svml_datan2_data_internal(%rip), %xmm11
addpd dA15+__svml_datan2_data_internal(%rip), %xmm10
mulpd %xmm15, %xmm13
mulpd %xmm15, %xmm11
mulpd %xmm15, %xmm10
addpd dA14+__svml_datan2_data_internal(%rip), %xmm13
addpd dA13+__svml_datan2_data_internal(%rip), %xmm11
addpd dA11+__svml_datan2_data_internal(%rip), %xmm10
mulpd %xmm15, %xmm13
mulpd %xmm15, %xmm11
mulpd %xmm15, %xmm10
addpd dA10+__svml_datan2_data_internal(%rip), %xmm13
addpd dA09+__svml_datan2_data_internal(%rip), %xmm11
addpd dA07+__svml_datan2_data_internal(%rip), %xmm10
mulpd %xmm15, %xmm13
mulpd %xmm15, %xmm11
mulpd %xmm15, %xmm10
addpd dA06+__svml_datan2_data_internal(%rip), %xmm13
addpd dA05+__svml_datan2_data_internal(%rip), %xmm11
addpd dA03+__svml_datan2_data_internal(%rip), %xmm10
mulpd %xmm15, %xmm13
mulpd %xmm15, %xmm11
mulpd %xmm12, %xmm10
addpd dA02+__svml_datan2_data_internal(%rip), %xmm13
addpd dA01+__svml_datan2_data_internal(%rip), %xmm11
addpd %xmm10, %xmm13
mulpd %xmm11, %xmm12
mulpd %xmm13, %xmm14
movups dA16+__svml_datan2_data_internal(%rip), %xmm2
mulpd %xmm15, %xmm2
addpd dA12+__svml_datan2_data_internal(%rip), %xmm2
mulpd %xmm15, %xmm2
addpd dA08+__svml_datan2_data_internal(%rip), %xmm2
mulpd %xmm15, %xmm2
addpd dA04+__svml_datan2_data_internal(%rip), %xmm2
/* A00=1.0, account for it later VQFMA(D, dP4, dP4, dR8, dA00); */
mulpd %xmm2, %xmm15
addpd %xmm12, %xmm15
addpd %xmm14, %xmm15
/* P = A19*R2 + A18 */
movups dA19+__svml_datan2_data_internal(%rip), %xmm15
movaps %xmm11, %xmm2
mulpd %xmm12, %xmm15
addpd dA18+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A17 */
mulpd %xmm12, %xmm15
addpd dA17+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A16 */
mulpd %xmm12, %xmm15
addpd dA16+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A15 */
mulpd %xmm12, %xmm15
addpd dA15+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A14 */
mulpd %xmm12, %xmm15
addpd dA14+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A13 */
mulpd %xmm12, %xmm15
addpd dA13+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A12 */
mulpd %xmm12, %xmm15
addpd dA12+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A11 */
mulpd %xmm12, %xmm15
addpd dA11+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A10 */
mulpd %xmm12, %xmm15
addpd dA10+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A09 */
mulpd %xmm12, %xmm15
addpd dA09+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A08 */
mulpd %xmm12, %xmm15
addpd dA08+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A07 */
mulpd %xmm12, %xmm15
addpd dA07+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A06 */
mulpd %xmm12, %xmm15
addpd dA06+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A05 */
mulpd %xmm12, %xmm15
addpd dA05+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A04 */
mulpd %xmm12, %xmm15
addpd dA04+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A03 */
mulpd %xmm12, %xmm15
addpd dA03+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A02 */
mulpd %xmm12, %xmm15
addpd dA02+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 + A01 */
mulpd %xmm12, %xmm15
addpd dA01+__svml_datan2_data_internal(%rip), %xmm15
/* P = P*R2 */
mulpd %xmm15, %xmm12
/*
* Reconstruction.
* dP=(R+R*dP) + dPIO2
*/
mulpd %xmm0, %xmm15
addpd %xmm15, %xmm0
addpd %xmm5, %xmm0
andps __svml_datan2_data_internal(%rip), %xmm3
mulpd %xmm0, %xmm12
addpd %xmm12, %xmm0
/* if x<0, dPI = Pi, else dPI =0 */
movups dZERO+__svml_datan2_data_internal(%rip), %xmm3
por %xmm13, %xmm14
cmplepd %xmm3, %xmm2
addpd %xmm6, %xmm0
andps __svml_datan2_data_internal(%rip), %xmm2
orps %xmm8, %xmm0
addpd %xmm2, %xmm0
por %xmm14, %xmm4
orps %xmm7, %xmm0
addpd %xmm3, %xmm0
movmskps %xmm4, %ecx
/* Special branch for fast (vector) processing of zero arguments */
movups 16(%rsp), %xmm11
orps %xmm6, %xmm0
testb $3, %cl
/* Go to auxilary branch */
jne L(AUX_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
/* Return from auxilary branch
* for out of main path inputs
@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN):
/* Go to special inputs processing branch */
jne L(SPECIAL_VALUES_BRANCH)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
/* Restore registers
* and exit the function
@ -237,8 +264,8 @@ L(EXIT):
*/
L(SPECIAL_VALUES_BRANCH):
movups %xmm8, 32(%rsp)
movups %xmm9, 48(%rsp)
movups %xmm10, 32(%rsp)
movups %xmm11, 48(%rsp)
movups %xmm0, 64(%rsp)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL):
*/
L(AUX_BRANCH):
/* Check if at least on of Y or Y is zero: iAXAYZERO */
movups dZERO+__svml_datan2_data_internal(%rip), %xmm2
/* Check if both X & Y are not NaNs: iXYnotNAN */
movaps %xmm9, %xmm12
movaps %xmm8, %xmm10
cmpordpd %xmm9, %xmm12
cmpordpd %xmm8, %xmm10
cmpeqpd %xmm2, %xmm1
cmpeqpd %xmm2, %xmm11
andps %xmm10, %xmm12
orps %xmm11, %xmm1
pshufd $221, %xmm1, %xmm1
pshufd $221, %xmm12, %xmm11
movaps %xmm11, %xmm13
movaps %xmm10, %xmm12
cmpordpd %xmm11, %xmm13
cmpordpd %xmm10, %xmm12
/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
pand %xmm11, %xmm1
/* Exclude from previous callout mask zero (and not NaN) arguments */
movdqa %xmm1, %xmm13
pandn %xmm4, %xmm13
/* Check if at least on of Y or Y is zero: iAXAYZERO */
cmpeqpd %xmm3, %xmm9
cmpeqpd %xmm3, %xmm1
/*
* Path for zero arguments (at least one of both)
* Check if both args are zeros (den. is zero)
*/
movups (%rsp), %xmm4
cmpeqpd %xmm2, %xmm4
cmpeqpd %xmm3, %xmm5
andps %xmm12, %xmm13
orps %xmm1, %xmm9
pshufd $221, %xmm9, %xmm1
pshufd $221, %xmm13, %xmm9
/* Go to callout */
movmskps %xmm13, %edx
/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
pand %xmm9, %xmm1
/* Exclude from previous callout mask zero (and not NaN) arguments */
movdqa %xmm1, %xmm14
pandn %xmm4, %xmm14
/* Set sPIO2 to zero if den. is zero */
movaps %xmm4, %xmm15
andps %xmm2, %xmm4
andnps %xmm5, %xmm15
andl $3, %edx
orps %xmm4, %xmm15
pshufd $221, %xmm9, %xmm5
orps %xmm7, %xmm15
movaps %xmm5, %xmm4
andnps %xmm6, %xmm4
andps %xmm3, %xmm5
/* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
pshufd $221, %xmm2, %xmm7
pcmpgtd %xmm5, %xmm7
pshufd $80, %xmm7, %xmm14
andps %xmm3, %xmm14
addpd %xmm14, %xmm15
pshufd $221, %xmm3, %xmm3
orps %xmm5, %xmm4
pshufd $221, %xmm11, %xmm5
orps %xmm8, %xmm4
pcmpgtd %xmm5, %xmm3
pshufd $80, %xmm3, %xmm6
andps %xmm2, %xmm6
addpd %xmm6, %xmm4
/* Go to callout */
movmskps %xmm14, %edx
/* Merge results from main and spec path */
pshufd $80, %xmm1, %xmm3
orps %xmm6, %xmm15
movdqa %xmm3, %xmm6
andps %xmm3, %xmm15
andnps %xmm0, %xmm6
movaps %xmm6, %xmm0
orps %xmm15, %xmm0
pshufd $80, %xmm1, %xmm2
orps %xmm7, %xmm4
movdqa %xmm2, %xmm7
andps %xmm2, %xmm4
andnps %xmm0, %xmm7
andl $3, %edx
movaps %xmm7, %xmm0
orps %xmm4, %xmm0
/* Return to main vector processing path */
jmp L(AUX_BRANCH_RETURN)
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
END(_ZGVbN2vv_atan2_sse4)
.section .rodata, "a"