mirror of git://sourceware.org/git/glibc.git
x86_64: Fix SSE4.2 libmvec atan2 function accuracy [BZ #28765]
This patch fixes SSE4.2 libmvec atan2 function accuracy for following inputs to less than 4 ulps. {0x1.bcab29da0e947p-54,0x1.bc41f4d2294b8p-54} 4.19888 ulps {0x1.b836ed678be29p-588,0x1.b7be6f5a03a8cp-588} 4.09889 ulps This fixes BZ #28765. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
parent
fcfc908681
commit
49e2bf58d5
|
@ -65,7 +65,7 @@
|
|||
ENTRY(_ZGVbN2vv_atan2_sse4)
|
||||
subq $88, %rsp
|
||||
cfi_def_cfa_offset(96)
|
||||
movaps %xmm0, %xmm8
|
||||
movaps %xmm1, %xmm11
|
||||
|
||||
/*
|
||||
* #define NO_VECTOR_ZERO_ATAN2_ARGS
|
||||
|
@ -78,134 +78,161 @@ ENTRY(_ZGVbN2vv_atan2_sse4)
|
|||
* Cannot be replaced by VQRCP(D, dR0, dB);
|
||||
* Argument Absolute values
|
||||
*/
|
||||
movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm4
|
||||
movups dABS_MASK+__svml_datan2_data_internal(%rip), %xmm1
|
||||
movaps %xmm0, %xmm10
|
||||
movaps %xmm1, %xmm9
|
||||
movaps %xmm4, %xmm1
|
||||
andps %xmm8, %xmm4
|
||||
andps %xmm9, %xmm1
|
||||
movaps %xmm4, %xmm2
|
||||
cmpnltpd %xmm1, %xmm2
|
||||
andps %xmm10, %xmm1
|
||||
andps %xmm11, %xmm9
|
||||
movaps %xmm1, %xmm4
|
||||
cmpnltpd %xmm9, %xmm4
|
||||
|
||||
/* Argument signs */
|
||||
movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm3
|
||||
movaps %xmm2, %xmm0
|
||||
movups dPIO2+__svml_datan2_data_internal(%rip), %xmm5
|
||||
movaps %xmm3, %xmm7
|
||||
movaps %xmm3, %xmm6
|
||||
movups dSIGN_MASK+__svml_datan2_data_internal(%rip), %xmm5
|
||||
movaps %xmm4, %xmm0
|
||||
movaps %xmm5, %xmm8
|
||||
movaps %xmm5, %xmm7
|
||||
|
||||
/*
|
||||
* 1) If y<x then a= y, b=x, PIO2=0
|
||||
* 2) If y>x then a=-x, b=y, PIO2=Pi/2
|
||||
*/
|
||||
orps %xmm1, %xmm3
|
||||
movaps %xmm2, %xmm10
|
||||
andps %xmm2, %xmm5
|
||||
andnps %xmm4, %xmm0
|
||||
andps %xmm2, %xmm3
|
||||
andnps %xmm1, %xmm10
|
||||
andps %xmm4, %xmm2
|
||||
orps %xmm3, %xmm0
|
||||
orps %xmm2, %xmm10
|
||||
divpd %xmm10, %xmm0
|
||||
movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm11
|
||||
|
||||
/* if x<0, dPI = Pi, else dPI =0 */
|
||||
movaps %xmm9, %xmm3
|
||||
orps %xmm9, %xmm5
|
||||
andnps %xmm1, %xmm0
|
||||
andps %xmm4, %xmm5
|
||||
andps %xmm11, %xmm8
|
||||
movups dPIO2+__svml_datan2_data_internal(%rip), %xmm6
|
||||
orps %xmm5, %xmm0
|
||||
movaps %xmm4, %xmm5
|
||||
andps %xmm4, %xmm6
|
||||
andnps %xmm9, %xmm5
|
||||
andps %xmm1, %xmm4
|
||||
orps %xmm4, %xmm5
|
||||
andps %xmm10, %xmm7
|
||||
divpd %xmm5, %xmm0
|
||||
movq iCHK_WORK_SUB+__svml_datan2_data_internal(%rip), %xmm2
|
||||
xorl %edx, %edx
|
||||
|
||||
/* Check if y and x are on main path. */
|
||||
pshufd $221, %xmm1, %xmm12
|
||||
andps %xmm9, %xmm7
|
||||
psubd %xmm11, %xmm12
|
||||
andps %xmm8, %xmm6
|
||||
movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm13
|
||||
xorl %edx, %edx
|
||||
movups %xmm4, 16(%rsp)
|
||||
pshufd $221, %xmm9, %xmm3
|
||||
xorl %eax, %eax
|
||||
pshufd $221, %xmm4, %xmm14
|
||||
movdqa %xmm12, %xmm4
|
||||
pcmpgtd %xmm13, %xmm4
|
||||
pcmpeqd %xmm13, %xmm12
|
||||
por %xmm12, %xmm4
|
||||
pshufd $221, %xmm1, %xmm13
|
||||
psubd %xmm2, %xmm3
|
||||
psubd %xmm2, %xmm13
|
||||
movdqa %xmm3, %xmm4
|
||||
movq iCHK_WORK_CMP+__svml_datan2_data_internal(%rip), %xmm12
|
||||
movdqa %xmm13, %xmm14
|
||||
pcmpgtd %xmm12, %xmm4
|
||||
pcmpeqd %xmm12, %xmm3
|
||||
pcmpgtd %xmm12, %xmm14
|
||||
pcmpeqd %xmm12, %xmm13
|
||||
|
||||
/* Polynomial. */
|
||||
movaps %xmm0, %xmm12
|
||||
por %xmm3, %xmm4
|
||||
mulpd %xmm0, %xmm12
|
||||
cmplepd dZERO+__svml_datan2_data_internal(%rip), %xmm3
|
||||
psubd %xmm11, %xmm14
|
||||
movdqa %xmm14, %xmm15
|
||||
pcmpeqd %xmm13, %xmm14
|
||||
pcmpgtd %xmm13, %xmm15
|
||||
por %xmm14, %xmm15
|
||||
movaps %xmm12, %xmm14
|
||||
mulpd %xmm12, %xmm14
|
||||
por %xmm15, %xmm4
|
||||
movaps %xmm14, %xmm15
|
||||
mulpd %xmm14, %xmm15
|
||||
movmskps %xmm4, %ecx
|
||||
movups %xmm10, (%rsp)
|
||||
movups dA19+__svml_datan2_data_internal(%rip), %xmm10
|
||||
mulpd %xmm15, %xmm10
|
||||
movups dA18+__svml_datan2_data_internal(%rip), %xmm13
|
||||
movups dA17+__svml_datan2_data_internal(%rip), %xmm11
|
||||
addpd dA15+__svml_datan2_data_internal(%rip), %xmm10
|
||||
mulpd %xmm15, %xmm13
|
||||
mulpd %xmm15, %xmm11
|
||||
mulpd %xmm15, %xmm10
|
||||
addpd dA14+__svml_datan2_data_internal(%rip), %xmm13
|
||||
addpd dA13+__svml_datan2_data_internal(%rip), %xmm11
|
||||
addpd dA11+__svml_datan2_data_internal(%rip), %xmm10
|
||||
mulpd %xmm15, %xmm13
|
||||
mulpd %xmm15, %xmm11
|
||||
mulpd %xmm15, %xmm10
|
||||
addpd dA10+__svml_datan2_data_internal(%rip), %xmm13
|
||||
addpd dA09+__svml_datan2_data_internal(%rip), %xmm11
|
||||
addpd dA07+__svml_datan2_data_internal(%rip), %xmm10
|
||||
mulpd %xmm15, %xmm13
|
||||
mulpd %xmm15, %xmm11
|
||||
mulpd %xmm15, %xmm10
|
||||
addpd dA06+__svml_datan2_data_internal(%rip), %xmm13
|
||||
addpd dA05+__svml_datan2_data_internal(%rip), %xmm11
|
||||
addpd dA03+__svml_datan2_data_internal(%rip), %xmm10
|
||||
mulpd %xmm15, %xmm13
|
||||
mulpd %xmm15, %xmm11
|
||||
mulpd %xmm12, %xmm10
|
||||
addpd dA02+__svml_datan2_data_internal(%rip), %xmm13
|
||||
addpd dA01+__svml_datan2_data_internal(%rip), %xmm11
|
||||
addpd %xmm10, %xmm13
|
||||
mulpd %xmm11, %xmm12
|
||||
mulpd %xmm13, %xmm14
|
||||
movups dA16+__svml_datan2_data_internal(%rip), %xmm2
|
||||
mulpd %xmm15, %xmm2
|
||||
addpd dA12+__svml_datan2_data_internal(%rip), %xmm2
|
||||
mulpd %xmm15, %xmm2
|
||||
addpd dA08+__svml_datan2_data_internal(%rip), %xmm2
|
||||
mulpd %xmm15, %xmm2
|
||||
addpd dA04+__svml_datan2_data_internal(%rip), %xmm2
|
||||
|
||||
/* A00=1.0, account for it later VQFMA(D, dP4, dP4, dR8, dA00); */
|
||||
mulpd %xmm2, %xmm15
|
||||
addpd %xmm12, %xmm15
|
||||
addpd %xmm14, %xmm15
|
||||
/* P = A19*R2 + A18 */
|
||||
movups dA19+__svml_datan2_data_internal(%rip), %xmm15
|
||||
movaps %xmm11, %xmm2
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA18+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A17 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA17+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A16 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA16+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A15 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA15+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A14 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA14+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A13 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA13+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A12 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA12+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A11 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA11+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A10 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA10+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A09 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA09+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A08 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA08+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A07 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA07+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A06 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA06+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A05 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA05+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A04 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA04+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A03 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA03+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A02 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA02+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 + A01 */
|
||||
mulpd %xmm12, %xmm15
|
||||
addpd dA01+__svml_datan2_data_internal(%rip), %xmm15
|
||||
|
||||
/* P = P*R2 */
|
||||
mulpd %xmm15, %xmm12
|
||||
|
||||
/*
|
||||
* Reconstruction.
|
||||
* dP=(R+R*dP) + dPIO2
|
||||
*/
|
||||
mulpd %xmm0, %xmm15
|
||||
addpd %xmm15, %xmm0
|
||||
addpd %xmm5, %xmm0
|
||||
andps __svml_datan2_data_internal(%rip), %xmm3
|
||||
mulpd %xmm0, %xmm12
|
||||
addpd %xmm12, %xmm0
|
||||
|
||||
/* if x<0, dPI = Pi, else dPI =0 */
|
||||
movups dZERO+__svml_datan2_data_internal(%rip), %xmm3
|
||||
por %xmm13, %xmm14
|
||||
cmplepd %xmm3, %xmm2
|
||||
addpd %xmm6, %xmm0
|
||||
andps __svml_datan2_data_internal(%rip), %xmm2
|
||||
orps %xmm8, %xmm0
|
||||
addpd %xmm2, %xmm0
|
||||
por %xmm14, %xmm4
|
||||
orps %xmm7, %xmm0
|
||||
addpd %xmm3, %xmm0
|
||||
movmskps %xmm4, %ecx
|
||||
|
||||
/* Special branch for fast (vector) processing of zero arguments */
|
||||
movups 16(%rsp), %xmm11
|
||||
orps %xmm6, %xmm0
|
||||
testb $3, %cl
|
||||
|
||||
/* Go to auxilary branch */
|
||||
jne L(AUX_BRANCH)
|
||||
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm11
|
||||
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm1 xmm2 xmm3 xmm4 xmm5 xmm6 xmm7 xmm8 xmm9 xmm10 xmm11
|
||||
|
||||
/* Return from auxilary branch
|
||||
* for out of main path inputs
|
||||
|
@ -220,7 +247,7 @@ L(AUX_BRANCH_RETURN):
|
|||
|
||||
/* Go to special inputs processing branch */
|
||||
jne L(SPECIAL_VALUES_BRANCH)
|
||||
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
|
||||
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
|
||||
|
||||
/* Restore registers
|
||||
* and exit the function
|
||||
|
@ -237,8 +264,8 @@ L(EXIT):
|
|||
*/
|
||||
|
||||
L(SPECIAL_VALUES_BRANCH):
|
||||
movups %xmm8, 32(%rsp)
|
||||
movups %xmm9, 48(%rsp)
|
||||
movups %xmm10, 32(%rsp)
|
||||
movups %xmm11, 48(%rsp)
|
||||
movups %xmm0, 64(%rsp)
|
||||
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0
|
||||
|
||||
|
@ -315,66 +342,64 @@ L(SCALAR_MATH_CALL):
|
|||
*/
|
||||
|
||||
L(AUX_BRANCH):
|
||||
/* Check if at least on of Y or Y is zero: iAXAYZERO */
|
||||
movups dZERO+__svml_datan2_data_internal(%rip), %xmm2
|
||||
|
||||
/* Check if both X & Y are not NaNs: iXYnotNAN */
|
||||
movaps %xmm9, %xmm12
|
||||
movaps %xmm8, %xmm10
|
||||
cmpordpd %xmm9, %xmm12
|
||||
cmpordpd %xmm8, %xmm10
|
||||
cmpeqpd %xmm2, %xmm1
|
||||
cmpeqpd %xmm2, %xmm11
|
||||
andps %xmm10, %xmm12
|
||||
orps %xmm11, %xmm1
|
||||
pshufd $221, %xmm1, %xmm1
|
||||
pshufd $221, %xmm12, %xmm11
|
||||
movaps %xmm11, %xmm13
|
||||
movaps %xmm10, %xmm12
|
||||
cmpordpd %xmm11, %xmm13
|
||||
cmpordpd %xmm10, %xmm12
|
||||
|
||||
/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
|
||||
pand %xmm11, %xmm1
|
||||
|
||||
/* Exclude from previous callout mask zero (and not NaN) arguments */
|
||||
movdqa %xmm1, %xmm13
|
||||
pandn %xmm4, %xmm13
|
||||
/* Check if at least on of Y or Y is zero: iAXAYZERO */
|
||||
cmpeqpd %xmm3, %xmm9
|
||||
cmpeqpd %xmm3, %xmm1
|
||||
|
||||
/*
|
||||
* Path for zero arguments (at least one of both)
|
||||
* Check if both args are zeros (den. is zero)
|
||||
*/
|
||||
movups (%rsp), %xmm4
|
||||
cmpeqpd %xmm2, %xmm4
|
||||
cmpeqpd %xmm3, %xmm5
|
||||
andps %xmm12, %xmm13
|
||||
orps %xmm1, %xmm9
|
||||
pshufd $221, %xmm9, %xmm1
|
||||
pshufd $221, %xmm13, %xmm9
|
||||
|
||||
/* Go to callout */
|
||||
movmskps %xmm13, %edx
|
||||
/* Check if at least on of Y or Y is zero and not NaN: iAXAYZEROnotNAN */
|
||||
pand %xmm9, %xmm1
|
||||
|
||||
/* Exclude from previous callout mask zero (and not NaN) arguments */
|
||||
movdqa %xmm1, %xmm14
|
||||
pandn %xmm4, %xmm14
|
||||
|
||||
/* Set sPIO2 to zero if den. is zero */
|
||||
movaps %xmm4, %xmm15
|
||||
andps %xmm2, %xmm4
|
||||
andnps %xmm5, %xmm15
|
||||
andl $3, %edx
|
||||
orps %xmm4, %xmm15
|
||||
pshufd $221, %xmm9, %xmm5
|
||||
orps %xmm7, %xmm15
|
||||
movaps %xmm5, %xmm4
|
||||
andnps %xmm6, %xmm4
|
||||
andps %xmm3, %xmm5
|
||||
|
||||
/* Res = sign(Y)*(X<0)?(PIO2+PI):PIO2 */
|
||||
pshufd $221, %xmm2, %xmm7
|
||||
pcmpgtd %xmm5, %xmm7
|
||||
pshufd $80, %xmm7, %xmm14
|
||||
andps %xmm3, %xmm14
|
||||
addpd %xmm14, %xmm15
|
||||
pshufd $221, %xmm3, %xmm3
|
||||
orps %xmm5, %xmm4
|
||||
pshufd $221, %xmm11, %xmm5
|
||||
orps %xmm8, %xmm4
|
||||
pcmpgtd %xmm5, %xmm3
|
||||
pshufd $80, %xmm3, %xmm6
|
||||
andps %xmm2, %xmm6
|
||||
addpd %xmm6, %xmm4
|
||||
|
||||
/* Go to callout */
|
||||
movmskps %xmm14, %edx
|
||||
|
||||
/* Merge results from main and spec path */
|
||||
pshufd $80, %xmm1, %xmm3
|
||||
orps %xmm6, %xmm15
|
||||
movdqa %xmm3, %xmm6
|
||||
andps %xmm3, %xmm15
|
||||
andnps %xmm0, %xmm6
|
||||
movaps %xmm6, %xmm0
|
||||
orps %xmm15, %xmm0
|
||||
pshufd $80, %xmm1, %xmm2
|
||||
orps %xmm7, %xmm4
|
||||
movdqa %xmm2, %xmm7
|
||||
andps %xmm2, %xmm4
|
||||
andnps %xmm0, %xmm7
|
||||
andl $3, %edx
|
||||
movaps %xmm7, %xmm0
|
||||
orps %xmm4, %xmm0
|
||||
|
||||
/* Return to main vector processing path */
|
||||
jmp L(AUX_BRANCH_RETURN)
|
||||
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm8 xmm9
|
||||
# LOE rbx rbp r12 r13 r14 r15 eax edx xmm0 xmm10 xmm11
|
||||
END(_ZGVbN2vv_atan2_sse4)
|
||||
|
||||
.section .rodata, "a"
|
||||
|
|
Loading…
Reference in New Issue