Vector cosf for x86_64.

Here is implementation of vectorized cosf containing SSE, AVX,
AVX2 and AVX512 versions according to Vector ABI
<https://groups.google.com/forum/#!topic/x86-64-abi/LmppCfN1rZ4>.

    * sysdeps/x86_64/fpu/Makefile (libmvec-support): Added new files.
    * sysdeps/x86_64/fpu/Versions: New versions added.
    * sysdeps/x86_64/fpu/svml_s_cosf4_core.S: New file.
    * sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S: New file.
    * sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S: New file.
    * sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S: New file.
    * sysdeps/x86_64/fpu/svml_s_cosf8_core.S: New file.
    * sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S: New file.
    * sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S: New file.
    * sysdeps/x86_64/fpu/svml_s_cosf16_core.S: New file.
    * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: New file.
    * sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S: New file.
    * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: New file.
    * sysdeps/x86_64/fpu/svml_s_cosf_data.S: New file.
    * sysdeps/x86_64/fpu/svml_s_cosf_data.h: New file.
    * sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added
    build of SSE, AVX2 and AVX512 IFUNC versions.
    * sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New versions added.
    * sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration for cosf.
    * NEWS: Mention addition of x86_64 vector cosf.
This commit is contained in:
Andrew Senkevich 2015-06-09 18:29:47 +03:00
parent 24a2718f59
commit 04f496d602
20 changed files with 2458 additions and 3 deletions

View File

@ -47,6 +47,27 @@
* sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c: New file.
* sysdeps/x86_64/fpu/test-double-vlen8.c: New file.
* sysdeps/x86_64/fpu/Makefile (libmvec-support): Added new files.
* sysdeps/x86_64/fpu/Versions: New versions added.
* sysdeps/x86_64/fpu/svml_s_cosf4_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core_sse4.S: New file.
* sysdeps/x86_64/fpu/svml_s_cosf8_core_avx.S: New file.
* sysdeps/x86_64/fpu/svml_s_cosf8_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core_avx2.S: New file.
* sysdeps/x86_64/fpu/svml_s_cosf16_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S: New file.
* sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: New file.
* sysdeps/x86_64/fpu/svml_s_cosf_data.S: New file.
* sysdeps/x86_64/fpu/svml_s_cosf_data.h: New file.
* sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added
build of SSE, AVX2 and AVX512 IFUNC versions.
* sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New versions added.
* sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration for cosf.
* NEWS: Mention addition of x86_64 vector cosf.
2015-06-09 Marko Myllynen <myllynen@redhat.com>
* locale/C-ctype.c (PREDEFINED_CLASSES): Remove.

2
NEWS
View File

@ -52,7 +52,7 @@ Version 2.22
condition in some applications.
* Added vector math library named libmvec with the following vectorized x86_64
implementations: cos.
implementations: cos, cosf.
The library can be disabled with --disable-mathvec. Use of the functions is
enabled with -fopenmp -ffast-math starting from -O1 for GCC version >= 4.9.0.
The library is linked in as needed when using -lm (no need to specify -lmvec

View File

@ -1,6 +1,10 @@
GLIBC_2.22
GLIBC_2.22 A
_ZGVbN2v_cos F
_ZGVbN4v_cosf F
_ZGVcN4v_cos F
_ZGVcN8v_cosf F
_ZGVdN4v_cos F
_ZGVdN8v_cosf F
_ZGVeN16v_cosf F
_ZGVeN8v_cos F

View File

@ -30,5 +30,7 @@
# define __DECL_SIMD_x86_64 _Pragma ("omp declare simd notinbranch")
# undef __DECL_SIMD_cos
# define __DECL_SIMD_cos __DECL_SIMD_x86_64
# undef __DECL_SIMD_cosf
# define __DECL_SIMD_cosf __DECL_SIMD_x86_64
# endif
#endif

View File

@ -1,7 +1,9 @@
ifeq ($(subdir),mathvec)
libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \
svml_d_cos4_core svml_d_cos8_core \
svml_d_cos_data init-arch
svml_d_cos_data svml_s_cosf4_core svml_s_cosf8_core_avx \
svml_s_cosf8_core svml_s_cosf16_core svml_s_cosf_data \
init-arch
endif
# Variables for libmvec tests.

View File

@ -1,5 +1,6 @@
libmvec {
GLIBC_2.22 {
_ZGVbN2v_cos; _ZGVcN4v_cos; _ZGVdN4v_cos; _ZGVeN8v_cos;
_ZGVbN4v_cosf; _ZGVcN8v_cosf; _ZGVdN8v_cosf; _ZGVeN16v_cosf;
}
}

View File

@ -54,5 +54,6 @@ endif
ifeq ($(subdir),mathvec)
libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \
svml_d_cos8_core_avx512
svml_d_cos8_core_avx512 svml_s_cosf4_core_sse4 \
svml_s_cosf8_core_avx2 svml_s_cosf16_core_avx512
endif

View File

@ -0,0 +1,39 @@
/* Multiple versions of vectorized cosf.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include <init-arch.h>
.text
ENTRY (_ZGVeN16v_cosf)
.type _ZGVeN16v_cosf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1
call __init_cpu_features
1: leaq _ZGVeN16v_cosf_skx(%rip), %rax
testl $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
jnz 3
2: leaq _ZGVeN16v_cosf_knl(%rip), %rax
testl $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
jnz 3
leaq _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
3: ret
END (_ZGVeN16v_cosf)
#define _ZGVeN16v_cosf _ZGVeN16v_cosf_avx2_wrapper
#include "../svml_s_cosf16_core.S"

View File

@ -0,0 +1,460 @@
/* Function cosf vectorized with AVX-512. KNL and SKX versions.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_cosf_data.h"
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVeN16v_cosf_knl)
#ifndef HAVE_AVX512_ASM_SUPPORT
WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
#else
/*
ALGORITHM DESCRIPTION:
1) Range reduction to [-Pi/2; +Pi/2] interval
a) We remove sign using AND operation
b) Add Pi/2 value to argument X for Cos to Sin transformation
c) Getting octant Y by 1/Pi multiplication
d) Add "Right Shifter" value
e) Treat obtained value as integer for destination sign setting.
Shift first bit of this value to the last (sign) position
f) Subtract "Right Shifter" value
g) Subtract 0.5 from result for octant correction
h) Subtract Y*PI from X argument, where PI divided to 4 parts:
X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
a) Calculate X^2 = X * X
b) Calculate polynomial:
R = X + X * X^2 * (A3 + x^2 * (A5 + .....
3) Destination sign setting
a) Set shifted destination sign using XOR operation:
R = XOR( R, S );
*/
pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $1280, %rsp
movq __svml_scos_data@GOTPCREL(%rip), %rdx
/*
h) Subtract Y*PI from X argument, where PI divided to 4 parts:
X = X - Y*PI1 - Y*PI2 - Y*PI3
*/
vmovaps %zmm0, %zmm6
movl $-1, %eax
/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
vaddps __sHalfPI(%rdx), %zmm0, %zmm2
vmovups __sRShifter(%rdx), %zmm3
/*
1) Range reduction to [-Pi/2; +Pi/2] interval
c) Getting octant Y by 1/Pi multiplication
d) Add "Right Shifter" (0x4B000000) value
*/
vfmadd132ps __sInvPI(%rdx), %zmm3, %zmm2
vmovups __sPI1_FMA(%rdx), %zmm5
/* f) Subtract "Right Shifter" (0x4B000000) value */
vsubps %zmm3, %zmm2, %zmm4
vmovups __sA9_FMA(%rdx), %zmm9
/* Check for large and special arguments */
vpandd __sAbsMask(%rdx), %zmm0, %zmm1
/*
e) Treat obtained value as integer for destination sign setting.
Shift first bit of this value to the last (sign) position (S << 31)
*/
vpslld $31, %zmm2, %zmm8
vcmpps $22, __sRangeReductionVal(%rdx), %zmm1, %k1
vpbroadcastd %eax, %zmm12{%k1}{z}
/* g) Subtract 0.5 from result for octant correction */
vsubps __sOneHalf(%rdx), %zmm4, %zmm7
vptestmd %zmm12, %zmm12, %k0
vfnmadd231ps %zmm7, %zmm5, %zmm6
kmovw %k0, %ecx
vfnmadd231ps __sPI2_FMA(%rdx), %zmm7, %zmm6
vfnmadd132ps __sPI3_FMA(%rdx), %zmm6, %zmm7
/* a) Calculate X^2 = X * X */
vmulps %zmm7, %zmm7, %zmm10
/*
3) Destination sign setting
a) Set shifted destination sign using XOR operation:
R = XOR( R, S );
*/
vpxord %zmm8, %zmm7, %zmm11
/*
b) Calculate polynomial:
R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
*/
vfmadd213ps __sA7_FMA(%rdx), %zmm10, %zmm9
vfmadd213ps __sA5_FMA(%rdx), %zmm10, %zmm9
vfmadd213ps __sA3(%rdx), %zmm10, %zmm9
vmulps %zmm10, %zmm9, %zmm1
vfmadd213ps %zmm11, %zmm11, %zmm1
testl %ecx, %ecx
jne .LBL_1_3
.LBL_1_2:
cfi_remember_state
vmovaps %zmm1, %zmm0
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
.LBL_1_3:
cfi_restore_state
vmovups %zmm0, 1152(%rsp)
vmovups %zmm1, 1216(%rsp)
je .LBL_1_2
xorb %dl, %dl
kmovw %k4, 1048(%rsp)
xorl %eax, %eax
kmovw %k5, 1040(%rsp)
kmovw %k6, 1032(%rsp)
kmovw %k7, 1024(%rsp)
vmovups %zmm16, 960(%rsp)
vmovups %zmm17, 896(%rsp)
vmovups %zmm18, 832(%rsp)
vmovups %zmm19, 768(%rsp)
vmovups %zmm20, 704(%rsp)
vmovups %zmm21, 640(%rsp)
vmovups %zmm22, 576(%rsp)
vmovups %zmm23, 512(%rsp)
vmovups %zmm24, 448(%rsp)
vmovups %zmm25, 384(%rsp)
vmovups %zmm26, 320(%rsp)
vmovups %zmm27, 256(%rsp)
vmovups %zmm28, 192(%rsp)
vmovups %zmm29, 128(%rsp)
vmovups %zmm30, 64(%rsp)
vmovups %zmm31, (%rsp)
movq %rsi, 1064(%rsp)
movq %rdi, 1056(%rsp)
movq %r12, 1096(%rsp)
cfi_offset_rel_rsp (12, 1096)
movb %dl, %r12b
movq %r13, 1088(%rsp)
cfi_offset_rel_rsp (13, 1088)
movl %ecx, %r13d
movq %r14, 1080(%rsp)
cfi_offset_rel_rsp (14, 1080)
movl %eax, %r14d
movq %r15, 1072(%rsp)
cfi_offset_rel_rsp (15, 1072)
cfi_remember_state
.LBL_1_6:
btl %r14d, %r13d
jc .LBL_1_12
.LBL_1_7:
lea 1(%r14), %esi
btl %esi, %r13d
jc .LBL_1_10
.LBL_1_8:
addb $1, %r12b
addl $2, %r14d
cmpb $16, %r12b
jb .LBL_1_6
kmovw 1048(%rsp), %k4
movq 1064(%rsp), %rsi
kmovw 1040(%rsp), %k5
movq 1056(%rsp), %rdi
kmovw 1032(%rsp), %k6
movq 1096(%rsp), %r12
cfi_restore (%r12)
movq 1088(%rsp), %r13
cfi_restore (%r13)
kmovw 1024(%rsp), %k7
vmovups 960(%rsp), %zmm16
vmovups 896(%rsp), %zmm17
vmovups 832(%rsp), %zmm18
vmovups 768(%rsp), %zmm19
vmovups 704(%rsp), %zmm20
vmovups 640(%rsp), %zmm21
vmovups 576(%rsp), %zmm22
vmovups 512(%rsp), %zmm23
vmovups 448(%rsp), %zmm24
vmovups 384(%rsp), %zmm25
vmovups 320(%rsp), %zmm26
vmovups 256(%rsp), %zmm27
vmovups 192(%rsp), %zmm28
vmovups 128(%rsp), %zmm29
vmovups 64(%rsp), %zmm30
vmovups (%rsp), %zmm31
movq 1080(%rsp), %r14
cfi_restore (%r14)
movq 1072(%rsp), %r15
cfi_restore (%r15)
vmovups 1216(%rsp), %zmm1
jmp .LBL_1_2
.LBL_1_10:
cfi_restore_state
movzbl %r12b, %r15d
vmovss 1156(%rsp,%r15,8), %xmm0
call cosf@PLT
vmovss %xmm0, 1220(%rsp,%r15,8)
jmp .LBL_1_8
.LBL_1_12:
movzbl %r12b, %r15d
vmovss 1152(%rsp,%r15,8), %xmm0
call cosf@PLT
vmovss %xmm0, 1216(%rsp,%r15,8)
jmp .LBL_1_7
#endif
END (_ZGVeN16v_cosf_knl)
ENTRY (_ZGVeN16v_cosf_skx)
#ifndef HAVE_AVX512_ASM_SUPPORT
WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
#else
/*
ALGORITHM DESCRIPTION:
1) Range reduction to [-Pi/2; +Pi/2] interval
a) We remove sign using AND operation
b) Add Pi/2 value to argument X for Cos to Sin transformation
c) Getting octant Y by 1/Pi multiplication
d) Add "Right Shifter" value
e) Treat obtained value as integer for destination sign setting.
Shift first bit of this value to the last (sign) position
f) Subtract "Right Shifter" value
g) Subtract 0.5 from result for octant correction
h) Subtract Y*PI from X argument, where PI divided to 4 parts:
X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
a) Calculate X^2 = X * X
b) Calculate polynomial:
R = X + X * X^2 * (A3 + x^2 * (A5 + .....
3) Destination sign setting
a) Set shifted destination sign using XOR operation:
R = XOR( R, S );
*/
pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $1280, %rsp
movq __svml_scos_data@GOTPCREL(%rip), %rax
/*
h) Subtract Y*PI from X argument, where PI divided to 4 parts:
X = X - Y*PI1 - Y*PI2 - Y*PI3
*/
vmovaps %zmm0, %zmm6
vmovups .L_2il0floatpacket.13(%rip), %zmm12
vmovups __sRShifter(%rax), %zmm3
vmovups __sPI1_FMA(%rax), %zmm5
vmovups __sA9_FMA(%rax), %zmm9
/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
vaddps __sHalfPI(%rax), %zmm0, %zmm2
/* Check for large and special arguments */
vandps __sAbsMask(%rax), %zmm0, %zmm1
/*
1) Range reduction to [-Pi/2; +Pi/2] interval
c) Getting octant Y by 1/Pi multiplication
d) Add "Right Shifter" (0x4B000000) value
*/
vfmadd132ps __sInvPI(%rax), %zmm3, %zmm2
vcmpps $18, __sRangeReductionVal(%rax), %zmm1, %k1
/*
e) Treat obtained value as integer for destination sign setting.
Shift first bit of this value to the last (sign) position (S << 31)
*/
vpslld $31, %zmm2, %zmm8
/* f) Subtract "Right Shifter" (0x4B000000) value */
vsubps %zmm3, %zmm2, %zmm4
/* g) Subtract 0.5 from result for octant correction */
vsubps __sOneHalf(%rax), %zmm4, %zmm7
vfnmadd231ps %zmm7, %zmm5, %zmm6
vfnmadd231ps __sPI2_FMA(%rax), %zmm7, %zmm6
vfnmadd132ps __sPI3_FMA(%rax), %zmm6, %zmm7
/* a) Calculate X^2 = X * X */
vmulps %zmm7, %zmm7, %zmm10
/*
3) Destination sign setting
a) Set shifted destination sign using XOR operation:
R = XOR( R, S );
*/
vxorps %zmm8, %zmm7, %zmm11
/*
b) Calculate polynomial:
R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
*/
vfmadd213ps __sA7_FMA(%rax), %zmm10, %zmm9
vfmadd213ps __sA5_FMA(%rax), %zmm10, %zmm9
vfmadd213ps __sA3(%rax), %zmm10, %zmm9
vpandnd %zmm1, %zmm1, %zmm12{%k1}
vmulps %zmm10, %zmm9, %zmm1
vptestmd %zmm12, %zmm12, %k0
vfmadd213ps %zmm11, %zmm11, %zmm1
kmovw %k0, %ecx
testl %ecx, %ecx
jne .LBL_2_3
.LBL_2_2:
cfi_remember_state
vmovaps %zmm1, %zmm0
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
.LBL_2_3:
cfi_restore_state
vmovups %zmm0, 1152(%rsp)
vmovups %zmm1, 1216(%rsp)
je .LBL_2_2
xorb %dl, %dl
xorl %eax, %eax
kmovw %k4, 1048(%rsp)
kmovw %k5, 1040(%rsp)
kmovw %k6, 1032(%rsp)
kmovw %k7, 1024(%rsp)
vmovups %zmm16, 960(%rsp)
vmovups %zmm17, 896(%rsp)
vmovups %zmm18, 832(%rsp)
vmovups %zmm19, 768(%rsp)
vmovups %zmm20, 704(%rsp)
vmovups %zmm21, 640(%rsp)
vmovups %zmm22, 576(%rsp)
vmovups %zmm23, 512(%rsp)
vmovups %zmm24, 448(%rsp)
vmovups %zmm25, 384(%rsp)
vmovups %zmm26, 320(%rsp)
vmovups %zmm27, 256(%rsp)
vmovups %zmm28, 192(%rsp)
vmovups %zmm29, 128(%rsp)
vmovups %zmm30, 64(%rsp)
vmovups %zmm31, (%rsp)
movq %rsi, 1064(%rsp)
movq %rdi, 1056(%rsp)
movq %r12, 1096(%rsp)
cfi_offset_rel_rsp (12, 1096)
movb %dl, %r12b
movq %r13, 1088(%rsp)
cfi_offset_rel_rsp (13, 1088)
movl %ecx, %r13d
movq %r14, 1080(%rsp)
cfi_offset_rel_rsp (14, 1080)
movl %eax, %r14d
movq %r15, 1072(%rsp)
cfi_offset_rel_rsp (15, 1072)
cfi_remember_state
.LBL_2_6:
btl %r14d, %r13d
jc .LBL_2_12
.LBL_2_7:
lea 1(%r14), %esi
btl %esi, %r13d
jc .LBL_2_10
.LBL_2_8:
incb %r12b
addl $2, %r14d
cmpb $16, %r12b
jb .LBL_2_6
kmovw 1048(%rsp), %k4
kmovw 1040(%rsp), %k5
kmovw 1032(%rsp), %k6
kmovw 1024(%rsp), %k7
vmovups 960(%rsp), %zmm16
vmovups 896(%rsp), %zmm17
vmovups 832(%rsp), %zmm18
vmovups 768(%rsp), %zmm19
vmovups 704(%rsp), %zmm20
vmovups 640(%rsp), %zmm21
vmovups 576(%rsp), %zmm22
vmovups 512(%rsp), %zmm23
vmovups 448(%rsp), %zmm24
vmovups 384(%rsp), %zmm25
vmovups 320(%rsp), %zmm26
vmovups 256(%rsp), %zmm27
vmovups 192(%rsp), %zmm28
vmovups 128(%rsp), %zmm29
vmovups 64(%rsp), %zmm30
vmovups (%rsp), %zmm31
vmovups 1216(%rsp), %zmm1
movq 1064(%rsp), %rsi
movq 1056(%rsp), %rdi
movq 1096(%rsp), %r12
cfi_restore (%r12)
movq 1088(%rsp), %r13
cfi_restore (%r13)
movq 1080(%rsp), %r14
cfi_restore (%r14)
movq 1072(%rsp), %r15
cfi_restore (%r15)
jmp .LBL_2_2
.LBL_2_10:
cfi_restore_state
movzbl %r12b, %r15d
vmovss 1156(%rsp,%r15,8), %xmm0
vzeroupper
vmovss 1156(%rsp,%r15,8), %xmm0
call cosf@PLT
vmovss %xmm0, 1220(%rsp,%r15,8)
jmp .LBL_2_8
.LBL_2_12:
movzbl %r12b, %r15d
vmovss 1152(%rsp,%r15,8), %xmm0
vzeroupper
vmovss 1152(%rsp,%r15,8), %xmm0
call cosf@PLT
vmovss %xmm0, 1216(%rsp,%r15,8)
jmp .LBL_2_7
#endif
END (_ZGVeN16v_cosf_skx)
.section .rodata, "a"
.L_2il0floatpacket.13:
.long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
.type .L_2il0floatpacket.13,@object

View File

@ -0,0 +1,38 @@
/* Multiple versions of vectorized cosf, vector length is 4.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include <init-arch.h>
.text
ENTRY (_ZGVbN4v_cosf)
.type _ZGVbN4v_cosf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
1: leaq _ZGVbN4v_cosf_sse4(%rip), %rax
testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
jz 2f
ret
2: leaq _ZGVbN4v_cosf_sse2(%rip), %rax
ret
END (_ZGVbN4v_cosf)
libmvec_hidden_def (_ZGVbN4v_cosf)
#define _ZGVbN4v_cosf _ZGVbN4v_cosf_sse2
#include "../svml_s_cosf4_core.S"

View File

@ -0,0 +1,227 @@
/* Function cosf vectorized with SSE4.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_cosf_data.h"
.text
ENTRY (_ZGVbN4v_cosf_sse4)
/*
ALGORITHM DESCRIPTION:
1) Range reduction to [-Pi/2; +Pi/2] interval
a) We remove sign using AND operation
b) Add Pi/2 value to argument X for Cos to Sin transformation
c) Getting octant Y by 1/Pi multiplication
d) Add "Right Shifter" value
e) Treat obtained value as integer for destination sign setting.
Shift first bit of this value to the last (sign) position
f) Subtract "Right Shifter" value
g) Subtract 0.5 from result for octant correction
h) Subtract Y*PI from X argument, where PI divided to 4 parts:
X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
a) Calculate X^2 = X * X
b) Calculate polynomial:
R = X + X * X^2 * (A3 + x^2 * (A5 + .....
3) Destination sign setting
a) Set shifted destination sign using XOR operation:
R = XOR( R, S );
*/
pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $320, %rsp
movaps %xmm0, %xmm4
movq __svml_scos_data@GOTPCREL(%rip), %rax
movups __sHalfPI(%rax), %xmm1
movups __sRShifter(%rax), %xmm5
/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
addps %xmm4, %xmm1
/*
1) Range reduction to [-Pi/2; +Pi/2] interval
c) Getting octant Y by 1/Pi multiplication
d) Add "Right Shifter" (0x4B000000) value
*/
mulps __sInvPI(%rax), %xmm1
movups __sPI1(%rax), %xmm6
addps %xmm5, %xmm1
/*
e) Treat obtained value as integer for destination sign setting.
Shift first bit of this value to the last (sign) position (S << 31)
*/
movaps %xmm1, %xmm2
/* f) Subtract "Right Shifter" (0x4B000000) value */
subps %xmm5, %xmm1
movups __sPI2(%rax), %xmm7
pslld $31, %xmm2
movups __sPI3(%rax), %xmm5
movups __sAbsMask(%rax), %xmm3
/* Check for large and special arguments */
andps %xmm4, %xmm3
/* g) Subtract 0.5 from result for octant correction */
subps __sOneHalf(%rax), %xmm1
cmpnleps __sRangeReductionVal(%rax), %xmm3
/*
h) Subtract Y*PI from X argument, where PI divided to 4 parts:
X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
*/
mulps %xmm1, %xmm6
mulps %xmm1, %xmm7
mulps %xmm1, %xmm5
subps %xmm6, %xmm0
movmskps %xmm3, %ecx
movups __sPI4(%rax), %xmm6
subps %xmm7, %xmm0
mulps %xmm6, %xmm1
subps %xmm5, %xmm0
subps %xmm1, %xmm0
/* a) Calculate X^2 = X * X */
movaps %xmm0, %xmm1
mulps %xmm0, %xmm1
/*
3) Destination sign setting
a) Set shifted destination sign using XOR operation:
R = XOR( R, S );
*/
xorps %xmm2, %xmm0
movups __sA9(%rax), %xmm2
/*
b) Calculate polynomial:
R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
*/
mulps %xmm1, %xmm2
addps __sA7(%rax), %xmm2
mulps %xmm1, %xmm2
addps __sA5(%rax), %xmm2
mulps %xmm1, %xmm2
addps __sA3(%rax), %xmm2
mulps %xmm2, %xmm1
mulps %xmm0, %xmm1
addps %xmm1, %xmm0
testl %ecx, %ecx
jne .LBL_1_3
.LBL_1_2:
cfi_remember_state
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
.LBL_1_3:
cfi_restore_state
movups %xmm4, 192(%rsp)
movups %xmm0, 256(%rsp)
je .LBL_1_2
xorb %dl, %dl
xorl %eax, %eax
movups %xmm8, 112(%rsp)
movups %xmm9, 96(%rsp)
movups %xmm10, 80(%rsp)
movups %xmm11, 64(%rsp)
movups %xmm12, 48(%rsp)
movups %xmm13, 32(%rsp)
movups %xmm14, 16(%rsp)
movups %xmm15, (%rsp)
movq %rsi, 136(%rsp)
movq %rdi, 128(%rsp)
movq %r12, 168(%rsp)
cfi_offset_rel_rsp (12, 168)
movb %dl, %r12b
movq %r13, 160(%rsp)
cfi_offset_rel_rsp (13, 160)
movl %ecx, %r13d
movq %r14, 152(%rsp)
cfi_offset_rel_rsp (14, 152)
movl %eax, %r14d
movq %r15, 144(%rsp)
cfi_offset_rel_rsp (15, 144)
cfi_remember_state
.LBL_1_6:
btl %r14d, %r13d
jc .LBL_1_12
.LBL_1_7:
lea 1(%r14), %esi
btl %esi, %r13d
jc .LBL_1_10
.LBL_1_8:
incb %r12b
addl $2, %r14d
cmpb $16, %r12b
jb .LBL_1_6
movups 112(%rsp), %xmm8
movups 96(%rsp), %xmm9
movups 80(%rsp), %xmm10
movups 64(%rsp), %xmm11
movups 48(%rsp), %xmm12
movups 32(%rsp), %xmm13
movups 16(%rsp), %xmm14
movups (%rsp), %xmm15
movq 136(%rsp), %rsi
movq 128(%rsp), %rdi
movq 168(%rsp), %r12
cfi_restore (%r12)
movq 160(%rsp), %r13
cfi_restore (%r13)
movq 152(%rsp), %r14
cfi_restore (%r14)
movq 144(%rsp), %r15
cfi_restore (%r15)
movups 256(%rsp), %xmm0
jmp .LBL_1_2
.LBL_1_10:
cfi_restore_state
movzbl %r12b, %r15d
movss 196(%rsp,%r15,8), %xmm0
call cosf@PLT
movss %xmm0, 260(%rsp,%r15,8)
jmp .LBL_1_8
.LBL_1_12:
movzbl %r12b, %r15d
movss 192(%rsp,%r15,8), %xmm0
call cosf@PLT
movss %xmm0, 256(%rsp,%r15,8)
jmp .LBL_1_7
END (_ZGVbN4v_cosf_sse4)

View File

@ -0,0 +1,38 @@
/* Multiple versions of vectorized cosf, vector length is 8.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include <init-arch.h>
.text
ENTRY (_ZGVdN8v_cosf)
.type _ZGVdN8v_cosf, @gnu_indirect_function
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
1: leaq _ZGVdN8v_cosf_avx2(%rip), %rax
testl $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
jz 2f
ret
2: leaq _ZGVdN8v_cosf_sse_wrapper(%rip), %rax
ret
END (_ZGVdN8v_cosf)
libmvec_hidden_def (_ZGVdN8v_cosf)
#define _ZGVdN8v_cosf _ZGVdN8v_cosf_sse_wrapper
#include "../svml_s_cosf8_core.S"

View File

@ -0,0 +1,215 @@
/* Function cosf vectorized with AVX2.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_cosf_data.h"
.text
ENTRY (_ZGVdN8v_cosf_avx2)
/*
ALGORITHM DESCRIPTION:
1) Range reduction to [-Pi/2; +Pi/2] interval
a) We remove sign using AND operation
b) Add Pi/2 value to argument X for Cos to Sin transformation
c) Getting octant Y by 1/Pi multiplication
d) Add "Right Shifter" value
e) Treat obtained value as integer for destination sign setting.
Shift first bit of this value to the last (sign) position
f) Subtract "Right Shifter" value
g) Subtract 0.5 from result for octant correction
h) Subtract Y*PI from X argument, where PI divided to 4 parts:
X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
a) Calculate X^2 = X * X
b) Calculate polynomial:
R = X + X * X^2 * (A3 + x^2 * (A5 + .....
3) Destination sign setting
a) Set shifted destination sign using XOR operation:
R = XOR( R, S );
*/
pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $448, %rsp
movq __svml_scos_data@GOTPCREL(%rip), %rax
vmovaps %ymm0, %ymm2
vmovups __sRShifter(%rax), %ymm5
vmovups __sPI1_FMA(%rax), %ymm7
/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
vaddps __sHalfPI(%rax), %ymm2, %ymm4
/*
1) Range reduction to [-Pi/2; +Pi/2] interval
c) Getting octant Y by 1/Pi multiplication
d) Add "Right Shifter" (0x4B000000) value
*/
vfmadd132ps __sInvPI(%rax), %ymm5, %ymm4
/* f) Subtract "Right Shifter" (0x4B000000) value */
vsubps %ymm5, %ymm4, %ymm6
/*
e) Treat obtained value as integer for destination sign setting.
Shift first bit of this value to the last (sign) position (S << 31)
*/
vpslld $31, %ymm4, %ymm0
/* g) Subtract 0.5 from result for octant correction */
vsubps __sOneHalf(%rax), %ymm6, %ymm4
/* Check for large and special arguments */
vandps __sAbsMask(%rax), %ymm2, %ymm3
vcmpnle_uqps __sRangeReductionVal(%rax), %ymm3, %ymm1
/*
h) Subtract Y*PI from X argument, where PI divided to 4 parts:
X = X - Y*PI1 - Y*PI2 - Y*PI3
*/
vmovaps %ymm2, %ymm3
vfnmadd231ps %ymm4, %ymm7, %ymm3
vfnmadd231ps __sPI2_FMA(%rax), %ymm4, %ymm3
vfnmadd132ps __sPI3_FMA(%rax), %ymm3, %ymm4
/* a) Calculate X^2 = X * X */
vmulps %ymm4, %ymm4, %ymm5
/*
3) Destination sign setting
a) Set shifted destination sign using XOR operation:
R = XOR( R, S );
*/
vxorps %ymm0, %ymm4, %ymm6
vmovups __sA9_FMA(%rax), %ymm0
/*
b) Calculate polynomial:
R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))))
*/
vfmadd213ps __sA7_FMA(%rax), %ymm5, %ymm0
vfmadd213ps __sA5_FMA(%rax), %ymm5, %ymm0
vfmadd213ps __sA3(%rax), %ymm5, %ymm0
vmulps %ymm5, %ymm0, %ymm0
vmovmskps %ymm1, %ecx
vfmadd213ps %ymm6, %ymm6, %ymm0
testl %ecx, %ecx
jne .LBL_1_3
.LBL_1_2:
cfi_remember_state
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
.LBL_1_3:
cfi_restore_state
vmovups %ymm2, 320(%rsp)
vmovups %ymm0, 384(%rsp)
je .LBL_1_2
xorb %dl, %dl
xorl %eax, %eax
vmovups %ymm8, 224(%rsp)
vmovups %ymm9, 192(%rsp)
vmovups %ymm10, 160(%rsp)
vmovups %ymm11, 128(%rsp)
vmovups %ymm12, 96(%rsp)
vmovups %ymm13, 64(%rsp)
vmovups %ymm14, 32(%rsp)
vmovups %ymm15, (%rsp)
movq %rsi, 264(%rsp)
movq %rdi, 256(%rsp)
movq %r12, 296(%rsp)
cfi_offset_rel_rsp (12, 296)
movb %dl, %r12b
movq %r13, 288(%rsp)
cfi_offset_rel_rsp (13, 288)
movl %ecx, %r13d
movq %r14, 280(%rsp)
cfi_offset_rel_rsp (14, 280)
movl %eax, %r14d
movq %r15, 272(%rsp)
cfi_offset_rel_rsp (15, 272)
cfi_remember_state
.LBL_1_6:
btl %r14d, %r13d
jc .LBL_1_12
.LBL_1_7:
lea 1(%r14), %esi
btl %esi, %r13d
jc .LBL_1_10
.LBL_1_8:
incb %r12b
addl $2, %r14d
cmpb $16, %r12b
jb .LBL_1_6
vmovups 224(%rsp), %ymm8
vmovups 192(%rsp), %ymm9
vmovups 160(%rsp), %ymm10
vmovups 128(%rsp), %ymm11
vmovups 96(%rsp), %ymm12
vmovups 64(%rsp), %ymm13
vmovups 32(%rsp), %ymm14
vmovups (%rsp), %ymm15
vmovups 384(%rsp), %ymm0
movq 264(%rsp), %rsi
movq 256(%rsp), %rdi
movq 296(%rsp), %r12
cfi_restore (%r12)
movq 288(%rsp), %r13
cfi_restore (%r13)
movq 280(%rsp), %r14
cfi_restore (%r14)
movq 272(%rsp), %r15
cfi_restore (%r15)
jmp .LBL_1_2
.LBL_1_10:
cfi_restore_state
movzbl %r12b, %r15d
vmovss 324(%rsp,%r15,8), %xmm0
vzeroupper
call cosf@PLT
vmovss %xmm0, 388(%rsp,%r15,8)
jmp .LBL_1_8
.LBL_1_12:
movzbl %r12b, %r15d
vmovss 320(%rsp,%r15,8), %xmm0
vzeroupper
call cosf@PLT
vmovss %xmm0, 384(%rsp,%r15,8)
jmp .LBL_1_7
END (_ZGVdN8v_cosf_avx2)

View File

@ -0,0 +1,25 @@
/* Function cosf vectorized with AVX-512. Wrapper to AVX2 version.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVeN16v_cosf)
WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
END (_ZGVeN16v_cosf)

View File

@ -0,0 +1,29 @@
/* Function cosf vectorized with SSE2, wrapper version.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVbN4v_cosf)
WRAPPER_IMPL_SSE2 cosf
END (_ZGVbN4v_cosf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVbN4v_cosf)
#endif

View File

@ -0,0 +1,29 @@
/* Function cosf vectorized with AVX2, wrapper version.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVdN8v_cosf)
WRAPPER_IMPL_AVX _ZGVbN4v_cosf
END (_ZGVdN8v_cosf)
#ifndef USE_MULTIARCH
libmvec_hidden_def (_ZGVdN8v_cosf)
#endif

View File

@ -0,0 +1,25 @@
/* Function cosf vectorized in AVX ISA as wrapper to SSE4 ISA version.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
#include "svml_s_wrapper_impl.h"
.text
ENTRY (_ZGVcN8v_cosf)
WRAPPER_IMPL_AVX _ZGVbN4v_cosf
END (_ZGVcN8v_cosf)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,58 @@
/* Offsets for data table for vectorized cosf.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
#ifndef S_COSF_DATA_H
#define S_COSF_DATA_H
.macro float_vector offset value
.if .-__svml_scos_data != \offset
.err
.endif
.rept 16
.long \value
.endr
.endm
#define __dT 0
#define __sAbsMask 4096
#define __sRangeReductionVal 4160
#define __sRangeVal 4224
#define __sS1 4288
#define __sS2 4352
#define __sC1 4416
#define __sC2 4480
#define __sPI1 4544
#define __sPI2 4608
#define __sPI3 4672
#define __sPI4 4736
#define __sPI1_FMA 4800
#define __sPI2_FMA 4864
#define __sPI3_FMA 4928
#define __sA3 4992
#define __sA5 5056
#define __sA7 5120
#define __sA9 5184
#define __sA5_FMA 5248
#define __sA7_FMA 5312
#define __sA9_FMA 5376
#define __sInvPI 5440
#define __sRShifter 5504
#define __sHalfPI 5568
#define __sOneHalf 5632
#endif

View File

@ -0,0 +1,111 @@
/* Wrapper implementations of vector math functions.
Copyright (C) 2014-2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
/* SSE2 ISA version as wrapper to scalar. */
.macro WRAPPER_IMPL_SSE2 callee
subq $40, %rsp
cfi_adjust_cfa_offset(40)
movaps %xmm0, (%rsp)
call \callee@PLT
movss %xmm0, 16(%rsp)
movss 4(%rsp), %xmm0
call \callee@PLT
movss %xmm0, 20(%rsp)
movss 8(%rsp), %xmm0
call \callee@PLT
movss %xmm0, 24(%rsp)
movss 12(%rsp), %xmm0
call \callee@PLT
movss 16(%rsp), %xmm3
movss 20(%rsp), %xmm2
movss 24(%rsp), %xmm1
movss %xmm0, 28(%rsp)
unpcklps %xmm1, %xmm3
unpcklps %xmm0, %xmm2
unpcklps %xmm2, %xmm3
movaps %xmm3, %xmm0
addq $40, %rsp
cfi_adjust_cfa_offset(-40)
ret
.endm
/* AVX/AVX2 ISA version as wrapper to SSE ISA version. */
.macro WRAPPER_IMPL_AVX callee
pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
andq $-32, %rsp
subq $32, %rsp
vextractf128 $1, %ymm0, (%rsp)
vzeroupper
call HIDDEN_JUMPTARGET(\callee)
vmovaps %xmm0, 16(%rsp)
vmovaps (%rsp), %xmm0
call HIDDEN_JUMPTARGET(\callee)
vmovaps %xmm0, %xmm1
vmovaps 16(%rsp), %xmm0
vinsertf128 $1, %xmm1, %ymm0, %ymm0
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
.endm
/* AVX512 ISA version as wrapper to AVX2 ISA version. */
.macro WRAPPER_IMPL_AVX512 callee
pushq %rbp
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbp, 0)
movq %rsp, %rbp
cfi_def_cfa_register (%rbp)
andq $-64, %rsp
subq $64, %rsp
/* Below is encoding for vmovaps %zmm0, (%rsp). */
.byte 0x62
.byte 0xf1
.byte 0x7c
.byte 0x48
.byte 0x29
.byte 0x04
.byte 0x24
/* Below is encoding for vmovaps (%rsp), %ymm0. */
.byte 0xc5
.byte 0xfc
.byte 0x28
.byte 0x04
.byte 0x24
call HIDDEN_JUMPTARGET(\callee)
/* Below is encoding for vmovaps 32(%rsp), %ymm0. */
.byte 0xc5
.byte 0xfc
.byte 0x28
.byte 0x44
.byte 0x24
.byte 0x20
call HIDDEN_JUMPTARGET(\callee)
movq %rbp, %rsp
cfi_def_cfa_register (%rsp)
popq %rbp
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbp)
ret
.endm