mirror of git://sourceware.org/git/glibc.git
x86-64: Add AVX optimized string/memory functions for RTM
Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX
optimized string/memory functions with
xtest
jz 1f
vzeroall
ret
1:
vzeroupper
ret
at function exit on processors with usable RTM, but without 256-bit EVEX
instructions to avoid VZEROUPPER inside a transactionally executing RTM
region.
(cherry picked from commit 7ebba91361
)
This commit is contained in:
parent
d584356fe8
commit
5f59aaddc9
|
@ -41,6 +41,19 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
|||
memset-sse2-unaligned-erms \
|
||||
memset-avx2-unaligned-erms \
|
||||
memset-avx512-unaligned-erms \
|
||||
memchr-avx2-rtm \
|
||||
memcmp-avx2-movbe-rtm \
|
||||
memmove-avx-unaligned-erms-rtm \
|
||||
memrchr-avx2-rtm \
|
||||
memset-avx2-unaligned-erms-rtm \
|
||||
rawmemchr-avx2-rtm \
|
||||
strchr-avx2-rtm \
|
||||
strcmp-avx2-rtm \
|
||||
strchrnul-avx2-rtm \
|
||||
strlen-avx2-rtm \
|
||||
strncmp-avx2-rtm \
|
||||
strnlen-avx2-rtm \
|
||||
strrchr-avx2-rtm \
|
||||
memchr-evex \
|
||||
memcmp-evex-movbe \
|
||||
memmove-evex-unaligned-erms \
|
||||
|
@ -77,6 +90,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
|||
wcsrchr-sse2 wcsrchr-avx2 \
|
||||
wcsnlen-sse4_1 wcsnlen-c \
|
||||
wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
|
||||
wcschr-avx2-rtm \
|
||||
wcscmp-avx2-rtm \
|
||||
wcslen-avx2-rtm \
|
||||
wcsncmp-avx2-rtm \
|
||||
wcsnlen-avx2-rtm \
|
||||
wcsrchr-avx2-rtm \
|
||||
wmemchr-avx2-rtm \
|
||||
wmemcmp-avx2-movbe-rtm \
|
||||
wcschr-evex \
|
||||
wcscmp-evex \
|
||||
wcslen-evex \
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||||
|
||||
static inline void *
|
||||
|
@ -36,6 +37,9 @@ IFUNC_SELECTOR (void)
|
|||
&& CPU_FEATURES_CPU_P (cpu_features, BMI2))
|
||||
return OPTIMIZE (evex);
|
||||
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, RTM))
|
||||
return OPTIMIZE (avx2_rtm);
|
||||
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
return OPTIMIZE (avx2);
|
||||
}
|
||||
|
|
|
@ -43,6 +43,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, memchr,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__memchr_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, memchr,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memchr_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memchr,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -56,6 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (MOVBE)),
|
||||
__memcmp_avx2_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (MOVBE)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memcmp_avx2_movbe_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -85,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
HAS_ARCH_FEATURE (AVX_Usable),
|
||||
__memmove_chk_avx_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memmove_chk_avx_unaligned_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memmove_chk_avx_unaligned_erms_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
HAS_ARCH_FEATURE (AVX512VL_Usable),
|
||||
__memmove_chk_evex_unaligned)
|
||||
|
@ -113,6 +130,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
HAS_ARCH_FEATURE (AVX_Usable),
|
||||
__memmove_avx_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memmove_avx_unaligned_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memmove_avx_unaligned_erms_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
HAS_ARCH_FEATURE (AVX512VL_Usable),
|
||||
__memmove_evex_unaligned)
|
||||
|
@ -143,6 +168,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, memrchr,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__memrchr_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, memrchr,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memrchr_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memrchr,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
|
||||
|
@ -165,6 +194,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__memset_chk_avx2_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memset_chk_avx2_unaligned_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memset_chk_avx2_unaligned_erms_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
|
||||
|
@ -198,6 +235,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__memset_avx2_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memset_avx2_unaligned_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memset_avx2_unaligned_erms_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
|
||||
|
@ -222,6 +267,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, rawmemchr,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__rawmemchr_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, rawmemchr,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__rawmemchr_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, rawmemchr,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -234,6 +283,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, strlen,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__strlen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strlen,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__strlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strlen,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
|
||||
|
@ -245,6 +298,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__strnlen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__strnlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
|
||||
|
@ -317,6 +374,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, strchr,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__strchr_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strchr,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__strchr_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strchr,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -330,6 +391,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__strchrnul_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__strchrnul_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -342,6 +407,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, strrchr,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__strrchr_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strrchr,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__strrchr_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strrchr,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
|
||||
|
@ -353,6 +422,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, strcmp,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__strcmp_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strcmp,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__strcmp_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strcmp,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -457,6 +530,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, wcschr,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__wcschr_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wcschr,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__wcschr_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wcschr,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -469,6 +546,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, wcsrchr,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__wcsrchr_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wcsrchr,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__wcsrchr_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wcsrchr,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -481,6 +562,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, wcscmp,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__wcscmp_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wcscmp,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__wcscmp_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wcscmp,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -493,6 +578,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, wcsncmp,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__wcsncmp_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wcsncmp,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__wcsncmp_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wcsncmp,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -511,6 +600,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__wcslen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__wcslen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -523,6 +616,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__wcsnlen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__wcsnlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -538,6 +635,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, wmemchr,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__wmemchr_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wmemchr,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__wmemchr_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wmemchr,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -551,6 +652,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (MOVBE)),
|
||||
__wmemcmp_avx2_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (MOVBE)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__wmemcmp_avx2_movbe_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)
|
||||
|
@ -569,6 +675,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__wmemset_avx2_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__wmemset_avx2_unaligned_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
HAS_ARCH_FEATURE (AVX512VL_Usable),
|
||||
__wmemset_evex_unaligned)
|
||||
|
@ -594,6 +704,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
HAS_ARCH_FEATURE (AVX_Usable),
|
||||
__memcpy_chk_avx_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memcpy_chk_avx_unaligned_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memcpy_chk_avx_unaligned_erms_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
HAS_ARCH_FEATURE (AVX512VL_Usable),
|
||||
__memcpy_chk_evex_unaligned)
|
||||
|
@ -622,6 +740,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
HAS_ARCH_FEATURE (AVX_Usable),
|
||||
__memcpy_avx_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memcpy_avx_unaligned_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__memcpy_avx_unaligned_erms_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
HAS_ARCH_FEATURE (AVX512VL_Usable),
|
||||
__memcpy_evex_unaligned)
|
||||
|
@ -664,6 +790,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
HAS_ARCH_FEATURE (AVX_Usable),
|
||||
__mempcpy_chk_avx_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__mempcpy_chk_avx_unaligned_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__mempcpy_chk_avx_unaligned_erms_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
HAS_ARCH_FEATURE (AVX512VL_Usable),
|
||||
__mempcpy_chk_evex_unaligned)
|
||||
|
@ -701,6 +835,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
HAS_ARCH_FEATURE (AVX_Usable),
|
||||
__mempcpy_avx_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__mempcpy_avx_unaligned_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
(HAS_ARCH_FEATURE (AVX_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__mempcpy_avx_unaligned_erms_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
HAS_ARCH_FEATURE (AVX512VL_Usable),
|
||||
__mempcpy_evex_unaligned)
|
||||
|
@ -722,6 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|||
IFUNC_IMPL_ADD (array, i, strncmp,
|
||||
HAS_ARCH_FEATURE (AVX2_Usable),
|
||||
__strncmp_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strncmp,
|
||||
(HAS_ARCH_FEATURE (AVX2_Usable)
|
||||
&& HAS_CPU_FEATURE (RTM)),
|
||||
__strncmp_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strncmp,
|
||||
(HAS_ARCH_FEATURE (AVX512VL_Usable)
|
||||
&& HAS_ARCH_FEATURE (AVX512BW_Usable)),
|
||||
|
|
|
@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|||
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
|
||||
|
||||
static inline void *
|
||||
|
@ -38,6 +39,9 @@ IFUNC_SELECTOR (void)
|
|||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
|
||||
return OPTIMIZE (evex_movbe);
|
||||
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, RTM))
|
||||
return OPTIMIZE (avx2_movbe_rtm);
|
||||
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
return OPTIMIZE (avx2_movbe);
|
||||
}
|
||||
|
|
|
@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
|
|||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
|
||||
|
@ -71,6 +75,14 @@ IFUNC_SELECTOR (void)
|
|||
return OPTIMIZE (evex_unaligned);
|
||||
}
|
||||
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, RTM))
|
||||
{
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
|
||||
return OPTIMIZE (avx_unaligned_erms_rtm);
|
||||
|
||||
return OPTIMIZE (avx_unaligned_rtm);
|
||||
}
|
||||
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
{
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
|
||||
|
|
|
@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
|
|||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
|
||||
|
@ -69,6 +73,14 @@ IFUNC_SELECTOR (void)
|
|||
return OPTIMIZE (evex_unaligned);
|
||||
}
|
||||
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, RTM))
|
||||
{
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
|
||||
return OPTIMIZE (avx2_unaligned_erms_rtm);
|
||||
|
||||
return OPTIMIZE (avx2_unaligned_rtm);
|
||||
}
|
||||
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
{
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
|
||||
|
|
|
@ -20,6 +20,8 @@
|
|||
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
|
||||
|
||||
|
@ -39,6 +41,9 @@ IFUNC_SELECTOR (void)
|
|||
if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable))
|
||||
return OPTIMIZE (evex_unaligned);
|
||||
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, RTM))
|
||||
return OPTIMIZE (avx2_unaligned_rtm);
|
||||
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
return OPTIMIZE (avx2_unaligned);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef MEMCHR
|
||||
# define MEMCHR __memchr_avx2_rtm
|
||||
#endif
|
||||
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
|
||||
#include "memchr-avx2.S"
|
|
@ -34,9 +34,13 @@
|
|||
# define VZEROUPPER vzeroupper
|
||||
# endif
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
|
||||
.section .text.avx,"ax",@progbits
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (MEMCHR)
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check for zero length. */
|
||||
|
@ -107,8 +111,8 @@ L(cros_page_boundary):
|
|||
# endif
|
||||
addq %rdi, %rax
|
||||
addq %rcx, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(aligned_more):
|
||||
|
@ -224,8 +228,7 @@ L(last_4x_vec_or_less):
|
|||
|
||||
jnz L(first_vec_x3_check)
|
||||
xorl %eax, %eax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_2x_vec):
|
||||
|
@ -243,8 +246,7 @@ L(last_2x_vec):
|
|||
testl %eax, %eax
|
||||
jnz L(first_vec_x1_check)
|
||||
xorl %eax, %eax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x0_check):
|
||||
|
@ -253,8 +255,7 @@ L(first_vec_x0_check):
|
|||
cmpq %rax, %rdx
|
||||
jbe L(zero)
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x1_check):
|
||||
|
@ -264,8 +265,7 @@ L(first_vec_x1_check):
|
|||
jbe L(zero)
|
||||
addq $VEC_SIZE, %rax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x2_check):
|
||||
|
@ -275,8 +275,7 @@ L(first_vec_x2_check):
|
|||
jbe L(zero)
|
||||
addq $(VEC_SIZE * 2), %rax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x3_check):
|
||||
|
@ -286,12 +285,14 @@ L(first_vec_x3_check):
|
|||
jbe L(zero)
|
||||
addq $(VEC_SIZE * 3), %rax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(zero):
|
||||
VZEROUPPER
|
||||
xorl %eax, %eax
|
||||
jmp L(return_vzeroupper)
|
||||
|
||||
.p2align 4
|
||||
L(null):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
|
@ -301,24 +302,21 @@ L(null):
|
|||
L(first_vec_x0):
|
||||
tzcntl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x1):
|
||||
tzcntl %eax, %eax
|
||||
addq $VEC_SIZE, %rax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x2):
|
||||
tzcntl %eax, %eax
|
||||
addq $(VEC_SIZE * 2), %rax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(4x_vec_end):
|
||||
|
@ -337,8 +335,7 @@ L(first_vec_x3):
|
|||
tzcntl %eax, %eax
|
||||
addq $(VEC_SIZE * 3), %rax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
END (MEMCHR)
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef MEMCMP
|
||||
# define MEMCMP __memcmp_avx2_movbe_rtm
|
||||
#endif
|
||||
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
|
||||
#include "memcmp-avx2-movbe.S"
|
|
@ -47,6 +47,10 @@
|
|||
# define VZEROUPPER vzeroupper
|
||||
# endif
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
# define VEC_MASK ((1 << VEC_SIZE) - 1)
|
||||
|
||||
|
@ -55,7 +59,7 @@
|
|||
memcmp has to use UNSIGNED comparison for elemnts.
|
||||
*/
|
||||
|
||||
.section .text.avx,"ax",@progbits
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (MEMCMP)
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
shl $2, %RDX_LP
|
||||
|
@ -123,8 +127,8 @@ ENTRY (MEMCMP)
|
|||
vptest %ymm0, %ymm5
|
||||
jnc L(4x_vec_end)
|
||||
xorl %eax, %eax
|
||||
VZEROUPPER
|
||||
ret
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_2x_vec):
|
||||
|
@ -144,8 +148,7 @@ L(last_vec):
|
|||
vpmovmskb %ymm2, %eax
|
||||
subl $VEC_MASK, %eax
|
||||
jnz L(first_vec)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec):
|
||||
|
@ -164,8 +167,7 @@ L(wmemcmp_return):
|
|||
movzbl (%rsi, %rcx), %edx
|
||||
sub %edx, %eax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
.p2align 4
|
||||
|
@ -367,8 +369,7 @@ L(last_4x_vec):
|
|||
vpmovmskb %ymm2, %eax
|
||||
subl $VEC_MASK, %eax
|
||||
jnz L(first_vec)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(4x_vec_end):
|
||||
|
@ -394,8 +395,7 @@ L(4x_vec_end):
|
|||
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
||||
sub %edx, %eax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x1):
|
||||
|
@ -410,8 +410,7 @@ L(first_vec_x1):
|
|||
movzbl VEC_SIZE(%rsi, %rcx), %edx
|
||||
sub %edx, %eax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x2):
|
||||
|
@ -426,7 +425,6 @@ L(first_vec_x2):
|
|||
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
||||
sub %edx, %eax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
END (MEMCMP)
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
#if IS_IN (libc)
|
||||
# define VEC_SIZE 32
|
||||
# define VEC(i) ymm##i
|
||||
# define VMOVNT vmovntdq
|
||||
# define VMOVU vmovdqu
|
||||
# define VMOVA vmovdqa
|
||||
|
||||
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
# define VZEROUPPER_RETURN jmp L(return)
|
||||
|
||||
# define SECTION(p) p##.avx.rtm
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm
|
||||
|
||||
# include "memmove-vec-unaligned-erms.S"
|
||||
#endif
|
|
@ -158,11 +158,12 @@ L(last_2x_vec):
|
|||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
||||
VZEROUPPER
|
||||
#if !defined USE_MULTIARCH || !IS_IN (libc)
|
||||
L(nop):
|
||||
#endif
|
||||
ret
|
||||
#else
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
END (MEMMOVE_SYMBOL (__memmove, unaligned))
|
||||
|
||||
|
@ -255,8 +256,11 @@ L(last_2x_vec):
|
|||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
||||
L(return):
|
||||
VZEROUPPER
|
||||
#if VEC_SIZE > 16
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
#else
|
||||
ret
|
||||
#endif
|
||||
|
||||
L(movsb):
|
||||
cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
|
||||
|
@ -324,8 +328,7 @@ L(between_32_63):
|
|||
VMOVU -32(%rsi,%rdx), %YMM1
|
||||
VMOVU %YMM0, (%rdi)
|
||||
VMOVU %YMM1, -32(%rdi,%rdx)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
#if VEC_SIZE > 16
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
|
@ -334,7 +337,7 @@ L(between_16_31):
|
|||
VMOVU -16(%rsi,%rdx), %XMM1
|
||||
VMOVU %XMM0, (%rdi)
|
||||
VMOVU %XMM1, -16(%rdi,%rdx)
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
L(between_8_15):
|
||||
/* From 8 to 15. No branch when size == 8. */
|
||||
|
@ -387,8 +390,7 @@ L(more_2x_vec):
|
|||
VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
|
||||
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
L(last_4x_vec):
|
||||
/* Copy from 2 * VEC to 4 * VEC. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
|
@ -399,8 +401,7 @@ L(last_4x_vec):
|
|||
VMOVU %VEC(1), VEC_SIZE(%rdi)
|
||||
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
L(more_8x_vec):
|
||||
cmpq %rsi, %rdi
|
||||
|
@ -456,8 +457,7 @@ L(loop_4x_vec_forward):
|
|||
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
|
||||
/* Store the first VEC. */
|
||||
VMOVU %VEC(4), (%r11)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
L(more_8x_vec_backward):
|
||||
/* Load the first 4 * VEC and last VEC to support overlapping
|
||||
|
@ -508,8 +508,7 @@ L(loop_4x_vec_backward):
|
|||
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
|
||||
/* Store the last VEC. */
|
||||
VMOVU %VEC(8), (%r11)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
L(large_forward):
|
||||
|
@ -544,8 +543,7 @@ L(loop_large_forward):
|
|||
VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
|
||||
/* Store the first VEC. */
|
||||
VMOVU %VEC(4), (%r11)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
L(large_backward):
|
||||
/* Don't use non-temporal store if there is overlap between
|
||||
|
@ -579,8 +577,7 @@ L(loop_large_backward):
|
|||
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
|
||||
/* Store the last VEC. */
|
||||
VMOVU %VEC(8), (%r11)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
||||
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef MEMRCHR
|
||||
# define MEMRCHR __memrchr_avx2_rtm
|
||||
#endif
|
||||
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
|
||||
#include "memrchr-avx2.S"
|
|
@ -20,14 +20,22 @@
|
|||
|
||||
# include <sysdep.h>
|
||||
|
||||
# ifndef MEMRCHR
|
||||
# define MEMRCHR __memrchr_avx2
|
||||
# endif
|
||||
|
||||
# ifndef VZEROUPPER
|
||||
# define VZEROUPPER vzeroupper
|
||||
# endif
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
|
||||
.section .text.avx,"ax",@progbits
|
||||
ENTRY (__memrchr_avx2)
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (MEMRCHR)
|
||||
/* Broadcast CHAR to YMM0. */
|
||||
vmovd %esi, %xmm0
|
||||
vpbroadcastb %xmm0, %ymm0
|
||||
|
@ -134,8 +142,8 @@ L(loop_4x_vec):
|
|||
vpmovmskb %ymm1, %eax
|
||||
bsrl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_4x_vec_or_less):
|
||||
|
@ -169,8 +177,7 @@ L(last_4x_vec_or_less):
|
|||
addq %rax, %rdx
|
||||
jl L(zero)
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_2x_vec):
|
||||
|
@ -191,31 +198,27 @@ L(last_2x_vec):
|
|||
jl L(zero)
|
||||
addl $(VEC_SIZE * 2), %eax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_vec_x0):
|
||||
bsrl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_vec_x1):
|
||||
bsrl %eax, %eax
|
||||
addl $VEC_SIZE, %eax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_vec_x2):
|
||||
bsrl %eax, %eax
|
||||
addl $(VEC_SIZE * 2), %eax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_vec_x3):
|
||||
|
@ -232,8 +235,7 @@ L(last_vec_x1_check):
|
|||
jl L(zero)
|
||||
addl $VEC_SIZE, %eax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_vec_x3_check):
|
||||
|
@ -243,12 +245,14 @@ L(last_vec_x3_check):
|
|||
jl L(zero)
|
||||
addl $(VEC_SIZE * 3), %eax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(zero):
|
||||
VZEROUPPER
|
||||
xorl %eax, %eax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(null):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
|
@ -273,8 +277,7 @@ L(last_vec_or_less_aligned):
|
|||
|
||||
bsrl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_vec_or_less):
|
||||
|
@ -315,8 +318,7 @@ L(last_vec_or_less):
|
|||
bsrl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
addq %r8, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_vec_2x_aligned):
|
||||
|
@ -353,7 +355,6 @@ L(last_vec_2x_aligned):
|
|||
bsrl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
addq %r8, %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
END (__memrchr_avx2)
|
||||
VZEROUPPER_RETURN
|
||||
END (MEMRCHR)
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
|
||||
#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
|
||||
|
||||
#include "memset-avx2-unaligned-erms.S"
|
|
@ -14,9 +14,15 @@
|
|||
movq r, %rax; \
|
||||
vpbroadcastd %xmm0, %ymm0
|
||||
|
||||
# define SECTION(p) p##.avx
|
||||
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
# endif
|
||||
# ifndef MEMSET_SYMBOL
|
||||
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
# endif
|
||||
# ifndef WMEMSET_SYMBOL
|
||||
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
# endif
|
||||
|
||||
# include "memset-vec-unaligned-erms.S"
|
||||
#endif
|
||||
|
|
|
@ -45,17 +45,14 @@
|
|||
#ifndef VZEROUPPER
|
||||
# if VEC_SIZE > 16
|
||||
# define VZEROUPPER vzeroupper
|
||||
# define VZEROUPPER_SHORT_RETURN vzeroupper; ret
|
||||
# else
|
||||
# define VZEROUPPER
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#ifndef VZEROUPPER_SHORT_RETURN
|
||||
# if VEC_SIZE > 16
|
||||
# define VZEROUPPER_SHORT_RETURN vzeroupper
|
||||
# else
|
||||
# define VZEROUPPER_SHORT_RETURN rep
|
||||
# endif
|
||||
# define VZEROUPPER_SHORT_RETURN rep; ret
|
||||
#endif
|
||||
|
||||
#ifndef MOVQ
|
||||
|
@ -127,8 +124,7 @@ L(entry_from_bzero):
|
|||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
END (MEMSET_SYMBOL (__memset, unaligned))
|
||||
|
||||
|
@ -151,14 +147,12 @@ ENTRY (__memset_erms)
|
|||
ENTRY (MEMSET_SYMBOL (__memset, erms))
|
||||
# endif
|
||||
L(stosb):
|
||||
/* Issue vzeroupper before rep stosb. */
|
||||
VZEROUPPER
|
||||
mov %RDX_LP, %RCX_LP
|
||||
movzbl %sil, %eax
|
||||
mov %RDI_LP, %RDX_LP
|
||||
rep stosb
|
||||
mov %RDX_LP, %RAX_LP
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
# if VEC_SIZE == 16
|
||||
END (__memset_erms)
|
||||
# else
|
||||
|
@ -185,8 +179,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
L(stosb_more_2x_vec):
|
||||
cmpq $REP_STOSB_THRESHOLD, %rdx
|
||||
|
@ -200,8 +193,11 @@ L(more_2x_vec):
|
|||
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
L(return):
|
||||
VZEROUPPER
|
||||
#if VEC_SIZE > 16
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
#else
|
||||
ret
|
||||
#endif
|
||||
|
||||
L(loop_start):
|
||||
leaq (VEC_SIZE * 4)(%rdi), %rcx
|
||||
|
@ -227,7 +223,6 @@ L(loop):
|
|||
cmpq %rcx, %rdx
|
||||
jne L(loop)
|
||||
VZEROUPPER_SHORT_RETURN
|
||||
ret
|
||||
L(less_vec):
|
||||
/* Less than 1 VEC. */
|
||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
|
@ -251,40 +246,34 @@ L(less_vec):
|
|||
jb 1f
|
||||
movb %cl, (%rdi)
|
||||
1:
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
# if VEC_SIZE > 32
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
L(between_32_63):
|
||||
VMOVU %YMM0, -32(%rdi,%rdx)
|
||||
VMOVU %YMM0, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
# endif
|
||||
# if VEC_SIZE > 16
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
L(between_16_31):
|
||||
VMOVU %XMM0, -16(%rdi,%rdx)
|
||||
VMOVU %XMM0, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
# endif
|
||||
/* From 8 to 15. No branch when size == 8. */
|
||||
L(between_8_15):
|
||||
movq %rcx, -8(%rdi,%rdx)
|
||||
movq %rcx, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
movl %ecx, -4(%rdi,%rdx)
|
||||
movl %ecx, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
L(between_2_3):
|
||||
/* From 2 to 3. No branch when size == 2. */
|
||||
movw %cx, -2(%rdi,%rdx)
|
||||
movw %cx, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
#define MEMCHR __rawmemchr_avx2_rtm
|
||||
#define USE_AS_RAWMEMCHR 1
|
||||
|
||||
#include "memchr-avx2-rtm.S"
|
|
@ -0,0 +1,4 @@
|
|||
#define USE_AS_STPCPY
|
||||
#define USE_AS_STRNCPY
|
||||
#define STRCPY __stpncpy_avx2_rtm
|
||||
#include "strcpy-avx2-rtm.S"
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef STRCAT
|
||||
# define STRCAT __strcat_avx2_rtm
|
||||
#endif
|
||||
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
|
||||
#include "strcat-avx2.S"
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef STRCHR
|
||||
# define STRCHR __strchr_avx2_rtm
|
||||
#endif
|
||||
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
|
||||
#include "strchr-avx2.S"
|
|
@ -38,9 +38,13 @@
|
|||
# define VZEROUPPER vzeroupper
|
||||
# endif
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
|
||||
.section .text.avx,"ax",@progbits
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (STRCHR)
|
||||
movl %edi, %ecx
|
||||
/* Broadcast CHAR to YMM0. */
|
||||
|
@ -93,8 +97,8 @@ L(cros_page_boundary):
|
|||
cmp (%rax), %CHAR_REG
|
||||
cmovne %rdx, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(aligned_more):
|
||||
|
@ -190,8 +194,7 @@ L(first_vec_x0):
|
|||
cmp (%rax), %CHAR_REG
|
||||
cmovne %rdx, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x1):
|
||||
|
@ -205,8 +208,7 @@ L(first_vec_x1):
|
|||
cmp (%rax), %CHAR_REG
|
||||
cmovne %rdx, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x2):
|
||||
|
@ -220,8 +222,7 @@ L(first_vec_x2):
|
|||
cmp (%rax), %CHAR_REG
|
||||
cmovne %rdx, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(4x_vec_end):
|
||||
|
@ -247,8 +248,7 @@ L(first_vec_x3):
|
|||
cmp (%rax), %CHAR_REG
|
||||
cmovne %rdx, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
END (STRCHR)
|
||||
#endif
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||||
|
||||
static inline void *
|
||||
|
@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
|
|||
&& CPU_FEATURES_CPU_P (cpu_features, BMI2))
|
||||
return OPTIMIZE (evex);
|
||||
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, RTM))
|
||||
return OPTIMIZE (avx2_rtm);
|
||||
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
return OPTIMIZE (avx2);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
#define STRCHR __strchrnul_avx2_rtm
|
||||
#define USE_AS_STRCHRNUL 1
|
||||
#include "strchr-avx2-rtm.S"
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef STRCMP
|
||||
# define STRCMP __strcmp_avx2_rtm
|
||||
#endif
|
||||
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
|
||||
#include "strcmp-avx2.S"
|
|
@ -55,6 +55,10 @@
|
|||
# define VZEROUPPER vzeroupper
|
||||
# endif
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
# endif
|
||||
|
||||
/* Warning!
|
||||
wcscmp/wcsncmp have to use SIGNED comparison for elements.
|
||||
strcmp/strncmp have to use UNSIGNED comparison for elements.
|
||||
|
@ -75,7 +79,7 @@
|
|||
the maximum offset is reached before a difference is found, zero is
|
||||
returned. */
|
||||
|
||||
.section .text.avx,"ax",@progbits
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (STRCMP)
|
||||
# ifdef USE_AS_STRNCMP
|
||||
/* Check for simple cases (0 or 1) in offset. */
|
||||
|
@ -137,8 +141,8 @@ L(return):
|
|||
movzbl (%rsi, %rdx), %edx
|
||||
subl %edx, %eax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(return_vec_size):
|
||||
|
@ -171,8 +175,7 @@ L(return_vec_size):
|
|||
subl %edx, %eax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(return_2_vec_size):
|
||||
|
@ -205,8 +208,7 @@ L(return_2_vec_size):
|
|||
subl %edx, %eax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(return_3_vec_size):
|
||||
|
@ -239,8 +241,7 @@ L(return_3_vec_size):
|
|||
subl %edx, %eax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(next_3_vectors):
|
||||
|
@ -366,8 +367,7 @@ L(back_to_loop):
|
|||
subl %edx, %eax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(test_vec):
|
||||
|
@ -410,8 +410,7 @@ L(test_vec):
|
|||
subl %edx, %eax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(test_2_vec):
|
||||
|
@ -454,8 +453,7 @@ L(test_2_vec):
|
|||
subl %edx, %eax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(test_3_vec):
|
||||
|
@ -496,8 +494,7 @@ L(test_3_vec):
|
|||
subl %edx, %eax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(loop_cross_page):
|
||||
|
@ -566,8 +563,7 @@ L(loop_cross_page):
|
|||
subl %edx, %eax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(loop_cross_page_2_vec):
|
||||
|
@ -641,8 +637,7 @@ L(loop_cross_page_2_vec):
|
|||
subl %edx, %eax
|
||||
# endif
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
# ifdef USE_AS_STRNCMP
|
||||
L(string_nbyte_offset_check):
|
||||
|
@ -684,8 +679,7 @@ L(cross_page_loop):
|
|||
# ifndef USE_AS_WCSCMP
|
||||
L(different):
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
# ifdef USE_AS_WCSCMP
|
||||
.p2align 4
|
||||
|
@ -695,16 +689,14 @@ L(different):
|
|||
setl %al
|
||||
negl %eax
|
||||
orl $1, %eax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
# endif
|
||||
|
||||
# ifdef USE_AS_STRNCMP
|
||||
.p2align 4
|
||||
L(zero):
|
||||
xorl %eax, %eax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(char0):
|
||||
|
@ -718,8 +710,7 @@ L(char0):
|
|||
movzbl (%rdi), %eax
|
||||
subl %ecx, %eax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
# endif
|
||||
|
||||
.p2align 4
|
||||
|
@ -744,8 +735,7 @@ L(last_vector):
|
|||
movzbl (%rsi, %rdx), %edx
|
||||
subl %edx, %eax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
/* Comparing on page boundary region requires special treatment:
|
||||
It must done one vector at the time, starting with the wider
|
||||
|
@ -866,7 +856,6 @@ L(cross_page_4bytes):
|
|||
testl %eax, %eax
|
||||
jne L(cross_page_loop)
|
||||
subl %ecx, %eax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
END (STRCMP)
|
||||
#endif
|
||||
|
|
|
@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||||
|
||||
static inline void *
|
||||
|
@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
|
|||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
|
||||
return OPTIMIZE (evex);
|
||||
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, RTM))
|
||||
return OPTIMIZE (avx2_rtm);
|
||||
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
return OPTIMIZE (avx2);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef STRCPY
|
||||
# define STRCPY __strcpy_avx2_rtm
|
||||
#endif
|
||||
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
|
||||
#include "strcpy-avx2.S"
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef STRLEN
|
||||
# define STRLEN __strlen_avx2_rtm
|
||||
#endif
|
||||
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
|
||||
#include "strlen-avx2.S"
|
|
@ -36,9 +36,13 @@
|
|||
# define VZEROUPPER vzeroupper
|
||||
# endif
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
|
||||
.section .text.avx,"ax",@progbits
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (STRLEN)
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Check for zero length. */
|
||||
|
@ -111,8 +115,8 @@ L(cros_page_boundary):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(aligned_more):
|
||||
|
@ -231,8 +235,7 @@ L(last_4x_vec_or_less):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(last_2x_vec):
|
||||
|
@ -253,8 +256,7 @@ L(last_2x_vec):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x0_check):
|
||||
|
@ -267,8 +269,7 @@ L(first_vec_x0_check):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x1_check):
|
||||
|
@ -282,8 +283,7 @@ L(first_vec_x1_check):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x2_check):
|
||||
|
@ -297,8 +297,7 @@ L(first_vec_x2_check):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x3_check):
|
||||
|
@ -312,8 +311,7 @@ L(first_vec_x3_check):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(max):
|
||||
|
@ -321,8 +319,7 @@ L(max):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(zero):
|
||||
|
@ -338,8 +335,7 @@ L(first_vec_x0):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x1):
|
||||
|
@ -350,8 +346,7 @@ L(first_vec_x1):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x2):
|
||||
|
@ -362,8 +357,7 @@ L(first_vec_x2):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(4x_vec_end):
|
||||
|
@ -389,8 +383,7 @@ L(first_vec_x3):
|
|||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
END (STRLEN)
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
#define USE_AS_STRNCAT
|
||||
#define STRCAT __strncat_avx2_rtm
|
||||
#include "strcat-avx2-rtm.S"
|
|
@ -0,0 +1,3 @@
|
|||
#define STRCMP __strncmp_avx2_rtm
|
||||
#define USE_AS_STRNCMP 1
|
||||
#include "strcmp-avx2-rtm.S"
|
|
@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|||
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||||
|
||||
static inline void *
|
||||
|
@ -46,6 +47,9 @@ IFUNC_SELECTOR (void)
|
|||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
|
||||
return OPTIMIZE (evex);
|
||||
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, RTM))
|
||||
return OPTIMIZE (avx2_rtm);
|
||||
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
return OPTIMIZE (avx2);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
#define USE_AS_STRNCPY
|
||||
#define STRCPY __strncpy_avx2_rtm
|
||||
#include "strcpy-avx2-rtm.S"
|
|
@ -0,0 +1,4 @@
|
|||
#define STRLEN __strnlen_avx2_rtm
|
||||
#define USE_AS_STRNLEN 1
|
||||
|
||||
#include "strlen-avx2-rtm.S"
|
|
@ -0,0 +1,12 @@
|
|||
#ifndef STRRCHR
|
||||
# define STRRCHR __strrchr_avx2_rtm
|
||||
#endif
|
||||
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
|
||||
#include "strrchr-avx2.S"
|
|
@ -36,9 +36,13 @@
|
|||
# define VZEROUPPER vzeroupper
|
||||
# endif
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
|
||||
.section .text.avx,"ax",@progbits
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (STRRCHR)
|
||||
movd %esi, %xmm4
|
||||
movl %edi, %ecx
|
||||
|
@ -166,8 +170,8 @@ L(return_value):
|
|||
# endif
|
||||
bsrl %eax, %eax
|
||||
leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(match):
|
||||
|
@ -198,8 +202,7 @@ L(find_nul):
|
|||
jz L(return_value)
|
||||
bsrl %eax, %eax
|
||||
leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(char_and_nul):
|
||||
|
@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec):
|
|||
jz L(return_null)
|
||||
bsrl %eax, %eax
|
||||
leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
L(return_null):
|
||||
xorl %eax, %eax
|
||||
VZEROUPPER
|
||||
ret
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
END (STRRCHR)
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
#define STRCHR __wcschr_avx2_rtm
|
||||
#define USE_AS_WCSCHR 1
|
||||
#include "strchr-avx2-rtm.S"
|
|
@ -0,0 +1,4 @@
|
|||
#define STRCMP __wcscmp_avx2_rtm
|
||||
#define USE_AS_WCSCMP 1
|
||||
|
||||
#include "strcmp-avx2-rtm.S"
|
|
@ -0,0 +1,4 @@
|
|||
#define STRLEN __wcslen_avx2_rtm
|
||||
#define USE_AS_WCSLEN 1
|
||||
|
||||
#include "strlen-avx2-rtm.S"
|
|
@ -0,0 +1,5 @@
|
|||
#define STRCMP __wcsncmp_avx2_rtm
|
||||
#define USE_AS_STRNCMP 1
|
||||
#define USE_AS_WCSCMP 1
|
||||
|
||||
#include "strcmp-avx2-rtm.S"
|
|
@ -0,0 +1,5 @@
|
|||
#define STRLEN __wcsnlen_avx2_rtm
|
||||
#define USE_AS_WCSLEN 1
|
||||
#define USE_AS_STRNLEN 1
|
||||
|
||||
#include "strlen-avx2-rtm.S"
|
|
@ -29,6 +29,7 @@
|
|||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||||
|
||||
static inline void *
|
||||
|
@ -44,6 +45,9 @@ IFUNC_SELECTOR (void)
|
|||
&& CPU_FEATURES_CPU_P (cpu_features, BMI2))
|
||||
return OPTIMIZE (evex);
|
||||
|
||||
if (CPU_FEATURES_CPU_P (cpu_features, RTM))
|
||||
return OPTIMIZE (avx2_rtm);
|
||||
|
||||
if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
return OPTIMIZE (avx2);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
#define STRRCHR __wcsrchr_avx2_rtm
|
||||
#define USE_AS_WCSRCHR 1
|
||||
#include "strrchr-avx2-rtm.S"
|
|
@ -0,0 +1,4 @@
|
|||
#define MEMCHR __wmemchr_avx2_rtm
|
||||
#define USE_AS_WMEMCHR 1
|
||||
|
||||
#include "memchr-avx2-rtm.S"
|
|
@ -0,0 +1,4 @@
|
|||
#define MEMCMP __wmemcmp_avx2_movbe_rtm
|
||||
#define USE_AS_WMEMCMP 1
|
||||
|
||||
#include "memcmp-avx2-movbe-rtm.S"
|
|
@ -95,6 +95,28 @@ lose: \
|
|||
#define R14_LP r14
|
||||
#define R15_LP r15
|
||||
|
||||
/* Zero upper vector registers and return with xtest. NB: Use VZEROALL
|
||||
to avoid RTM abort triggered by VZEROUPPER inside transactionally. */
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
|
||||
xtest; \
|
||||
jz 1f; \
|
||||
vzeroall; \
|
||||
ret; \
|
||||
1: \
|
||||
vzeroupper; \
|
||||
ret
|
||||
|
||||
/* Zero upper vector registers and return. */
|
||||
#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
VZEROUPPER; \
|
||||
ret
|
||||
#endif
|
||||
|
||||
#ifndef VZEROUPPER_RETURN
|
||||
# define VZEROUPPER_RETURN VZEROUPPER; ret
|
||||
#endif
|
||||
|
||||
#else /* __ASSEMBLER__ */
|
||||
|
||||
/* Long and pointer size in bytes. */
|
||||
|
|
Loading…
Reference in New Issue