X86-64: Prepare memset-vec-unaligned-erms.S

Prepare memset-vec-unaligned-erms.S to make the SSE2 version as the
default memset.

	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
	(MEMSET_CHK_SYMBOL): New.  Define if not defined.
	(__bzero): Check VEC_SIZE == 16 instead of USE_MULTIARCH.
	Disabled fro now.
	Replace MEMSET_SYMBOL with MEMSET_CHK_SYMBOL on __memset_chk
	symbols.  Properly check USE_MULTIARCH on __memset symbols.
This commit is contained in:
H.J. Lu 2016-04-06 09:10:18 -07:00
parent a25322f4e8
commit 4af1bb06c5
2 changed files with 28 additions and 13 deletions

View File

@ -1,3 +1,12 @@
2016-04-06 H.J. Lu <hongjiu.lu@intel.com>
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
(MEMSET_CHK_SYMBOL): New. Define if not defined.
(__bzero): Check VEC_SIZE == 16 instead of USE_MULTIARCH.
Disabled fro now.
Replace MEMSET_SYMBOL with MEMSET_CHK_SYMBOL on __memset_chk
symbols. Properly check USE_MULTIARCH on __memset symbols.
2016-04-06 H.J. Lu <hongjiu.lu@intel.com> 2016-04-06 H.J. Lu <hongjiu.lu@intel.com>
* benchtests/Makefile (string-benchset): Add memcpy-large, * benchtests/Makefile (string-benchset): Add memcpy-large,

View File

@ -28,6 +28,10 @@
#include <sysdep.h> #include <sysdep.h>
#ifndef MEMSET_CHK_SYMBOL
# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
#endif
#ifndef VZEROUPPER #ifndef VZEROUPPER
# if VEC_SIZE > 16 # if VEC_SIZE > 16
# define VZEROUPPER vzeroupper # define VZEROUPPER vzeroupper
@ -66,8 +70,8 @@
# error SECTION is not defined! # error SECTION is not defined!
#endif #endif
#if !defined USE_MULTIARCH && IS_IN (libc)
.section SECTION(.text),"ax",@progbits .section SECTION(.text),"ax",@progbits
#if VEC_SIZE == 16 && IS_IN (libc) && 0
ENTRY (__bzero) ENTRY (__bzero)
movq %rdi, %rax /* Set return value. */ movq %rdi, %rax /* Set return value. */
movq %rsi, %rdx /* Set n. */ movq %rsi, %rdx /* Set n. */
@ -78,10 +82,10 @@ weak_alias (__bzero, bzero)
#endif #endif
#if defined SHARED && IS_IN (libc) #if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned)) ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
cmpq %rdx, %rcx cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail) jb HIDDEN_JUMPTARGET (__chk_fail)
END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned)) END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
#endif #endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned)) ENTRY (MEMSET_SYMBOL (__memset, unaligned))
@ -97,15 +101,16 @@ L(entry_from_bzero):
VMOVU %VEC(0), (%rdi) VMOVU %VEC(0), (%rdi)
VZEROUPPER VZEROUPPER
ret ret
#if defined USE_MULTIARCH && IS_IN (libc)
END (MEMSET_SYMBOL (__memset, unaligned)) END (MEMSET_SYMBOL (__memset, unaligned))
#if VEC_SIZE == 16 # if VEC_SIZE == 16
/* Only used to measure performance of REP STOSB. */ /* Only used to measure performance of REP STOSB. */
ENTRY (__memset_erms) ENTRY (__memset_erms)
#else # else
/* Provide a symbol to debugger. */ /* Provide a symbol to debugger. */
ENTRY (MEMSET_SYMBOL (__memset, erms)) ENTRY (MEMSET_SYMBOL (__memset, erms))
#endif # endif
L(stosb): L(stosb):
movq %rdx, %rcx movq %rdx, %rcx
movzbl %sil, %eax movzbl %sil, %eax
@ -113,18 +118,18 @@ L(stosb):
rep stosb rep stosb
movq %rdx, %rax movq %rdx, %rax
ret ret
#if VEC_SIZE == 16 # if VEC_SIZE == 16
END (__memset_erms) END (__memset_erms)
#else # else
END (MEMSET_SYMBOL (__memset, erms)) END (MEMSET_SYMBOL (__memset, erms))
#endif # endif
#if defined SHARED && IS_IN (libc) # if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms)) ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
cmpq %rdx, %rcx cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail) jb HIDDEN_JUMPTARGET (__chk_fail)
END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms)) END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
#endif # endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
@ -144,6 +149,7 @@ L(stosb_more_2x_vec):
/* Force 32-bit displacement to avoid long nop between /* Force 32-bit displacement to avoid long nop between
instructions. */ instructions. */
ja.d32 L(stosb) ja.d32 L(stosb)
#endif
.p2align 4 .p2align 4
L(more_2x_vec): L(more_2x_vec):
cmpq $(VEC_SIZE * 4), %rdx cmpq $(VEC_SIZE * 4), %rdx