x86: Optimize memcmp SSE2 in memcmp.S

New code save size (-303 bytes) and has significantly better performance. geometric_mean(N=20) of page cross cases New / Original: 0.634 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2022-04-15 12:27:59 -05:00 · 2022-04-15 12:27:59 -05:00 · 8804157ad9
parent ac0d208b54
commit 8804157ad9
8 changed files with 584 additions and 385 deletions
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@ -18,395 +18,557 @@
 #include <sysdep.h>
 #ifdef USE_AS_WMEMCMP
 # define PCMPEQ	pcmpeqd
 # define CHAR_SIZE	4
 # define SIZE_OFFSET	(0)
 #else
 # define PCMPEQ	pcmpeqb
 # define CHAR_SIZE	1
 #endif
 #ifdef USE_AS_MEMCMPEQ
 # define SIZE_OFFSET	(0)
 # define CHECK_CMP(x, y)	subl x, y
 #else
 # ifndef SIZE_OFFSET
 #  define SIZE_OFFSET	(CHAR_PER_VEC * 2)
 # endif
 # define CHECK_CMP(x, y)	cmpl x, y
 #endif
 #define VEC_SIZE	16
 #define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 #ifndef MEMCMP
 # define MEMCMP	memcmp
 #endif
 	.text
-ENTRY (memcmp)
+ENTRY(MEMCMP)
-#ifdef __ILP32__
+#ifdef USE_AS_WMEMCMP
-	/* Clear the upper 32 bits.  */
+	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
-	movl	%edx, %edx
+	   in ecx for code size. This is preferable to using `incw` as
 	   it avoids partial register stalls on older hardware (pre
 	   SnB).  */
 	movl	$0xffff, %ecx
 #endif
-	test	%RDX_LP, %RDX_LP
+	cmpq	$CHAR_PER_VEC, %rdx
-	jz	L(finz)
+	ja	L(more_1x_vec)
-	cmpq	$1, %rdx
+
-	jbe	L(finr1b)
+#ifdef USE_AS_WMEMCMP
-	subq	%rdi, %rsi
+	/* saves a byte of code keeping the fall through path n = [2, 4]
-	movq	%rdx, %r10
+	   in the initial cache line.  */
-	cmpq	$32, %r10
+	decl	%edx
-	jae	L(gt32)
+	jle	L(cmp_0_1)
-	/* Handle small chunks and last block of less than 32 bytes.  */
+
-L(small):
+	movq	(%rsi), %xmm0
-	testq	$1, %r10
+	movq	(%rdi), %xmm1
-	jz	L(s2b)
+	PCMPEQ	%xmm0, %xmm1
-	movzbl	(%rdi),	%eax
+	pmovmskb %xmm1, %eax
-	movzbl	(%rdi, %rsi), %edx
+	subl	%ecx, %eax
-	subq    $1, %r10
+	jnz	L(ret_nonzero_vec_start_0)
-	je	L(finz1)
+
-	addq	$1, %rdi
+	movq	-4(%rsi, %rdx, CHAR_SIZE), %xmm0
-	subl	%edx, %eax
+	movq	-4(%rdi, %rdx, CHAR_SIZE), %xmm1
-	jnz	L(exit)
+	PCMPEQ	%xmm0, %xmm1
-L(s2b):
+	pmovmskb %xmm1, %eax
-	testq	$2, %r10
+	subl	%ecx, %eax
-	jz	L(s4b)
+	jnz	L(ret_nonzero_vec_end_0_adj)
 	movzwl	(%rdi),	%eax
 	movzwl	(%rdi, %rsi), %edx
 	subq    $2, %r10
 #ifdef USE_AS_MEMCMPEQ
 	je	L(finz1)
 #else
-	je	L(fin2_7)
+	cmpl	$8, %edx
-#endif
+	ja	L(cmp_9_16)
-	addq	$2, %rdi
+
-	cmpl	%edx, %eax
+	cmpl	$4, %edx
-#ifdef USE_AS_MEMCMPEQ
+	jb	L(cmp_0_3)
-	jnz	L(neq_early)
+
-#else
+# ifdef USE_AS_MEMCMPEQ
-	jnz	L(fin2_7)
+	movl	(%rsi), %eax
-#endif
+	subl	(%rdi), %eax
-L(s4b):
+
-	testq	$4, %r10
+	movl	-4(%rsi, %rdx), %esi
-	jz	L(s8b)
+	subl	-4(%rdi, %rdx), %esi
-	movl	(%rdi),	%eax
+
-	movl	(%rdi, %rsi), %edx
+	orl	%esi, %eax
 	subq    $4, %r10
 #ifdef USE_AS_MEMCMPEQ
 	je	L(finz1)
 #else
 	je	L(fin2_7)
 #endif
 	addq	$4, %rdi
 	cmpl	%edx, %eax
 #ifdef USE_AS_MEMCMPEQ
 	jnz	L(neq_early)
 #else
 	jnz	L(fin2_7)
 #endif
 L(s8b):
 	testq	$8, %r10
 	jz	L(s16b)
 	movq	(%rdi),	%rax
 	movq	(%rdi, %rsi), %rdx
 	subq    $8, %r10
 #ifdef USE_AS_MEMCMPEQ
 	je	L(sub_return8)
 #else
 	je	L(fin2_7)
 #endif
 	addq	$8, %rdi
 	cmpq	%rdx, %rax
 #ifdef USE_AS_MEMCMPEQ
 	jnz	L(neq_early)
 #else
 	jnz	L(fin2_7)
 #endif
 L(s16b):
 	movdqu    (%rdi), %xmm1
 	movdqu    (%rdi, %rsi), %xmm0
 	pcmpeqb   %xmm0, %xmm1
 #ifdef USE_AS_MEMCMPEQ
 	pmovmskb  %xmm1, %eax
 	subl      $0xffff, %eax
 	ret
-#else
+# else
-	pmovmskb  %xmm1, %edx
+	/* Combine comparisons for lo and hi 4-byte comparisons.  */
-	xorl	  %eax, %eax
+	movl	-4(%rsi, %rdx), %ecx
-	subl      $0xffff, %edx
+	movl	-4(%rdi, %rdx), %eax
-	jz	  L(finz)
+	shlq	$32, %rcx
-	bsfl      %edx, %ecx
+	shlq	$32, %rax
-	leaq	 (%rdi, %rcx), %rcx
+	movl	(%rsi), %esi
-	movzbl	 (%rcx), %eax
+	movl	(%rdi), %edi
-	movzbl	 (%rsi, %rcx), %edx
+	orq	%rsi, %rcx
-	jmp	 L(finz1)
+	orq	%rdi, %rax
-#endif
+	/* Only compute proper return if not-equal.  */
-	.p2align 4,, 4
+	cmpq	%rcx, %rax
-L(finr1b):
+	jnz	L(ret_nonzero)
 	movzbl	(%rdi), %eax
 	movzbl  (%rsi), %edx
 L(finz1):
 	subl	%edx, %eax
 L(exit):
 	ret
 #ifdef USE_AS_MEMCMPEQ
 	.p2align 4,, 4
 L(sub_return8):
 	subq	%rdx, %rax
 	movl	%eax, %edx
 	shrq	$32, %rax
 	orl	%edx, %eax
 	ret
 #else
 	.p2align 4,, 4
 L(fin2_7):
 	cmpq	%rdx, %rax
 	jz	L(finz)
 	movq	%rax, %r11
 	subq	%rdx, %r11
 	bsfq	%r11, %rcx
 	sarq	$3, %rcx
 	salq	$3, %rcx
 	sarq	%cl, %rax
 	movzbl  %al, %eax
 	sarq	%cl, %rdx
 	movzbl  %dl, %edx
 	subl	%edx, %eax
 	ret
 #endif
 	.p2align 4,, 4
 L(finz):
 	xorl	%eax, %eax
 	ret
-#ifdef USE_AS_MEMCMPEQ
+# endif
-	.p2align 4,, 4
+
-L(neq_early):
+	.p2align 4,, 10
-	movl	$1, %eax
+L(cmp_9_16):
 # ifdef USE_AS_MEMCMPEQ
 	movq	(%rsi), %rax
 	subq	(%rdi), %rax
 	movq	-8(%rsi, %rdx), %rcx
 	subq	-8(%rdi, %rdx), %rcx
 	orq	%rcx, %rax
 	/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
 	   return long).  */
 	setnz	%cl
 	movzbl	%cl, %eax
 # else
 	movq	(%rsi), %rcx
 	movq	(%rdi), %rax
 	/* Only compute proper return if not-equal.  */
 	cmpq	%rcx, %rax
 	jnz	L(ret_nonzero)
 	movq	-8(%rsi, %rdx, CHAR_SIZE), %rcx
 	movq	-8(%rdi, %rdx, CHAR_SIZE), %rax
 	/* Only compute proper return if not-equal.  */
 	cmpq	%rcx, %rax
 	jnz	L(ret_nonzero)
 	xorl	%eax, %eax
 # endif
 #endif
 	ret
 	.p2align 4,, 8
 L(cmp_0_1):
 	/* Flag set by earlier comparison against 1.  */
 	jne	L(cmp_0_0)
 #ifdef USE_AS_WMEMCMP
 	movl	(%rdi), %ecx
 	xorl	%edx, %edx
 	cmpl	(%rsi), %ecx
 	je	L(cmp_0_0)
 	setg	%dl
 	leal	-1(%rdx, %rdx), %eax
 #else
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
 	subl	%ecx, %eax
 #endif
 	ret
 	/* Fits in aligning bytes.  */
 L(cmp_0_0):
 	xorl	%eax, %eax
 	ret
 #ifdef USE_AS_WMEMCMP
 	.p2align 4
 L(ret_nonzero_vec_start_0):
 	bsfl	%eax, %eax
 	movl	(%rdi, %rax), %ecx
 	xorl	%edx, %edx
 	cmpl	(%rsi, %rax), %ecx
 	/* NB: no partial register stall here because xorl zero idiom
 	   above.  */
 	setg	%dl
 	leal	-1(%rdx, %rdx), %eax
 	ret
 #else
 # ifndef USE_AS_MEMCMPEQ
 	.p2align 4,, 14
 L(ret_nonzero):
 	/* Need to bswap to get proper return without branch.  */
 	bswapq	%rcx
 	bswapq	%rax
 	subq	%rcx, %rax
 	sbbl	%eax, %eax
 	orl	$1, %eax
 	ret
 # endif
 	.p2align 4
 L(cmp_0_3):
 # ifdef USE_AS_MEMCMPEQ
 	/* No reason to add to dependency chain on rdx. Saving a the
 	   bytes here doesn't change number of fetch blocks.  */
 	cmpl	$1, %edx
 	jbe	L(cmp_0_1)
 # else
 	/* We need the code size to prevent taking an extra fetch block.
 	 */
 	decl	%edx
 	jle	L(cmp_0_1)
 # endif
 	movzwl	(%rsi), %ecx
 	movzwl	(%rdi), %eax
 # ifdef USE_AS_MEMCMPEQ
 	subl	%ecx, %eax
 	movzbl	-1(%rsi, %rdx), %esi
 	movzbl	-1(%rdi, %rdx), %edi
 	subl	%edi, %esi
 	orl	%esi, %eax
 # else
 	bswapl	%ecx
 	bswapl	%eax
 	/* Implicit right shift by one. We just need to displace the
 	   sign bits.  */
 	shrl	%ecx
 	shrl	%eax
 	/* Eat a partial register stall here. Saves code stopping
 	   L(cmp_0_3) from bleeding into the next fetch block and saves
 	   an ALU.  */
 	movb	(%rsi, %rdx), %cl
 	movzbl	(%rdi, %rdx), %edi
 	orl	%edi, %eax
 	subl	%ecx, %eax
 # endif
 	ret
 #endif
 	/* For blocks bigger than 32 bytes
 	   1. Advance one of the addr pointer to be 16B aligned.
 	   2. Treat the case of both addr pointers aligned to 16B
 	      separately to avoid movdqu.
 	   3. Handle any blocks of greater than 64 consecutive bytes with
 	      unrolling to reduce branches.
 	   4. At least one addr pointer is 16B aligned, use memory version
 	      of pcmbeqb.
 	*/
 	.p2align 4,, 4
 L(gt32):
 	movq	%rdx, %r11
 	addq	%rdi, %r11
 	movq	%rdi, %r8
-	andq	$15, %r8
+	.p2align 5
-	jz	L(16am)
+L(more_1x_vec):
-	/* Both pointers may be misaligned.  */
+#ifndef USE_AS_WMEMCMP
-	movdqu	(%rdi),	%xmm1
+	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
-	movdqu	(%rdi, %rsi), %xmm0
+	   in ecx for code size. This is preferable to using `incw` as
-	pcmpeqb   %xmm0, %xmm1
+	   it avoids partial register stalls on older hardware (pre
-	pmovmskb  %xmm1, %edx
+	   SnB).  */
-	subl      $0xffff, %edx
+	movl	$0xffff, %ecx
-	jnz       L(neq)
+#endif
-	neg	 %r8
+	movups	(%rsi), %xmm0
-	leaq    16(%rdi, %r8), %rdi
+	movups	(%rdi), %xmm1
-L(16am):
+	PCMPEQ	%xmm0, %xmm1
-	/* Handle two 16B aligned pointers separately.  */
+	pmovmskb %xmm1, %eax
-	testq   $15, %rsi
+	subl	%ecx, %eax
-	jz      L(ATR)
+	jnz	L(ret_nonzero_vec_start_0)
-	testq	$16, %rdi
+#if SIZE_OFFSET == 0
-	jz	L(A32)
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
-	movdqu	(%rdi, %rsi), %xmm0
+#else
-	pcmpeqb   (%rdi), %xmm0
+	/* Offset rdx. Saves just enough code size to keep the
-	pmovmskb  %xmm0, %edx
+	   L(last_2x_vec) case and the non-zero return in a single
-	subl      $0xffff, %edx
+	   cache line.  */
-	jnz       L(neq)
+	subq	$(CHAR_PER_VEC * 2), %rdx
-	addq	$16, %rdi
+#endif
-L(A32):
+	ja	L(more_2x_vec)
-	movq	%r11, %r10
+
-	andq	$-32, %r10
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
-	cmpq	%r10, %rdi
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
-        jae	L(mt16)
+	PCMPEQ	%xmm0, %xmm1
-	/* Pre-unroll to be ready for unrolled 64B loop.  */
+	pmovmskb %xmm1, %eax
-	testq	$32, %rdi
+	subl	%ecx, %eax
-	jz	L(A64)
+#ifndef USE_AS_MEMCMPEQ
-	movdqu    (%rdi,%rsi), %xmm0
+	/* Don't use `incw ax` as machines this code runs on are liable
-	pcmpeqb   (%rdi), %xmm0
+	   to have partial register stall.  */
-	pmovmskb  %xmm0, %edx
+	jnz	L(ret_nonzero_vec_end_0)
-	subl      $0xffff, %edx
+#else
-	jnz       L(neq)
+	/* Various return targets for memcmpeq. Will always be hot in
-	addq       $16, %rdi
+	   Icache and get short encoding.  */
-
+L(ret_nonzero_vec_start_1):
-	movdqu    (%rdi,%rsi), %xmm0
+L(ret_nonzero_vec_start_0):
-	pcmpeqb  (%rdi), %xmm0
+L(ret_nonzero_vec_end_0):
-	pmovmskb  %xmm0, %edx
+#endif
-	subl      $0xffff, %edx
+	ret
-	jnz       L(neq)
+
-	addq       $16, %rdi
+#ifndef USE_AS_MEMCMPEQ
-
+# ifdef USE_AS_WMEMCMP
-L(A64):
+	.p2align 4
-	movq	%r11, %r10
+L(ret_nonzero_vec_end_0_adj):
-	andq	$-64, %r10
+	addl	$3, %edx
-	cmpq	%r10, %rdi
+# else
-        jae	L(mt32)
+	.p2align 4,, 8
-
+# endif
-L(A64main):
+L(ret_nonzero_vec_end_0):
-	movdqu    (%rdi,%rsi), %xmm0
+	bsfl	%eax, %eax
-	pcmpeqb   (%rdi), %xmm0
+# ifdef USE_AS_WMEMCMP
-	pmovmskb  %xmm0, %edx
+	leal	(%rax, %rdx, CHAR_SIZE), %eax
-	subl      $0xffff, %edx
+	movl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
-	jnz       L(neq)
+	xorl	%edx, %edx
-	addq       $16, %rdi
+	cmpl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
-
+	/* NB: no partial register stall here because xorl zero idiom
-	movdqu    (%rdi,%rsi), %xmm0
+	   above.  */
-	pcmpeqb   (%rdi), %xmm0
+	setg	%dl
-	pmovmskb  %xmm0, %edx
+	leal	-1(%rdx, %rdx), %eax
-	subl      $0xffff, %edx
+# else
-	jnz       L(neq)
+	addl	%edx, %eax
-	addq       $16, %rdi
+	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
-
+	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
-	movdqu    (%rdi,%rsi), %xmm0
+	subl	%ecx, %eax
-	pcmpeqb   (%rdi), %xmm0
+# endif
-	pmovmskb  %xmm0, %edx
+	ret
-	subl      $0xffff, %edx
+# ifndef USE_AS_WMEMCMP
-	jnz       L(neq)
+	.p2align 4,, 10
-	addq       $16, %rdi
+L(ret_nonzero_vec_start_0):
-
+	bsfl	%eax, %eax
-	movdqu    (%rdi,%rsi), %xmm0
+	movzbl	(%rsi, %rax), %ecx
-	pcmpeqb  (%rdi), %xmm0
+	movzbl	(%rdi, %rax), %eax
-	pmovmskb  %xmm0, %edx
+	subl	%ecx, %eax
-	subl      $0xffff, %edx
+	ret
-	jnz       L(neq)
+# endif
 	addq       $16, %rdi
 	cmpq       %rdi, %r10
 	jne       L(A64main)
 L(mt32):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
        jae	L(mt16)
 L(A32main):
 	movdqu    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
 	subl      $0xffff, %edx
 	jnz       L(neq)
 	addq       $16, %rdi
 	movdqu    (%rdi,%rsi), %xmm0
 	pcmpeqb  (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
 	subl      $0xffff, %edx
 	jnz       L(neq)
 	addq       $16, %rdi
 	cmpq       %rdi, %r10
 	jne       L(A32main)
 L(mt16):
 	subq       %rdi, %r11
 	je	  L(finz)
 	movq	  %r11, %r10
 	jmp	  L(small)
 	.p2align 4,, 4
 L(neq):
 #ifdef USE_AS_MEMCMPEQ
 	movl	$1, %eax
    ret
 #else
 	bsfl      %edx, %ecx
 	movzbl	 (%rdi, %rcx), %eax
 	addq	 %rdi, %rsi
 	movzbl	 (%rsi,%rcx), %edx
 	jmp	 L(finz1)
 #endif
-	.p2align 4,, 4
+	.p2align 5
-L(ATR):
+L(more_2x_vec):
-	movq	%r11, %r10
+	movups	(VEC_SIZE * 1)(%rsi), %xmm0
-	andq	$-32, %r10
+	movups	(VEC_SIZE * 1)(%rdi), %xmm1
-	cmpq	%r10, %rdi
+	PCMPEQ	%xmm0, %xmm1
-        jae	L(mt16)
+	pmovmskb %xmm1, %eax
-	testq	$16, %rdi
+	subl	%ecx, %eax
-	jz	L(ATR32)
+	jnz	L(ret_nonzero_vec_start_1)
-	movdqa    (%rdi,%rsi), %xmm0
+	cmpq	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
-	pcmpeqb   (%rdi), %xmm0
+	jbe	L(last_2x_vec)
 	pmovmskb  %xmm0, %edx
 	subl      $0xffff, %edx
 	jnz       L(neq)
 	addq       $16, %rdi
 	cmpq       %rdi, %r10
 	je       L(mt16)
-L(ATR32):
+	cmpq	$(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
-	movq	%r11, %r10
+	ja	L(more_8x_vec)
 	andq	$-64, %r10
 	testq	$32, %rdi
 	jz	L(ATR64)
-	movdqa    (%rdi,%rsi), %xmm0
+	/* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
-	pcmpeqb   (%rdi), %xmm0
+	   This can harm performance if non-zero return in [65, 80] or
-	pmovmskb  %xmm0, %edx
+	   [97, 112] but helps performance otherwise. Generally zero-
-	subl      $0xffff, %edx
+	   return is hotter.  */
-	jnz       L(neq)
+	movups	(VEC_SIZE * 2)(%rsi), %xmm0
-	addq       $16, %rdi
+	movups	(VEC_SIZE * 2)(%rdi), %xmm1
 	PCMPEQ	%xmm0, %xmm1
 	movups	(VEC_SIZE * 3)(%rsi), %xmm2
 	movups	(VEC_SIZE * 3)(%rdi), %xmm3
 	PCMPEQ	%xmm2, %xmm3
 	pand	%xmm1, %xmm3
-	movdqa    (%rdi,%rsi), %xmm0
+	pmovmskb %xmm3, %eax
-	pcmpeqb   (%rdi), %xmm0
+	CHECK_CMP (%ecx, %eax)
-	pmovmskb  %xmm0, %edx
+	jnz	L(ret_nonzero_vec_start_2_3)
 	subl      $0xffff, %edx
 	jnz       L(neq)
 	addq       $16, %rdi
-L(ATR64):
+	cmpl	$(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
-	cmpq       %rdi, %r10
+	jbe	L(last_2x_vec)
 	je	   L(mt32)
-L(ATR64main):
+	movups	(VEC_SIZE * 4)(%rsi), %xmm0
-	movdqa    (%rdi,%rsi), %xmm0
+	movups	(VEC_SIZE * 4)(%rdi), %xmm1
-	pcmpeqb   (%rdi), %xmm0
+	PCMPEQ	%xmm0, %xmm1
-	pmovmskb  %xmm0, %edx
+	movups	(VEC_SIZE * 5)(%rsi), %xmm2
-	subl      $0xffff, %edx
+	movups	(VEC_SIZE * 5)(%rdi), %xmm3
-	jnz       L(neq)
+	PCMPEQ	%xmm2, %xmm3
-	addq       $16, %rdi
+	pand	%xmm1, %xmm3
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
 	subl      $0xffff, %edx
 	jnz       L(neq)
 	addq       $16, %rdi
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
 	subl      $0xffff, %edx
 	jnz       L(neq)
 	addq       $16, %rdi
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
 	subl      $0xffff, %edx
 	jnz       L(neq)
 	addq       $16, %rdi
 	cmpq       %rdi, %r10
 	jne       L(ATR64main)
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
        jae	L(mt16)
 L(ATR32res):
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
 	subl      $0xffff, %edx
 	jnz       L(neq)
 	addq       $16, %rdi
 	movdqa    (%rdi,%rsi), %xmm0
 	pcmpeqb   (%rdi), %xmm0
 	pmovmskb  %xmm0, %edx
 	subl      $0xffff, %edx
 	jnz       L(neq)
 	addq       $16, %rdi
 	cmpq	  %r10, %rdi
 	jne       L(ATR32res)
 	subq       %rdi, %r11
 	je	  L(finz)
 	movq	  %r11, %r10
 	jmp	  L(small)
 	/* Align to 16byte to improve instruction fetch.  */
 	.p2align 4,, 4
 END(memcmp)
 	pmovmskb %xmm3, %eax
 	CHECK_CMP (%ecx, %eax)
 #ifdef USE_AS_MEMCMPEQ
-libc_hidden_def (memcmp)
+	jz	L(last_2x_vec)
 	ret
 #else
-# undef bcmp
+	jnz	L(ret_nonzero_vec_start_4_5)
-weak_alias (memcmp, bcmp)
+#endif
-libc_hidden_builtin_def (memcmp)
+	.p2align 4
 L(last_2x_vec):
 	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
 	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
 	PCMPEQ	%xmm0, %xmm1
 	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
 	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
 	PCMPEQ	%xmm2, %xmm3
 	pand	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	subl	%ecx, %eax
 #ifdef USE_AS_MEMCMPEQ
 	/* Various return targets for memcmpeq. Will always be hot in
 	   Icache and get short encoding.  */
 L(ret_nonzero_vec_start_2_3):
 L(ret_nonzero_vec_start_4_5):
 	ret
 #else
 	jnz	L(ret_nonzero_vec_end_1)
 	ret
 	.p2align 4,, 8
 L(ret_nonzero_vec_end_1):
 	pmovmskb %xmm1, %ecx
 	/* High 16 bits of eax guranteed to be all ones. Rotate them in
 	   to we can do `or + not` with just `xor`.  */
 	rorl	$16, %eax
 	xorl	%ecx, %eax
 	/* Partial register stall.  */
 	bsfl	%eax, %eax
 # ifdef USE_AS_WMEMCMP
 	leal	(%rax, %rdx, CHAR_SIZE), %eax
 	movl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
 	xorl	%edx, %edx
 	cmpl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
 	/* NB: no partial register stall here because xorl zero idiom
 	   above.  */
 	setg	%dl
 	leal	-1(%rdx, %rdx), %eax
 # else
 	addl	%edx, %eax
 	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
 	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
 	subl	%ecx, %eax
 # endif
 	ret
 	.p2align 4
 L(ret_nonzero_vec_start_4_5):
 	pmovmskb %xmm1, %edx
 	sall	$16, %eax
 	leal	1(%rax, %rdx), %eax
 	bsfl	%eax, %eax
 # ifdef USE_AS_WMEMCMP
 	movl	(VEC_SIZE * 4)(%rdi, %rax), %ecx
 	xorl	%edx, %edx
 	cmpl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
 	/* NB: no partial register stall here because xorl zero idiom
 	   above.  */
 	setg	%dl
 	leal	-1(%rdx, %rdx), %eax
 # else
 	movzbl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
 	movzbl	(VEC_SIZE * 4)(%rdi, %rax), %eax
 	subl	%ecx, %eax
 # endif
 	ret
 	.p2align 4,, 8
 L(ret_nonzero_vec_start_1):
 	bsfl	%eax, %eax
 # ifdef USE_AS_WMEMCMP
 	movl	(VEC_SIZE * 1)(%rdi, %rax), %ecx
 	xorl	%edx, %edx
 	cmpl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
 	/* NB: no partial register stall here because xorl zero idiom
 	   above.  */
 	setg	%dl
 	leal	-1(%rdx, %rdx), %eax
 # else
 	movzbl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
 	movzbl	(VEC_SIZE * 1)(%rdi, %rax), %eax
 	subl	%ecx, %eax
 # endif
 	ret
 #endif
 	.p2align 4
 L(more_8x_vec):
 	subq	%rdi, %rsi
 	leaq	(VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
 	andq	$(VEC_SIZE * -1), %rdi
 	addq	%rdi, %rsi
 	.p2align 4
 L(loop_4x):
 	movups	(VEC_SIZE * 2)(%rsi), %xmm0
 	movups	(VEC_SIZE * 3)(%rsi), %xmm1
 	PCMPEQ	(VEC_SIZE * 2)(%rdi), %xmm0
 	PCMPEQ	(VEC_SIZE * 3)(%rdi), %xmm1
 	movups	(VEC_SIZE * 4)(%rsi), %xmm2
 	movups	(VEC_SIZE * 5)(%rsi), %xmm3
 	PCMPEQ	(VEC_SIZE * 4)(%rdi), %xmm2
 	PCMPEQ	(VEC_SIZE * 5)(%rdi), %xmm3
 	pand	%xmm0, %xmm1
 	pand	%xmm2, %xmm3
 	pand	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	subl	%ecx, %eax
 	jnz	L(ret_nonzero_loop)
 	addq	$(VEC_SIZE * 4), %rdi
 	addq	$(VEC_SIZE * 4), %rsi
 	cmpq	%rdi, %rdx
 	ja	L(loop_4x)
 	/* Get remaining length in edx.  */
 	subl	%edi, %edx
 	/* Restore offset so we can reuse L(last_2x_vec).  */
 	addl	$(VEC_SIZE * 6 - SIZE_OFFSET), %edx
 #ifdef USE_AS_WMEMCMP
 	shrl	$2, %edx
 #endif
 	cmpl	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
 	jbe	L(last_2x_vec)
 	movups	(VEC_SIZE * 2)(%rsi), %xmm0
 	movups	(VEC_SIZE * 2)(%rdi), %xmm1
 	PCMPEQ	%xmm0, %xmm1
 	movups	(VEC_SIZE * 3)(%rsi), %xmm2
 	movups	(VEC_SIZE * 3)(%rdi), %xmm3
 	PCMPEQ	%xmm2, %xmm3
 	pand	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	CHECK_CMP (%ecx, %eax)
 	jz	L(last_2x_vec)
 #ifdef USE_AS_MEMCMPEQ
 L(ret_nonzero_loop):
 	ret
 #else
 	.p2align 4
 L(ret_nonzero_vec_start_2_3):
 	pmovmskb %xmm1, %edx
 	sall	$16, %eax
 	leal	1(%rax, %rdx), %eax
 	bsfl	%eax, %eax
 # ifdef USE_AS_WMEMCMP
 	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
 	xorl	%edx, %edx
 	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
 	/* NB: no partial register stall here because xorl zero idiom
 	   above.  */
 	setg	%dl
 	leal	-1(%rdx, %rdx), %eax
 # else
 	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
 	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
 	subl	%ecx, %eax
 # endif
 	ret
 	.p2align 4
 L(ret_nonzero_loop):
 	pmovmskb %xmm0, %ecx
 	pmovmskb %xmm1, %edx
 	sall	$(VEC_SIZE * 1), %edx
 	leal	1(%rcx, %rdx), %edx
 	pmovmskb %xmm2, %ecx
 	/* High 16 bits of eax guranteed to be all ones. Rotate them in
 	   to we can do `or + not` with just `xor`.  */
 	rorl	$16, %eax
 	xorl	%ecx, %eax
 	salq	$32, %rax
 	orq	%rdx, %rax
 	bsfq	%rax, %rax
 # ifdef USE_AS_WMEMCMP
 	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
 	xorl	%edx, %edx
 	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
 	/* NB: no partial register stall here because xorl zero idiom
 	   above.  */
 	setg	%dl
 	leal	-1(%rdx, %rdx), %eax
 # else
 	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
 	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
 	subl	%ecx, %eax
 # endif
 	ret
 #endif
 END(MEMCMP)
 #ifndef USE_AS_WMEMCMP
 # ifdef USE_AS_MEMCMPEQ
 libc_hidden_def (MEMCMP)
 # else
 #  undef bcmp
 weak_alias (MEMCMP, bcmp)
 libc_hidden_builtin_def (MEMCMP)
 # endif
 #endif
--- a/sysdeps/x86_64/memcmpeq.S
+++ b/sysdeps/x86_64/memcmpeq.S
@ -16,6 +16,6 @@
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
-#define memcmp	__memcmpeq
+#define MEMCMP	__memcmpeq
 #define USE_AS_MEMCMPEQ	1
 #include "multiarch/memcmp-sse2.S"
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@ -162,8 +162,8 @@ sysdep_routines += \
  wmemchr-sse2 \
  wmemcmp-avx2-movbe \
  wmemcmp-avx2-movbe-rtm \
  wmemcmp-c \
  wmemcmp-evex-movbe \
  wmemcmp-sse2 \
  wmemcmp-sse4 \
 # sysdep_routines
 endif
--- a/sysdeps/x86_64/multiarch/memcmp-sse2.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S
@ -17,8 +17,8 @@
   <https://www.gnu.org/licenses/>.  */
 #if IS_IN (libc)
-# ifndef memcmp
+# ifndef MEMCMP
-#  define memcmp __memcmp_sse2
+#  define MEMCMP __memcmp_sse2
 # endif
 # ifdef SHARED
--- a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
@ -17,9 +17,9 @@
   <https://www.gnu.org/licenses/>.  */
 #if IS_IN (libc)
-# define memcmp	__memcmpeq_sse2
+# define MEMCMP	__memcmpeq_sse2
 #else
-# define memcmp	__memcmpeq
+# define MEMCMP	__memcmpeq
 #endif
 #define USE_AS_MEMCMPEQ	1
 #include "memcmp-sse2.S"
--- a/sysdeps/x86_64/multiarch/wmemcmp-c.c
+++ b/sysdeps/x86_64/multiarch/wmemcmp-c.c
@ -1,9 +0,0 @@
 #if IS_IN (libc)
 # include <wchar.h>
 # define WMEMCMP  __wmemcmp_sse2
 extern __typeof (wmemcmp) __wmemcmp_sse2;
 #endif
 #include "wcsmbs/wmemcmp.c"
--- a/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
@ -0,0 +1,25 @@
 /* wmemcmp optimized with SSE2.
   Copyright (C) 2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #if IS_IN (libc)
 # define MEMCMP	__wmemcmp_sse2
 #else
 # define MEMCMP	wmemcmp
 #endif
 #define USE_AS_WMEMCMP	1
 #include "memcmp-sse2.S"
--- a/sysdeps/x86_64/wmemcmp.S
+++ b/sysdeps/x86_64/wmemcmp.S
@ -0,0 +1,21 @@
 /* wmemcmp optimized with SSE2.
   Copyright (C) 2022 Free Software Foundation, Inc.
   This file is part of the GNU C Library.
   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.
   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.
   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */
 #define MEMCMP	wmemcmp
 #define USE_AS_WMEMCMP	1
 #include "multiarch/memcmp-sse2.S"