x86: Optimize memcmp SSE2 in memcmp.S

New code save size (-303 bytes) and has significantly better
performance.

geometric_mean(N=20) of page cross cases New / Original: 0.634
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
This commit is contained in:
Noah Goldstein 2022-04-15 12:27:59 -05:00
parent ac0d208b54
commit 8804157ad9
8 changed files with 584 additions and 385 deletions

View File

@ -18,395 +18,557 @@
#include <sysdep.h> #include <sysdep.h>
#ifdef USE_AS_WMEMCMP
# define PCMPEQ pcmpeqd
# define CHAR_SIZE 4
# define SIZE_OFFSET (0)
#else
# define PCMPEQ pcmpeqb
# define CHAR_SIZE 1
#endif
#ifdef USE_AS_MEMCMPEQ
# define SIZE_OFFSET (0)
# define CHECK_CMP(x, y) subl x, y
#else
# ifndef SIZE_OFFSET
# define SIZE_OFFSET (CHAR_PER_VEC * 2)
# endif
# define CHECK_CMP(x, y) cmpl x, y
#endif
#define VEC_SIZE 16
#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
#ifndef MEMCMP
# define MEMCMP memcmp
#endif
.text .text
ENTRY (memcmp) ENTRY(MEMCMP)
#ifdef __ILP32__ #ifdef USE_AS_WMEMCMP
/* Clear the upper 32 bits. */ /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
movl %edx, %edx in ecx for code size. This is preferable to using `incw` as
it avoids partial register stalls on older hardware (pre
SnB). */
movl $0xffff, %ecx
#endif #endif
test %RDX_LP, %RDX_LP cmpq $CHAR_PER_VEC, %rdx
jz L(finz) ja L(more_1x_vec)
cmpq $1, %rdx
jbe L(finr1b) #ifdef USE_AS_WMEMCMP
subq %rdi, %rsi /* saves a byte of code keeping the fall through path n = [2, 4]
movq %rdx, %r10 in the initial cache line. */
cmpq $32, %r10 decl %edx
jae L(gt32) jle L(cmp_0_1)
/* Handle small chunks and last block of less than 32 bytes. */
L(small): movq (%rsi), %xmm0
testq $1, %r10 movq (%rdi), %xmm1
jz L(s2b) PCMPEQ %xmm0, %xmm1
movzbl (%rdi), %eax pmovmskb %xmm1, %eax
movzbl (%rdi, %rsi), %edx subl %ecx, %eax
subq $1, %r10 jnz L(ret_nonzero_vec_start_0)
je L(finz1)
addq $1, %rdi movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
subl %edx, %eax movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
jnz L(exit) PCMPEQ %xmm0, %xmm1
L(s2b): pmovmskb %xmm1, %eax
testq $2, %r10 subl %ecx, %eax
jz L(s4b) jnz L(ret_nonzero_vec_end_0_adj)
movzwl (%rdi), %eax
movzwl (%rdi, %rsi), %edx
subq $2, %r10
#ifdef USE_AS_MEMCMPEQ
je L(finz1)
#else #else
je L(fin2_7) cmpl $8, %edx
#endif ja L(cmp_9_16)
addq $2, %rdi
cmpl %edx, %eax cmpl $4, %edx
#ifdef USE_AS_MEMCMPEQ jb L(cmp_0_3)
jnz L(neq_early)
#else # ifdef USE_AS_MEMCMPEQ
jnz L(fin2_7) movl (%rsi), %eax
#endif subl (%rdi), %eax
L(s4b):
testq $4, %r10 movl -4(%rsi, %rdx), %esi
jz L(s8b) subl -4(%rdi, %rdx), %esi
movl (%rdi), %eax
movl (%rdi, %rsi), %edx orl %esi, %eax
subq $4, %r10
#ifdef USE_AS_MEMCMPEQ
je L(finz1)
#else
je L(fin2_7)
#endif
addq $4, %rdi
cmpl %edx, %eax
#ifdef USE_AS_MEMCMPEQ
jnz L(neq_early)
#else
jnz L(fin2_7)
#endif
L(s8b):
testq $8, %r10
jz L(s16b)
movq (%rdi), %rax
movq (%rdi, %rsi), %rdx
subq $8, %r10
#ifdef USE_AS_MEMCMPEQ
je L(sub_return8)
#else
je L(fin2_7)
#endif
addq $8, %rdi
cmpq %rdx, %rax
#ifdef USE_AS_MEMCMPEQ
jnz L(neq_early)
#else
jnz L(fin2_7)
#endif
L(s16b):
movdqu (%rdi), %xmm1
movdqu (%rdi, %rsi), %xmm0
pcmpeqb %xmm0, %xmm1
#ifdef USE_AS_MEMCMPEQ
pmovmskb %xmm1, %eax
subl $0xffff, %eax
ret ret
#else # else
pmovmskb %xmm1, %edx /* Combine comparisons for lo and hi 4-byte comparisons. */
xorl %eax, %eax movl -4(%rsi, %rdx), %ecx
subl $0xffff, %edx movl -4(%rdi, %rdx), %eax
jz L(finz) shlq $32, %rcx
bsfl %edx, %ecx shlq $32, %rax
leaq (%rdi, %rcx), %rcx movl (%rsi), %esi
movzbl (%rcx), %eax movl (%rdi), %edi
movzbl (%rsi, %rcx), %edx orq %rsi, %rcx
jmp L(finz1) orq %rdi, %rax
#endif /* Only compute proper return if not-equal. */
.p2align 4,, 4 cmpq %rcx, %rax
L(finr1b): jnz L(ret_nonzero)
movzbl (%rdi), %eax
movzbl (%rsi), %edx
L(finz1):
subl %edx, %eax
L(exit):
ret
#ifdef USE_AS_MEMCMPEQ
.p2align 4,, 4
L(sub_return8):
subq %rdx, %rax
movl %eax, %edx
shrq $32, %rax
orl %edx, %eax
ret
#else
.p2align 4,, 4
L(fin2_7):
cmpq %rdx, %rax
jz L(finz)
movq %rax, %r11
subq %rdx, %r11
bsfq %r11, %rcx
sarq $3, %rcx
salq $3, %rcx
sarq %cl, %rax
movzbl %al, %eax
sarq %cl, %rdx
movzbl %dl, %edx
subl %edx, %eax
ret
#endif
.p2align 4,, 4
L(finz):
xorl %eax, %eax xorl %eax, %eax
ret ret
#ifdef USE_AS_MEMCMPEQ # endif
.p2align 4,, 4
L(neq_early): .p2align 4,, 10
movl $1, %eax L(cmp_9_16):
# ifdef USE_AS_MEMCMPEQ
movq (%rsi), %rax
subq (%rdi), %rax
movq -8(%rsi, %rdx), %rcx
subq -8(%rdi, %rdx), %rcx
orq %rcx, %rax
/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
return long). */
setnz %cl
movzbl %cl, %eax
# else
movq (%rsi), %rcx
movq (%rdi), %rax
/* Only compute proper return if not-equal. */
cmpq %rcx, %rax
jnz L(ret_nonzero)
movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
movq -8(%rdi, %rdx, CHAR_SIZE), %rax
/* Only compute proper return if not-equal. */
cmpq %rcx, %rax
jnz L(ret_nonzero)
xorl %eax, %eax
# endif
#endif
ret
.p2align 4,, 8
L(cmp_0_1):
/* Flag set by earlier comparison against 1. */
jne L(cmp_0_0)
#ifdef USE_AS_WMEMCMP
movl (%rdi), %ecx
xorl %edx, %edx
cmpl (%rsi), %ecx
je L(cmp_0_0)
setg %dl
leal -1(%rdx, %rdx), %eax
#else
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
subl %ecx, %eax
#endif
ret
/* Fits in aligning bytes. */
L(cmp_0_0):
xorl %eax, %eax
ret
#ifdef USE_AS_WMEMCMP
.p2align 4
L(ret_nonzero_vec_start_0):
bsfl %eax, %eax
movl (%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
ret
#else
# ifndef USE_AS_MEMCMPEQ
.p2align 4,, 14
L(ret_nonzero):
/* Need to bswap to get proper return without branch. */
bswapq %rcx
bswapq %rax
subq %rcx, %rax
sbbl %eax, %eax
orl $1, %eax
ret
# endif
.p2align 4
L(cmp_0_3):
# ifdef USE_AS_MEMCMPEQ
/* No reason to add to dependency chain on rdx. Saving a the
bytes here doesn't change number of fetch blocks. */
cmpl $1, %edx
jbe L(cmp_0_1)
# else
/* We need the code size to prevent taking an extra fetch block.
*/
decl %edx
jle L(cmp_0_1)
# endif
movzwl (%rsi), %ecx
movzwl (%rdi), %eax
# ifdef USE_AS_MEMCMPEQ
subl %ecx, %eax
movzbl -1(%rsi, %rdx), %esi
movzbl -1(%rdi, %rdx), %edi
subl %edi, %esi
orl %esi, %eax
# else
bswapl %ecx
bswapl %eax
/* Implicit right shift by one. We just need to displace the
sign bits. */
shrl %ecx
shrl %eax
/* Eat a partial register stall here. Saves code stopping
L(cmp_0_3) from bleeding into the next fetch block and saves
an ALU. */
movb (%rsi, %rdx), %cl
movzbl (%rdi, %rdx), %edi
orl %edi, %eax
subl %ecx, %eax
# endif
ret ret
#endif #endif
/* For blocks bigger than 32 bytes
1. Advance one of the addr pointer to be 16B aligned.
2. Treat the case of both addr pointers aligned to 16B
separately to avoid movdqu.
3. Handle any blocks of greater than 64 consecutive bytes with
unrolling to reduce branches.
4. At least one addr pointer is 16B aligned, use memory version
of pcmbeqb.
*/
.p2align 4,, 4
L(gt32):
movq %rdx, %r11
addq %rdi, %r11
movq %rdi, %r8
andq $15, %r8 .p2align 5
jz L(16am) L(more_1x_vec):
/* Both pointers may be misaligned. */ #ifndef USE_AS_WMEMCMP
movdqu (%rdi), %xmm1 /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
movdqu (%rdi, %rsi), %xmm0 in ecx for code size. This is preferable to using `incw` as
pcmpeqb %xmm0, %xmm1 it avoids partial register stalls on older hardware (pre
pmovmskb %xmm1, %edx SnB). */
subl $0xffff, %edx movl $0xffff, %ecx
jnz L(neq) #endif
neg %r8 movups (%rsi), %xmm0
leaq 16(%rdi, %r8), %rdi movups (%rdi), %xmm1
L(16am): PCMPEQ %xmm0, %xmm1
/* Handle two 16B aligned pointers separately. */ pmovmskb %xmm1, %eax
testq $15, %rsi subl %ecx, %eax
jz L(ATR) jnz L(ret_nonzero_vec_start_0)
testq $16, %rdi #if SIZE_OFFSET == 0
jz L(A32) cmpq $(CHAR_PER_VEC * 2), %rdx
movdqu (%rdi, %rsi), %xmm0 #else
pcmpeqb (%rdi), %xmm0 /* Offset rdx. Saves just enough code size to keep the
pmovmskb %xmm0, %edx L(last_2x_vec) case and the non-zero return in a single
subl $0xffff, %edx cache line. */
jnz L(neq) subq $(CHAR_PER_VEC * 2), %rdx
addq $16, %rdi #endif
L(A32): ja L(more_2x_vec)
movq %r11, %r10
andq $-32, %r10 movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
cmpq %r10, %rdi movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
jae L(mt16) PCMPEQ %xmm0, %xmm1
/* Pre-unroll to be ready for unrolled 64B loop. */ pmovmskb %xmm1, %eax
testq $32, %rdi subl %ecx, %eax
jz L(A64) #ifndef USE_AS_MEMCMPEQ
movdqu (%rdi,%rsi), %xmm0 /* Don't use `incw ax` as machines this code runs on are liable
pcmpeqb (%rdi), %xmm0 to have partial register stall. */
pmovmskb %xmm0, %edx jnz L(ret_nonzero_vec_end_0)
subl $0xffff, %edx #else
jnz L(neq) /* Various return targets for memcmpeq. Will always be hot in
addq $16, %rdi Icache and get short encoding. */
L(ret_nonzero_vec_start_1):
movdqu (%rdi,%rsi), %xmm0 L(ret_nonzero_vec_start_0):
pcmpeqb (%rdi), %xmm0 L(ret_nonzero_vec_end_0):
pmovmskb %xmm0, %edx #endif
subl $0xffff, %edx ret
jnz L(neq)
addq $16, %rdi #ifndef USE_AS_MEMCMPEQ
# ifdef USE_AS_WMEMCMP
L(A64): .p2align 4
movq %r11, %r10 L(ret_nonzero_vec_end_0_adj):
andq $-64, %r10 addl $3, %edx
cmpq %r10, %rdi # else
jae L(mt32) .p2align 4,, 8
# endif
L(A64main): L(ret_nonzero_vec_end_0):
movdqu (%rdi,%rsi), %xmm0 bsfl %eax, %eax
pcmpeqb (%rdi), %xmm0 # ifdef USE_AS_WMEMCMP
pmovmskb %xmm0, %edx leal (%rax, %rdx, CHAR_SIZE), %eax
subl $0xffff, %edx movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
jnz L(neq) xorl %edx, %edx
addq $16, %rdi cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
movdqu (%rdi,%rsi), %xmm0 above. */
pcmpeqb (%rdi), %xmm0 setg %dl
pmovmskb %xmm0, %edx leal -1(%rdx, %rdx), %eax
subl $0xffff, %edx # else
jnz L(neq) addl %edx, %eax
addq $16, %rdi movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
movdqu (%rdi,%rsi), %xmm0 subl %ecx, %eax
pcmpeqb (%rdi), %xmm0 # endif
pmovmskb %xmm0, %edx ret
subl $0xffff, %edx # ifndef USE_AS_WMEMCMP
jnz L(neq) .p2align 4,, 10
addq $16, %rdi L(ret_nonzero_vec_start_0):
bsfl %eax, %eax
movdqu (%rdi,%rsi), %xmm0 movzbl (%rsi, %rax), %ecx
pcmpeqb (%rdi), %xmm0 movzbl (%rdi, %rax), %eax
pmovmskb %xmm0, %edx subl %ecx, %eax
subl $0xffff, %edx ret
jnz L(neq) # endif
addq $16, %rdi
cmpq %rdi, %r10
jne L(A64main)
L(mt32):
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jae L(mt16)
L(A32main):
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqu (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
jne L(A32main)
L(mt16):
subq %rdi, %r11
je L(finz)
movq %r11, %r10
jmp L(small)
.p2align 4,, 4
L(neq):
#ifdef USE_AS_MEMCMPEQ
movl $1, %eax
ret
#else #else
bsfl %edx, %ecx
movzbl (%rdi, %rcx), %eax
addq %rdi, %rsi
movzbl (%rsi,%rcx), %edx
jmp L(finz1)
#endif #endif
.p2align 4,, 4 .p2align 5
L(ATR): L(more_2x_vec):
movq %r11, %r10 movups (VEC_SIZE * 1)(%rsi), %xmm0
andq $-32, %r10 movups (VEC_SIZE * 1)(%rdi), %xmm1
cmpq %r10, %rdi PCMPEQ %xmm0, %xmm1
jae L(mt16) pmovmskb %xmm1, %eax
testq $16, %rdi subl %ecx, %eax
jz L(ATR32) jnz L(ret_nonzero_vec_start_1)
movdqa (%rdi,%rsi), %xmm0 cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
pcmpeqb (%rdi), %xmm0 jbe L(last_2x_vec)
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
je L(mt16)
L(ATR32): cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
movq %r11, %r10 ja L(more_8x_vec)
andq $-64, %r10
testq $32, %rdi
jz L(ATR64)
movdqa (%rdi,%rsi), %xmm0 /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
pcmpeqb (%rdi), %xmm0 This can harm performance if non-zero return in [65, 80] or
pmovmskb %xmm0, %edx [97, 112] but helps performance otherwise. Generally zero-
subl $0xffff, %edx return is hotter. */
jnz L(neq) movups (VEC_SIZE * 2)(%rsi), %xmm0
addq $16, %rdi movups (VEC_SIZE * 2)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * 3)(%rsi), %xmm2
movups (VEC_SIZE * 3)(%rdi), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
movdqa (%rdi,%rsi), %xmm0 pmovmskb %xmm3, %eax
pcmpeqb (%rdi), %xmm0 CHECK_CMP (%ecx, %eax)
pmovmskb %xmm0, %edx jnz L(ret_nonzero_vec_start_2_3)
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
L(ATR64): cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
cmpq %rdi, %r10 jbe L(last_2x_vec)
je L(mt32)
L(ATR64main): movups (VEC_SIZE * 4)(%rsi), %xmm0
movdqa (%rdi,%rsi), %xmm0 movups (VEC_SIZE * 4)(%rdi), %xmm1
pcmpeqb (%rdi), %xmm0 PCMPEQ %xmm0, %xmm1
pmovmskb %xmm0, %edx movups (VEC_SIZE * 5)(%rsi), %xmm2
subl $0xffff, %edx movups (VEC_SIZE * 5)(%rdi), %xmm3
jnz L(neq) PCMPEQ %xmm2, %xmm3
addq $16, %rdi pand %xmm1, %xmm3
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %rdi, %r10
jne L(ATR64main)
movq %r11, %r10
andq $-32, %r10
cmpq %r10, %rdi
jae L(mt16)
L(ATR32res):
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
movdqa (%rdi,%rsi), %xmm0
pcmpeqb (%rdi), %xmm0
pmovmskb %xmm0, %edx
subl $0xffff, %edx
jnz L(neq)
addq $16, %rdi
cmpq %r10, %rdi
jne L(ATR32res)
subq %rdi, %r11
je L(finz)
movq %r11, %r10
jmp L(small)
/* Align to 16byte to improve instruction fetch. */
.p2align 4,, 4
END(memcmp)
pmovmskb %xmm3, %eax
CHECK_CMP (%ecx, %eax)
#ifdef USE_AS_MEMCMPEQ #ifdef USE_AS_MEMCMPEQ
libc_hidden_def (memcmp) jz L(last_2x_vec)
ret
#else #else
# undef bcmp jnz L(ret_nonzero_vec_start_4_5)
weak_alias (memcmp, bcmp) #endif
libc_hidden_builtin_def (memcmp) .p2align 4
L(last_2x_vec):
movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
subl %ecx, %eax
#ifdef USE_AS_MEMCMPEQ
/* Various return targets for memcmpeq. Will always be hot in
Icache and get short encoding. */
L(ret_nonzero_vec_start_2_3):
L(ret_nonzero_vec_start_4_5):
ret
#else
jnz L(ret_nonzero_vec_end_1)
ret
.p2align 4,, 8
L(ret_nonzero_vec_end_1):
pmovmskb %xmm1, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */
rorl $16, %eax
xorl %ecx, %eax
/* Partial register stall. */
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
leal (%rax, %rdx, CHAR_SIZE), %eax
movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
addl %edx, %eax
movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4
L(ret_nonzero_vec_start_4_5):
pmovmskb %xmm1, %edx
sall $16, %eax
leal 1(%rax, %rdx), %eax
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4,, 8
L(ret_nonzero_vec_start_1):
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
#endif
.p2align 4
L(more_8x_vec):
subq %rdi, %rsi
leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
andq $(VEC_SIZE * -1), %rdi
addq %rdi, %rsi
.p2align 4
L(loop_4x):
movups (VEC_SIZE * 2)(%rsi), %xmm0
movups (VEC_SIZE * 3)(%rsi), %xmm1
PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
movups (VEC_SIZE * 4)(%rsi), %xmm2
movups (VEC_SIZE * 5)(%rsi), %xmm3
PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
pand %xmm0, %xmm1
pand %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
subl %ecx, %eax
jnz L(ret_nonzero_loop)
addq $(VEC_SIZE * 4), %rdi
addq $(VEC_SIZE * 4), %rsi
cmpq %rdi, %rdx
ja L(loop_4x)
/* Get remaining length in edx. */
subl %edi, %edx
/* Restore offset so we can reuse L(last_2x_vec). */
addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
#ifdef USE_AS_WMEMCMP
shrl $2, %edx
#endif
cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
jbe L(last_2x_vec)
movups (VEC_SIZE * 2)(%rsi), %xmm0
movups (VEC_SIZE * 2)(%rdi), %xmm1
PCMPEQ %xmm0, %xmm1
movups (VEC_SIZE * 3)(%rsi), %xmm2
movups (VEC_SIZE * 3)(%rdi), %xmm3
PCMPEQ %xmm2, %xmm3
pand %xmm1, %xmm3
pmovmskb %xmm3, %eax
CHECK_CMP (%ecx, %eax)
jz L(last_2x_vec)
#ifdef USE_AS_MEMCMPEQ
L(ret_nonzero_loop):
ret
#else
.p2align 4
L(ret_nonzero_vec_start_2_3):
pmovmskb %xmm1, %edx
sall $16, %eax
leal 1(%rax, %rdx), %eax
bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
.p2align 4
L(ret_nonzero_loop):
pmovmskb %xmm0, %ecx
pmovmskb %xmm1, %edx
sall $(VEC_SIZE * 1), %edx
leal 1(%rcx, %rdx), %edx
pmovmskb %xmm2, %ecx
/* High 16 bits of eax guranteed to be all ones. Rotate them in
to we can do `or + not` with just `xor`. */
rorl $16, %eax
xorl %ecx, %eax
salq $32, %rax
orq %rdx, %rax
bsfq %rax, %rax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
xorl %edx, %edx
cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
/* NB: no partial register stall here because xorl zero idiom
above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
#endif
END(MEMCMP)
#ifndef USE_AS_WMEMCMP
# ifdef USE_AS_MEMCMPEQ
libc_hidden_def (MEMCMP)
# else
# undef bcmp
weak_alias (MEMCMP, bcmp)
libc_hidden_builtin_def (MEMCMP)
# endif
#endif #endif

View File

@ -16,6 +16,6 @@
License along with the GNU C Library; if not, see License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */ <https://www.gnu.org/licenses/>. */
#define memcmp __memcmpeq #define MEMCMP __memcmpeq
#define USE_AS_MEMCMPEQ 1 #define USE_AS_MEMCMPEQ 1
#include "multiarch/memcmp-sse2.S" #include "multiarch/memcmp-sse2.S"

View File

@ -162,8 +162,8 @@ sysdep_routines += \
wmemchr-sse2 \ wmemchr-sse2 \
wmemcmp-avx2-movbe \ wmemcmp-avx2-movbe \
wmemcmp-avx2-movbe-rtm \ wmemcmp-avx2-movbe-rtm \
wmemcmp-c \
wmemcmp-evex-movbe \ wmemcmp-evex-movbe \
wmemcmp-sse2 \
wmemcmp-sse4 \ wmemcmp-sse4 \
# sysdep_routines # sysdep_routines
endif endif

View File

@ -17,8 +17,8 @@
<https://www.gnu.org/licenses/>. */ <https://www.gnu.org/licenses/>. */
#if IS_IN (libc) #if IS_IN (libc)
# ifndef memcmp # ifndef MEMCMP
# define memcmp __memcmp_sse2 # define MEMCMP __memcmp_sse2
# endif # endif
# ifdef SHARED # ifdef SHARED

View File

@ -17,9 +17,9 @@
<https://www.gnu.org/licenses/>. */ <https://www.gnu.org/licenses/>. */
#if IS_IN (libc) #if IS_IN (libc)
# define memcmp __memcmpeq_sse2 # define MEMCMP __memcmpeq_sse2
#else #else
# define memcmp __memcmpeq # define MEMCMP __memcmpeq
#endif #endif
#define USE_AS_MEMCMPEQ 1 #define USE_AS_MEMCMPEQ 1
#include "memcmp-sse2.S" #include "memcmp-sse2.S"

View File

@ -1,9 +0,0 @@
#if IS_IN (libc)
# include <wchar.h>
# define WMEMCMP __wmemcmp_sse2
extern __typeof (wmemcmp) __wmemcmp_sse2;
#endif
#include "wcsmbs/wmemcmp.c"

View File

@ -0,0 +1,25 @@
/* wmemcmp optimized with SSE2.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
# define MEMCMP __wmemcmp_sse2
#else
# define MEMCMP wmemcmp
#endif
#define USE_AS_WMEMCMP 1
#include "memcmp-sse2.S"

21
sysdeps/x86_64/wmemcmp.S Normal file
View File

@ -0,0 +1,21 @@
/* wmemcmp optimized with SSE2.
Copyright (C) 2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#define MEMCMP wmemcmp
#define USE_AS_WMEMCMP 1
#include "multiarch/memcmp-sse2.S"