x86-64: Improve evex512 version of strlen functions

This patch improves following functionality
- Replace VPCMP with VPCMPEQ.
- Replace page cross check logic with sall.
- Remove extra lea from align_more.
- Remove uncondition loop jump.
- Use bsf to check max length in first vector.

Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
This commit is contained in:
Sunil K Pandey 2022-10-03 12:00:53 -07:00
parent 361d6454c0
commit e96971482d
1 changed files with 57 additions and 34 deletions

View File

@ -25,12 +25,12 @@
# include <sysdep.h>
# ifdef USE_AS_WCSLEN
# define VPCMP vpcmpd
# define VPCMPEQ vpcmpeqd
# define VPTESTN vptestnmd
# define VPMINU vpminud
# define CHAR_SIZE 4
# else
# define VPCMP vpcmpb
# define VPCMPEQ vpcmpeqb
# define VPTESTN vptestnmb
# define VPMINU vpminub
# define CHAR_SIZE 1
@ -55,20 +55,29 @@ ENTRY_P2ALIGN (STRLEN, 6)
movl %edi, %eax
vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
andl $(PAGE_SIZE - 1), %eax
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
sall $20, %eax
cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
ja L(page_cross)
/* Compare [w]char for null, mask bit will be set for match. */
VPCMP $0, (%rdi), %VMM(0), %k0
VPCMPEQ (%rdi), %VMM(0), %k0
# ifdef USE_AS_STRNLEN
KMOV %k0, %VRCX
/* Store max length in rax. */
mov %rsi, %rax
/* If rcx is 0, rax will have max length. We can not use VRCX
and VRAX here for evex256 because, upper 32 bits may be
undefined for ecx and eax. */
bsfq %rcx, %rax
cmp $CHAR_PER_VEC, %rax
ja L(align_more)
cmpq %rax, %rsi
cmovb %esi, %eax
# else
KMOV %k0, %VRAX
test %VRAX, %VRAX
jz L(align_more)
bsf %VRAX, %VRAX
# ifdef USE_AS_STRNLEN
cmpq %rsi, %rax
cmovnb %rsi, %rax
# endif
ret
@ -81,25 +90,24 @@ L(ret_max):
# endif
L(align_more):
leaq VEC_SIZE(%rdi), %rax
mov %rdi, %rax
/* Align rax to VEC_SIZE. */
andq $-VEC_SIZE, %rax
# ifdef USE_AS_STRNLEN
movq %rax, %rdx
subq %rdi, %rdx
movq %rdi, %rdx
subq %rax, %rdx
# ifdef USE_AS_WCSLEN
shr $2, %VRDX
# endif
/* At this point rdx contains [w]chars already compared. */
subq %rsi, %rdx
jae L(ret_max)
negq %rdx
leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
/* At this point rdx contains number of w[char] needs to go.
Now onwards rdx will keep decrementing with each compare. */
# endif
/* Loop unroll 4 times for 4 vector loop. */
VPCMP $0, (%rax), %VMM(0), %k0
VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
subq $-VEC_SIZE, %rax
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x1)
@ -109,7 +117,7 @@ L(align_more):
jbe L(ret_max)
# endif
VPCMP $0, VEC_SIZE(%rax), %VMM(0), %k0
VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x2)
@ -119,7 +127,7 @@ L(align_more):
jbe L(ret_max)
# endif
VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x3)
@ -129,7 +137,7 @@ L(align_more):
jbe L(ret_max)
# endif
VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
KMOV %k0, %VRCX
test %VRCX, %VRCX
jnz L(ret_vec_x4)
@ -155,16 +163,10 @@ L(align_more):
addq %rcx, %rdx
/* Need jump as we don't want to add/subtract rdx for first
iteration of 4 x VEC_SIZE aligned loop. */
jmp L(loop_entry)
# endif
.p2align 4,,11
L(loop):
# ifdef USE_AS_STRNLEN
subq $(CHAR_PER_VEC * 4), %rdx
jbe L(ret_max)
L(loop_entry):
# endif
/* VPMINU and VPCMP combination provide better performance as
compared to alternative combinations. */
VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
@ -177,7 +179,18 @@ L(loop_entry):
subq $-(VEC_SIZE * 4), %rax
KORTEST %k0, %k1
jz L(loop)
# ifndef USE_AS_STRNLEN
jz L(loop)
# else
jnz L(loopend)
subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop)
mov %rsi, %rax
ret
# endif
L(loopend):
VPTESTN %VMM(1), %VMM(1), %k2
KMOV %k2, %VRCX
@ -249,24 +262,34 @@ L(ret_vec_x1):
ret
L(page_cross):
movl %eax, %ecx
# ifdef USE_AS_WCSLEN
mov %rdi, %rax
movl %edi, %ecx
andl $(VEC_SIZE - 1), %ecx
# ifdef USE_AS_WCSLEN
sarl $2, %ecx
# endif
/* ecx contains number of w[char] to be skipped as a result
of address alignment. */
xorq %rdi, %rax
VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
KMOV %k0, %VRAX
andq $-VEC_SIZE, %rax
VPCMPEQ (%rax), %VMM(0), %k0
KMOV %k0, %VRDX
/* Ignore number of character for alignment adjustment. */
shr %cl, %VRAX
shr %cl, %VRDX
# ifdef USE_AS_STRNLEN
jnz L(page_cross_end)
movl $CHAR_PER_VEC, %eax
sub %ecx, %eax
cmp %rax, %rsi
ja L(align_more)
# else
jz L(align_more)
# endif
bsf %VRAX, %VRAX
L(page_cross_end):
bsf %VRDX, %VRAX
# ifdef USE_AS_STRNLEN
cmpq %rsi, %rax
cmovnb %rsi, %rax
cmovnb %esi, %eax
# endif
ret