2021-03-05 14:24:52 +00:00
|
|
|
/* memchr/wmemchr optimized with 256-bit EVEX instructions.
|
2024-01-01 18:12:26 +00:00
|
|
|
Copyright (C) 2021-2024 Free Software Foundation, Inc.
|
2021-03-05 14:24:52 +00:00
|
|
|
This file is part of the GNU C Library.
|
|
|
|
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
|
|
modify it under the terms of the GNU Lesser General Public
|
|
|
|
License as published by the Free Software Foundation; either
|
|
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
Lesser General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
|
|
License along with the GNU C Library; if not, see
|
|
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
|
2022-06-22 23:51:20 +00:00
|
|
|
#include <isa-level.h>
|
|
|
|
#include <sysdep.h>
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-06-22 23:51:20 +00:00
|
|
|
#if ISA_SHOULD_BUILD (4)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# ifndef VEC_SIZE
|
|
|
|
# include "x86-evex256-vecs.h"
|
|
|
|
# endif
|
|
|
|
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifndef MEMCHR
|
|
|
|
# define MEMCHR __memchr_evex
|
|
|
|
# endif
|
|
|
|
|
|
|
|
# ifdef USE_AS_WMEMCHR
|
2022-10-19 00:44:03 +00:00
|
|
|
# define PC_SHIFT_GPR rcx
|
|
|
|
# define VPTESTN vptestnmd
|
2021-03-05 14:24:52 +00:00
|
|
|
# define VPBROADCAST vpbroadcastd
|
2021-05-03 07:03:19 +00:00
|
|
|
# define VPMINU vpminud
|
|
|
|
# define VPCMP vpcmpd
|
|
|
|
# define VPCMPEQ vpcmpeqd
|
|
|
|
# define CHAR_SIZE 4
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
# define USE_WIDE_CHAR
|
2021-03-05 14:24:52 +00:00
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
# define PC_SHIFT_GPR rdi
|
|
|
|
# define VPTESTN vptestnmb
|
2021-03-05 14:24:52 +00:00
|
|
|
# define VPBROADCAST vpbroadcastb
|
2021-05-03 07:03:19 +00:00
|
|
|
# define VPMINU vpminub
|
|
|
|
# define VPCMP vpcmpb
|
|
|
|
# define VPCMPEQ vpcmpeqb
|
|
|
|
# define CHAR_SIZE 1
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# include "reg-macros.h"
|
|
|
|
|
|
|
|
|
|
|
|
/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
|
|
|
|
doesn't have VEX encoding), use VEX encoding in loop so we
|
|
|
|
can use vpcmpeqb + vptern which is more efficient than the
|
|
|
|
EVEX alternative. */
|
|
|
|
# if defined USE_IN_RTM || VEC_SIZE == 64
|
|
|
|
# undef COND_VZEROUPPER
|
|
|
|
# undef VZEROUPPER_RETURN
|
|
|
|
# undef VZEROUPPER
|
|
|
|
|
|
|
|
# define COND_VZEROUPPER
|
|
|
|
# define VZEROUPPER_RETURN ret
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# define VZEROUPPER
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
# define USE_TERN_IN_LOOP 0
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
# define USE_TERN_IN_LOOP 1
|
|
|
|
# undef VZEROUPPER
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# define VZEROUPPER vzeroupper
|
|
|
|
# endif
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# if USE_TERN_IN_LOOP
|
|
|
|
/* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
|
|
|
|
so we don't want to multiply resulting index. */
|
|
|
|
# define TERN_CHAR_MULT 1
|
|
|
|
|
|
|
|
# ifdef USE_AS_WMEMCHR
|
|
|
|
# define TEST_END() inc %VRCX
|
|
|
|
# else
|
|
|
|
# define TEST_END() add %rdx, %rcx
|
|
|
|
# endif
|
2021-05-03 07:03:19 +00:00
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
# define TERN_CHAR_MULT CHAR_SIZE
|
|
|
|
# define TEST_END() KORTEST %k2, %k3
|
2021-05-03 07:03:19 +00:00
|
|
|
# endif
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
|
|
|
|
# ifndef USE_AS_WMEMCHR
|
|
|
|
# define GPR_X0_IS_RET 1
|
|
|
|
# else
|
|
|
|
# define GPR_X0_IS_RET 0
|
|
|
|
# endif
|
|
|
|
# define GPR_X0 rax
|
|
|
|
# else
|
|
|
|
# define GPR_X0_IS_RET 0
|
|
|
|
# define GPR_X0 rdx
|
|
|
|
# endif
|
|
|
|
|
|
|
|
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# if CHAR_PER_VEC == 64
|
|
|
|
# define LAST_VEC_OFFSET (VEC_SIZE * 3)
|
|
|
|
# else
|
|
|
|
# define LAST_VEC_OFFSET (VEC_SIZE * 2)
|
|
|
|
# endif
|
|
|
|
# if CHAR_PER_VEC >= 32
|
|
|
|
# define MASK_GPR(...) VGPR(__VA_ARGS__)
|
|
|
|
# elif CHAR_PER_VEC == 16
|
|
|
|
# define MASK_GPR(reg) VGPR_SZ(reg, 16)
|
|
|
|
# else
|
|
|
|
# define MASK_GPR(reg) VGPR_SZ(reg, 8)
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# endif
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# define VMATCH VMM(0)
|
|
|
|
# define VMATCH_LO VMM_lo(0)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# define PAGE_SIZE 4096
|
|
|
|
|
|
|
|
|
|
|
|
.section SECTION(.text), "ax", @progbits
|
2022-06-07 04:11:34 +00:00
|
|
|
ENTRY_P2ALIGN (MEMCHR, 6)
|
2021-03-05 14:24:52 +00:00
|
|
|
/* Check for zero length. */
|
|
|
|
test %RDX_LP, %RDX_LP
|
2022-10-19 00:44:03 +00:00
|
|
|
jz L(zero_0)
|
2021-05-03 07:03:19 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# ifdef __ILP32__
|
2021-03-05 14:24:52 +00:00
|
|
|
/* Clear the upper 32 bits. */
|
|
|
|
movl %edx, %edx
|
|
|
|
# endif
|
2022-10-19 00:44:03 +00:00
|
|
|
VPBROADCAST %esi, %VMATCH
|
2021-03-05 14:24:52 +00:00
|
|
|
/* Check if we may cross page boundary with one vector load. */
|
2021-05-03 07:03:19 +00:00
|
|
|
movl %edi, %eax
|
|
|
|
andl $(PAGE_SIZE - 1), %eax
|
|
|
|
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
2022-10-19 00:44:03 +00:00
|
|
|
ja L(page_cross)
|
|
|
|
|
|
|
|
VPCMPEQ (%rdi), %VMATCH, %k0
|
|
|
|
KMOV %k0, %VRAX
|
|
|
|
# ifndef USE_AS_WMEMCHR
|
|
|
|
/* If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a
|
|
|
|
already a dependency between rcx and rsi so no worries about
|
|
|
|
false-dep here. */
|
|
|
|
tzcnt %VRAX, %VRSI
|
|
|
|
/* If rdx <= rsi then either 1) rcx was non-zero (there was a
|
|
|
|
match) but it was out of bounds or 2) rcx was zero and rdx
|
|
|
|
was <= VEC_SIZE so we are done scanning. */
|
|
|
|
cmpq %rsi, %rdx
|
|
|
|
/* NB: Use branch to return zero/non-zero. Common usage will
|
|
|
|
branch on result of function (if return is null/non-null).
|
|
|
|
This branch can be used to predict the ensuing one so there
|
|
|
|
is no reason to extend the data-dependency with cmovcc. */
|
|
|
|
jbe L(zero_0)
|
|
|
|
|
|
|
|
/* If rcx is zero then len must be > RDX, otherwise since we
|
|
|
|
already tested len vs lzcnt(rcx) (in rsi) we are good to
|
|
|
|
return this match. */
|
|
|
|
test %VRAX, %VRAX
|
|
|
|
jz L(more_1x_vec)
|
|
|
|
leaq (%rdi, %rsi), %rax
|
|
|
|
# else
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
/* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
|
|
|
|
> 1 so if rcx is tzcnt != CHAR_PER_VEC. */
|
2021-05-03 07:03:19 +00:00
|
|
|
cmpq $CHAR_PER_VEC, %rdx
|
2022-10-19 00:44:03 +00:00
|
|
|
ja L(more_1x_vec)
|
|
|
|
tzcnt %VRAX, %VRAX
|
|
|
|
cmpl %eax, %edx
|
|
|
|
jbe L(zero_0)
|
|
|
|
L(first_vec_x0_ret):
|
2021-05-03 07:03:19 +00:00
|
|
|
leaq (%rdi, %rax, CHAR_SIZE), %rax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2021-05-03 07:03:19 +00:00
|
|
|
ret
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
/* Only fits in first cache line for VEC_SIZE == 32. */
|
|
|
|
# if VEC_SIZE == 32
|
|
|
|
.p2align 4,, 2
|
|
|
|
L(zero_0):
|
2022-06-07 04:11:34 +00:00
|
|
|
xorl %eax, %eax
|
2021-05-03 07:03:19 +00:00
|
|
|
ret
|
|
|
|
# endif
|
2022-06-07 04:11:34 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
.p2align 4,, 9
|
|
|
|
L(more_1x_vec):
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WMEMCHR
|
2022-10-19 00:44:03 +00:00
|
|
|
/* If wmemchr still need to test if there was a match in first
|
|
|
|
VEC. Use bsf to test here so we can reuse
|
|
|
|
L(first_vec_x0_ret). */
|
|
|
|
bsf %VRAX, %VRAX
|
|
|
|
jnz L(first_vec_x0_ret)
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
L(page_cross_continue):
|
2021-03-05 14:24:52 +00:00
|
|
|
# ifdef USE_AS_WMEMCHR
|
2022-10-19 00:44:03 +00:00
|
|
|
/* We can't use end of the buffer to re-calculate length for
|
|
|
|
wmemchr as len * CHAR_SIZE may overflow. */
|
|
|
|
leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
|
|
|
|
andq $(VEC_SIZE * -1), %rdi
|
|
|
|
subq %rdi, %rax
|
|
|
|
sarq $2, %rax
|
|
|
|
addq %rdx, %rax
|
|
|
|
# else
|
|
|
|
leaq -(VEC_SIZE + 1)(%rdx, %rdi), %rax
|
|
|
|
andq $(VEC_SIZE * -1), %rdi
|
|
|
|
subq %rdi, %rax
|
2021-05-03 07:03:19 +00:00
|
|
|
# endif
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
/* rax contains remaining length - 1. -1 so we can get imm8
|
|
|
|
encoding in a few additional places saving code size. */
|
|
|
|
|
|
|
|
/* Needed regardless of remaining length. */
|
|
|
|
VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
|
|
|
|
KMOV %k0, %VRDX
|
|
|
|
|
|
|
|
/* We cannot fold the above `sub %rdi, %rax` with the `cmp
|
|
|
|
$(CHAR_PER_VEC * 2), %rax` because its possible for a very
|
|
|
|
large length to overflow and cause the subtract to carry
|
|
|
|
despite length being above CHAR_PER_VEC * 2. */
|
|
|
|
cmpq $(CHAR_PER_VEC * 2 - 1), %rax
|
|
|
|
ja L(more_2x_vec)
|
|
|
|
L(last_2x_vec):
|
|
|
|
|
|
|
|
test %VRDX, %VRDX
|
|
|
|
jnz L(first_vec_x1_check)
|
|
|
|
|
|
|
|
/* Check the end of data. NB: use 8-bit operations to save code
|
|
|
|
size. We no longer need the full-width of eax and will
|
|
|
|
perform a write-only operation over eax so there will be no
|
|
|
|
partial-register stalls. */
|
|
|
|
subb $(CHAR_PER_VEC * 1 - 1), %al
|
|
|
|
jle L(zero_0)
|
|
|
|
|
|
|
|
VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
|
|
|
|
KMOV %k0, %VRCX
|
2021-05-03 07:03:19 +00:00
|
|
|
# ifdef USE_AS_WMEMCHR
|
2022-10-19 00:44:03 +00:00
|
|
|
/* For wmemchr against we can't take advantage of tzcnt(0) ==
|
|
|
|
VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */
|
|
|
|
test %VRCX, %VRCX
|
|
|
|
jz L(zero_0)
|
|
|
|
# endif
|
|
|
|
tzcnt %VRCX, %VRCX
|
|
|
|
cmp %cl, %al
|
|
|
|
|
|
|
|
/* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give
|
|
|
|
fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
|
|
|
|
not enough space before the next cache line to fit the `lea`
|
|
|
|
for return. */
|
|
|
|
# if VEC_SIZE == 64
|
|
|
|
ja L(first_vec_x2_ret)
|
|
|
|
L(zero_0):
|
|
|
|
xorl %eax, %eax
|
|
|
|
ret
|
2021-05-03 07:03:19 +00:00
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
jbe L(zero_0)
|
|
|
|
leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
|
|
|
|
ret
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
.p2align 4,, 5
|
|
|
|
L(first_vec_x1_check):
|
|
|
|
bsf %VRDX, %VRDX
|
|
|
|
cmpb %dl, %al
|
|
|
|
jb L(zero_4)
|
|
|
|
leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
/* Fits at the end of the cache line here for VEC_SIZE == 32.
|
|
|
|
*/
|
|
|
|
# if VEC_SIZE == 32
|
|
|
|
L(zero_4):
|
|
|
|
xorl %eax, %eax
|
2021-05-03 07:03:19 +00:00
|
|
|
ret
|
2022-10-19 00:44:03 +00:00
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
.p2align 4,, 4
|
2021-05-03 07:03:19 +00:00
|
|
|
L(first_vec_x2):
|
2022-10-19 00:44:03 +00:00
|
|
|
bsf %VRCX, %VRCX
|
|
|
|
L(first_vec_x2_ret):
|
|
|
|
leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
|
2021-05-03 07:03:19 +00:00
|
|
|
ret
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
/* Fits at the end of the cache line here for VEC_SIZE == 64.
|
|
|
|
*/
|
|
|
|
# if VEC_SIZE == 64
|
|
|
|
L(zero_4):
|
|
|
|
xorl %eax, %eax
|
2021-05-03 07:03:19 +00:00
|
|
|
ret
|
2022-10-19 00:44:03 +00:00
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
.p2align 4,, 4
|
|
|
|
L(first_vec_x1):
|
|
|
|
bsf %VRDX, %VRDX
|
|
|
|
leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
|
2021-05-03 07:03:19 +00:00
|
|
|
ret
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
.p2align 4,, 5
|
|
|
|
L(more_2x_vec):
|
|
|
|
/* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
|
|
|
|
length. */
|
|
|
|
|
|
|
|
|
|
|
|
/* Already computed matches for first VEC in rdx. */
|
|
|
|
test %VRDX, %VRDX
|
2021-03-05 14:24:52 +00:00
|
|
|
jnz L(first_vec_x1)
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
|
|
|
|
KMOV %k0, %VRCX
|
|
|
|
test %VRCX, %VRCX
|
2021-03-05 14:24:52 +00:00
|
|
|
jnz L(first_vec_x2)
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
/* Needed regardless of next length check. */
|
|
|
|
VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
|
|
|
|
KMOV %k0, %VRCX
|
|
|
|
|
|
|
|
/* Check if we are near the end. */
|
|
|
|
cmpq $(CHAR_PER_VEC * 4 - 1), %rax
|
|
|
|
ja L(more_4x_vec)
|
|
|
|
|
|
|
|
test %VRCX, %VRCX
|
|
|
|
jnz L(first_vec_x3_check)
|
|
|
|
|
|
|
|
/* Use 8-bit instructions to save code size. We won't use full-
|
|
|
|
width eax again and will perform a write-only operation to
|
|
|
|
eax so no worries about partial-register stalls. */
|
|
|
|
subb $(CHAR_PER_VEC * 3), %al
|
|
|
|
jb L(zero_2)
|
|
|
|
L(last_vec_check):
|
|
|
|
VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
|
|
|
|
KMOV %k0, %VRCX
|
|
|
|
# ifdef USE_AS_WMEMCHR
|
|
|
|
/* For wmemchr against we can't take advantage of tzcnt(0) ==
|
|
|
|
VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */
|
|
|
|
test %VRCX, %VRCX
|
|
|
|
jz L(zero_2)
|
|
|
|
# endif
|
|
|
|
tzcnt %VRCX, %VRCX
|
|
|
|
cmp %cl, %al
|
|
|
|
jae L(first_vec_x4_ret)
|
|
|
|
L(zero_2):
|
|
|
|
xorl %eax, %eax
|
|
|
|
ret
|
|
|
|
|
|
|
|
/* Fits at the end of the cache line here for VEC_SIZE == 64.
|
|
|
|
For VEC_SIZE == 32 we put the return label at the end of
|
|
|
|
L(first_vec_x4). */
|
|
|
|
# if VEC_SIZE == 64
|
|
|
|
L(first_vec_x4_ret):
|
|
|
|
leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
|
|
|
|
ret
|
|
|
|
# endif
|
|
|
|
|
|
|
|
.p2align 4,, 6
|
|
|
|
L(first_vec_x4):
|
|
|
|
bsf %VRCX, %VRCX
|
|
|
|
# if VEC_SIZE == 32
|
|
|
|
/* Place L(first_vec_x4_ret) here as we can't fit it in the same
|
|
|
|
cache line as where it is called from so we might as well
|
|
|
|
save code size by reusing return of L(first_vec_x4). */
|
|
|
|
L(first_vec_x4_ret):
|
|
|
|
# endif
|
|
|
|
leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
|
|
|
|
ret
|
|
|
|
|
|
|
|
.p2align 4,, 6
|
|
|
|
L(first_vec_x3_check):
|
|
|
|
/* Need to adjust remaining length before checking. */
|
|
|
|
addb $-(CHAR_PER_VEC * 2), %al
|
|
|
|
bsf %VRCX, %VRCX
|
|
|
|
cmpb %cl, %al
|
|
|
|
jb L(zero_2)
|
|
|
|
leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
|
|
|
|
ret
|
|
|
|
|
|
|
|
.p2align 4,, 6
|
|
|
|
L(first_vec_x3):
|
|
|
|
bsf %VRCX, %VRCX
|
|
|
|
leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
|
|
|
|
ret
|
|
|
|
|
|
|
|
.p2align 4,, 3
|
|
|
|
# if !USE_TERN_IN_LOOP
|
|
|
|
.p2align 4,, 10
|
|
|
|
# endif
|
|
|
|
L(more_4x_vec):
|
|
|
|
test %VRCX, %VRCX
|
2021-03-05 14:24:52 +00:00
|
|
|
jnz L(first_vec_x3)
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
|
|
|
|
KMOV %k0, %VRCX
|
|
|
|
test %VRCX, %VRCX
|
2021-05-03 07:03:19 +00:00
|
|
|
jnz L(first_vec_x4)
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
subq $-(VEC_SIZE * 5), %rdi
|
|
|
|
subq $(CHAR_PER_VEC * 8), %rax
|
|
|
|
jb L(last_4x_vec)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# ifdef USE_AS_WMEMCHR
|
2021-05-03 07:03:19 +00:00
|
|
|
movl %edi, %ecx
|
2022-10-19 00:44:03 +00:00
|
|
|
# else
|
|
|
|
addq %rdi, %rax
|
|
|
|
# endif
|
|
|
|
|
|
|
|
|
|
|
|
# if VEC_SIZE == 64
|
|
|
|
/* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
|
|
|
|
processor has partial register stalls (all have merging
|
|
|
|
uop). If that changes this can be removed. */
|
|
|
|
xorb %dil, %dil
|
|
|
|
# else
|
|
|
|
andq $-(VEC_SIZE * 4), %rdi
|
|
|
|
# endif
|
|
|
|
|
|
|
|
# ifdef USE_AS_WMEMCHR
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
subl %edi, %ecx
|
2021-05-03 07:03:19 +00:00
|
|
|
sarl $2, %ecx
|
2022-10-19 00:44:03 +00:00
|
|
|
addq %rcx, %rax
|
2021-05-03 07:03:19 +00:00
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
subq %rdi, %rax
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# if USE_TERN_IN_LOOP
|
|
|
|
/* copy VMATCH to low ymm so we can use vpcmpeq which is not
|
|
|
|
encodable with EVEX registers. NB: this is VEC_SIZE == 32
|
|
|
|
only as there is no way to encode vpcmpeq with zmm0-15. */
|
|
|
|
vmovdqa64 %VMATCH, %VMATCH_LO
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# endif
|
2021-05-03 07:03:19 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
.p2align 4,, 11
|
2021-03-05 14:24:52 +00:00
|
|
|
L(loop_4x_vec):
|
2022-10-19 00:44:03 +00:00
|
|
|
/* Two versions of the loop. One that does not require
|
|
|
|
vzeroupper by not using ymmm0-15 and another does that
|
|
|
|
require vzeroupper because it uses ymmm0-15. The reason why
|
|
|
|
ymm0-15 is used at all is because there is no EVEX encoding
|
|
|
|
vpcmpeq and with vpcmpeq this loop can be performed more
|
|
|
|
efficiently. The non-vzeroupper version is safe for RTM
|
2023-05-23 03:57:01 +00:00
|
|
|
while the vzeroupper version should be preferred if RTM are
|
2022-10-19 00:44:03 +00:00
|
|
|
not supported. Which loop version we use is determined by
|
|
|
|
USE_TERN_IN_LOOP. */
|
|
|
|
|
|
|
|
# if USE_TERN_IN_LOOP
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
/* Since vptern can only take 3x vectors fastest to do 1 vec
|
2023-05-23 03:57:01 +00:00
|
|
|
separately with EVEX vpcmp. */
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# ifdef USE_AS_WMEMCHR
|
|
|
|
/* vptern can only accept masks for epi32/epi64 so can only save
|
2022-10-19 00:44:03 +00:00
|
|
|
instruction using not equals mask on vptern with wmemchr.
|
|
|
|
*/
|
|
|
|
VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# endif
|
|
|
|
/* Compare 3x with vpcmpeq and or them all together with vptern.
|
|
|
|
*/
|
2022-10-19 00:44:03 +00:00
|
|
|
VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
|
|
|
|
VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
|
|
|
|
VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# ifdef USE_AS_WMEMCHR
|
2022-10-19 00:44:03 +00:00
|
|
|
/* This takes the not of or between VEC_lo(2), VEC_lo(3),
|
|
|
|
VEC_lo(4) as well as combines result from VEC(0) with zero
|
|
|
|
mask. */
|
|
|
|
vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
|
|
|
|
vpmovmskb %VMM_lo(4), %VRCX
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
|
|
|
|
VEC_lo(4). */
|
|
|
|
vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
|
|
|
|
vpmovmskb %VMM_lo(4), %VRCX
|
|
|
|
KMOV %k1, %edx
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# endif
|
|
|
|
|
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
/* Loop version that uses EVEX encoding. */
|
|
|
|
VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
|
|
|
|
vpxorq (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
|
|
|
|
vpxorq (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
|
|
|
|
VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3
|
|
|
|
VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z}
|
|
|
|
VPTESTN %VMM(3), %VMM(3), %k2
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# endif
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
|
|
|
|
TEST_END ()
|
|
|
|
jnz L(loop_vec_ret)
|
2021-05-03 07:03:19 +00:00
|
|
|
|
|
|
|
subq $-(VEC_SIZE * 4), %rdi
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
subq $(CHAR_PER_VEC * 4), %rax
|
|
|
|
jae L(loop_4x_vec)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
/* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
|
2021-05-03 07:03:19 +00:00
|
|
|
*/
|
2022-10-19 00:44:03 +00:00
|
|
|
COND_VZEROUPPER
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
.p2align 4,, 10
|
|
|
|
L(last_4x_vec):
|
|
|
|
/* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
|
|
|
|
instructions on eax from here on out. */
|
|
|
|
# if CHAR_PER_VEC != 64
|
|
|
|
andl $(CHAR_PER_VEC * 4 - 1), %eax
|
|
|
|
# endif
|
|
|
|
VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0
|
|
|
|
subq $(VEC_SIZE * 1), %rdi
|
|
|
|
KMOV %k0, %VRDX
|
|
|
|
cmpb $(CHAR_PER_VEC * 2 - 1), %al
|
|
|
|
jbe L(last_2x_vec)
|
|
|
|
test %VRDX, %VRDX
|
|
|
|
jnz L(last_vec_x1_novzero)
|
|
|
|
|
|
|
|
VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
|
|
|
|
KMOV %k0, %VRDX
|
|
|
|
test %VRDX, %VRDX
|
|
|
|
jnz L(last_vec_x2_novzero)
|
|
|
|
|
|
|
|
VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
|
|
|
|
KMOV %k0, %VRCX
|
|
|
|
test %VRCX, %VRCX
|
|
|
|
jnz L(first_vec_x3_check)
|
|
|
|
|
|
|
|
subb $(CHAR_PER_VEC * 3), %al
|
|
|
|
jae L(last_vec_check)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-06-07 04:11:34 +00:00
|
|
|
xorl %eax, %eax
|
|
|
|
ret
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
|
|
|
|
L(last_vec_x2_novzero):
|
|
|
|
addq $VEC_SIZE, %rdi
|
|
|
|
L(last_vec_x1_novzero):
|
|
|
|
bsf %VRDX, %VRDX
|
|
|
|
leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
|
2021-05-03 07:03:19 +00:00
|
|
|
ret
|
2022-10-19 00:44:03 +00:00
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# if CHAR_PER_VEC == 64
|
|
|
|
/* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
|
2023-05-23 03:57:01 +00:00
|
|
|
64 it needs a separate return label. */
|
2022-10-19 00:44:03 +00:00
|
|
|
.p2align 4,, 4
|
|
|
|
L(last_vec_x2):
|
|
|
|
L(last_vec_x2_novzero):
|
|
|
|
bsf %VRDX, %VRDX
|
|
|
|
leaq (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
|
|
|
|
ret
|
2021-05-03 07:03:19 +00:00
|
|
|
# endif
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
.p2align 4,, 4
|
|
|
|
L(loop_vec_ret):
|
|
|
|
# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
|
|
|
|
KMOV %k1, %VRAX
|
|
|
|
inc %MASK_GPR(rax)
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
test %VRDX, %VRDX
|
2021-05-03 07:03:19 +00:00
|
|
|
# endif
|
2022-10-19 00:44:03 +00:00
|
|
|
jnz L(last_vec_x0)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
# if USE_TERN_IN_LOOP
|
|
|
|
vpmovmskb %VMM_lo(2), %VRDX
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
VPTESTN %VMM(2), %VMM(2), %k1
|
|
|
|
KMOV %k1, %VRDX
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# endif
|
2022-10-19 00:44:03 +00:00
|
|
|
test %VRDX, %VRDX
|
|
|
|
jnz L(last_vec_x1)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# if USE_TERN_IN_LOOP
|
|
|
|
vpmovmskb %VMM_lo(3), %VRDX
|
2021-05-03 07:03:19 +00:00
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
KMOV %k2, %VRDX
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
|
|
|
|
(only if used VEX encoded loop). */
|
|
|
|
COND_VZEROUPPER
|
|
|
|
|
2023-05-23 03:57:01 +00:00
|
|
|
/* Separate logic for CHAR_PER_VEC == 64 vs the rest. For
|
|
|
|
CHAR_PER_VEC we test the last 2x VEC separately, for
|
2022-10-19 00:44:03 +00:00
|
|
|
CHAR_PER_VEC <= 32 we can combine the results from the 2x
|
|
|
|
VEC in a single GPR. */
|
|
|
|
# if CHAR_PER_VEC == 64
|
|
|
|
# if USE_TERN_IN_LOOP
|
|
|
|
# error "Unsupported"
|
|
|
|
# endif
|
|
|
|
|
|
|
|
|
|
|
|
/* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */
|
|
|
|
test %VRDX, %VRDX
|
|
|
|
jnz L(last_vec_x2)
|
|
|
|
KMOV %k3, %VRDX
|
2021-05-03 07:03:19 +00:00
|
|
|
# else
|
2022-10-19 00:44:03 +00:00
|
|
|
/* CHAR_PER_VEC <= 32 so we can combine the results from the
|
|
|
|
last 2x VEC. */
|
|
|
|
|
|
|
|
# if !USE_TERN_IN_LOOP
|
|
|
|
KMOV %k3, %VRCX
|
|
|
|
# endif
|
|
|
|
salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx
|
|
|
|
addq %rcx, %rdx
|
|
|
|
# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
|
|
|
|
L(last_vec_x2_novzero):
|
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
# endif
|
2022-10-19 00:44:03 +00:00
|
|
|
bsf %rdx, %rdx
|
|
|
|
leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
.p2align 4,, 8
|
|
|
|
L(last_vec_x1):
|
|
|
|
COND_VZEROUPPER
|
|
|
|
# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
|
|
|
|
L(last_vec_x1_novzero):
|
|
|
|
# endif
|
|
|
|
bsf %VRDX, %VRDX
|
|
|
|
leaq (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
.p2align 4,, 4
|
|
|
|
L(last_vec_x0):
|
|
|
|
COND_VZEROUPPER
|
|
|
|
bsf %VGPR(GPR_X0), %VGPR(GPR_X0)
|
|
|
|
# if GPR_X0_IS_RET
|
|
|
|
addq %rdi, %rax
|
|
|
|
# else
|
|
|
|
leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax
|
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
.p2align 4,, 6
|
|
|
|
L(page_cross):
|
|
|
|
/* Need to preserve eax to compute inbound bytes we are
|
|
|
|
checking. */
|
|
|
|
# ifdef USE_AS_WMEMCHR
|
|
|
|
movl %eax, %ecx
|
|
|
|
# else
|
|
|
|
xorl %ecx, %ecx
|
|
|
|
subl %eax, %ecx
|
x86: Add EVEX optimized memchr family not safe for RTM
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
2021-05-04 23:02:40 +00:00
|
|
|
# endif
|
2021-05-03 07:03:19 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
xorq %rdi, %rax
|
|
|
|
VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
|
|
|
|
KMOV %k0, %VRAX
|
2021-05-03 07:03:19 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# ifdef USE_AS_WMEMCHR
|
|
|
|
/* NB: Divide by CHAR_SIZE to shift out out of bounds bytes. */
|
|
|
|
shrl $2, %ecx
|
|
|
|
andl $(CHAR_PER_VEC - 1), %ecx
|
|
|
|
# endif
|
2021-05-03 07:03:19 +00:00
|
|
|
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
|
2021-05-03 07:03:19 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
# ifdef USE_AS_WMEMCHR
|
|
|
|
negl %ecx
|
|
|
|
# endif
|
2021-05-03 07:03:19 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
/* mask lower bits from ecx (negative eax) to get bytes till
|
|
|
|
next VEC. */
|
|
|
|
andl $(CHAR_PER_VEC - 1), %ecx
|
2021-05-03 07:03:19 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
/* Check if VEC is entirely contained in the remainder of the
|
|
|
|
page. */
|
|
|
|
cmpq %rcx, %rdx
|
|
|
|
jbe L(page_cross_ret)
|
2021-05-03 07:03:19 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
/* Length crosses the page so if rax is zero (no matches)
|
|
|
|
continue. */
|
|
|
|
test %VRAX, %VRAX
|
|
|
|
jz L(page_cross_continue)
|
2021-03-05 14:24:52 +00:00
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
/* if rdx > rcx then any match here must be in [buf:buf + len].
|
|
|
|
*/
|
|
|
|
tzcnt %VRAX, %VRAX
|
|
|
|
# ifdef USE_AS_WMEMCHR
|
|
|
|
leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
|
|
# else
|
|
|
|
addq %rdi, %rax
|
|
|
|
# endif
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
|
|
|
|
2022-10-19 00:44:03 +00:00
|
|
|
.p2align 4,, 2
|
|
|
|
L(page_cross_zero):
|
|
|
|
xorl %eax, %eax
|
2021-03-05 14:24:52 +00:00
|
|
|
ret
|
2022-10-19 00:44:03 +00:00
|
|
|
|
|
|
|
.p2align 4,, 4
|
|
|
|
L(page_cross_ret):
|
|
|
|
/* Search is entirely contained in page cross case. */
|
|
|
|
# ifdef USE_AS_WMEMCHR
|
|
|
|
test %VRAX, %VRAX
|
|
|
|
jz L(page_cross_zero)
|
|
|
|
# endif
|
|
|
|
tzcnt %VRAX, %VRAX
|
|
|
|
cmpl %eax, %edx
|
|
|
|
jbe L(page_cross_zero)
|
|
|
|
# ifdef USE_AS_WMEMCHR
|
|
|
|
leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
|
|
# else
|
|
|
|
addq %rdi, %rax
|
2021-05-03 07:03:19 +00:00
|
|
|
# endif
|
2022-10-19 00:44:03 +00:00
|
|
|
ret
|
2021-03-05 14:24:52 +00:00
|
|
|
END (MEMCHR)
|
|
|
|
#endif
|