mirror of git://sourceware.org/git/glibc.git
aarch64: Optimized strlen for strlen_asimd
Optimize the strlen implementation by using vector operations and loop unrolling in main loop.Compared to __strlen_generic,it reduces latency of cases in bench-strlen by 7%~18% when the length of src is greater than 128 bytes, with gains throughout the benchmark. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
parent
0db8e7b366
commit
c2150769d0
|
@ -34,7 +34,9 @@ extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
|
||||||
extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
|
extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
|
||||||
|
|
||||||
libc_ifunc (__strlen,
|
libc_ifunc (__strlen,
|
||||||
(USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic));
|
(USE_ASIMD_STRLEN () || IS_KUNPENG(midr)
|
||||||
|
? __strlen_asimd
|
||||||
|
:__strlen_generic));
|
||||||
|
|
||||||
# undef strlen
|
# undef strlen
|
||||||
strong_alias (__strlen, strlen);
|
strong_alias (__strlen, strlen);
|
||||||
|
|
|
@ -48,6 +48,9 @@
|
||||||
#define dataq2 q3
|
#define dataq2 q3
|
||||||
#define datav2 v3
|
#define datav2 v3
|
||||||
|
|
||||||
|
#define REP8_01 0x0101010101010101
|
||||||
|
#define REP8_7f 0x7f7f7f7f7f7f7f7f
|
||||||
|
|
||||||
#ifdef TEST_PAGE_CROSS
|
#ifdef TEST_PAGE_CROSS
|
||||||
# define MIN_PAGE_SIZE 16
|
# define MIN_PAGE_SIZE 16
|
||||||
#else
|
#else
|
||||||
|
@ -82,40 +85,47 @@ ENTRY_ALIGN (__strlen_asimd, 6)
|
||||||
DELOUSE (0)
|
DELOUSE (0)
|
||||||
DELOUSE (1)
|
DELOUSE (1)
|
||||||
and tmp1, srcin, MIN_PAGE_SIZE - 1
|
and tmp1, srcin, MIN_PAGE_SIZE - 1
|
||||||
|
mov zeroones, REP8_01
|
||||||
cmp tmp1, MIN_PAGE_SIZE - 16
|
cmp tmp1, MIN_PAGE_SIZE - 16
|
||||||
b.gt L(page_cross)
|
b.gt L(page_cross)
|
||||||
ldr dataq, [srcin]
|
ldp data1, data2, [srcin]
|
||||||
#ifdef __AARCH64EB__
|
#ifdef __AARCH64EB__
|
||||||
rev64 datav.16b, datav.16b
|
rev data1, data1
|
||||||
|
rev data2, data2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Get the minimum value and keep going if it is not zero. */
|
sub tmp1, data1, zeroones
|
||||||
uminv datab2, datav.16b
|
orr tmp2, data1, REP8_7f
|
||||||
mov tmp1, datav2.d[0]
|
sub tmp3, data2, zeroones
|
||||||
cbnz tmp1, L(main_loop_entry)
|
orr tmp4, data2, REP8_7f
|
||||||
|
bics has_nul1, tmp1, tmp2
|
||||||
cmeq datav.16b, datav.16b, #0
|
bic has_nul2, tmp3, tmp4
|
||||||
mov data1, datav.d[0]
|
ccmp has_nul2, 0, 0, eq
|
||||||
mov data2, datav.d[1]
|
beq L(main_loop_entry)
|
||||||
cmp data1, 0
|
csel has_nul1, has_nul1, has_nul2, cc
|
||||||
csel data1, data1, data2, ne
|
|
||||||
mov len, 8
|
mov len, 8
|
||||||
rev data1, data1
|
rev has_nul1, has_nul1
|
||||||
clz tmp1, data1
|
clz tmp1, has_nul1
|
||||||
csel len, xzr, len, ne
|
csel len, xzr, len, cc
|
||||||
add len, len, tmp1, lsr 3
|
add len, len, tmp1, lsr 3
|
||||||
ret
|
ret
|
||||||
|
|
||||||
L(main_loop_entry):
|
L(main_loop_entry):
|
||||||
bic src, srcin, 15
|
bic src, srcin, 15
|
||||||
|
sub src, src, 16
|
||||||
|
|
||||||
L(main_loop):
|
L(main_loop):
|
||||||
ldr dataq, [src, 16]!
|
ldr dataq, [src, 32]!
|
||||||
L(page_cross_entry):
|
L(page_cross_entry):
|
||||||
/* Get the minimum value and keep going if it is not zero. */
|
/* Get the minimum value and keep going if it is not zero. */
|
||||||
uminv datab2, datav.16b
|
uminv datab2, datav.16b
|
||||||
mov tmp1, datav2.d[0]
|
mov tmp1, datav2.d[0]
|
||||||
|
cbz tmp1, L(tail)
|
||||||
|
ldr dataq, [src, 16]
|
||||||
|
uminv datab2, datav.16b
|
||||||
|
mov tmp1, datav2.d[0]
|
||||||
cbnz tmp1, L(main_loop)
|
cbnz tmp1, L(main_loop)
|
||||||
|
add src, src, 16
|
||||||
|
|
||||||
L(tail):
|
L(tail):
|
||||||
#ifdef __AARCH64EB__
|
#ifdef __AARCH64EB__
|
||||||
|
|
Loading…
Reference in New Issue