aarch64: Optimized strlen for strlen_asimd

Optimize the strlen implementation by using vector operations and
loop unrolling in main loop.Compared to __strlen_generic,it reduces
latency of cases in bench-strlen by 7%~18% when the length of src
is greater than 128 bytes, with gains throughout the benchmark.

Checked on aarch64-linux-gnu.

Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
Xuelei Zhang 2019-12-19 13:41:40 +00:00 committed by Adhemerval Zanella
parent 0db8e7b366
commit c2150769d0
2 changed files with 29 additions and 17 deletions

View File

@ -34,7 +34,9 @@ extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
libc_ifunc (__strlen,
(USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic));
(USE_ASIMD_STRLEN () || IS_KUNPENG(midr)
? __strlen_asimd
:__strlen_generic));
# undef strlen
strong_alias (__strlen, strlen);

View File

@ -48,6 +48,9 @@
#define dataq2 q3
#define datav2 v3
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#ifdef TEST_PAGE_CROSS
# define MIN_PAGE_SIZE 16
#else
@ -82,40 +85,47 @@ ENTRY_ALIGN (__strlen_asimd, 6)
DELOUSE (0)
DELOUSE (1)
and tmp1, srcin, MIN_PAGE_SIZE - 1
mov zeroones, REP8_01
cmp tmp1, MIN_PAGE_SIZE - 16
b.gt L(page_cross)
ldr dataq, [srcin]
ldp data1, data2, [srcin]
#ifdef __AARCH64EB__
rev64 datav.16b, datav.16b
rev data1, data1
rev data2, data2
#endif
/* Get the minimum value and keep going if it is not zero. */
uminv datab2, datav.16b
mov tmp1, datav2.d[0]
cbnz tmp1, L(main_loop_entry)
cmeq datav.16b, datav.16b, #0
mov data1, datav.d[0]
mov data2, datav.d[1]
cmp data1, 0
csel data1, data1, data2, ne
sub tmp1, data1, zeroones
orr tmp2, data1, REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
ccmp has_nul2, 0, 0, eq
beq L(main_loop_entry)
csel has_nul1, has_nul1, has_nul2, cc
mov len, 8
rev data1, data1
clz tmp1, data1
csel len, xzr, len, ne
rev has_nul1, has_nul1
clz tmp1, has_nul1
csel len, xzr, len, cc
add len, len, tmp1, lsr 3
ret
L(main_loop_entry):
bic src, srcin, 15
sub src, src, 16
L(main_loop):
ldr dataq, [src, 16]!
ldr dataq, [src, 32]!
L(page_cross_entry):
/* Get the minimum value and keep going if it is not zero. */
uminv datab2, datav.16b
mov tmp1, datav2.d[0]
cbz tmp1, L(tail)
ldr dataq, [src, 16]
uminv datab2, datav.16b
mov tmp1, datav2.d[0]
cbnz tmp1, L(main_loop)
add src, src, 16
L(tail):
#ifdef __AARCH64EB__