aarch64: Optimized strlen for strlen_asimd

Optimize the strlen implementation by using vector operations and
loop unrolling in main loop.Compared to __strlen_generic,it reduces
latency of cases in bench-strlen by 7%~18% when the length of src
is greater than 128 bytes, with gains throughout the benchmark.

Checked on aarch64-linux-gnu.

Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
This commit is contained in:
Xuelei Zhang 2019-12-19 13:41:40 +00:00 committed by Adhemerval Zanella
parent 0db8e7b366
commit c2150769d0
2 changed files with 29 additions and 17 deletions

View File

@ -34,7 +34,9 @@ extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden;
extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden; extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden;
libc_ifunc (__strlen, libc_ifunc (__strlen,
(USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic)); (USE_ASIMD_STRLEN () || IS_KUNPENG(midr)
? __strlen_asimd
:__strlen_generic));
# undef strlen # undef strlen
strong_alias (__strlen, strlen); strong_alias (__strlen, strlen);

View File

@ -48,6 +48,9 @@
#define dataq2 q3 #define dataq2 q3
#define datav2 v3 #define datav2 v3
#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#ifdef TEST_PAGE_CROSS #ifdef TEST_PAGE_CROSS
# define MIN_PAGE_SIZE 16 # define MIN_PAGE_SIZE 16
#else #else
@ -82,40 +85,47 @@ ENTRY_ALIGN (__strlen_asimd, 6)
DELOUSE (0) DELOUSE (0)
DELOUSE (1) DELOUSE (1)
and tmp1, srcin, MIN_PAGE_SIZE - 1 and tmp1, srcin, MIN_PAGE_SIZE - 1
mov zeroones, REP8_01
cmp tmp1, MIN_PAGE_SIZE - 16 cmp tmp1, MIN_PAGE_SIZE - 16
b.gt L(page_cross) b.gt L(page_cross)
ldr dataq, [srcin] ldp data1, data2, [srcin]
#ifdef __AARCH64EB__ #ifdef __AARCH64EB__
rev64 datav.16b, datav.16b rev data1, data1
rev data2, data2
#endif #endif
/* Get the minimum value and keep going if it is not zero. */ sub tmp1, data1, zeroones
uminv datab2, datav.16b orr tmp2, data1, REP8_7f
mov tmp1, datav2.d[0] sub tmp3, data2, zeroones
cbnz tmp1, L(main_loop_entry) orr tmp4, data2, REP8_7f
bics has_nul1, tmp1, tmp2
cmeq datav.16b, datav.16b, #0 bic has_nul2, tmp3, tmp4
mov data1, datav.d[0] ccmp has_nul2, 0, 0, eq
mov data2, datav.d[1] beq L(main_loop_entry)
cmp data1, 0 csel has_nul1, has_nul1, has_nul2, cc
csel data1, data1, data2, ne
mov len, 8 mov len, 8
rev data1, data1 rev has_nul1, has_nul1
clz tmp1, data1 clz tmp1, has_nul1
csel len, xzr, len, ne csel len, xzr, len, cc
add len, len, tmp1, lsr 3 add len, len, tmp1, lsr 3
ret ret
L(main_loop_entry): L(main_loop_entry):
bic src, srcin, 15 bic src, srcin, 15
sub src, src, 16
L(main_loop): L(main_loop):
ldr dataq, [src, 16]! ldr dataq, [src, 32]!
L(page_cross_entry): L(page_cross_entry):
/* Get the minimum value and keep going if it is not zero. */ /* Get the minimum value and keep going if it is not zero. */
uminv datab2, datav.16b uminv datab2, datav.16b
mov tmp1, datav2.d[0] mov tmp1, datav2.d[0]
cbz tmp1, L(tail)
ldr dataq, [src, 16]
uminv datab2, datav.16b
mov tmp1, datav2.d[0]
cbnz tmp1, L(main_loop) cbnz tmp1, L(main_loop)
add src, src, 16
L(tail): L(tail):
#ifdef __AARCH64EB__ #ifdef __AARCH64EB__