[4/5] AArch64: Improve A64FX memset by removing unroll32

Remove unroll32 code since it doesn't improve performance.

Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
This commit is contained in:
Wilco Dijkstra 2021-08-10 13:44:27 +01:00
parent 186092c6ba
commit e69d9981f8
1 changed files with 1 additions and 17 deletions

View File

@ -102,22 +102,6 @@ L(vl_agnostic): // VL Agnostic
ccmp vector_length, tmp1, 0, cs
b.eq L(L1_prefetch)
L(unroll32):
lsl tmp1, vector_length, 3 // vector_length * 8
lsl tmp2, vector_length, 5 // vector_length * 32
.p2align 3
1: cmp rest, tmp2
b.cc L(unroll8)
st1b_unroll
add dst, dst, tmp1
st1b_unroll
add dst, dst, tmp1
st1b_unroll
add dst, dst, tmp1
st1b_unroll
add dst, dst, tmp1
sub rest, rest, tmp2
b 1b
L(unroll8):
lsl tmp1, vector_length, 3
@ -155,7 +139,7 @@ L(L1_prefetch): // if rest >= L1_SIZE
sub rest, rest, CACHE_LINE_SIZE * 2
cmp rest, L1_SIZE
b.ge 1b
cbnz rest, L(unroll32)
cbnz rest, L(unroll8)
ret
// count >= L2_SIZE