aarch64: Optimize string functions with shrn instruction

We found that string functions were using AND+ADDP
to find the nibble/syndrome mask but there is an easier
opportunity through `SHRN dst.8b, src.8h, 4` (shift
right every 2 bytes by 4 and narrow to 1 byte) and has
same latency on all SIMD ARMv8 targets as ADDP. There
are also possible gaps for memcmp but that's for
another patch.

We see 10-20% savings for small-mid size cases (<=128)
which are primary cases for general workloads.
This commit is contained in:
Danila Kutenin 2022-06-27 16:12:13 +00:00 committed by Szabolcs Nagy
parent bd0b58837c
commit 3c99806989
6 changed files with 59 additions and 102 deletions

View File

@ -41,24 +41,21 @@
#define synd x5 #define synd x5
#define shift x6 #define shift x6
#define tmp x7 #define tmp x7
#define wtmp w7
#define vrepchr v0 #define vrepchr v0
#define qdata q1 #define qdata q1
#define vdata v1 #define vdata v1
#define vhas_chr v2 #define vhas_chr v2
#define vrepmask v3 #define vend v3
#define vend v4 #define dend d3
#define dend d4
/* /*
Core algorithm: Core algorithm:
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the per byte. We take 4 bits of every comparison byte with shift right and narrow
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are by 4 instruction. Since the bits in the nibble mask reflect the order in
set likewise for odd bytes so that adjacent bytes can be merged. Since the which things occur in the original string, counting leading zeros identifies
bits in the syndrome reflect the order in which things occur in the original exactly which byte matched. */
string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (MEMCHR) ENTRY (MEMCHR)
PTR_ARG (0) PTR_ARG (0)
@ -67,12 +64,9 @@ ENTRY (MEMCHR)
cbz cntin, L(nomatch) cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src] ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin dup vrepchr.16b, chrin
mov wtmp, 0xf00f
dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
lsl shift, srcin, 2 lsl shift, srcin, 2
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend fmov synd, dend
lsr synd, synd, shift lsr synd, synd, shift
cbz synd, L(start_loop) cbz synd, L(start_loop)
@ -111,8 +105,7 @@ L(loop32_2):
fmov synd, dend fmov synd, dend
cbz synd, L(loop32) cbz synd, L(loop32)
L(end): L(end):
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend fmov synd, dend
add tmp, srcin, cntin add tmp, srcin, cntin
sub cntrem, tmp, src sub cntrem, tmp, src

View File

@ -37,7 +37,6 @@
#define synd x5 #define synd x5
#define shift x6 #define shift x6
#define tmp x7 #define tmp x7
#define wtmp w7
#define end x8 #define end x8
#define endm1 x9 #define endm1 x9
@ -45,18 +44,16 @@
#define qdata q1 #define qdata q1
#define vdata v1 #define vdata v1
#define vhas_chr v2 #define vhas_chr v2
#define vrepmask v3 #define vend v3
#define vend v4 #define dend d3
#define dend d4
/* /*
Core algorithm: Core algorithm:
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the per byte. We take 4 bits of every comparison byte with shift right and narrow
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are by 4 instruction. Since the bits in the nibble mask reflect the order in
set likewise for odd bytes so that adjacent bytes can be merged. Since the which things occur in the original string, counting leading zeros identifies
bits in the syndrome reflect the order in which things occur in the original exactly which byte matched. */
string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (__memrchr) ENTRY (__memrchr)
PTR_ARG (0) PTR_ARG (0)
@ -67,12 +64,9 @@ ENTRY (__memrchr)
cbz cntin, L(nomatch) cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src] ld1 {vdata.16b}, [src]
dup vrepchr.16b, chrin dup vrepchr.16b, chrin
mov wtmp, 0xf00f
dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
neg shift, end, lsl 2 neg shift, end, lsl 2
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend fmov synd, dend
lsl synd, synd, shift lsl synd, synd, shift
cbz synd, L(start_loop) cbz synd, L(start_loop)
@ -109,8 +103,7 @@ L(loop32_2):
fmov synd, dend fmov synd, dend
cbz synd, L(loop32) cbz synd, L(loop32)
L(end): L(end):
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend fmov synd, dend
add tmp, src, 15 add tmp, src, 15

View File

@ -33,38 +33,32 @@
#define src x2 #define src x2
#define tmp1 x1 #define tmp1 x1
#define tmp2 x3 #define tmp2 x3
#define tmp2w w3
#define vrepchr v0 #define vrepchr v0
#define vdata v1 #define vdata v1
#define qdata q1 #define qdata q1
#define vhas_nul v2 #define vhas_nul v2
#define vhas_chr v3 #define vhas_chr v3
#define vrepmask v4 #define vend v4
#define vend v5 #define dend d4
#define dend d5
/* Core algorithm: /*
Core algorithm:
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the per byte. We take 4 bits of every comparison byte with shift right and narrow
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are by 4 instruction. Since the bits in the nibble mask reflect the order in
set likewise for odd bytes so that adjacent bytes can be merged. Since the which things occur in the original string, counting leading zeros identifies
bits in the syndrome reflect the order in which things occur in the original exactly which byte matched. */
string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (__strchrnul) ENTRY (__strchrnul)
PTR_ARG (0) PTR_ARG (0)
bic src, srcin, 15 bic src, srcin, 15
dup vrepchr.16b, chrin dup vrepchr.16b, chrin
ld1 {vdata.16b}, [src] ld1 {vdata.16b}, [src]
mov tmp2w, 0xf00f
dup vrepmask.8h, tmp2w
cmeq vhas_chr.16b, vdata.16b, vrepchr.16b cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b cmhs vhas_chr.16b, vhas_chr.16b, vdata.16b
lsl tmp2, srcin, 2 lsl tmp2, srcin, 2
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov tmp1, dend fmov tmp1, dend
lsr tmp1, tmp1, tmp2 /* Mask padding bits. */ lsr tmp1, tmp1, tmp2 /* Mask padding bits. */
cbz tmp1, L(loop) cbz tmp1, L(loop)
@ -83,8 +77,7 @@ L(loop):
fmov tmp1, dend fmov tmp1, dend
cbz tmp1, L(loop) cbz tmp1, L(loop)
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov tmp1, dend fmov tmp1, dend
#ifndef __AARCH64EB__ #ifndef __AARCH64EB__
rbit tmp1, tmp1 rbit tmp1, tmp1

View File

@ -40,7 +40,6 @@
#define len x4 #define len x4
#define synd x4 #define synd x4
#define tmp x5 #define tmp x5
#define wtmp w5
#define shift x5 #define shift x5
#define data1 x6 #define data1 x6
#define dataw1 w6 #define dataw1 w6
@ -50,9 +49,8 @@
#define dataq q0 #define dataq q0
#define vdata v0 #define vdata v0
#define vhas_nul v1 #define vhas_nul v1
#define vrepmask v2 #define vend v2
#define vend v3 #define dend d2
#define dend d3
#define dataq2 q1 #define dataq2 q1
#ifdef BUILD_STPCPY #ifdef BUILD_STPCPY
@ -63,34 +61,29 @@
# define IFSTPCPY(X,...) # define IFSTPCPY(X,...)
#endif #endif
/* Core algorithm: /*
Core algorithm:
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the per byte. We take 4 bits of every comparison byte with shift right and narrow
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are by 4 instruction. Since the bits in the nibble mask reflect the order in
set likewise for odd bytes so that adjacent bytes can be merged. Since the which things occur in the original string, counting leading zeros identifies
bits in the syndrome reflect the order in which things occur in the original exactly which byte matched. */
string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (STRCPY) ENTRY (STRCPY)
PTR_ARG (0) PTR_ARG (0)
PTR_ARG (1) PTR_ARG (1)
bic src, srcin, 15 bic src, srcin, 15
mov wtmp, 0xf00f
ld1 {vdata.16b}, [src] ld1 {vdata.16b}, [src]
dup vrepmask.8h, wtmp
cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2 lsl shift, srcin, 2
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
addp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend fmov synd, dend
lsr synd, synd, shift lsr synd, synd, shift
cbnz synd, L(tail) cbnz synd, L(tail)
ldr dataq, [src, 16]! ldr dataq, [src, 16]!
cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_nul.16b, vdata.16b, 0
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
addp vend.16b, vhas_nul.16b, vhas_nul.16b
fmov synd, dend fmov synd, dend
cbz synd, L(start_loop) cbz synd, L(start_loop)
@ -162,8 +155,7 @@ L(loop):
fmov synd, dend fmov synd, dend
cbz synd, L(loop) cbz synd, L(loop)
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
fmov synd, dend fmov synd, dend
#ifndef __AARCH64EB__ #ifndef __AARCH64EB__
rbit synd, synd rbit synd, synd

View File

@ -34,35 +34,29 @@
#define src x1 #define src x1
#define synd x2 #define synd x2
#define tmp x3 #define tmp x3
#define wtmp w3
#define shift x4 #define shift x4
#define data q0 #define data q0
#define vdata v0 #define vdata v0
#define vhas_nul v1 #define vhas_nul v1
#define vrepmask v2 #define vend v2
#define vend v3 #define dend d2
#define dend d3
/* Core algorithm: /* Core algorithm:
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the per byte. We take 4 bits of every comparison byte with shift right and narrow
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are by 4 instruction. Since the bits in the nibble mask reflect the order in
set likewise for odd bytes so that adjacent bytes can be merged. Since the which things occur in the original string, counting trailing zeros identifies
bits in the syndrome reflect the order in which things occur in the original exactly which byte matched. */
string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (STRLEN) ENTRY (STRLEN)
PTR_ARG (0) PTR_ARG (0)
bic src, srcin, 15 bic src, srcin, 15
mov wtmp, 0xf00f
ld1 {vdata.16b}, [src] ld1 {vdata.16b}, [src]
dup vrepmask.8h, wtmp
cmeq vhas_nul.16b, vdata.16b, 0 cmeq vhas_nul.16b, vdata.16b, 0
lsl shift, srcin, 2 lsl shift, srcin, 2
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
fmov synd, dend fmov synd, dend
lsr synd, synd, shift lsr synd, synd, shift
cbz synd, L(loop) cbz synd, L(loop)
@ -80,8 +74,7 @@ L(loop):
fmov synd, dend fmov synd, dend
cbz synd, L(loop) cbz synd, L(loop)
and vhas_nul.16b, vhas_nul.16b, vrepmask.16b shrn vend.8b, vhas_nul.8h, 4 /* 128->64 */
addp vend.16b, vhas_nul.16b, vhas_nul.16b /* 128->64 */
sub result, src, srcin sub result, src, srcin
fmov synd, dend fmov synd, dend
#ifndef __AARCH64EB__ #ifndef __AARCH64EB__

View File

@ -33,39 +33,33 @@
#define src x2 #define src x2
#define synd x3 #define synd x3
#define shift x4 #define shift x4
#define wtmp w4
#define tmp x4 #define tmp x4
#define cntrem x5 #define cntrem x5
#define qdata q0 #define qdata q0
#define vdata v0 #define vdata v0
#define vhas_chr v1 #define vhas_chr v1
#define vrepmask v2 #define vend v2
#define vend v3 #define dend d2
#define dend d3
/* /*
Core algorithm: Core algorithm:
For each 16-byte chunk we calculate a 64-bit syndrome value with four bits For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
per byte. For even bytes, bits 0-3 are set if the relevant byte matched the per byte. We take 4 bits of every comparison byte with shift right and narrow
requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are by 4 instruction. Since the bits in the nibble mask reflect the order in
set likewise for odd bytes so that adjacent bytes can be merged. Since the which things occur in the original string, counting trailing zeros identifies
bits in the syndrome reflect the order in which things occur in the original exactly which byte matched. */
string, counting trailing zeros identifies exactly which byte matched. */
ENTRY (__strnlen) ENTRY (__strnlen)
PTR_ARG (0) PTR_ARG (0)
SIZE_ARG (1) SIZE_ARG (1)
bic src, srcin, 15 bic src, srcin, 15
mov wtmp, 0xf00f
cbz cntin, L(nomatch) cbz cntin, L(nomatch)
ld1 {vdata.16b}, [src], 16 ld1 {vdata.16b}, [src], 16
dup vrepmask.8h, wtmp
cmeq vhas_chr.16b, vdata.16b, 0 cmeq vhas_chr.16b, vdata.16b, 0
lsl shift, srcin, 2 lsl shift, srcin, 2
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend fmov synd, dend
lsr synd, synd, shift lsr synd, synd, shift
cbz synd, L(start_loop) cbz synd, L(start_loop)
@ -103,8 +97,7 @@ L(loop32_2):
cbz synd, L(loop32) cbz synd, L(loop32)
L(end): L(end):
and vhas_chr.16b, vhas_chr.16b, vrepmask.16b shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
addp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
sub src, src, 16 sub src, src, 16
mov synd, vend.d[0] mov synd, vend.d[0]
sub result, src, srcin sub result, src, srcin