aarch64: Optimize string functions with shrn instruction

We found that string functions were using AND+ADDP to find the nibble/syndrome mask but there is an easier opportunity through `SHRN dst.8b, src.8h, 4` (shift right every 2 bytes by 4 and narrow to 1 byte) and has same latency on all SIMD ARMv8 targets as ADDP. There are also possible gaps for memcmp but that's for another patch. We see 10-20% savings for small-mid size cases (<=128) which are primary cases for general workloads.
2022-06-27 16:12:13 +00:00 · 2022-06-27 16:12:13 +00:00 · 3c99806989
parent bd0b58837c
commit 3c99806989
6 changed files with 59 additions and 102 deletions
--- a/sysdeps/aarch64/memchr.S
+++ b/sysdeps/aarch64/memchr.S
@ -41,24 +41,21 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
 #define wtmp		w7
 #define vrepchr		v0
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
+#define vend		v3
-#define vend		v4
+#define dend		d3
 #define dend		d4
 /*
   Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   which things occur in the original string, counting leading zeros identifies
-   bits in the syndrome reflect the order in which things occur in the original
+   exactly which byte matched.  */
   string, counting trailing zeros identifies exactly which byte matched.  */
 ENTRY (MEMCHR)
 	PTR_ARG (0)
@ -67,12 +64,9 @@ ENTRY (MEMCHR)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
 	mov	wtmp, 0xf00f
 	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@ -111,8 +105,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	add	tmp, srcin, cntin
 	sub	cntrem, tmp, src
--- a/sysdeps/aarch64/memrchr.S
+++ b/sysdeps/aarch64/memrchr.S
@ -37,7 +37,6 @@
 #define synd		x5
 #define shift		x6
 #define	tmp		x7
 #define wtmp		w7
 #define end		x8
 #define endm1		x9
@ -45,18 +44,16 @@
 #define qdata		q1
 #define vdata		v1
 #define vhas_chr	v2
-#define vrepmask	v3
+#define vend		v3
-#define vend		v4
+#define dend		d3
 #define dend		d4
 /*
   Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   which things occur in the original string, counting leading zeros identifies
-   bits in the syndrome reflect the order in which things occur in the original
+   exactly which byte matched.  */
   string, counting trailing zeros identifies exactly which byte matched.  */
 ENTRY (__memrchr)
 	PTR_ARG (0)
@ -67,12 +64,9 @@ ENTRY (__memrchr)
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src]
 	dup	vrepchr.16b, chrin
 	mov	wtmp, 0xf00f
 	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	neg	shift, end, lsl 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b            /* 128->64 */
 	fmov	synd, dend
 	lsl	synd, synd, shift
 	cbz	synd, L(start_loop)
@ -109,8 +103,7 @@ L(loop32_2):
 	fmov	synd, dend
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	add	tmp, src, 15
--- a/sysdeps/aarch64/strchrnul.S
+++ b/sysdeps/aarch64/strchrnul.S
@ -33,38 +33,32 @@
 #define src		x2
 #define tmp1		x1
 #define tmp2		x3
 #define tmp2w		w3
 #define vrepchr		v0
 #define vdata		v1
 #define qdata		q1
 #define vhas_nul	v2
 #define vhas_chr	v3
-#define vrepmask	v4
+#define vend		v4
-#define vend		v5
+#define dend		d4
 #define dend		d5
-/* Core algorithm:
+/*
-
+   Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   which things occur in the original string, counting leading zeros identifies
-   bits in the syndrome reflect the order in which things occur in the original
+   exactly which byte matched.  */
   string, counting trailing zeros identifies exactly which byte matched.  */
 ENTRY (__strchrnul)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	dup	vrepchr.16b, chrin
 	ld1	{vdata.16b}, [src]
 	mov	tmp2w, 0xf00f
 	dup	vrepmask.8h, tmp2w
 	cmeq	vhas_chr.16b, vdata.16b, vrepchr.16b
 	cmhs	vhas_chr.16b, vhas_chr.16b, vdata.16b
 	lsl	tmp2, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	tmp1, dend
 	lsr	tmp1, tmp1, tmp2	/* Mask padding bits.  */
 	cbz	tmp1, L(loop)
@ -83,8 +77,7 @@ L(loop):
 	fmov	tmp1, dend
 	cbz	tmp1, L(loop)
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	tmp1, dend
 #ifndef __AARCH64EB__
 	rbit	tmp1, tmp1
--- a/sysdeps/aarch64/strcpy.S
+++ b/sysdeps/aarch64/strcpy.S
@ -40,7 +40,6 @@
 #define len		x4
 #define synd		x4
 #define	tmp		x5
 #define wtmp		w5
 #define shift		x5
 #define data1		x6
 #define dataw1		w6
@ -50,9 +49,8 @@
 #define dataq		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
+#define vend		v2
-#define vend		v3
+#define dend		d2
 #define dend		d3
 #define dataq2		q1
 #ifdef BUILD_STPCPY
@ -63,34 +61,29 @@
 # define IFSTPCPY(X,...)
 #endif
-/* Core algorithm:
+/*
-
+   Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   which things occur in the original string, counting leading zeros identifies
-   bits in the syndrome reflect the order in which things occur in the original
+   exactly which byte matched.  */
   string, counting trailing zeros identifies exactly which byte matched.  */
 ENTRY (STRCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	bic	src, srcin, 15
 	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
 	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbnz	synd, L(tail)
 	ldr	dataq, [src, 16]!
 	cmeq	vhas_nul.16b, vdata.16b, 0
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_nul.16b, vhas_nul.16b
 	fmov	synd, dend
 	cbz	synd, L(start_loop)
@ -162,8 +155,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
 	fmov	synd, dend
 #ifndef __AARCH64EB__
 	rbit	synd, synd
--- a/sysdeps/aarch64/strlen.S
+++ b/sysdeps/aarch64/strlen.S
@ -34,35 +34,29 @@
 #define src		x1
 #define	synd		x2
 #define tmp		x3
 #define wtmp		w3
 #define shift		x4
 #define data		q0
 #define vdata		v0
 #define vhas_nul	v1
-#define vrepmask	v2
+#define vend		v2
-#define vend		v3
+#define dend		d2
 #define dend		d3
 /* Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   which things occur in the original string, counting trailing zeros identifies
-   bits in the syndrome reflect the order in which things occur in the original
+   exactly which byte matched.  */
   string, counting trailing zeros identifies exactly which byte matched.  */
 ENTRY (STRLEN)
 	PTR_ARG (0)
 	bic	src, srcin, 15
 	mov	wtmp, 0xf00f
 	ld1	{vdata.16b}, [src]
 	dup	vrepmask.8h, wtmp
 	cmeq	vhas_nul.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(loop)
@ -80,8 +74,7 @@ L(loop):
 	fmov	synd, dend
 	cbz	synd, L(loop)
-	and	vhas_nul.16b, vhas_nul.16b, vrepmask.16b
+	shrn	vend.8b, vhas_nul.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_nul.16b, vhas_nul.16b		/* 128->64 */
 	sub	result, src, srcin
 	fmov	synd, dend
 #ifndef __AARCH64EB__
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@ -33,39 +33,33 @@
 #define src		x2
 #define synd		x3
 #define	shift		x4
 #define wtmp		w4
 #define tmp		x4
 #define cntrem		x5
 #define qdata		q0
 #define vdata		v0
 #define vhas_chr	v1
-#define vrepmask	v2
+#define vend		v2
-#define vend		v3
+#define dend		d2
 #define dend		d3
 /*
   Core algorithm:
-   For each 16-byte chunk we calculate a 64-bit syndrome value with four bits
+   For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
-   per byte. For even bytes, bits 0-3 are set if the relevant byte matched the
+   per byte. We take 4 bits of every comparison byte with shift right and narrow
-   requested character or the byte is NUL. Bits 4-7 must be zero. Bits 4-7 are
+   by 4 instruction. Since the bits in the nibble mask reflect the order in
-   set likewise for odd bytes so that adjacent bytes can be merged. Since the
+   which things occur in the original string, counting trailing zeros identifies
-   bits in the syndrome reflect the order in which things occur in the original
+   exactly which byte matched.  */
   string, counting trailing zeros identifies exactly which byte matched.  */
 ENTRY (__strnlen)
 	PTR_ARG (0)
 	SIZE_ARG (1)
 	bic	src, srcin, 15
 	mov	wtmp, 0xf00f
 	cbz	cntin, L(nomatch)
 	ld1	{vdata.16b}, [src], 16
 	dup	vrepmask.8h, wtmp
 	cmeq	vhas_chr.16b, vdata.16b, 0
 	lsl	shift, srcin, 2
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	fmov	synd, dend
 	lsr	synd, synd, shift
 	cbz	synd, L(start_loop)
@ -103,8 +97,7 @@ L(loop32_2):
 	cbz	synd, L(loop32)
 L(end):
-	and	vhas_chr.16b, vhas_chr.16b, vrepmask.16b
+	shrn	vend.8b, vhas_chr.8h, 4		/* 128->64 */
 	addp	vend.16b, vhas_chr.16b, vhas_chr.16b		/* 128->64 */
 	sub	src, src, 16
 	mov	synd, vend.d[0]
 	sub	result, src, srcin