[AArch64] Optimized memcmp.

This is an optimized memcmp for AArch64. This is a complete rewrite using a different algorithm. The previous version split into cases where both inputs were aligned, the inputs were mutually aligned and unaligned using a byte loop. The new version combines all these cases, while small inputs of less than 8 bytes are handled separately. This allows the main code to be sped up using unaligned loads since there are now at least 8 bytes to be compared. After the first 8 bytes, align the first input. This ensures each iteration does at most one unaligned access and mutually aligned inputs behave as aligned. After the main loop, process the last 8 bytes using unaligned accesses. This improves performance of (mutually) aligned cases by 25% and unaligned by >500% (yes >6 times faster) on large inputs. * sysdeps/aarch64/memcmp.S (memcmp): Rewrite of optimized memcmp.
2017-08-10 17:00:38 +01:00 · 2017-08-10 17:00:38 +01:00 · 922369032c
parent 2449ae7b2d
commit 922369032c
2 changed files with 70 additions and 99 deletions
--- a/5
+++ b/5
@ -1,3 +1,8 @@
 2017-08-10  Wilco Dijkstra  <wdijkstr@arm.com>
 	* sysdeps/aarch64/memcmp.S (memcmp):
 	Rewrite of optimized memcmp.
 2017-08-10  Florian Weimer  <fweimer@redhat.com>
 	Introduce ld.so exceptions.
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@ -22,132 +22,98 @@
 /* Assumptions:
 *
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses.
 */
 /* Parameters and result.  */
 #define src1		x0
 #define src2		x1
 #define limit		x2
-#define result		x0
+#define result		w0
 /* Internal variables.  */
 #define data1		x3
 #define data1w		w3
 #define data2		x4
 #define data2w		w4
-#define has_nul		x5
+#define tmp1		x5
 #define diff		x6
 #define endloop		x7
 #define tmp1		x8
 #define tmp2		x9
 #define tmp3		x10
 #define pos		x11
 #define limit_wd	x12
 #define mask		x13
 ENTRY_ALIGN (memcmp, 6)
 	DELOUSE (0)
 	DELOUSE (1)
 	DELOUSE (2)
 	cbz	limit, L(ret0)
 	eor	tmp1, src1, src2
 	tst	tmp1, #7
 	b.ne	L(misaligned8)
 	ands	tmp1, src1, #7
 	b.ne	L(mutual_align)
 	add	limit_wd, limit, #7
 	lsr	limit_wd, limit_wd, #3
 	/* Start of performance-critical section  -- one 64B cache line.  */
 L(loop_aligned):
 	ldr	data1, [src1], #8
 	ldr	data2, [src2], #8
 L(start_realigned):
 	subs	limit_wd, limit_wd, #1
 	eor	diff, data1, data2	/* Non-zero if differences found.  */
 	csinv	endloop, diff, xzr, ne	/* Last Dword or differences.  */
 	cbz	endloop, L(loop_aligned)
 	/* End of performance-critical section  -- one 64B cache line.  */
-	/* Not reached the limit, must have found a diff.  */
+	subs	limit, limit, 8
-	cbnz	limit_wd, L(not_limit)
+	b.lo	.Lless8
-	/* Limit % 8 == 0 => all bytes significant.  */
+	/* Limit >= 8, so check first 8 bytes using unaligned loads.  */
-	ands	limit, limit, #7
+	ldr	data1, [src1], 8
-	b.eq	L(not_limit)
+	ldr	data2, [src2], 8
 	and	tmp1, src1, 7
 	add	limit, limit, tmp1
 	cmp	data1, data2
 	bne	.Lreturn
-	lsl	limit, limit, #3	/* Bits -> bytes.  */
+	/* Align src1 and adjust src2 with bytes not yet done.  */
-	mov	mask, #~0
+	sub	src1, src1, tmp1
-#ifdef __AARCH64EB__
+	sub	src2, src2, tmp1
 	lsr	mask, mask, limit
 #else
 	lsl	mask, mask, limit
 #endif
 	bic	data1, data1, mask
 	bic	data2, data2, mask
-	orr	diff, diff, mask
+	subs	limit, limit, 8
-L(not_limit):
+	b.ls	.Llast_bytes
-#ifndef	__AARCH64EB__
+	/* Loop performing 8 bytes per iteration using aligned src1.
-	rev	diff, diff
+	   Limit is pre-decremented by 8 and must be larger than zero.
 	   Exit if <= 8 bytes left to do or if the data is not equal.  */
 	.p2align 4
 .Lloop8:
 	ldr	data1, [src1], 8
 	ldr	data2, [src2], 8
 	subs	limit, limit, 8
 	ccmp	data1, data2, 0, hi  /* NZCV = 0b0000.  */
 	b.eq	.Lloop8
 	cmp	data1, data2
 	bne	.Lreturn
 	/* Compare last 1-8 bytes using unaligned access.  */
 .Llast_bytes:
 	ldr	data1, [src1, limit]
 	ldr	data2, [src2, limit]
 	/* Compare data bytes and set return value to 0, -1 or 1.  */
 .Lreturn:
 #ifndef __AARCH64EB__
 	rev	data1, data1
 	rev	data2, data2
 #endif
-	/* The MS-non-zero bit of DIFF marks either the first bit
+	cmp     data1, data2
-	   that is different, or the end of the significant data.
+.Lret_eq:
-	   Shifting left now will bring the critical information into the
+	cset	result, ne
-	   top bits.  */
+	cneg	result, result, lo
-	clz	pos, diff
+	ret
 	lsl	data1, data1, pos
 	lsl	data2, data2, pos
 	/* But we need to zero-extend (char is unsigned) the value and then
 	   perform a signed 32-bit subtraction.  */
 	lsr	data1, data1, #56
 	sub	result, data1, data2, lsr #56
 	RET
-L(mutual_align):
+	.p2align 4
-	/* Sources are mutually aligned, but are not currently at an
+	/* Compare up to 8 bytes.  Limit is [-8..-1].  */
-	   alignment boundary.  Round down the addresses and then mask off
+.Lless8:
-	   the bytes that precede the start point.  */
+	adds	limit, limit, 4
-	bic	src1, src1, #7
+	b.lo	.Lless4
-	bic	src2, src2, #7
+	ldr	data1w, [src1], 4
-	add	limit, limit, tmp1	/* Adjust the limit for the extra.  */
+	ldr	data2w, [src2], 4
-	lsl	tmp1, tmp1, #3		/* Bytes beyond alignment -> bits.  */
+	cmp	data1w, data2w
-	ldr	data1, [src1], #8
+	b.ne	.Lreturn
-	neg	tmp1, tmp1		/* Bits to alignment -64.  */
+	sub	limit, limit, 4
-	ldr	data2, [src2], #8
+.Lless4:
-	mov	tmp2, #~0
+	adds	limit, limit, 4
-#ifdef __AARCH64EB__
+	beq	.Lret_eq
-	/* Big-endian.  Early bytes are at MSB.  */
+.Lbyte_loop:
-	lsl	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	ldrb	data1w, [src1], 1
-#else
+	ldrb	data2w, [src2], 1
-	/* Little-endian.  Early bytes are at LSB.  */
+	subs	limit, limit, 1
-	lsr	tmp2, tmp2, tmp1	/* Shift (tmp1 & 63).  */
+	ccmp	data1w, data2w, 0, ne	/* NZCV = 0b0000.  */
-#endif
+	b.eq	.Lbyte_loop
-	add	limit_wd, limit, #7
+	sub	result, data1w, data2w
-	orr	data1, data1, tmp2
+	ret
 	orr	data2, data2, tmp2
 	lsr	limit_wd, limit_wd, #3
 	b	L(start_realigned)
 L(ret0):
 	mov	result, #0
 	RET
 	.p2align 6
 L(misaligned8):
 	sub	limit, limit, #1
 1:
 	/* Perhaps we can do better than this.  */
 	ldrb	data1w, [src1], #1
 	ldrb	data2w, [src2], #1
 	subs	limit, limit, #1
 	ccmp	data1w, data2w, #0, cs	/* NZCV = 0b0000.  */
 	b.eq	1b
 	sub	result, data1, data2
 	RET
 END (memcmp)
 #undef bcmp
 weak_alias (memcmp, bcmp)