[AArch64] Optimized memcmp.

This is an optimized memcmp for AArch64.  This is a complete rewrite
using a different algorithm.  The previous version split into cases
where both inputs were aligned, the inputs were mutually aligned and
unaligned using a byte loop.  The new version combines all these cases,
while small inputs of less than 8 bytes are handled separately.

This allows the main code to be sped up using unaligned loads since
there are now at least 8 bytes to be compared.  After the first 8 bytes,
align the first input.  This ensures each iteration does at most one
unaligned access and mutually aligned inputs behave as aligned.
After the main loop, process the last 8 bytes using unaligned accesses.

This improves performance of (mutually) aligned cases by 25% and
unaligned by >500% (yes >6 times faster) on large inputs.

	* sysdeps/aarch64/memcmp.S (memcmp):
	Rewrite of optimized memcmp.
This commit is contained in:
Wilco Dijkstra 2017-08-10 17:00:38 +01:00
parent 2449ae7b2d
commit 922369032c
2 changed files with 70 additions and 99 deletions

View File

@ -1,3 +1,8 @@
2017-08-10 Wilco Dijkstra <wdijkstr@arm.com>
* sysdeps/aarch64/memcmp.S (memcmp):
Rewrite of optimized memcmp.
2017-08-10 Florian Weimer <fweimer@redhat.com> 2017-08-10 Florian Weimer <fweimer@redhat.com>
Introduce ld.so exceptions. Introduce ld.so exceptions.

View File

@ -22,132 +22,98 @@
/* Assumptions: /* Assumptions:
* *
* ARMv8-a, AArch64 * ARMv8-a, AArch64, unaligned accesses.
*/ */
/* Parameters and result. */ /* Parameters and result. */
#define src1 x0 #define src1 x0
#define src2 x1 #define src2 x1
#define limit x2 #define limit x2
#define result x0 #define result w0
/* Internal variables. */ /* Internal variables. */
#define data1 x3 #define data1 x3
#define data1w w3 #define data1w w3
#define data2 x4 #define data2 x4
#define data2w w4 #define data2w w4
#define has_nul x5 #define tmp1 x5
#define diff x6
#define endloop x7
#define tmp1 x8
#define tmp2 x9
#define tmp3 x10
#define pos x11
#define limit_wd x12
#define mask x13
ENTRY_ALIGN (memcmp, 6) ENTRY_ALIGN (memcmp, 6)
DELOUSE (0) DELOUSE (0)
DELOUSE (1) DELOUSE (1)
DELOUSE (2) DELOUSE (2)
cbz limit, L(ret0)
eor tmp1, src1, src2
tst tmp1, #7
b.ne L(misaligned8)
ands tmp1, src1, #7
b.ne L(mutual_align)
add limit_wd, limit, #7
lsr limit_wd, limit_wd, #3
/* Start of performance-critical section -- one 64B cache line. */
L(loop_aligned):
ldr data1, [src1], #8
ldr data2, [src2], #8
L(start_realigned):
subs limit_wd, limit_wd, #1
eor diff, data1, data2 /* Non-zero if differences found. */
csinv endloop, diff, xzr, ne /* Last Dword or differences. */
cbz endloop, L(loop_aligned)
/* End of performance-critical section -- one 64B cache line. */
/* Not reached the limit, must have found a diff. */ subs limit, limit, 8
cbnz limit_wd, L(not_limit) b.lo .Lless8
/* Limit % 8 == 0 => all bytes significant. */ /* Limit >= 8, so check first 8 bytes using unaligned loads. */
ands limit, limit, #7 ldr data1, [src1], 8
b.eq L(not_limit) ldr data2, [src2], 8
and tmp1, src1, 7
add limit, limit, tmp1
cmp data1, data2
bne .Lreturn
lsl limit, limit, #3 /* Bits -> bytes. */ /* Align src1 and adjust src2 with bytes not yet done. */
mov mask, #~0 sub src1, src1, tmp1
#ifdef __AARCH64EB__ sub src2, src2, tmp1
lsr mask, mask, limit
#else
lsl mask, mask, limit
#endif
bic data1, data1, mask
bic data2, data2, mask
orr diff, diff, mask subs limit, limit, 8
L(not_limit): b.ls .Llast_bytes
#ifndef __AARCH64EB__ /* Loop performing 8 bytes per iteration using aligned src1.
rev diff, diff Limit is pre-decremented by 8 and must be larger than zero.
Exit if <= 8 bytes left to do or if the data is not equal. */
.p2align 4
.Lloop8:
ldr data1, [src1], 8
ldr data2, [src2], 8
subs limit, limit, 8
ccmp data1, data2, 0, hi /* NZCV = 0b0000. */
b.eq .Lloop8
cmp data1, data2
bne .Lreturn
/* Compare last 1-8 bytes using unaligned access. */
.Llast_bytes:
ldr data1, [src1, limit]
ldr data2, [src2, limit]
/* Compare data bytes and set return value to 0, -1 or 1. */
.Lreturn:
#ifndef __AARCH64EB__
rev data1, data1 rev data1, data1
rev data2, data2 rev data2, data2
#endif #endif
/* The MS-non-zero bit of DIFF marks either the first bit cmp data1, data2
that is different, or the end of the significant data. .Lret_eq:
Shifting left now will bring the critical information into the cset result, ne
top bits. */ cneg result, result, lo
clz pos, diff ret
lsl data1, data1, pos
lsl data2, data2, pos
/* But we need to zero-extend (char is unsigned) the value and then
perform a signed 32-bit subtraction. */
lsr data1, data1, #56
sub result, data1, data2, lsr #56
RET
L(mutual_align): .p2align 4
/* Sources are mutually aligned, but are not currently at an /* Compare up to 8 bytes. Limit is [-8..-1]. */
alignment boundary. Round down the addresses and then mask off .Lless8:
the bytes that precede the start point. */ adds limit, limit, 4
bic src1, src1, #7 b.lo .Lless4
bic src2, src2, #7 ldr data1w, [src1], 4
add limit, limit, tmp1 /* Adjust the limit for the extra. */ ldr data2w, [src2], 4
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */ cmp data1w, data2w
ldr data1, [src1], #8 b.ne .Lreturn
neg tmp1, tmp1 /* Bits to alignment -64. */ sub limit, limit, 4
ldr data2, [src2], #8 .Lless4:
mov tmp2, #~0 adds limit, limit, 4
#ifdef __AARCH64EB__ beq .Lret_eq
/* Big-endian. Early bytes are at MSB. */ .Lbyte_loop:
lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ ldrb data1w, [src1], 1
#else ldrb data2w, [src2], 1
/* Little-endian. Early bytes are at LSB. */ subs limit, limit, 1
lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
#endif b.eq .Lbyte_loop
add limit_wd, limit, #7 sub result, data1w, data2w
orr data1, data1, tmp2 ret
orr data2, data2, tmp2
lsr limit_wd, limit_wd, #3
b L(start_realigned)
L(ret0):
mov result, #0
RET
.p2align 6
L(misaligned8):
sub limit, limit, #1
1:
/* Perhaps we can do better than this. */
ldrb data1w, [src1], #1
ldrb data2w, [src2], #1
subs limit, limit, #1
ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
b.eq 1b
sub result, data1, data2
RET
END (memcmp) END (memcmp)
#undef bcmp #undef bcmp
weak_alias (memcmp, bcmp) weak_alias (memcmp, bcmp)