mirror of git://sourceware.org/git/glibc.git
206 lines
4.3 KiB
ArmAsm
206 lines
4.3 KiB
ArmAsm
/* Optimized strcpy implementation for PowerPC64/POWER9.
|
|
Copyright (C) 2020-2023 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#include <sysdep.h>
|
|
|
|
#ifdef USE_AS_STPCPY
|
|
# ifndef STPCPY
|
|
# define FUNC_NAME __stpcpy
|
|
# else
|
|
# define FUNC_NAME STPCPY
|
|
# endif
|
|
#else
|
|
# ifndef STRCPY
|
|
# define FUNC_NAME strcpy
|
|
# else
|
|
# define FUNC_NAME STRCPY
|
|
# endif
|
|
#endif /* !USE_AS_STPCPY */
|
|
|
|
/* Implements the function
|
|
|
|
char * [r3] strcpy (char *dest [r3], const char *src [r4])
|
|
|
|
or
|
|
|
|
char * [r3] stpcpy (char *dest [r3], const char *src [r4])
|
|
|
|
if USE_AS_STPCPY is defined.
|
|
|
|
The implementation can load bytes past a null terminator, but only
|
|
up to the next 16B boundary, so it never crosses a page. */
|
|
|
|
/* Load quadword at addr+offset to vreg, check for null bytes,
|
|
and branch to label if any are found. */
|
|
#define CHECK16(vreg,offset,addr,label) \
|
|
lxv vreg+32,offset(addr); \
|
|
vcmpequb. v6,vreg,v18; \
|
|
bne cr6,L(label);
|
|
|
|
.machine power9
|
|
ENTRY_TOCLESS (FUNC_NAME, 4)
|
|
CALL_MCOUNT 2
|
|
|
|
vspltisb v18,0 /* Zeroes in v18 */
|
|
vspltisb v19,-1 /* 0xFF bytes in v19 */
|
|
|
|
/* Next 16B-aligned address. Prepare address for L(loop). */
|
|
addi r5,r4,16
|
|
clrrdi r5,r5,4
|
|
subf r8,r4,r5
|
|
add r11,r3,r8
|
|
|
|
/* Align data and fill bytes not loaded with non matching char. */
|
|
lvx v0,0,r4
|
|
lvsr v1,0,r4
|
|
vperm v0,v19,v0,v1
|
|
|
|
vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
|
|
beq cr6,L(no_null)
|
|
|
|
/* There's a null byte. */
|
|
vctzlsbb r8,v6 /* Number of trailing zeroes */
|
|
addi r9,r8,1 /* Add null byte. */
|
|
sldi r10,r9,56 /* stxvl wants size in top 8 bits. */
|
|
stxvl 32+v0,r3,r10 /* Partial store */
|
|
|
|
#ifdef USE_AS_STPCPY
|
|
/* stpcpy returns the dest address plus the size not counting the
|
|
final '\0'. */
|
|
add r3,r3,r8
|
|
#endif
|
|
blr
|
|
|
|
L(no_null):
|
|
sldi r10,r8,56 /* stxvl wants size in top 8 bits */
|
|
stxvl 32+v0,r3,r10 /* Partial store */
|
|
|
|
.p2align 4
|
|
L(loop):
|
|
CHECK16(v0,0,r5,tail1)
|
|
CHECK16(v1,16,r5,tail2)
|
|
CHECK16(v2,32,r5,tail3)
|
|
CHECK16(v3,48,r5,tail4)
|
|
CHECK16(v4,64,r5,tail5)
|
|
CHECK16(v5,80,r5,tail6)
|
|
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
stxv 32+v4,64(r11)
|
|
stxv 32+v5,80(r11)
|
|
|
|
addi r5,r5,96
|
|
addi r11,r11,96
|
|
|
|
b L(loop)
|
|
|
|
.p2align 4
|
|
L(tail1):
|
|
vctzlsbb r8,v6 /* Number of trailing zeroes */
|
|
addi r9,r8,1 /* Add null terminator */
|
|
sldi r9,r9,56 /* stxvl wants size in top 8 bits */
|
|
stxvl 32+v0,r11,r9 /* Partial store */
|
|
#ifdef USE_AS_STPCPY
|
|
/* stpcpy returns the dest address plus the size not counting the
|
|
final '\0'. */
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail2):
|
|
stxv 32+v0,0(r11)
|
|
vctzlsbb r8,v6
|
|
addi r9,r8,1
|
|
sldi r9,r9,56
|
|
addi r11,r11,16
|
|
stxvl 32+v1,r11,r9
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail3):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
vctzlsbb r8,v6
|
|
addi r9,r8,1
|
|
sldi r9,r9,56
|
|
addi r11,r11,32
|
|
stxvl 32+v2,r11,r9
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail4):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
vctzlsbb r8,v6
|
|
addi r9,r8,1
|
|
sldi r9,r9,56
|
|
addi r11,r11,48
|
|
stxvl 32+v3,r11,r9
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail5):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
vctzlsbb r8,v6
|
|
addi r9,r8,1
|
|
sldi r9,r9,56
|
|
addi r11,r11,64
|
|
stxvl 32+v4,r11,r9
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
.p2align 4
|
|
L(tail6):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
stxv 32+v4,64(r11)
|
|
vctzlsbb r8,v6
|
|
addi r9,r8,1
|
|
sldi r9,r9,56
|
|
addi r11,r11,80
|
|
stxvl 32+v5,r11,r9
|
|
#ifdef USE_AS_STPCPY
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
END (FUNC_NAME)
|
|
#ifndef USE_AS_STPCPY
|
|
libc_hidden_builtin_def (strcpy)
|
|
#endif
|