mirror of git://sourceware.org/git/glibc.git
				
				
				
			x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2
Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with a single vector compare instruction. It is as fast as SSE2 versions for size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast. NB: It uses TZCNT instead of BSF since TZCNT produces the same result as BSF for non-zero input. TZCNT is faster than BSF and is executed as BSF if machine doesn't support TZCNT. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Add tests for __strlen_avx2, __strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, __wcslen_sse2 and __wcsnlen_avx2. * sysdeps/x86_64/multiarch/strlen-avx2.S: New file. * sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strlen.c: Likewise. * sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/strnlen.c: Likewise. * sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. * sysdeps/x86_64/multiarch/wcslen.c: Likewise. * sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. * sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
This commit is contained in:
		
							parent
							
								
									2f5d20ac99
								
							
						
					
					
						commit
						dc485ceb2a
					
				
							
								
								
									
										23
									
								
								ChangeLog
								
								
								
								
							
							
						
						
									
										23
									
								
								ChangeLog
								
								
								
								
							|  | @ -1,3 +1,26 @@ | |||
| 2017-06-09  H.J. Lu  <hongjiu.lu@intel.com> | ||||
| 
 | ||||
| 	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add | ||||
| 	strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2, | ||||
| 	wcslen-sse2, wcslen-avx2 and wcsnlen-avx2. | ||||
| 	* sysdeps/x86_64/multiarch/ifunc-impl-list.c | ||||
| 	(__libc_ifunc_impl_list): Add tests for __strlen_avx2, | ||||
| 	__strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2, | ||||
| 	__wcslen_sse2 and __wcsnlen_avx2. | ||||
| 	* sysdeps/x86_64/multiarch/strlen-avx2.S: New file. | ||||
| 	* sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise. | ||||
| 	* sysdeps/x86_64/multiarch/strlen.c: Likewise. | ||||
| 	* sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise. | ||||
| 	* sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise. | ||||
| 	* sysdeps/x86_64/multiarch/strnlen.c: Likewise. | ||||
| 	* sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise. | ||||
| 	* sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise. | ||||
| 	* sysdeps/x86_64/multiarch/wcslen.c: Likewise. | ||||
| 	* sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise. | ||||
| 	* sysdeps/x86_64/multiarch/wcsnlen.c (OPTIMIZE (avx2)): New. | ||||
| 	(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX2 machines where | ||||
| 	vzeroupper is preferred and AVX unaligned load is fast. | ||||
| 
 | ||||
| 2017-06-09  H.J. Lu  <hongjiu.lu@intel.com> | ||||
| 
 | ||||
| 	* sysdeps/x86_64/memchr.S (MEMCHR): New.  Depending on if | ||||
|  |  | |||
|  | @ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ | |||
| 		   memcpy-ssse3-back \
 | ||||
| 		   memmove-ssse3-back \
 | ||||
| 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
 | ||||
| 		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
 | ||||
| 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 | ||||
| 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 | ||||
| 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 | ||||
|  | @ -36,7 +37,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ | |||
| 		   wmemcmp-avx2-movbe \
 | ||||
| 		   wmemchr-sse2 wmemchr-avx2 \
 | ||||
| 		   wcscpy-ssse3 wcscpy-c \
 | ||||
| 		   wcsnlen-sse4_1 wcsnlen-c | ||||
| 		   wcsnlen-sse4_1 wcsnlen-c \
 | ||||
| 		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 | ||||
| endif | ||||
| 
 | ||||
| ifeq ($(subdir),debug) | ||||
|  |  | |||
|  | @ -166,6 +166,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, | |||
| 			      __rawmemchr_avx2) | ||||
| 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) | ||||
| 
 | ||||
|   /* Support sysdeps/x86_64/multiarch/strlen.S.  */ | ||||
|   IFUNC_IMPL (i, name, strlen, | ||||
| 	      IFUNC_IMPL_ADD (array, i, strlen, | ||||
| 			      HAS_ARCH_FEATURE (AVX2_Usable), | ||||
| 			      __strlen_avx2) | ||||
| 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) | ||||
| 
 | ||||
|   /* Support sysdeps/x86_64/multiarch/strnlen.S.  */ | ||||
|   IFUNC_IMPL (i, name, strnlen, | ||||
| 	      IFUNC_IMPL_ADD (array, i, strnlen, | ||||
| 			      HAS_ARCH_FEATURE (AVX2_Usable), | ||||
| 			      __strnlen_avx2) | ||||
| 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) | ||||
| 
 | ||||
|   /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */ | ||||
|   IFUNC_IMPL (i, name, stpncpy, | ||||
| 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3), | ||||
|  | @ -310,8 +324,18 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, | |||
| 			      __wcscpy_ssse3) | ||||
| 	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2)) | ||||
| 
 | ||||
|   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */ | ||||
|   IFUNC_IMPL (i, name, wcslen, | ||||
| 	      IFUNC_IMPL_ADD (array, i, wcslen, | ||||
| 			      HAS_ARCH_FEATURE (AVX2_Usable), | ||||
| 			      __wcslen_avx2) | ||||
| 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) | ||||
| 
 | ||||
|   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */ | ||||
|   IFUNC_IMPL (i, name, wcsnlen, | ||||
| 	      IFUNC_IMPL_ADD (array, i, wcsnlen, | ||||
| 			      HAS_ARCH_FEATURE (AVX2_Usable), | ||||
| 			      __wcsnlen_avx2) | ||||
| 	      IFUNC_IMPL_ADD (array, i, wcsnlen, | ||||
| 			      HAS_CPU_FEATURE (SSE4_1), | ||||
| 			      __wcsnlen_sse4_1) | ||||
|  |  | |||
|  | @ -0,0 +1,394 @@ | |||
| /* strlen/strnlen/wcslen/wcsnlen optimized with AVX2. | ||||
|    Copyright (C) 2017 Free Software Foundation, Inc. | ||||
|    This file is part of the GNU C Library. | ||||
| 
 | ||||
|    The GNU C Library is free software; you can redistribute it and/or
 | ||||
|    modify it under the terms of the GNU Lesser General Public | ||||
|    License as published by the Free Software Foundation; either
 | ||||
|    version 2.1 of the License, or (at your option) any later version. | ||||
| 
 | ||||
|    The GNU C Library is distributed in the hope that it will be useful, | ||||
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|    Lesser General Public License for more details. | ||||
| 
 | ||||
|    You should have received a copy of the GNU Lesser General Public | ||||
|    License along with the GNU C Library; if not, see
 | ||||
|    <http://www.gnu.org/licenses/>.  */ | ||||
| 
 | ||||
| #if IS_IN (libc) | ||||
| 
 | ||||
| # include <sysdep.h> | ||||
| 
 | ||||
| # ifndef STRLEN | ||||
| #  define STRLEN	__strlen_avx2 | ||||
| # endif | ||||
| 
 | ||||
| # ifdef USE_AS_WCSLEN | ||||
| #  define VPCMPEQ	vpcmpeqd | ||||
| #  define VPMINU	vpminud | ||||
| # else | ||||
| #  define VPCMPEQ	vpcmpeqb | ||||
| #  define VPMINU	vpminub | ||||
| # endif | ||||
| 
 | ||||
| # ifndef VZEROUPPER | ||||
| #  define VZEROUPPER	vzeroupper | ||||
| # endif | ||||
| 
 | ||||
| # define VEC_SIZE 32 | ||||
| 
 | ||||
| 	.section .text.avx,"ax",@progbits
 | ||||
| ENTRY (STRLEN) | ||||
| # ifdef USE_AS_STRNLEN | ||||
| 	/* Check for zero length.  */ | ||||
| 	testq	%rsi, %rsi | ||||
| 	jz	L(zero) | ||||
| #  ifdef USE_AS_WCSLEN | ||||
| 	shl	$2, %rsi | ||||
| #  endif | ||||
| 	movq	%rsi, %r8 | ||||
| # endif | ||||
| 	movl	%edi, %ecx | ||||
| 	movq	%rdi, %rdx | ||||
| 	vpxor	%xmm0, %xmm0, %xmm0 | ||||
| 
 | ||||
| 	/* Check if we may cross page boundary with one vector load.  */ | ||||
| 	andl	$(2 * VEC_SIZE - 1), %ecx | ||||
| 	cmpl	$VEC_SIZE, %ecx | ||||
| 	ja	L(cros_page_boundary) | ||||
| 
 | ||||
| 	/* Check the first VEC_SIZE bytes.  */ | ||||
| 	VPCMPEQ (%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 
 | ||||
| # ifdef USE_AS_STRNLEN | ||||
| 	jnz	L(first_vec_x0_check) | ||||
| 	/* Adjust length and check the end of data.  */ | ||||
| 	subq	$VEC_SIZE, %rsi | ||||
| 	jbe	L(max) | ||||
| # else | ||||
| 	jnz	L(first_vec_x0) | ||||
| # endif | ||||
| 
 | ||||
| 	/* Align data for aligned loads in the loop.  */ | ||||
| 	addq	$VEC_SIZE, %rdi | ||||
| 	andl	$(VEC_SIZE - 1), %ecx | ||||
| 	andq	$-VEC_SIZE, %rdi | ||||
| 
 | ||||
| # ifdef USE_AS_STRNLEN | ||||
| 	/* Adjust length.  */ | ||||
| 	addq	%rcx, %rsi | ||||
| 
 | ||||
| 	subq	$(VEC_SIZE * 4), %rsi | ||||
| 	jbe	L(last_4x_vec_or_less) | ||||
| # endif | ||||
| 	jmp	L(more_4x_vec) | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(cros_page_boundary): | ||||
| 	andl	$(VEC_SIZE - 1), %ecx | ||||
| 	andq	$-VEC_SIZE, %rdi | ||||
| 	VPCMPEQ (%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	/* Remove the leading bytes.  */ | ||||
| 	sarl	%cl, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jz	L(aligned_more) | ||||
| 	tzcntl	%eax, %eax | ||||
| # ifdef USE_AS_STRNLEN | ||||
| 	/* Check the end of data.  */ | ||||
| 	cmpq	%rax, %rsi | ||||
| 	jbe	L(max) | ||||
| # endif | ||||
| 	addq	%rdi, %rax | ||||
| 	addq	%rcx, %rax | ||||
| 	subq	%rdx, %rax | ||||
| # ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| # endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(aligned_more): | ||||
| # ifdef USE_AS_STRNLEN | ||||
|         /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE" | ||||
| 	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" | ||||
| 	    to void possible addition overflow.  */ | ||||
| 	negq	%rcx | ||||
| 	addq	$VEC_SIZE, %rcx | ||||
| 
 | ||||
| 	/* Check the end of data.  */ | ||||
| 	subq	%rcx, %rsi | ||||
| 	jbe	L(max) | ||||
| # endif | ||||
| 
 | ||||
| 	addq	$VEC_SIZE, %rdi | ||||
| 
 | ||||
| # ifdef USE_AS_STRNLEN | ||||
| 	subq	$(VEC_SIZE * 4), %rsi | ||||
| 	jbe	L(last_4x_vec_or_less) | ||||
| # endif | ||||
| 
 | ||||
| L(more_4x_vec): | ||||
| 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time | ||||
| 	   since data is only aligned to VEC_SIZE.  */ | ||||
| 	VPCMPEQ (%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jnz	L(first_vec_x0) | ||||
| 
 | ||||
| 	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jnz	L(first_vec_x1) | ||||
| 
 | ||||
| 	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jnz	L(first_vec_x2) | ||||
| 
 | ||||
| 	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jnz	L(first_vec_x3) | ||||
| 
 | ||||
| 	addq	$(VEC_SIZE * 4), %rdi | ||||
| 
 | ||||
| # ifdef USE_AS_STRNLEN | ||||
| 	subq	$(VEC_SIZE * 4), %rsi | ||||
| 	jbe	L(last_4x_vec_or_less) | ||||
| # endif | ||||
| 
 | ||||
| 	/* Align data to 4 * VEC_SIZE.  */ | ||||
| 	movq	%rdi, %rcx | ||||
| 	andl	$(4 * VEC_SIZE - 1), %ecx | ||||
| 	andq	$-(4 * VEC_SIZE), %rdi | ||||
| 
 | ||||
| # ifdef USE_AS_STRNLEN | ||||
| 	/* Adjust length.  */ | ||||
| 	addq	%rcx, %rsi | ||||
| # endif | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(loop_4x_vec): | ||||
| 	/* Compare 4 * VEC at a time forward.  */ | ||||
| 	vmovdqa (%rdi), %ymm1 | ||||
| 	vmovdqa	VEC_SIZE(%rdi), %ymm2 | ||||
| 	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3 | ||||
| 	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4 | ||||
| 	VPMINU	%ymm1, %ymm2, %ymm5 | ||||
| 	VPMINU	%ymm3, %ymm4, %ymm6 | ||||
| 	VPMINU	%ymm5, %ymm6, %ymm5 | ||||
| 
 | ||||
| 	VPCMPEQ	%ymm5, %ymm0, %ymm5 | ||||
| 	vpmovmskb %ymm5, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jnz	L(4x_vec_end) | ||||
| 
 | ||||
| 	addq	$(VEC_SIZE * 4), %rdi | ||||
| 
 | ||||
| # ifndef USE_AS_STRNLEN | ||||
| 	jmp	L(loop_4x_vec) | ||||
| # else | ||||
| 	subq	$(VEC_SIZE * 4), %rsi | ||||
| 	ja	L(loop_4x_vec) | ||||
| 
 | ||||
| L(last_4x_vec_or_less): | ||||
| 	/* Less than 4 * VEC and aligned to VEC_SIZE.  */ | ||||
| 	addl	$(VEC_SIZE * 2), %esi | ||||
| 	jle	L(last_2x_vec) | ||||
| 
 | ||||
| 	VPCMPEQ (%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jnz	L(first_vec_x0) | ||||
| 
 | ||||
| 	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jnz	L(first_vec_x1) | ||||
| 
 | ||||
| 	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 
 | ||||
| 	jnz	L(first_vec_x2_check) | ||||
| 	subl	$VEC_SIZE, %esi | ||||
| 	jle	L(max) | ||||
| 
 | ||||
| 	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 
 | ||||
| 	jnz	L(first_vec_x3_check) | ||||
| 	movq	%r8, %rax | ||||
| #  ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| #  endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(last_2x_vec): | ||||
| 	addl	$(VEC_SIZE * 2), %esi | ||||
| 	VPCMPEQ (%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 
 | ||||
| 	jnz	L(first_vec_x0_check) | ||||
| 	subl	$VEC_SIZE, %esi | ||||
| 	jle	L(max) | ||||
| 
 | ||||
| 	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jnz	L(first_vec_x1_check) | ||||
| 	movq	%r8, %rax | ||||
| #  ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| #  endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(first_vec_x0_check): | ||||
| 	tzcntl	%eax, %eax | ||||
| 	/* Check the end of data.  */ | ||||
| 	cmpq	%rax, %rsi | ||||
| 	jbe	L(max) | ||||
| 	addq	%rdi, %rax | ||||
| 	subq	%rdx, %rax | ||||
| #  ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| #  endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(first_vec_x1_check): | ||||
| 	tzcntl	%eax, %eax | ||||
| 	/* Check the end of data.  */ | ||||
| 	cmpq	%rax, %rsi | ||||
| 	jbe	L(max) | ||||
| 	addq	$VEC_SIZE, %rax | ||||
| 	addq	%rdi, %rax | ||||
| 	subq	%rdx, %rax | ||||
| #  ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| #  endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(first_vec_x2_check): | ||||
| 	tzcntl	%eax, %eax | ||||
| 	/* Check the end of data.  */ | ||||
| 	cmpq	%rax, %rsi | ||||
| 	jbe	L(max) | ||||
| 	addq	$(VEC_SIZE * 2), %rax | ||||
| 	addq	%rdi, %rax | ||||
| 	subq	%rdx, %rax | ||||
| #  ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| #  endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(first_vec_x3_check): | ||||
| 	tzcntl	%eax, %eax | ||||
| 	/* Check the end of data.  */ | ||||
| 	cmpq	%rax, %rsi | ||||
| 	jbe	L(max) | ||||
| 	addq	$(VEC_SIZE * 3), %rax | ||||
| 	addq	%rdi, %rax | ||||
| 	subq	%rdx, %rax | ||||
| #  ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| #  endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(max): | ||||
| 	movq	%r8, %rax | ||||
| #  ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| #  endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(zero): | ||||
| 	xorl	%eax, %eax | ||||
| 	ret | ||||
| # endif | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(first_vec_x0): | ||||
| 	tzcntl	%eax, %eax | ||||
| 	addq	%rdi, %rax | ||||
| 	subq	%rdx, %rax | ||||
| # ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| # endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(first_vec_x1): | ||||
| 	tzcntl	%eax, %eax | ||||
| 	addq	$VEC_SIZE, %rax | ||||
| 	addq	%rdi, %rax | ||||
| 	subq	%rdx, %rax | ||||
| # ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| # endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(first_vec_x2): | ||||
| 	tzcntl	%eax, %eax | ||||
| 	addq	$(VEC_SIZE * 2), %rax | ||||
| 	addq	%rdi, %rax | ||||
| 	subq	%rdx, %rax | ||||
| # ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| # endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| 	.p2align 4
 | ||||
| L(4x_vec_end): | ||||
| 	VPCMPEQ	%ymm1, %ymm0, %ymm1 | ||||
| 	vpmovmskb %ymm1, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jnz	L(first_vec_x0) | ||||
| 	VPCMPEQ %ymm2, %ymm0, %ymm2 | ||||
| 	vpmovmskb %ymm2, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jnz	L(first_vec_x1) | ||||
| 	VPCMPEQ %ymm3, %ymm0, %ymm3 | ||||
| 	vpmovmskb %ymm3, %eax | ||||
| 	testl	%eax, %eax | ||||
| 	jnz	L(first_vec_x2) | ||||
| 	VPCMPEQ %ymm4, %ymm0, %ymm4 | ||||
| 	vpmovmskb %ymm4, %eax | ||||
| 	testl	%eax, %eax | ||||
| L(first_vec_x3): | ||||
| 	tzcntl	%eax, %eax | ||||
| 	addq	$(VEC_SIZE * 3), %rax | ||||
| 	addq	%rdi, %rax | ||||
| 	subq	%rdx, %rax | ||||
| # ifdef USE_AS_WCSLEN | ||||
| 	shrq	$2, %rax | ||||
| # endif | ||||
| 	VZEROUPPER | ||||
| 	ret | ||||
| 
 | ||||
| END (STRLEN) | ||||
| #endif | ||||
|  | @ -0,0 +1,23 @@ | |||
| /* strlen optimized with SSE2. | ||||
|    Copyright (C) 2017 Free Software Foundation, Inc. | ||||
|    This file is part of the GNU C Library. | ||||
| 
 | ||||
|    The GNU C Library is free software; you can redistribute it and/or
 | ||||
|    modify it under the terms of the GNU Lesser General Public | ||||
|    License as published by the Free Software Foundation; either
 | ||||
|    version 2.1 of the License, or (at your option) any later version. | ||||
| 
 | ||||
|    The GNU C Library is distributed in the hope that it will be useful, | ||||
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|    Lesser General Public License for more details. | ||||
| 
 | ||||
|    You should have received a copy of the GNU Lesser General Public | ||||
|    License along with the GNU C Library; if not, see
 | ||||
|    <http://www.gnu.org/licenses/>.  */ | ||||
| 
 | ||||
| #if IS_IN (libc) | ||||
| # define strlen __strlen_sse2 | ||||
| #endif | ||||
| 
 | ||||
| #include "../strlen.S" | ||||
|  | @ -0,0 +1,34 @@ | |||
| /* Multiple versions of strlen.
 | ||||
|    All versions must be listed in ifunc-impl-list.c. | ||||
|    Copyright (C) 2017 Free Software Foundation, Inc. | ||||
|    This file is part of the GNU C Library. | ||||
| 
 | ||||
|    The GNU C Library is free software; you can redistribute it and/or | ||||
|    modify it under the terms of the GNU Lesser General Public | ||||
|    License as published by the Free Software Foundation; either | ||||
|    version 2.1 of the License, or (at your option) any later version. | ||||
| 
 | ||||
|    The GNU C Library is distributed in the hope that it will be useful, | ||||
|    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|    Lesser General Public License for more details. | ||||
| 
 | ||||
|    You should have received a copy of the GNU Lesser General Public | ||||
|    License along with the GNU C Library; if not, see | ||||
|    <http://www.gnu.org/licenses/>.  */
 | ||||
| 
 | ||||
| /* Define multiple versions only for the definition in libc. */ | ||||
| #if IS_IN (libc) | ||||
| # define strlen __redirect_strlen | ||||
| # include <string.h> | ||||
| # undef strlen | ||||
| 
 | ||||
| # define SYMBOL_NAME strlen | ||||
| # include "ifunc-avx2.h" | ||||
| 
 | ||||
| libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ()); | ||||
| # ifdef SHARED | ||||
| __hidden_ver1 (strlen, __GI_strlen, __redirect_strlen) | ||||
|   __attribute__((visibility ("hidden"))); | ||||
| # endif | ||||
| #endif | ||||
|  | @ -0,0 +1,4 @@ | |||
| #define STRLEN __strnlen_avx2 | ||||
| #define USE_AS_STRNLEN 1 | ||||
| 
 | ||||
| #include "strlen-avx2.S" | ||||
|  | @ -0,0 +1,28 @@ | |||
| /* strnlen optimized with SSE2. | ||||
|    Copyright (C) 2017 Free Software Foundation, Inc. | ||||
|    This file is part of the GNU C Library. | ||||
| 
 | ||||
|    The GNU C Library is free software; you can redistribute it and/or
 | ||||
|    modify it under the terms of the GNU Lesser General Public | ||||
|    License as published by the Free Software Foundation; either
 | ||||
|    version 2.1 of the License, or (at your option) any later version. | ||||
| 
 | ||||
|    The GNU C Library is distributed in the hope that it will be useful, | ||||
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|    Lesser General Public License for more details. | ||||
| 
 | ||||
|    You should have received a copy of the GNU Lesser General Public | ||||
|    License along with the GNU C Library; if not, see
 | ||||
|    <http://www.gnu.org/licenses/>.  */ | ||||
| 
 | ||||
| #if IS_IN (libc) | ||||
| # define __strnlen __strnlen_sse2 | ||||
| 
 | ||||
| # undef weak_alias | ||||
| # define weak_alias(__strnlen, strnlen) | ||||
| # undef libc_hidden_builtin_def | ||||
| # define libc_hidden_builtin_def(strnlen) | ||||
| #endif | ||||
| 
 | ||||
| #include "../strnlen.S" | ||||
|  | @ -0,0 +1,39 @@ | |||
| /* Multiple versions of strnlen.
 | ||||
|    All versions must be listed in ifunc-impl-list.c. | ||||
|    Copyright (C) 2017 Free Software Foundation, Inc. | ||||
|    This file is part of the GNU C Library. | ||||
| 
 | ||||
|    The GNU C Library is free software; you can redistribute it and/or | ||||
|    modify it under the terms of the GNU Lesser General Public | ||||
|    License as published by the Free Software Foundation; either | ||||
|    version 2.1 of the License, or (at your option) any later version. | ||||
| 
 | ||||
|    The GNU C Library is distributed in the hope that it will be useful, | ||||
|    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|    Lesser General Public License for more details. | ||||
| 
 | ||||
|    You should have received a copy of the GNU Lesser General Public | ||||
|    License along with the GNU C Library; if not, see | ||||
|    <http://www.gnu.org/licenses/>.  */
 | ||||
| 
 | ||||
| /* Define multiple versions only for the definition in libc. */ | ||||
| #if IS_IN (libc) | ||||
| # define strnlen __redirect_strnlen | ||||
| # define __strnlen __redirect___strnlen | ||||
| # include <string.h> | ||||
| # undef __strnlen | ||||
| # undef strnlen | ||||
| 
 | ||||
| # define SYMBOL_NAME strnlen | ||||
| # include "ifunc-avx2.h" | ||||
| 
 | ||||
| libc_ifunc_redirected (__redirect_strnlen, __strnlen, IFUNC_SELECTOR ()); | ||||
| weak_alias (__strnlen, strnlen); | ||||
| # ifdef SHARED | ||||
| __hidden_ver1 (__strnlen, __GI___strnlen, __redirect___strnlen) | ||||
|   __attribute__((visibility ("hidden"))); | ||||
| __hidden_ver1 (strnlen, __GI_strnlen, __redirect_strnlen) | ||||
|   __attribute__((weak, visibility ("hidden"))); | ||||
| # endif | ||||
| #endif | ||||
|  | @ -0,0 +1,4 @@ | |||
| #define STRLEN __wcslen_avx2 | ||||
| #define USE_AS_WCSLEN 1 | ||||
| 
 | ||||
| #include "strlen-avx2.S" | ||||
|  | @ -0,0 +1,26 @@ | |||
| /* wcslen optimized with SSE2. | ||||
|    Copyright (C) 2017 Free Software Foundation, Inc. | ||||
|    This file is part of the GNU C Library. | ||||
| 
 | ||||
|    The GNU C Library is free software; you can redistribute it and/or
 | ||||
|    modify it under the terms of the GNU Lesser General Public | ||||
|    License as published by the Free Software Foundation; either
 | ||||
|    version 2.1 of the License, or (at your option) any later version. | ||||
| 
 | ||||
|    The GNU C Library is distributed in the hope that it will be useful, | ||||
|    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|    Lesser General Public License for more details. | ||||
| 
 | ||||
|    You should have received a copy of the GNU Lesser General Public | ||||
|    License along with the GNU C Library; if not, see
 | ||||
|    <http://www.gnu.org/licenses/>.  */ | ||||
| 
 | ||||
| #if IS_IN (libc) | ||||
| # define __wcslen __wcslen_sse2 | ||||
| 
 | ||||
| # undef weak_alias | ||||
| # define weak_alias(__wcslen, wcslen) | ||||
| #endif | ||||
| 
 | ||||
| #include "../wcslen.S" | ||||
|  | @ -0,0 +1,31 @@ | |||
| /* Multiple versions of wcslen.
 | ||||
|    All versions must be listed in ifunc-impl-list.c. | ||||
|    Copyright (C) 2017 Free Software Foundation, Inc. | ||||
|    This file is part of the GNU C Library. | ||||
| 
 | ||||
|    The GNU C Library is free software; you can redistribute it and/or | ||||
|    modify it under the terms of the GNU Lesser General Public | ||||
|    License as published by the Free Software Foundation; either | ||||
|    version 2.1 of the License, or (at your option) any later version. | ||||
| 
 | ||||
|    The GNU C Library is distributed in the hope that it will be useful, | ||||
|    but WITHOUT ANY WARRANTY; without even the implied warranty of | ||||
|    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU | ||||
|    Lesser General Public License for more details. | ||||
| 
 | ||||
|    You should have received a copy of the GNU Lesser General Public | ||||
|    License along with the GNU C Library; if not, see | ||||
|    <http://www.gnu.org/licenses/>.  */
 | ||||
| 
 | ||||
| /* Define multiple versions only for the definition in libc. */ | ||||
| #if IS_IN (libc) | ||||
| # define __wcslen __redirect_wcslen | ||||
| # include <wchar.h> | ||||
| # undef __wcslen | ||||
| 
 | ||||
| # define SYMBOL_NAME wcslen | ||||
| # include "ifunc-avx2.h" | ||||
| 
 | ||||
| libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ()); | ||||
| weak_alias (__wcslen, wcslen); | ||||
| #endif | ||||
|  | @ -0,0 +1,5 @@ | |||
| #define STRLEN __wcsnlen_avx2 | ||||
| #define USE_AS_WCSLEN 1 | ||||
| #define USE_AS_STRNLEN 1 | ||||
| 
 | ||||
| #include "strlen-avx2.S" | ||||
|  | @ -28,12 +28,18 @@ | |||
| 
 | ||||
| extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; | ||||
| extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; | ||||
| extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; | ||||
| 
 | ||||
| static inline void * | ||||
| IFUNC_SELECTOR (void) | ||||
| { | ||||
|   const struct cpu_features* cpu_features = __get_cpu_features (); | ||||
| 
 | ||||
|   if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) | ||||
|       && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable) | ||||
|       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) | ||||
|     return OPTIMIZE (avx2); | ||||
| 
 | ||||
|   if (CPU_FEATURES_CPU_P (cpu_features, SSE4_1)) | ||||
|     return OPTIMIZE (sse4_1); | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue