This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PING] [PATCH v2] Fix strrchr regression.


Ping.

On Mon, Aug 26, 2013 at 03:46:49PM +0200, OndÅej BÃlka wrote:
> Ping,
> 
> Now data for atom and silvermont are available and it is improvement
> there.
> 
> http://kam.mff.cuni.cz/~ondra/benchmark_string/atom/strrchr_profile/results_gcc/result.html
> 
> 
> On Fri, Aug 16, 2013 at 02:14:57PM +0200, OndÅej BÃlka wrote:
> > 
> > Here is correct version 
> > 
> > > On Mon, Aug 05, 2013 at 07:33:46PM +0200, OndÅej BÃlka wrote:
> > > > Hi, 
> > > >
> > > ... 
> > > > To get reliable results I added strrchr to my profiler. You can clearly
> > > > see asymptotic behavior of strrchr here.
> > > > 
> > Hi, according to profiling feedback I tuned strrchr implementation to
> > much faster one.
> > 
> > See results at:
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/strrchr_profile.html
> > are updated and benchmark program is here:
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/strrchr_profile160813.tar.bz2
> > 
> > I do not cover adding avx2 implementation which will be posted
> > separately. A considerable speedup is possible there.
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/haswell/strrchr_profile/results_rand/result.html
> > 
> > I need to retest it on atom and silvermont to verify how much I
> > improved.
> > 
> > Passes tests, OK to commit?
> > 
> > Ondra
> > 
> > 	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Update.
> > 	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Remove strrchr ifunc.
> > 	* sysdeps/x86_64/multiarch/strend-sse4.S Remove.
> > 	* sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S Likewise.
> > 	* sysdeps/x86_64/multiarch/strrchr.S: Likewise.
> > 	* sysdeps/x86_64/strrchr.S (strrchr): Use optimized implementation.
> > 
> > ---
> >  sysdeps/x86_64/multiarch/Makefile              |   4 +-
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c     |   6 -
> >  sysdeps/x86_64/multiarch/strend-sse4.S         |  48 ---
> >  sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S | 555 -------------------------
> >  sysdeps/x86_64/multiarch/strrchr.S             | 288 -------------
> >  sysdeps/x86_64/strrchr.S                       | 247 ++++++++---
> >  6 files changed, 202 insertions(+), 946 deletions(-)
> >  delete mode 100644 sysdeps/x86_64/multiarch/strend-sse4.S
> >  delete mode 100644 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
> >  delete mode 100644 sysdeps/x86_64/multiarch/strrchr.S
> > 
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 203d16e..b99e8d1 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -7,7 +7,7 @@ endif
> >  ifeq ($(subdir),string)
> >  
> >  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
> > -		   strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
> > +		   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
> >  		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> >  		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
> >  		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
> > @@ -15,7 +15,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
> >  		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
> >  		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
> >  		   strcat-sse2-unaligned strncat-sse2-unaligned \
> > -		   strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
> > +		   strchr-sse2-no-bsf memcmp-ssse3
> >  ifeq (yes,$(config-cflags-sse4))
> >  sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
> >  CFLAGS-varshift.c += -msse4
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 8486294..dfef9e7 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -181,12 +181,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >  			      __strpbrk_sse42)
> >  	      IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
> >  
> > -  /* Support sysdeps/x86_64/multiarch/strrchr.S.  */
> > -  IFUNC_IMPL (i, name, strrchr,
> > -	      IFUNC_IMPL_ADD (array, i, strrchr, HAS_SSE4_2,
> > -			      __strrchr_sse42)
> > -	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2_no_bsf)
> > -	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
> >  
> >    /* Support sysdeps/x86_64/multiarch/strspn.S.  */
> >    IFUNC_IMPL (i, name, strspn,
> > diff --git a/sysdeps/x86_64/multiarch/strend-sse4.S b/sysdeps/x86_64/multiarch/strend-sse4.S
> > deleted file mode 100644
> > index c5a7ae2..0000000
> > --- a/sysdeps/x86_64/multiarch/strend-sse4.S
> > +++ /dev/null
> > @@ -1,48 +0,0 @@
> > -/* Return the pointer to the end of string, using SSE4.2
> > -   Copyright (C) 2009-2013 Free Software Foundation, Inc.
> > -   Contributed by Intel Corporation.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -#include <sysdep.h>
> > -#include "asm-syntax.h"
> > -
> > -	.section .text.sse4.2,"ax",@progbits
> > -ENTRY (__strend_sse4)
> > -	pxor	%xmm2, %xmm2
> > -	movq	%rdi, %rcx
> > -	andq	$~15, %rdi
> > -	movdqa	%xmm2, %xmm1
> > -	pcmpeqb	(%rdi), %xmm2
> > -	orl	$0xffffffff, %esi
> > -	subq	%rdi, %rcx
> > -	shll	%cl, %esi
> > -	pmovmskb %xmm2, %edx
> > -	andl	%esi, %edx
> > -	jnz	1f
> > -
> > -2:	pcmpistri $0x08, 16(%rdi), %xmm1
> > -	leaq	16(%rdi), %rdi
> > -	jnz	2b
> > -
> > -	leaq	(%rdi,%rcx), %rax
> > -	ret
> > -
> > -1:	bsfl	%edx, %eax
> > -	addq	%rdi, %rax
> > -	ret
> > -
> > -END (__strend_sse4)
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
> > deleted file mode 100644
> > index fcef610..0000000
> > --- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
> > +++ /dev/null
> > @@ -1,555 +0,0 @@
> > -/* strrchr with SSE2 without bsf and bsr
> > -   Copyright (C) 2011-2013 Free Software Foundation, Inc.
> > -   Contributed by Intel Corporation.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -#if defined SHARED && !defined NOT_IN_libc
> > -
> > -# include <sysdep.h>
> > -# include "asm-syntax.h"
> > -
> > -	atom_text_section
> > -ENTRY (__strrchr_sse2_no_bsf)
> > -
> > -	movd	%rsi, %xmm1
> > -	pxor	%xmm2, %xmm2
> > -	mov	%rdi, %rcx
> > -	punpcklbw %xmm1, %xmm1
> > -	punpcklbw %xmm1, %xmm1
> > -	/* ECX has OFFSET. */
> > -	and	$63, %rcx
> > -	cmp	$48, %rcx
> > -	pshufd	$0, %xmm1, %xmm1
> > -	ja	L(crosscache)
> > -
> > -/* unaligned string. */
> > -	movdqu	(%rdi), %xmm0
> > -	pcmpeqb	%xmm0, %xmm2
> > -	pcmpeqb	%xmm1, %xmm0
> > -	/* Find where NULL is.  */
> > -	pmovmskb %xmm2, %rcx
> > -	/* Check if there is a match.  */
> > -	pmovmskb %xmm0, %rax
> > -	add	$16, %rdi
> > -
> > -	test	%rax, %rax
> > -	jnz	L(unaligned_match1)
> > -
> > -	test	%rcx, %rcx
> > -	jnz	L(return_null)
> > -
> > -	and	$-16, %rdi
> > -	xor	%r8, %r8
> > -	jmp	L(loop)
> > -
> > -	.p2align 4
> > -L(unaligned_match1):
> > -	test	%rcx, %rcx
> > -	jnz	L(prolog_find_zero_1)
> > -
> > -	mov	%rax, %r8
> > -	mov	%rdi, %rsi
> > -	and	$-16, %rdi
> > -	jmp	L(loop)
> > -
> > -	.p2align 4
> > -L(crosscache):
> > -/* Hancle unaligned string.  */
> > -	and	$15, %rcx
> > -	and	$-16, %rdi
> > -	pxor	%xmm3, %xmm3
> > -	movdqa	(%rdi), %xmm0
> > -	pcmpeqb	%xmm0, %xmm3
> > -	pcmpeqb	%xmm1, %xmm0
> > -	/* Find where NULL is.  */
> > -	pmovmskb %xmm3, %rdx
> > -	/* Check if there is a match.  */
> > -	pmovmskb %xmm0, %rax
> > -	/* Remove the leading bytes.  */
> > -	shr	%cl, %rdx
> > -	shr	%cl, %rax
> > -	add	$16, %rdi
> > -
> > -	test	%rax, %rax
> > -	jnz	L(unaligned_match)
> > -
> > -	test	%rdx, %rdx
> > -	jnz	L(return_null)
> > -
> > -	xor	%r8, %r8
> > -	jmp	L(loop)
> > -
> > -	.p2align 4
> > -L(unaligned_match):
> > -	test	%rdx, %rdx
> > -	jnz	L(prolog_find_zero)
> > -
> > -	mov	%rax, %r8
> > -	lea	(%rdi, %rcx), %rsi
> > -
> > -/* Loop start on aligned string.  */
> > -	.p2align 4
> > -L(loop):
> > -	movdqa	(%rdi), %xmm0
> > -	pcmpeqb	%xmm0, %xmm2
> > -	add	$16, %rdi
> > -	pcmpeqb	%xmm1, %xmm0
> > -	pmovmskb %xmm2, %rcx
> > -	pmovmskb %xmm0, %rax
> > -	or	%rax, %rcx
> > -	jnz	L(matches)
> > -
> > -	movdqa	(%rdi), %xmm0
> > -	pcmpeqb	%xmm0, %xmm2
> > -	add	$16, %rdi
> > -	pcmpeqb	%xmm1, %xmm0
> > -	pmovmskb %xmm2, %rcx
> > -	pmovmskb %xmm0, %rax
> > -	or	%rax, %rcx
> > -	jnz	L(matches)
> > -
> > -	movdqa	(%rdi), %xmm0
> > -	pcmpeqb	%xmm0, %xmm2
> > -	add	$16, %rdi
> > -	pcmpeqb	%xmm1, %xmm0
> > -	pmovmskb %xmm2, %rcx
> > -	pmovmskb %xmm0, %rax
> > -	or	%rax, %rcx
> > -	jnz	L(matches)
> > -
> > -	movdqa	(%rdi), %xmm0
> > -	pcmpeqb	%xmm0, %xmm2
> > -	add	$16, %rdi
> > -	pcmpeqb	%xmm1, %xmm0
> > -	pmovmskb %xmm2, %rcx
> > -	pmovmskb %xmm0, %rax
> > -	or	%rax, %rcx
> > -	jz	L(loop)
> > -
> > -L(matches):
> > -	test	%rax, %rax
> > -	jnz	L(match)
> > -L(return_value):
> > -	test	%r8, %r8
> > -	jz	L(return_null)
> > -	mov	%r8, %rax
> > -	mov	%rsi, %rdi
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(match):
> > -	pmovmskb %xmm2, %rcx
> > -	test	%rcx, %rcx
> > -	jnz	L(find_zero)
> > -	mov	%rax, %r8
> > -	mov	%rdi, %rsi
> > -	jmp	L(loop)
> > -
> > -	.p2align 4
> > -L(find_zero):
> > -	test	%cl, %cl
> > -	jz	L(find_zero_high)
> > -	mov	%cl, %dl
> > -	and	$15, %dl
> > -	jz	L(find_zero_8)
> > -	test	$0x01, %cl
> > -	jnz	L(FindZeroExit1)
> > -	test	$0x02, %cl
> > -	jnz	L(FindZeroExit2)
> > -	test	$0x04, %cl
> > -	jnz	L(FindZeroExit3)
> > -	and	$1 << 4 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(find_zero_8):
> > -	test	$0x10, %cl
> > -	jnz	L(FindZeroExit5)
> > -	test	$0x20, %cl
> > -	jnz	L(FindZeroExit6)
> > -	test	$0x40, %cl
> > -	jnz	L(FindZeroExit7)
> > -	and	$1 << 8 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(find_zero_high):
> > -	mov	%ch, %dh
> > -	and	$15, %dh
> > -	jz	L(find_zero_high_8)
> > -	test	$0x01, %ch
> > -	jnz	L(FindZeroExit9)
> > -	test	$0x02, %ch
> > -	jnz	L(FindZeroExit10)
> > -	test	$0x04, %ch
> > -	jnz	L(FindZeroExit11)
> > -	and	$1 << 12 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(find_zero_high_8):
> > -	test	$0x10, %ch
> > -	jnz	L(FindZeroExit13)
> > -	test	$0x20, %ch
> > -	jnz	L(FindZeroExit14)
> > -	test	$0x40, %ch
> > -	jnz	L(FindZeroExit15)
> > -	and	$1 << 16 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit1):
> > -	and	$1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit2):
> > -	and	$1 << 2 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit3):
> > -	and	$1 << 3 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit5):
> > -	and	$1 << 5 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit6):
> > -	and	$1 << 6 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit7):
> > -	and	$1 << 7 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit9):
> > -	and	$1 << 9 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit10):
> > -	and	$1 << 10 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit11):
> > -	and	$1 << 11 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit13):
> > -	and	$1 << 13 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit14):
> > -	and	$1 << 14 - 1, %rax
> > -	jz	L(return_value)
> > -	jmp	L(match_exit)
> > -
> > -	.p2align 4
> > -L(FindZeroExit15):
> > -	and	$1 << 15 - 1, %rax
> > -	jz	L(return_value)
> > -
> > -	.p2align 4
> > -L(match_exit):
> > -	test	%ah, %ah
> > -	jnz	L(match_exit_high)
> > -	mov	%al, %dl
> > -	and	$15 << 4, %dl
> > -	jnz	L(match_exit_8)
> > -	test	$0x08, %al
> > -	jnz	L(Exit4)
> > -	test	$0x04, %al
> > -	jnz	L(Exit3)
> > -	test	$0x02, %al
> > -	jnz	L(Exit2)
> > -	lea	-16(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(match_exit_8):
> > -	test	$0x80, %al
> > -	jnz	L(Exit8)
> > -	test	$0x40, %al
> > -	jnz	L(Exit7)
> > -	test	$0x20, %al
> > -	jnz	L(Exit6)
> > -	lea	-12(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(match_exit_high):
> > -	mov	%ah, %dh
> > -	and	$15 << 4, %dh
> > -	jnz	L(match_exit_high_8)
> > -	test	$0x08, %ah
> > -	jnz	L(Exit12)
> > -	test	$0x04, %ah
> > -	jnz	L(Exit11)
> > -	test	$0x02, %ah
> > -	jnz	L(Exit10)
> > -	lea	-8(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(match_exit_high_8):
> > -	test	$0x80, %ah
> > -	jnz	L(Exit16)
> > -	test	$0x40, %ah
> > -	jnz	L(Exit15)
> > -	test	$0x20, %ah
> > -	jnz	L(Exit14)
> > -	lea	-4(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit2):
> > -	lea	-15(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit3):
> > -	lea	-14(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit4):
> > -	lea	-13(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit6):
> > -	lea	-11(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit7):
> > -	lea	-10(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit8):
> > -	lea	-9(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit10):
> > -	lea	-7(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit11):
> > -	lea	-6(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit12):
> > -	lea	-5(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit14):
> > -	lea	-3(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit15):
> > -	lea	-2(%rdi), %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(Exit16):
> > -	lea	-1(%rdi), %rax
> > -	ret
> > -
> > -/* Return NULL.  */
> > -	.p2align 4
> > -L(return_null):
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(prolog_find_zero):
> > -	add	%rcx, %rdi
> > -	mov     %rdx, %rcx
> > -L(prolog_find_zero_1):
> > -	test	%cl, %cl
> > -	jz	L(prolog_find_zero_high)
> > -	mov	%cl, %dl
> > -	and	$15, %dl
> > -	jz	L(prolog_find_zero_8)
> > -	test	$0x01, %cl
> > -	jnz	L(PrologFindZeroExit1)
> > -	test	$0x02, %cl
> > -	jnz	L(PrologFindZeroExit2)
> > -	test	$0x04, %cl
> > -	jnz	L(PrologFindZeroExit3)
> > -	and	$1 << 4 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(prolog_find_zero_8):
> > -	test	$0x10, %cl
> > -	jnz	L(PrologFindZeroExit5)
> > -	test	$0x20, %cl
> > -	jnz	L(PrologFindZeroExit6)
> > -	test	$0x40, %cl
> > -	jnz	L(PrologFindZeroExit7)
> > -	and	$1 << 8 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(prolog_find_zero_high):
> > -	mov	%ch, %dh
> > -	and	$15, %dh
> > -	jz	L(prolog_find_zero_high_8)
> > -	test	$0x01, %ch
> > -	jnz	L(PrologFindZeroExit9)
> > -	test	$0x02, %ch
> > -	jnz	L(PrologFindZeroExit10)
> > -	test	$0x04, %ch
> > -	jnz	L(PrologFindZeroExit11)
> > -	and	$1 << 12 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(prolog_find_zero_high_8):
> > -	test	$0x10, %ch
> > -	jnz	L(PrologFindZeroExit13)
> > -	test	$0x20, %ch
> > -	jnz	L(PrologFindZeroExit14)
> > -	test	$0x40, %ch
> > -	jnz	L(PrologFindZeroExit15)
> > -	and	$1 << 16 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit1):
> > -	and	$1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit2):
> > -	and	$1 << 2 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit3):
> > -	and	$1 << 3 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit5):
> > -	and	$1 << 5 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit6):
> > -	and	$1 << 6 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit7):
> > -	and	$1 << 7 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit9):
> > -	and	$1 << 9 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit10):
> > -	and	$1 << 10 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit11):
> > -	and	$1 << 11 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit13):
> > -	and	$1 << 13 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit14):
> > -	and	$1 << 14 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -	.p2align 4
> > -L(PrologFindZeroExit15):
> > -	and	$1 << 15 - 1, %rax
> > -	jnz	L(match_exit)
> > -	xor	%rax, %rax
> > -	ret
> > -
> > -END (__strrchr_sse2_no_bsf)
> > -#endif
> > diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
> > deleted file mode 100644
> > index 3f92a41..0000000
> > --- a/sysdeps/x86_64/multiarch/strrchr.S
> > +++ /dev/null
> > @@ -1,288 +0,0 @@
> > -/* Multiple versions of strrchr
> > -   All versions must be listed in ifunc-impl-list.c.
> > -   Copyright (C) 2009-2013 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <http://www.gnu.org/licenses/>.  */
> > -
> > -#include <sysdep.h>
> > -#include <init-arch.h>
> > -
> > -
> > -/* Define multiple versions only for the definition in libc and for
> > -   the DSO.  In static binaries we need strrchr before the initialization
> > -   happened.  */
> > -#if defined SHARED && !defined NOT_IN_libc
> > -	.text
> > -ENTRY(strrchr)
> > -	.type	strrchr, @gnu_indirect_function
> > -	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
> > -	jne	1f
> > -	call	__init_cpu_features
> > -1:	leaq	__strrchr_sse2(%rip), %rax
> > -	testl	$bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
> > -	jnz	2f
> > -	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> > -	jz	2f
> > -	leaq	__strrchr_sse42(%rip), %rax
> > -	ret
> > -2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
> > -	jz	3f
> > -	leaq    __strrchr_sse2_no_bsf(%rip), %rax
> > -3:	ret
> > -END(strrchr)
> > -
> > -/*
> > -   This implementation uses SSE4 instructions to compare up to 16 bytes
> > -   at a time looking for the last occurrence of the character c in the
> > -   string s:
> > -
> > -   char *strrchr (const char *s, int c);
> > -
> > -   We use 0x4a:
> > -	_SIDD_SBYTE_OPS
> > -	| _SIDD_CMP_EQUAL_EACH
> > -	| _SIDD_MOST_SIGNIFICANT
> > -   on pcmpistri to compare xmm/mem128
> > -
> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > -   X X X X X X X X X X X X X X X X
> > -
> > -   against xmm
> > -
> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> > -   C C C C C C C C C C C C C C C C
> > -
> > -   to find out if the first 16byte data element has a byte C and the
> > -   last offset.  There are 4 cases:
> > -
> > -   1. The first 16byte data element has EOS and has the byte C at the
> > -      last offset X.
> > -   2. The first 16byte data element is valid and has the byte C at the
> > -      last offset X.
> > -   3. The first 16byte data element has EOS and doesn't have the byte C.
> > -   4. The first 16byte data element is valid and doesn't have the byte C.
> > -
> > -   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
> > -
> > -   case		ECX	CFlag	ZFlag	SFlag
> > -    1		 X	  1	  1	  0
> > -    2		 X	  1	  0	  0
> > -    3		16	  0	  1	  0
> > -    4		16	  0	  0	  0
> > -
> > -   We exit from the loop for cases 1 and 3 with jz which branches
> > -   when ZFlag is 1.  If CFlag == 1, ECX has the offset X for case 1.  */
> > -
> > -
> > -	.section .text.sse4.2,"ax",@progbits
> > -	.align	16
> > -	.type	__strrchr_sse42, @function
> > -	.globl	__strrchr_sse42
> > -	.hidden	__strrchr_sse42
> > -__strrchr_sse42:
> > -	cfi_startproc
> > -	CALL_MCOUNT
> > -	testb	%sil, %sil
> > -	je	__strend_sse4
> > -	xor	%eax,%eax	/* RAX has the last occurrence of s.  */
> > -	movd	%esi, %xmm1
> > -	punpcklbw	%xmm1, %xmm1
> > -	movl	%edi, %esi
> > -	punpcklbw	%xmm1, %xmm1
> > -	andl	$15, %esi
> > -	pshufd	$0, %xmm1, %xmm1
> > -	movq	%rdi, %r8
> > -	je	L(loop)
> > -
> > -/* Handle unaligned string using psrldq.  */
> > -	leaq	L(psrldq_table)(%rip), %rdx
> > -	andq	$-16, %r8
> > -	movslq	(%rdx,%rsi,4),%r9
> > -	movdqa	(%r8), %xmm0
> > -	addq	%rdx, %r9
> > -	jmp	*%r9
> > -
> > -/* Handle unaligned string with offset 1 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_1):
> > -	psrldq	$1, %xmm0
> > -
> > -	.p2align 4
> > -L(unaligned_pcmpistri):
> > -	pcmpistri	$0x4a, %xmm1, %xmm0
> > -	jnc	L(unaligned_no_byte)
> > -	leaq	(%rdi,%rcx), %rax
> > -L(unaligned_no_byte):
> > -	/* Find the length of the unaligned string.  */
> > -	pcmpistri	$0x3a, %xmm0, %xmm0
> > -	movl	$16, %edx
> > -	subl	%esi, %edx
> > -	cmpl	%ecx, %edx
> > -	/* Return RAX if the unaligned fragment to next 16B already
> > -	   contain the NULL terminator.  */
> > -	jg	L(exit)
> > -	addq	$16, %r8
> > -
> > -/* Loop start on aligned string.  */
> > -	.p2align 4
> > -L(loop):
> > -	pcmpistri	$0x4a, (%r8), %xmm1
> > -	jbe	L(match_or_eos)
> > -	addq	$16, %r8
> > -	jmp	L(loop)
> > -	.p2align 4
> > -L(match_or_eos):
> > -	je	L(had_eos)
> > -L(match_no_eos):
> > -	leaq	(%r8,%rcx), %rax
> > -	addq	$16, %r8
> > -	jmp     L(loop)
> > -	.p2align 4
> > -L(had_eos):
> > -	jnc     L(exit)
> > -	leaq	(%r8,%rcx), %rax
> > -	.p2align 4
> > -L(exit):
> > -	ret
> > -
> > -/* Handle unaligned string with offset 15 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_15):
> > -	psrldq	$15, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 14 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_14):
> > -	psrldq	$14, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 13 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_13):
> > -	psrldq	$13, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 12 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_12):
> > -	psrldq	$12, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 11 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_11):
> > -	psrldq	$11, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 10 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_10):
> > -	psrldq	$10, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 9 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_9):
> > -	psrldq	$9, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 8 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_8):
> > -	psrldq	$8, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 7 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_7):
> > -	psrldq	$7, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 6 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_6):
> > -	psrldq	$6, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 5 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_5):
> > -	psrldq	$5, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 4 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_4):
> > -	psrldq	$4, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 3 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_3):
> > -	psrldq	$3, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -/* Handle unaligned string with offset 2 using psrldq.  */
> > -	.p2align 4
> > -L(psrldq_2):
> > -	psrldq	$2, %xmm0
> > -	jmp	L(unaligned_pcmpistri)
> > -
> > -	cfi_endproc
> > -	.size	__strrchr_sse42, .-__strrchr_sse42
> > -
> > -	.section .rodata.sse4.2,"a",@progbits
> > -	.p2align 4
> > -L(psrldq_table):
> > -	.int	L(loop) - L(psrldq_table)
> > -	.int	L(psrldq_1) - L(psrldq_table)
> > -	.int	L(psrldq_2) - L(psrldq_table)
> > -	.int	L(psrldq_3) - L(psrldq_table)
> > -	.int	L(psrldq_4) - L(psrldq_table)
> > -	.int	L(psrldq_5) - L(psrldq_table)
> > -	.int	L(psrldq_6) - L(psrldq_table)
> > -	.int	L(psrldq_7) - L(psrldq_table)
> > -	.int	L(psrldq_8) - L(psrldq_table)
> > -	.int	L(psrldq_9) - L(psrldq_table)
> > -	.int	L(psrldq_10) - L(psrldq_table)
> > -	.int	L(psrldq_11) - L(psrldq_table)
> > -	.int	L(psrldq_12) - L(psrldq_table)
> > -	.int	L(psrldq_13) - L(psrldq_table)
> > -	.int	L(psrldq_14) - L(psrldq_table)
> > -	.int	L(psrldq_15) - L(psrldq_table)
> > -
> > -
> > -# undef ENTRY
> > -# define ENTRY(name) \
> > -	.type __strrchr_sse2, @function; \
> > -	.align 16; \
> > -	.globl __strrchr_sse2; \
> > -	.hidden __strrchr_sse2; \
> > -	__strrchr_sse2: cfi_startproc; \
> > -	CALL_MCOUNT
> > -# undef END
> > -# define END(name) \
> > -	cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
> > -# undef libc_hidden_builtin_def
> > -/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
> > -   The speedup we get from using SSE4.2 instruction is likely eaten away
> > -   by the indirect call in the PLT.  */
> > -# define libc_hidden_builtin_def(name) \
> > -	.globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
> > -#endif
> > -
> > -#include "../strrchr.S"
> > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> > index e413b07..22c83bc 100644
> > --- a/sysdeps/x86_64/strrchr.S
> > +++ b/sysdeps/x86_64/strrchr.S
> > @@ -1,6 +1,6 @@
> > -/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
> > -   For AMD x86-64.
> > -   Copyright (C) 2009-2013 Free Software Foundation, Inc.
> > +/* strrchr with SSE2 without bsf and bsr
> > +   Copyright (C) 2011-2013 Free Software Foundation, Inc.
> > +   Contributed by Intel Corporation.
> >     This file is part of the GNU C Library.
> >  
> >     The GNU C Library is free software; you can redistribute it and/or
> > @@ -17,63 +17,216 @@
> >     License along with the GNU C Library; if not, see
> >     <http://www.gnu.org/licenses/>.  */
> >  
> > +
> >  #include <sysdep.h>
> > +#include "asm-syntax.h"
> > +
> > +# ifndef ALIGN
> > +#  define ALIGN(n)	.p2align n
> > +# endif
> > +
> >  
> >  
> > -	.text
> >  ENTRY (strrchr)
> > +
> >  	movd	%esi, %xmm1
> > -	movq	%rdi, %rcx
> > -	punpcklbw %xmm1, %xmm1
> > -	andq	$~15, %rdi
> > -	pxor	%xmm2, %xmm2
> > -	punpcklbw %xmm1, %xmm1
> > -	orl	$0xffffffff, %esi
> > -	movdqa	(%rdi), %xmm0
> > +	movq	%rdi, %rax
> > +	andl	$4095, %eax
> > +	punpcklbw	%xmm1, %xmm1
> > +	cmpq	$4032, %rax
> > +	punpcklwd	%xmm1, %xmm1
> >  	pshufd	$0, %xmm1, %xmm1
> > -	subq	%rdi, %rcx
> > +	ja	L(cross_page)
> > +	movdqu	(%rdi), %xmm0
> > +	pxor	%xmm2, %xmm2
> >  	movdqa	%xmm0, %xmm3
> > -	leaq	16(%rdi), %rdi
> >  	pcmpeqb	%xmm1, %xmm0
> >  	pcmpeqb	%xmm2, %xmm3
> > -	shl	%cl, %esi
> > -	pmovmskb %xmm0, %edx
> > -	pmovmskb %xmm3, %ecx
> > -	andl	%esi, %edx
> > -	andl	%esi, %ecx
> > -	xorl	%eax, %eax
> > -	movl	%edx, %esi
> > -	orl	%ecx, %esi
> > -	jnz	1f
> > -
> > -2:	movdqa	(%rdi), %xmm0
> > -	leaq	16(%rdi), %rdi
> > -	movdqa	%xmm0, %xmm3
> > +	pmovmskb	%xmm0, %ecx
> > +	pmovmskb	%xmm3, %edx
> > +	testq	%rdx, %rdx
> > +	je	L(next_48_bytes)
> > +	leaq	-1(%rdx), %rax
> > +	xorq	%rdx, %rax
> > +	andq	%rcx, %rax
> > +	je	L(exit)
> > +	bsrq	%rax, %rax
> > +	addq	%rdi, %rax
> > +	ret
> > +	ALIGN(4)
> > +L(next_48_bytes):
> > +	movdqu	16(%rdi), %xmm4
> > +	movdqa	%xmm4, %xmm5
> > +	movdqu	32(%rdi), %xmm3
> > +	pcmpeqb	%xmm1, %xmm4
> > +	pcmpeqb	%xmm2, %xmm5
> > +	movdqu	48(%rdi), %xmm0
> > +	pmovmskb	%xmm5, %edx
> > +	movdqa	%xmm3, %xmm5
> > +	pcmpeqb	%xmm1, %xmm3
> > +	pcmpeqb	%xmm2, %xmm5
> > +	pcmpeqb	%xmm0, %xmm2
> > +	salq	$16, %rdx
> > +	pmovmskb	%xmm3, %r8d
> > +	pmovmskb	%xmm5, %eax
> > +	pmovmskb	%xmm2, %esi
> > +	salq	$32, %r8
> > +	salq	$32, %rax
> >  	pcmpeqb	%xmm1, %xmm0
> > -	pcmpeqb	%xmm2, %xmm3
> > -	pmovmskb %xmm0, %edx
> > -	pmovmskb %xmm3, %ecx
> > -	movl	%edx, %esi
> > -	orl	%ecx, %esi
> > -	jz	2b
> > +	orq	%rdx, %rax
> > +	movq	%rsi, %rdx
> > +	pmovmskb	%xmm4, %esi
> > +	salq	$48, %rdx
> > +	salq	$16, %rsi
> > +	orq	%r8, %rsi
> > +	orq	%rcx, %rsi
> > +	pmovmskb	%xmm0, %ecx
> > +	salq	$48, %rcx
> > +	orq	%rcx, %rsi
> > +	orq	%rdx, %rax
> > +	je	L(loop_header2)
> > +	leaq	-1(%rax), %rcx
> > +	xorq	%rax, %rcx
> > +	andq	%rcx, %rsi
> > +	je	L(exit)
> > +	bsrq	%rsi, %rsi
> > +	leaq	(%rdi,%rsi), %rax
> > +	ret
> >  
> > -1:	bsfl	%ecx, %r9d
> > -	movl	$0xffffffff, %r8d
> > -	movl	$31, %ecx
> > -	jnz	5f
> > +	ALIGN(4)
> > +L(loop_header2):
> > +	testq	%rsi, %rsi
> > +	movq	%rdi, %rcx
> > +	je	L(no_c_found)
> > +L(loop_header):
> > +	addq	$64, %rdi
> > +	pxor	%xmm7, %xmm7
> > +	andq	$-64, %rdi
> > +	jmp	L(loop_entry)
> > +	ALIGN(4)
> > +L(loop64):
> > +	testq	%rdx, %rdx
> > +	cmovne	%rdx, %rsi
> > +	cmovne	%rdi, %rcx
> > +	addq	$64, %rdi
> > +L(loop_entry):
> > +	movdqa	32(%rdi), %xmm3
> > +	pxor	%xmm6, %xmm6
> > +	movdqa	48(%rdi), %xmm2
> > +	movdqa	%xmm3, %xmm0
> > +	movdqa	16(%rdi), %xmm4
> > +	pminub	%xmm2, %xmm0
> > +	movdqa	(%rdi), %xmm5
> > +	pminub	%xmm4, %xmm0
> > +	pminub	%xmm5, %xmm0
> > +	pcmpeqb	%xmm7, %xmm0
> > +	pmovmskb	%xmm0, %eax
> > +	movdqa	%xmm5, %xmm0
> > +	pcmpeqb	%xmm1, %xmm0
> > +	pmovmskb	%xmm0, %r9d
> > +	movdqa	%xmm4, %xmm0
> > +	pcmpeqb	%xmm1, %xmm0
> > +	pmovmskb	%xmm0, %edx
> > +	movdqa	%xmm3, %xmm0
> > +	pcmpeqb	%xmm1, %xmm0
> > +	salq	$16, %rdx
> > +	pmovmskb	%xmm0, %r10d
> > +	movdqa	%xmm2, %xmm0
> > +	pcmpeqb	%xmm1, %xmm0
> > +	salq	$32, %r10
> > +	orq	%r10, %rdx
> > +	pmovmskb	%xmm0, %r8d
> > +	orq	%r9, %rdx
> > +	salq	$48, %r8
> > +	orq	%r8, %rdx
> > +	testl	%eax, %eax
> > +	je	L(loop64)
> > +	pcmpeqb	%xmm6, %xmm4
> > +	pcmpeqb	%xmm6, %xmm3
> > +	pcmpeqb	%xmm6, %xmm5
> > +	pmovmskb	%xmm4, %eax
> > +	pmovmskb	%xmm3, %r10d
> > +	pcmpeqb	%xmm6, %xmm2
> > +	pmovmskb	%xmm5, %r9d
> > +	salq	$32, %r10
> > +	salq	$16, %rax
> > +	pmovmskb	%xmm2, %r8d
> > +	orq	%r10, %rax
> > +	orq	%r9, %rax
> > +	salq	$48, %r8
> > +	orq	%r8, %rax
> > +	leaq	-1(%rax), %r8
> > +	xorq	%rax, %r8
> > +	andq	%r8, %rdx
> > +	cmovne	%rdi, %rcx
> > +	cmovne	%rdx, %rsi
> > +	bsrq	%rsi, %rsi
> > +	leaq	(%rcx,%rsi), %rax
> > +	ret
> > +	ALIGN(4)
> > +L(no_c_found):
> > +	movl	$1, %esi
> > +	xorl	%ecx, %ecx
> > +	jmp	L(loop_header)
> > +	ALIGN(4)
> > +L(exit):
> > +	xorl	%eax, %eax
> > +	ret
> > +	ALIGN(4)
> >  
> > -	bsrl	%edx, %edx
> > -	jz	2b
> > -	leaq	-16(%rdi,%rdx), %rax
> > -	jmp	2b
> > +L(cross_page):
> > +	movq	%rdi, %rax
> > +	pxor	%xmm0, %xmm0
> > +	andq	$-64, %rax
> > +	movdqu	(%rax), %xmm5
> > +	movdqa	%xmm5, %xmm6
> > +	movdqu	16(%rax), %xmm4
> > +	pcmpeqb	%xmm1, %xmm5
> > +	pcmpeqb	%xmm0, %xmm6
> > +	movdqu	32(%rax), %xmm3
> > +	pmovmskb	%xmm6, %esi
> > +	movdqa	%xmm4, %xmm6
> > +	movdqu	48(%rax), %xmm2
> > +	pcmpeqb	%xmm1, %xmm4
> > +	pcmpeqb	%xmm0, %xmm6
> > +	pmovmskb	%xmm6, %edx
> > +	movdqa	%xmm3, %xmm6
> > +	pcmpeqb	%xmm1, %xmm3
> > +	pcmpeqb	%xmm0, %xmm6
> > +	pcmpeqb	%xmm2, %xmm0
> > +	salq	$16, %rdx
> > +	pmovmskb	%xmm3, %r9d
> > +	pmovmskb	%xmm6, %r8d
> > +	pmovmskb	%xmm0, %ecx
> > +	salq	$32, %r9
> > +	salq	$32, %r8
> > +	pcmpeqb	%xmm1, %xmm2
> > +	orq	%r8, %rdx
> > +	salq	$48, %rcx
> > +	pmovmskb	%xmm5, %r8d
> > +	orq	%rsi, %rdx
> > +	pmovmskb	%xmm4, %esi
> > +	orq	%rcx, %rdx
> > +	pmovmskb	%xmm2, %ecx
> > +	salq	$16, %rsi
> > +	salq	$48, %rcx
> > +	orq	%r9, %rsi
> > +	orq	%r8, %rsi
> > +	orq	%rcx, %rsi
> > +	movl	%edi, %ecx
> > +	subl	%eax, %ecx
> > +	shrq	%cl, %rdx
> > +	shrq	%cl, %rsi
> > +	testq	%rdx, %rdx
> > +	je	L(loop_header2)
> > +	leaq	-1(%rdx), %rax
> > +	xorq	%rdx, %rax
> > +	andq	%rax, %rsi
> > +	je	L(exit)
> > +	bsrq	%rsi, %rax
> > +	addq	%rdi, %rax
> > +	ret
> >  
> > -5:	subl	%r9d, %ecx
> > -	shrl	%cl, %r8d
> > -	andl	%r8d, %edx
> > -	bsrl	%edx, %edx
> > -	jz	4f
> > -	leaq	-16(%rdi,%rdx), %rax
> > -4:	ret
> >  END (strrchr)
> >  
> >  weak_alias (strrchr, rindex)
> > -- 
> > 1.8.3.2
> > 
> 
> -- 
> 
> the AA battery in the wallclock sends magnetic interference

-- 

terrorist activities


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]