This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PING] [PATCH v2] Fix strrchr regression.


It looks good for me except several small issues.

You mentioned Intel Corporation. :)

> +/* strrchr with SSE2 without bsf and bsr
> +   Copyright (C) 2011-2013 Free Software Foundation, Inc.
> +   Contributed by Intel Corporation.

And what was the purpose to remove .text?
> -     .text


I don't see what is the purpose to include "asm-syntax.h".
> +#include "asm-syntax.h"

--
Liubov
Intel Corporation

On Mon, Sep 2, 2013 at 1:20 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> Ping.
>
> On Mon, Aug 26, 2013 at 03:46:49PM +0200, OndÅej BÃlka wrote:
>> Ping,
>>
>> Now data for atom and silvermont are available and it is improvement
>> there.
>>
>> http://kam.mff.cuni.cz/~ondra/benchmark_string/atom/strrchr_profile/results_gcc/result.html
>>
>>
>> On Fri, Aug 16, 2013 at 02:14:57PM +0200, OndÅej BÃlka wrote:
>> >
>> > Here is correct version
>> >
>> > > On Mon, Aug 05, 2013 at 07:33:46PM +0200, OndÅej BÃlka wrote:
>> > > > Hi,
>> > > >
>> > > ...
>> > > > To get reliable results I added strrchr to my profiler. You can clearly
>> > > > see asymptotic behavior of strrchr here.
>> > > >
>> > Hi, according to profiling feedback I tuned strrchr implementation to
>> > much faster one.
>> >
>> > See results at:
>> > http://kam.mff.cuni.cz/~ondra/benchmark_string/strrchr_profile.html
>> > are updated and benchmark program is here:
>> > http://kam.mff.cuni.cz/~ondra/benchmark_string/strrchr_profile160813.tar.bz2
>> >
>> > I do not cover adding avx2 implementation which will be posted
>> > separately. A considerable speedup is possible there.
>> > http://kam.mff.cuni.cz/~ondra/benchmark_string/haswell/strrchr_profile/results_rand/result.html
>> >
>> > I need to retest it on atom and silvermont to verify how much I
>> > improved.
>> >
>> > Passes tests, OK to commit?
>> >
>> > Ondra
>> >
>> >     * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Update.
>> >     * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Remove strrchr ifunc.
>> >     * sysdeps/x86_64/multiarch/strend-sse4.S Remove.
>> >     * sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S Likewise.
>> >     * sysdeps/x86_64/multiarch/strrchr.S: Likewise.
>> >     * sysdeps/x86_64/strrchr.S (strrchr): Use optimized implementation.
>> >
>> > ---
>> >  sysdeps/x86_64/multiarch/Makefile              |   4 +-
>> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c     |   6 -
>> >  sysdeps/x86_64/multiarch/strend-sse4.S         |  48 ---
>> >  sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S | 555 -------------------------
>> >  sysdeps/x86_64/multiarch/strrchr.S             | 288 -------------
>> >  sysdeps/x86_64/strrchr.S                       | 247 ++++++++---
>> >  6 files changed, 202 insertions(+), 946 deletions(-)
>> >  delete mode 100644 sysdeps/x86_64/multiarch/strend-sse4.S
>> >  delete mode 100644 sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
>> >  delete mode 100644 sysdeps/x86_64/multiarch/strrchr.S
>> >
>> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
>> > index 203d16e..b99e8d1 100644
>> > --- a/sysdeps/x86_64/multiarch/Makefile
>> > +++ b/sysdeps/x86_64/multiarch/Makefile
>> > @@ -7,7 +7,7 @@ endif
>> >  ifeq ($(subdir),string)
>> >
>> >  sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
>> > -              strend-sse4 memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
>> > +              memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned mempcpy-ssse3 \
>> >                memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>> >                memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
>> >                strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
>> > @@ -15,7 +15,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
>> >                strcpy-sse2-unaligned strncpy-sse2-unaligned \
>> >                stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
>> >                strcat-sse2-unaligned strncat-sse2-unaligned \
>> > -              strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
>> > +              strchr-sse2-no-bsf memcmp-ssse3
>> >  ifeq (yes,$(config-cflags-sse4))
>> >  sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
>> >  CFLAGS-varshift.c += -msse4
>> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > index 8486294..dfef9e7 100644
>> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > @@ -181,12 +181,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >                           __strpbrk_sse42)
>> >           IFUNC_IMPL_ADD (array, i, strpbrk, 1, __strpbrk_sse2))
>> >
>> > -  /* Support sysdeps/x86_64/multiarch/strrchr.S.  */
>> > -  IFUNC_IMPL (i, name, strrchr,
>> > -         IFUNC_IMPL_ADD (array, i, strrchr, HAS_SSE4_2,
>> > -                         __strrchr_sse42)
>> > -         IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2_no_bsf)
>> > -         IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
>> >
>> >    /* Support sysdeps/x86_64/multiarch/strspn.S.  */
>> >    IFUNC_IMPL (i, name, strspn,
>> > diff --git a/sysdeps/x86_64/multiarch/strend-sse4.S b/sysdeps/x86_64/multiarch/strend-sse4.S
>> > deleted file mode 100644
>> > index c5a7ae2..0000000
>> > --- a/sysdeps/x86_64/multiarch/strend-sse4.S
>> > +++ /dev/null
>> > @@ -1,48 +0,0 @@
>> > -/* Return the pointer to the end of string, using SSE4.2
>> > -   Copyright (C) 2009-2013 Free Software Foundation, Inc.
>> > -   Contributed by Intel Corporation.
>> > -   This file is part of the GNU C Library.
>> > -
>> > -   The GNU C Library is free software; you can redistribute it and/or
>> > -   modify it under the terms of the GNU Lesser General Public
>> > -   License as published by the Free Software Foundation; either
>> > -   version 2.1 of the License, or (at your option) any later version.
>> > -
>> > -   The GNU C Library is distributed in the hope that it will be useful,
>> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > -   Lesser General Public License for more details.
>> > -
>> > -   You should have received a copy of the GNU Lesser General Public
>> > -   License along with the GNU C Library; if not, see
>> > -   <http://www.gnu.org/licenses/>.  */
>> > -
>> > -#include <sysdep.h>
>> > -#include "asm-syntax.h"
>> > -
>> > -   .section .text.sse4.2,"ax",@progbits
>> > -ENTRY (__strend_sse4)
>> > -   pxor    %xmm2, %xmm2
>> > -   movq    %rdi, %rcx
>> > -   andq    $~15, %rdi
>> > -   movdqa  %xmm2, %xmm1
>> > -   pcmpeqb (%rdi), %xmm2
>> > -   orl     $0xffffffff, %esi
>> > -   subq    %rdi, %rcx
>> > -   shll    %cl, %esi
>> > -   pmovmskb %xmm2, %edx
>> > -   andl    %esi, %edx
>> > -   jnz     1f
>> > -
>> > -2: pcmpistri $0x08, 16(%rdi), %xmm1
>> > -   leaq    16(%rdi), %rdi
>> > -   jnz     2b
>> > -
>> > -   leaq    (%rdi,%rcx), %rax
>> > -   ret
>> > -
>> > -1: bsfl    %edx, %eax
>> > -   addq    %rdi, %rax
>> > -   ret
>> > -
>> > -END (__strend_sse4)
>> > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
>> > deleted file mode 100644
>> > index fcef610..0000000
>> > --- a/sysdeps/x86_64/multiarch/strrchr-sse2-no-bsf.S
>> > +++ /dev/null
>> > @@ -1,555 +0,0 @@
>> > -/* strrchr with SSE2 without bsf and bsr
>> > -   Copyright (C) 2011-2013 Free Software Foundation, Inc.
>> > -   Contributed by Intel Corporation.
>> > -   This file is part of the GNU C Library.
>> > -
>> > -   The GNU C Library is free software; you can redistribute it and/or
>> > -   modify it under the terms of the GNU Lesser General Public
>> > -   License as published by the Free Software Foundation; either
>> > -   version 2.1 of the License, or (at your option) any later version.
>> > -
>> > -   The GNU C Library is distributed in the hope that it will be useful,
>> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > -   Lesser General Public License for more details.
>> > -
>> > -   You should have received a copy of the GNU Lesser General Public
>> > -   License along with the GNU C Library; if not, see
>> > -   <http://www.gnu.org/licenses/>.  */
>> > -
>> > -#if defined SHARED && !defined NOT_IN_libc
>> > -
>> > -# include <sysdep.h>
>> > -# include "asm-syntax.h"
>> > -
>> > -   atom_text_section
>> > -ENTRY (__strrchr_sse2_no_bsf)
>> > -
>> > -   movd    %rsi, %xmm1
>> > -   pxor    %xmm2, %xmm2
>> > -   mov     %rdi, %rcx
>> > -   punpcklbw %xmm1, %xmm1
>> > -   punpcklbw %xmm1, %xmm1
>> > -   /* ECX has OFFSET. */
>> > -   and     $63, %rcx
>> > -   cmp     $48, %rcx
>> > -   pshufd  $0, %xmm1, %xmm1
>> > -   ja      L(crosscache)
>> > -
>> > -/* unaligned string. */
>> > -   movdqu  (%rdi), %xmm0
>> > -   pcmpeqb %xmm0, %xmm2
>> > -   pcmpeqb %xmm1, %xmm0
>> > -   /* Find where NULL is.  */
>> > -   pmovmskb %xmm2, %rcx
>> > -   /* Check if there is a match.  */
>> > -   pmovmskb %xmm0, %rax
>> > -   add     $16, %rdi
>> > -
>> > -   test    %rax, %rax
>> > -   jnz     L(unaligned_match1)
>> > -
>> > -   test    %rcx, %rcx
>> > -   jnz     L(return_null)
>> > -
>> > -   and     $-16, %rdi
>> > -   xor     %r8, %r8
>> > -   jmp     L(loop)
>> > -
>> > -   .p2align 4
>> > -L(unaligned_match1):
>> > -   test    %rcx, %rcx
>> > -   jnz     L(prolog_find_zero_1)
>> > -
>> > -   mov     %rax, %r8
>> > -   mov     %rdi, %rsi
>> > -   and     $-16, %rdi
>> > -   jmp     L(loop)
>> > -
>> > -   .p2align 4
>> > -L(crosscache):
>> > -/* Hancle unaligned string.  */
>> > -   and     $15, %rcx
>> > -   and     $-16, %rdi
>> > -   pxor    %xmm3, %xmm3
>> > -   movdqa  (%rdi), %xmm0
>> > -   pcmpeqb %xmm0, %xmm3
>> > -   pcmpeqb %xmm1, %xmm0
>> > -   /* Find where NULL is.  */
>> > -   pmovmskb %xmm3, %rdx
>> > -   /* Check if there is a match.  */
>> > -   pmovmskb %xmm0, %rax
>> > -   /* Remove the leading bytes.  */
>> > -   shr     %cl, %rdx
>> > -   shr     %cl, %rax
>> > -   add     $16, %rdi
>> > -
>> > -   test    %rax, %rax
>> > -   jnz     L(unaligned_match)
>> > -
>> > -   test    %rdx, %rdx
>> > -   jnz     L(return_null)
>> > -
>> > -   xor     %r8, %r8
>> > -   jmp     L(loop)
>> > -
>> > -   .p2align 4
>> > -L(unaligned_match):
>> > -   test    %rdx, %rdx
>> > -   jnz     L(prolog_find_zero)
>> > -
>> > -   mov     %rax, %r8
>> > -   lea     (%rdi, %rcx), %rsi
>> > -
>> > -/* Loop start on aligned string.  */
>> > -   .p2align 4
>> > -L(loop):
>> > -   movdqa  (%rdi), %xmm0
>> > -   pcmpeqb %xmm0, %xmm2
>> > -   add     $16, %rdi
>> > -   pcmpeqb %xmm1, %xmm0
>> > -   pmovmskb %xmm2, %rcx
>> > -   pmovmskb %xmm0, %rax
>> > -   or      %rax, %rcx
>> > -   jnz     L(matches)
>> > -
>> > -   movdqa  (%rdi), %xmm0
>> > -   pcmpeqb %xmm0, %xmm2
>> > -   add     $16, %rdi
>> > -   pcmpeqb %xmm1, %xmm0
>> > -   pmovmskb %xmm2, %rcx
>> > -   pmovmskb %xmm0, %rax
>> > -   or      %rax, %rcx
>> > -   jnz     L(matches)
>> > -
>> > -   movdqa  (%rdi), %xmm0
>> > -   pcmpeqb %xmm0, %xmm2
>> > -   add     $16, %rdi
>> > -   pcmpeqb %xmm1, %xmm0
>> > -   pmovmskb %xmm2, %rcx
>> > -   pmovmskb %xmm0, %rax
>> > -   or      %rax, %rcx
>> > -   jnz     L(matches)
>> > -
>> > -   movdqa  (%rdi), %xmm0
>> > -   pcmpeqb %xmm0, %xmm2
>> > -   add     $16, %rdi
>> > -   pcmpeqb %xmm1, %xmm0
>> > -   pmovmskb %xmm2, %rcx
>> > -   pmovmskb %xmm0, %rax
>> > -   or      %rax, %rcx
>> > -   jz      L(loop)
>> > -
>> > -L(matches):
>> > -   test    %rax, %rax
>> > -   jnz     L(match)
>> > -L(return_value):
>> > -   test    %r8, %r8
>> > -   jz      L(return_null)
>> > -   mov     %r8, %rax
>> > -   mov     %rsi, %rdi
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(match):
>> > -   pmovmskb %xmm2, %rcx
>> > -   test    %rcx, %rcx
>> > -   jnz     L(find_zero)
>> > -   mov     %rax, %r8
>> > -   mov     %rdi, %rsi
>> > -   jmp     L(loop)
>> > -
>> > -   .p2align 4
>> > -L(find_zero):
>> > -   test    %cl, %cl
>> > -   jz      L(find_zero_high)
>> > -   mov     %cl, %dl
>> > -   and     $15, %dl
>> > -   jz      L(find_zero_8)
>> > -   test    $0x01, %cl
>> > -   jnz     L(FindZeroExit1)
>> > -   test    $0x02, %cl
>> > -   jnz     L(FindZeroExit2)
>> > -   test    $0x04, %cl
>> > -   jnz     L(FindZeroExit3)
>> > -   and     $1 << 4 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(find_zero_8):
>> > -   test    $0x10, %cl
>> > -   jnz     L(FindZeroExit5)
>> > -   test    $0x20, %cl
>> > -   jnz     L(FindZeroExit6)
>> > -   test    $0x40, %cl
>> > -   jnz     L(FindZeroExit7)
>> > -   and     $1 << 8 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(find_zero_high):
>> > -   mov     %ch, %dh
>> > -   and     $15, %dh
>> > -   jz      L(find_zero_high_8)
>> > -   test    $0x01, %ch
>> > -   jnz     L(FindZeroExit9)
>> > -   test    $0x02, %ch
>> > -   jnz     L(FindZeroExit10)
>> > -   test    $0x04, %ch
>> > -   jnz     L(FindZeroExit11)
>> > -   and     $1 << 12 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(find_zero_high_8):
>> > -   test    $0x10, %ch
>> > -   jnz     L(FindZeroExit13)
>> > -   test    $0x20, %ch
>> > -   jnz     L(FindZeroExit14)
>> > -   test    $0x40, %ch
>> > -   jnz     L(FindZeroExit15)
>> > -   and     $1 << 16 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit1):
>> > -   and     $1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit2):
>> > -   and     $1 << 2 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit3):
>> > -   and     $1 << 3 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit5):
>> > -   and     $1 << 5 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit6):
>> > -   and     $1 << 6 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit7):
>> > -   and     $1 << 7 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit9):
>> > -   and     $1 << 9 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit10):
>> > -   and     $1 << 10 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit11):
>> > -   and     $1 << 11 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit13):
>> > -   and     $1 << 13 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit14):
>> > -   and     $1 << 14 - 1, %rax
>> > -   jz      L(return_value)
>> > -   jmp     L(match_exit)
>> > -
>> > -   .p2align 4
>> > -L(FindZeroExit15):
>> > -   and     $1 << 15 - 1, %rax
>> > -   jz      L(return_value)
>> > -
>> > -   .p2align 4
>> > -L(match_exit):
>> > -   test    %ah, %ah
>> > -   jnz     L(match_exit_high)
>> > -   mov     %al, %dl
>> > -   and     $15 << 4, %dl
>> > -   jnz     L(match_exit_8)
>> > -   test    $0x08, %al
>> > -   jnz     L(Exit4)
>> > -   test    $0x04, %al
>> > -   jnz     L(Exit3)
>> > -   test    $0x02, %al
>> > -   jnz     L(Exit2)
>> > -   lea     -16(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(match_exit_8):
>> > -   test    $0x80, %al
>> > -   jnz     L(Exit8)
>> > -   test    $0x40, %al
>> > -   jnz     L(Exit7)
>> > -   test    $0x20, %al
>> > -   jnz     L(Exit6)
>> > -   lea     -12(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(match_exit_high):
>> > -   mov     %ah, %dh
>> > -   and     $15 << 4, %dh
>> > -   jnz     L(match_exit_high_8)
>> > -   test    $0x08, %ah
>> > -   jnz     L(Exit12)
>> > -   test    $0x04, %ah
>> > -   jnz     L(Exit11)
>> > -   test    $0x02, %ah
>> > -   jnz     L(Exit10)
>> > -   lea     -8(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(match_exit_high_8):
>> > -   test    $0x80, %ah
>> > -   jnz     L(Exit16)
>> > -   test    $0x40, %ah
>> > -   jnz     L(Exit15)
>> > -   test    $0x20, %ah
>> > -   jnz     L(Exit14)
>> > -   lea     -4(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit2):
>> > -   lea     -15(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit3):
>> > -   lea     -14(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit4):
>> > -   lea     -13(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit6):
>> > -   lea     -11(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit7):
>> > -   lea     -10(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit8):
>> > -   lea     -9(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit10):
>> > -   lea     -7(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit11):
>> > -   lea     -6(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit12):
>> > -   lea     -5(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit14):
>> > -   lea     -3(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit15):
>> > -   lea     -2(%rdi), %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(Exit16):
>> > -   lea     -1(%rdi), %rax
>> > -   ret
>> > -
>> > -/* Return NULL.  */
>> > -   .p2align 4
>> > -L(return_null):
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(prolog_find_zero):
>> > -   add     %rcx, %rdi
>> > -   mov     %rdx, %rcx
>> > -L(prolog_find_zero_1):
>> > -   test    %cl, %cl
>> > -   jz      L(prolog_find_zero_high)
>> > -   mov     %cl, %dl
>> > -   and     $15, %dl
>> > -   jz      L(prolog_find_zero_8)
>> > -   test    $0x01, %cl
>> > -   jnz     L(PrologFindZeroExit1)
>> > -   test    $0x02, %cl
>> > -   jnz     L(PrologFindZeroExit2)
>> > -   test    $0x04, %cl
>> > -   jnz     L(PrologFindZeroExit3)
>> > -   and     $1 << 4 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(prolog_find_zero_8):
>> > -   test    $0x10, %cl
>> > -   jnz     L(PrologFindZeroExit5)
>> > -   test    $0x20, %cl
>> > -   jnz     L(PrologFindZeroExit6)
>> > -   test    $0x40, %cl
>> > -   jnz     L(PrologFindZeroExit7)
>> > -   and     $1 << 8 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(prolog_find_zero_high):
>> > -   mov     %ch, %dh
>> > -   and     $15, %dh
>> > -   jz      L(prolog_find_zero_high_8)
>> > -   test    $0x01, %ch
>> > -   jnz     L(PrologFindZeroExit9)
>> > -   test    $0x02, %ch
>> > -   jnz     L(PrologFindZeroExit10)
>> > -   test    $0x04, %ch
>> > -   jnz     L(PrologFindZeroExit11)
>> > -   and     $1 << 12 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(prolog_find_zero_high_8):
>> > -   test    $0x10, %ch
>> > -   jnz     L(PrologFindZeroExit13)
>> > -   test    $0x20, %ch
>> > -   jnz     L(PrologFindZeroExit14)
>> > -   test    $0x40, %ch
>> > -   jnz     L(PrologFindZeroExit15)
>> > -   and     $1 << 16 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit1):
>> > -   and     $1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit2):
>> > -   and     $1 << 2 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit3):
>> > -   and     $1 << 3 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit5):
>> > -   and     $1 << 5 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit6):
>> > -   and     $1 << 6 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit7):
>> > -   and     $1 << 7 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit9):
>> > -   and     $1 << 9 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit10):
>> > -   and     $1 << 10 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit11):
>> > -   and     $1 << 11 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit13):
>> > -   and     $1 << 13 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit14):
>> > -   and     $1 << 14 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -   .p2align 4
>> > -L(PrologFindZeroExit15):
>> > -   and     $1 << 15 - 1, %rax
>> > -   jnz     L(match_exit)
>> > -   xor     %rax, %rax
>> > -   ret
>> > -
>> > -END (__strrchr_sse2_no_bsf)
>> > -#endif
>> > diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
>> > deleted file mode 100644
>> > index 3f92a41..0000000
>> > --- a/sysdeps/x86_64/multiarch/strrchr.S
>> > +++ /dev/null
>> > @@ -1,288 +0,0 @@
>> > -/* Multiple versions of strrchr
>> > -   All versions must be listed in ifunc-impl-list.c.
>> > -   Copyright (C) 2009-2013 Free Software Foundation, Inc.
>> > -   This file is part of the GNU C Library.
>> > -
>> > -   The GNU C Library is free software; you can redistribute it and/or
>> > -   modify it under the terms of the GNU Lesser General Public
>> > -   License as published by the Free Software Foundation; either
>> > -   version 2.1 of the License, or (at your option) any later version.
>> > -
>> > -   The GNU C Library is distributed in the hope that it will be useful,
>> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > -   Lesser General Public License for more details.
>> > -
>> > -   You should have received a copy of the GNU Lesser General Public
>> > -   License along with the GNU C Library; if not, see
>> > -   <http://www.gnu.org/licenses/>.  */
>> > -
>> > -#include <sysdep.h>
>> > -#include <init-arch.h>
>> > -
>> > -
>> > -/* Define multiple versions only for the definition in libc and for
>> > -   the DSO.  In static binaries we need strrchr before the initialization
>> > -   happened.  */
>> > -#if defined SHARED && !defined NOT_IN_libc
>> > -   .text
>> > -ENTRY(strrchr)
>> > -   .type   strrchr, @gnu_indirect_function
>> > -   cmpl    $0, __cpu_features+KIND_OFFSET(%rip)
>> > -   jne     1f
>> > -   call    __init_cpu_features
>> > -1: leaq    __strrchr_sse2(%rip), %rax
>> > -   testl   $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
>> > -   jnz     2f
>> > -   testl   $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
>> > -   jz      2f
>> > -   leaq    __strrchr_sse42(%rip), %rax
>> > -   ret
>> > -2: testl   $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
>> > -   jz      3f
>> > -   leaq    __strrchr_sse2_no_bsf(%rip), %rax
>> > -3: ret
>> > -END(strrchr)
>> > -
>> > -/*
>> > -   This implementation uses SSE4 instructions to compare up to 16 bytes
>> > -   at a time looking for the last occurrence of the character c in the
>> > -   string s:
>> > -
>> > -   char *strrchr (const char *s, int c);
>> > -
>> > -   We use 0x4a:
>> > -   _SIDD_SBYTE_OPS
>> > -   | _SIDD_CMP_EQUAL_EACH
>> > -   | _SIDD_MOST_SIGNIFICANT
>> > -   on pcmpistri to compare xmm/mem128
>> > -
>> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
>> > -   X X X X X X X X X X X X X X X X
>> > -
>> > -   against xmm
>> > -
>> > -   0 1 2 3 4 5 6 7 8 9 A B C D E F
>> > -   C C C C C C C C C C C C C C C C
>> > -
>> > -   to find out if the first 16byte data element has a byte C and the
>> > -   last offset.  There are 4 cases:
>> > -
>> > -   1. The first 16byte data element has EOS and has the byte C at the
>> > -      last offset X.
>> > -   2. The first 16byte data element is valid and has the byte C at the
>> > -      last offset X.
>> > -   3. The first 16byte data element has EOS and doesn't have the byte C.
>> > -   4. The first 16byte data element is valid and doesn't have the byte C.
>> > -
>> > -   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
>> > -
>> > -   case            ECX     CFlag   ZFlag   SFlag
>> > -    1               X        1       1       0
>> > -    2               X        1       0       0
>> > -    3              16        0       1       0
>> > -    4              16        0       0       0
>> > -
>> > -   We exit from the loop for cases 1 and 3 with jz which branches
>> > -   when ZFlag is 1.  If CFlag == 1, ECX has the offset X for case 1.  */
>> > -
>> > -
>> > -   .section .text.sse4.2,"ax",@progbits
>> > -   .align  16
>> > -   .type   __strrchr_sse42, @function
>> > -   .globl  __strrchr_sse42
>> > -   .hidden __strrchr_sse42
>> > -__strrchr_sse42:
>> > -   cfi_startproc
>> > -   CALL_MCOUNT
>> > -   testb   %sil, %sil
>> > -   je      __strend_sse4
>> > -   xor     %eax,%eax       /* RAX has the last occurrence of s.  */
>> > -   movd    %esi, %xmm1
>> > -   punpcklbw       %xmm1, %xmm1
>> > -   movl    %edi, %esi
>> > -   punpcklbw       %xmm1, %xmm1
>> > -   andl    $15, %esi
>> > -   pshufd  $0, %xmm1, %xmm1
>> > -   movq    %rdi, %r8
>> > -   je      L(loop)
>> > -
>> > -/* Handle unaligned string using psrldq.  */
>> > -   leaq    L(psrldq_table)(%rip), %rdx
>> > -   andq    $-16, %r8
>> > -   movslq  (%rdx,%rsi,4),%r9
>> > -   movdqa  (%r8), %xmm0
>> > -   addq    %rdx, %r9
>> > -   jmp     *%r9
>> > -
>> > -/* Handle unaligned string with offset 1 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_1):
>> > -   psrldq  $1, %xmm0
>> > -
>> > -   .p2align 4
>> > -L(unaligned_pcmpistri):
>> > -   pcmpistri       $0x4a, %xmm1, %xmm0
>> > -   jnc     L(unaligned_no_byte)
>> > -   leaq    (%rdi,%rcx), %rax
>> > -L(unaligned_no_byte):
>> > -   /* Find the length of the unaligned string.  */
>> > -   pcmpistri       $0x3a, %xmm0, %xmm0
>> > -   movl    $16, %edx
>> > -   subl    %esi, %edx
>> > -   cmpl    %ecx, %edx
>> > -   /* Return RAX if the unaligned fragment to next 16B already
>> > -      contain the NULL terminator.  */
>> > -   jg      L(exit)
>> > -   addq    $16, %r8
>> > -
>> > -/* Loop start on aligned string.  */
>> > -   .p2align 4
>> > -L(loop):
>> > -   pcmpistri       $0x4a, (%r8), %xmm1
>> > -   jbe     L(match_or_eos)
>> > -   addq    $16, %r8
>> > -   jmp     L(loop)
>> > -   .p2align 4
>> > -L(match_or_eos):
>> > -   je      L(had_eos)
>> > -L(match_no_eos):
>> > -   leaq    (%r8,%rcx), %rax
>> > -   addq    $16, %r8
>> > -   jmp     L(loop)
>> > -   .p2align 4
>> > -L(had_eos):
>> > -   jnc     L(exit)
>> > -   leaq    (%r8,%rcx), %rax
>> > -   .p2align 4
>> > -L(exit):
>> > -   ret
>> > -
>> > -/* Handle unaligned string with offset 15 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_15):
>> > -   psrldq  $15, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 14 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_14):
>> > -   psrldq  $14, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 13 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_13):
>> > -   psrldq  $13, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 12 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_12):
>> > -   psrldq  $12, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 11 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_11):
>> > -   psrldq  $11, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 10 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_10):
>> > -   psrldq  $10, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 9 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_9):
>> > -   psrldq  $9, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 8 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_8):
>> > -   psrldq  $8, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 7 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_7):
>> > -   psrldq  $7, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 6 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_6):
>> > -   psrldq  $6, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 5 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_5):
>> > -   psrldq  $5, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 4 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_4):
>> > -   psrldq  $4, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 3 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_3):
>> > -   psrldq  $3, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -/* Handle unaligned string with offset 2 using psrldq.  */
>> > -   .p2align 4
>> > -L(psrldq_2):
>> > -   psrldq  $2, %xmm0
>> > -   jmp     L(unaligned_pcmpistri)
>> > -
>> > -   cfi_endproc
>> > -   .size   __strrchr_sse42, .-__strrchr_sse42
>> > -
>> > -   .section .rodata.sse4.2,"a",@progbits
>> > -   .p2align 4
>> > -L(psrldq_table):
>> > -   .int    L(loop) - L(psrldq_table)
>> > -   .int    L(psrldq_1) - L(psrldq_table)
>> > -   .int    L(psrldq_2) - L(psrldq_table)
>> > -   .int    L(psrldq_3) - L(psrldq_table)
>> > -   .int    L(psrldq_4) - L(psrldq_table)
>> > -   .int    L(psrldq_5) - L(psrldq_table)
>> > -   .int    L(psrldq_6) - L(psrldq_table)
>> > -   .int    L(psrldq_7) - L(psrldq_table)
>> > -   .int    L(psrldq_8) - L(psrldq_table)
>> > -   .int    L(psrldq_9) - L(psrldq_table)
>> > -   .int    L(psrldq_10) - L(psrldq_table)
>> > -   .int    L(psrldq_11) - L(psrldq_table)
>> > -   .int    L(psrldq_12) - L(psrldq_table)
>> > -   .int    L(psrldq_13) - L(psrldq_table)
>> > -   .int    L(psrldq_14) - L(psrldq_table)
>> > -   .int    L(psrldq_15) - L(psrldq_table)
>> > -
>> > -
>> > -# undef ENTRY
>> > -# define ENTRY(name) \
>> > -   .type __strrchr_sse2, @function; \
>> > -   .align 16; \
>> > -   .globl __strrchr_sse2; \
>> > -   .hidden __strrchr_sse2; \
>> > -   __strrchr_sse2: cfi_startproc; \
>> > -   CALL_MCOUNT
>> > -# undef END
>> > -# define END(name) \
>> > -   cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
>> > -# undef libc_hidden_builtin_def
>> > -/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
>> > -   The speedup we get from using SSE4.2 instruction is likely eaten away
>> > -   by the indirect call in the PLT.  */
>> > -# define libc_hidden_builtin_def(name) \
>> > -   .globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
>> > -#endif
>> > -
>> > -#include "../strrchr.S"
>> > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
>> > index e413b07..22c83bc 100644
>> > --- a/sysdeps/x86_64/strrchr.S
>> > +++ b/sysdeps/x86_64/strrchr.S
>> > @@ -1,6 +1,6 @@
>> > -/* strrchr (str, ch) -- Return pointer to last occurrence of CH in STR.
>> > -   For AMD x86-64.
>> > -   Copyright (C) 2009-2013 Free Software Foundation, Inc.
>> > +/* strrchr with SSE2 without bsf and bsr
>> > +   Copyright (C) 2011-2013 Free Software Foundation, Inc.
>> > +   Contributed by Intel Corporation.
>> >     This file is part of the GNU C Library.
>> >
>> >     The GNU C Library is free software; you can redistribute it and/or
>> > @@ -17,63 +17,216 @@
>> >     License along with the GNU C Library; if not, see
>> >     <http://www.gnu.org/licenses/>.  */
>> >
>> > +
>> >  #include <sysdep.h>
>> > +#include "asm-syntax.h"
>> > +
>> > +# ifndef ALIGN
>> > +#  define ALIGN(n) .p2align n
>> > +# endif
>> > +
>> >
>> >
>> > -   .text
>> >  ENTRY (strrchr)
>> > +
>> >     movd    %esi, %xmm1
>> > -   movq    %rdi, %rcx
>> > -   punpcklbw %xmm1, %xmm1
>> > -   andq    $~15, %rdi
>> > -   pxor    %xmm2, %xmm2
>> > -   punpcklbw %xmm1, %xmm1
>> > -   orl     $0xffffffff, %esi
>> > -   movdqa  (%rdi), %xmm0
>> > +   movq    %rdi, %rax
>> > +   andl    $4095, %eax
>> > +   punpcklbw       %xmm1, %xmm1
>> > +   cmpq    $4032, %rax
>> > +   punpcklwd       %xmm1, %xmm1
>> >     pshufd  $0, %xmm1, %xmm1
>> > -   subq    %rdi, %rcx
>> > +   ja      L(cross_page)
>> > +   movdqu  (%rdi), %xmm0
>> > +   pxor    %xmm2, %xmm2
>> >     movdqa  %xmm0, %xmm3
>> > -   leaq    16(%rdi), %rdi
>> >     pcmpeqb %xmm1, %xmm0
>> >     pcmpeqb %xmm2, %xmm3
>> > -   shl     %cl, %esi
>> > -   pmovmskb %xmm0, %edx
>> > -   pmovmskb %xmm3, %ecx
>> > -   andl    %esi, %edx
>> > -   andl    %esi, %ecx
>> > -   xorl    %eax, %eax
>> > -   movl    %edx, %esi
>> > -   orl     %ecx, %esi
>> > -   jnz     1f
>> > -
>> > -2: movdqa  (%rdi), %xmm0
>> > -   leaq    16(%rdi), %rdi
>> > -   movdqa  %xmm0, %xmm3
>> > +   pmovmskb        %xmm0, %ecx
>> > +   pmovmskb        %xmm3, %edx
>> > +   testq   %rdx, %rdx
>> > +   je      L(next_48_bytes)
>> > +   leaq    -1(%rdx), %rax
>> > +   xorq    %rdx, %rax
>> > +   andq    %rcx, %rax
>> > +   je      L(exit)
>> > +   bsrq    %rax, %rax
>> > +   addq    %rdi, %rax
>> > +   ret
>> > +   ALIGN(4)
>> > +L(next_48_bytes):
>> > +   movdqu  16(%rdi), %xmm4
>> > +   movdqa  %xmm4, %xmm5
>> > +   movdqu  32(%rdi), %xmm3
>> > +   pcmpeqb %xmm1, %xmm4
>> > +   pcmpeqb %xmm2, %xmm5
>> > +   movdqu  48(%rdi), %xmm0
>> > +   pmovmskb        %xmm5, %edx
>> > +   movdqa  %xmm3, %xmm5
>> > +   pcmpeqb %xmm1, %xmm3
>> > +   pcmpeqb %xmm2, %xmm5
>> > +   pcmpeqb %xmm0, %xmm2
>> > +   salq    $16, %rdx
>> > +   pmovmskb        %xmm3, %r8d
>> > +   pmovmskb        %xmm5, %eax
>> > +   pmovmskb        %xmm2, %esi
>> > +   salq    $32, %r8
>> > +   salq    $32, %rax
>> >     pcmpeqb %xmm1, %xmm0
>> > -   pcmpeqb %xmm2, %xmm3
>> > -   pmovmskb %xmm0, %edx
>> > -   pmovmskb %xmm3, %ecx
>> > -   movl    %edx, %esi
>> > -   orl     %ecx, %esi
>> > -   jz      2b
>> > +   orq     %rdx, %rax
>> > +   movq    %rsi, %rdx
>> > +   pmovmskb        %xmm4, %esi
>> > +   salq    $48, %rdx
>> > +   salq    $16, %rsi
>> > +   orq     %r8, %rsi
>> > +   orq     %rcx, %rsi
>> > +   pmovmskb        %xmm0, %ecx
>> > +   salq    $48, %rcx
>> > +   orq     %rcx, %rsi
>> > +   orq     %rdx, %rax
>> > +   je      L(loop_header2)
>> > +   leaq    -1(%rax), %rcx
>> > +   xorq    %rax, %rcx
>> > +   andq    %rcx, %rsi
>> > +   je      L(exit)
>> > +   bsrq    %rsi, %rsi
>> > +   leaq    (%rdi,%rsi), %rax
>> > +   ret
>> >
>> > -1: bsfl    %ecx, %r9d
>> > -   movl    $0xffffffff, %r8d
>> > -   movl    $31, %ecx
>> > -   jnz     5f
>> > +   ALIGN(4)
>> > +L(loop_header2):
>> > +   testq   %rsi, %rsi
>> > +   movq    %rdi, %rcx
>> > +   je      L(no_c_found)
>> > +L(loop_header):
>> > +   addq    $64, %rdi
>> > +   pxor    %xmm7, %xmm7
>> > +   andq    $-64, %rdi
>> > +   jmp     L(loop_entry)
>> > +   ALIGN(4)
>> > +L(loop64):
>> > +   testq   %rdx, %rdx
>> > +   cmovne  %rdx, %rsi
>> > +   cmovne  %rdi, %rcx
>> > +   addq    $64, %rdi
>> > +L(loop_entry):
>> > +   movdqa  32(%rdi), %xmm3
>> > +   pxor    %xmm6, %xmm6
>> > +   movdqa  48(%rdi), %xmm2
>> > +   movdqa  %xmm3, %xmm0
>> > +   movdqa  16(%rdi), %xmm4
>> > +   pminub  %xmm2, %xmm0
>> > +   movdqa  (%rdi), %xmm5
>> > +   pminub  %xmm4, %xmm0
>> > +   pminub  %xmm5, %xmm0
>> > +   pcmpeqb %xmm7, %xmm0
>> > +   pmovmskb        %xmm0, %eax
>> > +   movdqa  %xmm5, %xmm0
>> > +   pcmpeqb %xmm1, %xmm0
>> > +   pmovmskb        %xmm0, %r9d
>> > +   movdqa  %xmm4, %xmm0
>> > +   pcmpeqb %xmm1, %xmm0
>> > +   pmovmskb        %xmm0, %edx
>> > +   movdqa  %xmm3, %xmm0
>> > +   pcmpeqb %xmm1, %xmm0
>> > +   salq    $16, %rdx
>> > +   pmovmskb        %xmm0, %r10d
>> > +   movdqa  %xmm2, %xmm0
>> > +   pcmpeqb %xmm1, %xmm0
>> > +   salq    $32, %r10
>> > +   orq     %r10, %rdx
>> > +   pmovmskb        %xmm0, %r8d
>> > +   orq     %r9, %rdx
>> > +   salq    $48, %r8
>> > +   orq     %r8, %rdx
>> > +   testl   %eax, %eax
>> > +   je      L(loop64)
>> > +   pcmpeqb %xmm6, %xmm4
>> > +   pcmpeqb %xmm6, %xmm3
>> > +   pcmpeqb %xmm6, %xmm5
>> > +   pmovmskb        %xmm4, %eax
>> > +   pmovmskb        %xmm3, %r10d
>> > +   pcmpeqb %xmm6, %xmm2
>> > +   pmovmskb        %xmm5, %r9d
>> > +   salq    $32, %r10
>> > +   salq    $16, %rax
>> > +   pmovmskb        %xmm2, %r8d
>> > +   orq     %r10, %rax
>> > +   orq     %r9, %rax
>> > +   salq    $48, %r8
>> > +   orq     %r8, %rax
>> > +   leaq    -1(%rax), %r8
>> > +   xorq    %rax, %r8
>> > +   andq    %r8, %rdx
>> > +   cmovne  %rdi, %rcx
>> > +   cmovne  %rdx, %rsi
>> > +   bsrq    %rsi, %rsi
>> > +   leaq    (%rcx,%rsi), %rax
>> > +   ret
>> > +   ALIGN(4)
>> > +L(no_c_found):
>> > +   movl    $1, %esi
>> > +   xorl    %ecx, %ecx
>> > +   jmp     L(loop_header)
>> > +   ALIGN(4)
>> > +L(exit):
>> > +   xorl    %eax, %eax
>> > +   ret
>> > +   ALIGN(4)
>> >
>> > -   bsrl    %edx, %edx
>> > -   jz      2b
>> > -   leaq    -16(%rdi,%rdx), %rax
>> > -   jmp     2b
>> > +L(cross_page):
>> > +   movq    %rdi, %rax
>> > +   pxor    %xmm0, %xmm0
>> > +   andq    $-64, %rax
>> > +   movdqu  (%rax), %xmm5
>> > +   movdqa  %xmm5, %xmm6
>> > +   movdqu  16(%rax), %xmm4
>> > +   pcmpeqb %xmm1, %xmm5
>> > +   pcmpeqb %xmm0, %xmm6
>> > +   movdqu  32(%rax), %xmm3
>> > +   pmovmskb        %xmm6, %esi
>> > +   movdqa  %xmm4, %xmm6
>> > +   movdqu  48(%rax), %xmm2
>> > +   pcmpeqb %xmm1, %xmm4
>> > +   pcmpeqb %xmm0, %xmm6
>> > +   pmovmskb        %xmm6, %edx
>> > +   movdqa  %xmm3, %xmm6
>> > +   pcmpeqb %xmm1, %xmm3
>> > +   pcmpeqb %xmm0, %xmm6
>> > +   pcmpeqb %xmm2, %xmm0
>> > +   salq    $16, %rdx
>> > +   pmovmskb        %xmm3, %r9d
>> > +   pmovmskb        %xmm6, %r8d
>> > +   pmovmskb        %xmm0, %ecx
>> > +   salq    $32, %r9
>> > +   salq    $32, %r8
>> > +   pcmpeqb %xmm1, %xmm2
>> > +   orq     %r8, %rdx
>> > +   salq    $48, %rcx
>> > +   pmovmskb        %xmm5, %r8d
>> > +   orq     %rsi, %rdx
>> > +   pmovmskb        %xmm4, %esi
>> > +   orq     %rcx, %rdx
>> > +   pmovmskb        %xmm2, %ecx
>> > +   salq    $16, %rsi
>> > +   salq    $48, %rcx
>> > +   orq     %r9, %rsi
>> > +   orq     %r8, %rsi
>> > +   orq     %rcx, %rsi
>> > +   movl    %edi, %ecx
>> > +   subl    %eax, %ecx
>> > +   shrq    %cl, %rdx
>> > +   shrq    %cl, %rsi
>> > +   testq   %rdx, %rdx
>> > +   je      L(loop_header2)
>> > +   leaq    -1(%rdx), %rax
>> > +   xorq    %rdx, %rax
>> > +   andq    %rax, %rsi
>> > +   je      L(exit)
>> > +   bsrq    %rsi, %rax
>> > +   addq    %rdi, %rax
>> > +   ret
>> >
>> > -5: subl    %r9d, %ecx
>> > -   shrl    %cl, %r8d
>> > -   andl    %r8d, %edx
>> > -   bsrl    %edx, %edx
>> > -   jz      4f
>> > -   leaq    -16(%rdi,%rdx), %rax
>> > -4: ret
>> >  END (strrchr)
>> >
>> >  weak_alias (strrchr, rindex)
>> > --
>> > 1.8.3.2
>> >
>>
>> --
>>
>> the AA battery in the wallclock sends magnetic interference
>
> --
>
> terrorist activities


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]