This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [RFC] Improve strcat


Is it possible to make less drawdowns in your plots?
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/strcat_profile/results_rand_L3/result.html
Especially #4.

--
Liubov

On Wed, Sep 11, 2013 at 2:23 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> On Tue, Sep 10, 2013 at 10:28:44PM +0200, OndÅej BÃlka wrote:
>> Hi Carlos,
>>
>> Here is strcpy with comments. To get structure I decided to include
>> ssse3 loop in this patch. If you are ok with splitting to loop header
>> an ssse3 could be reviewed separately.
>>
>> I ommited actual strcat calls as I have patch that uses them ready and
>> it needs bit of code movement.
>>
> For strcat there was one optimization oppurtunity left - find trailing
> zeros in source and destination in parallel. This patch does exactly
> that.
>
> This allows us to directly jump to code that copies given amount of
> bytes so I put strcat implementation to file strcpy-sse2-unaligned-v2.S.
>
> I do not handle strncat yet, so I copied old strcat*.S to strncat*.S
>
> I did not optimize instruction scheduling yet to make code easier to
> read.
>
> Results of benchmark are here.
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strcat_profile.html
>
> Comments?
>
> ---
>  sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S   | 280 +------
>  sysdeps/x86_64/multiarch/strcat-ssse3.S            | 868 +-------------------
>  .../x86_64/multiarch/strcpy-sse2-unaligned-v2.S    | 217 ++++-
>  sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S         |   2 +-
>  sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S  | 285 ++++++-
>  sysdeps/x86_64/multiarch/strncat-ssse3.S           | 869 ++++++++++++++++++++-
>  6 files changed, 1364 insertions(+), 1157 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> index 028c6d3..03c1f18 100644
> --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> @@ -1,279 +1 @@
> -/* strcat with SSE2
> -   Copyright (C) 2011-2013 Free Software Foundation, Inc.
> -   Contributed by Intel Corporation.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#ifndef NOT_IN_libc
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_sse2_unaligned
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -       mov     %rdi, %r9
> -# ifdef USE_AS_STRNCAT
> -       mov     %rdx, %r8
> -# endif
> -
> -/* Inline corresponding strlen file, temporary until new strcpy
> -   implementation gets merged.  */
> -
> -       xor     %rax, %rax
> -       mov     %edi, %ecx
> -       and     $0x3f, %ecx
> -       pxor    %xmm0, %xmm0
> -       cmp     $0x30, %ecx
> -       ja      L(next)
> -       movdqu  (%rdi), %xmm1
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit_less16)
> -       mov     %rdi, %rax
> -       and     $-16, %rax
> -       jmp     L(align16_start)
> -L(next):
> -       mov     %rdi, %rax
> -       and     $-16, %rax
> -       pcmpeqb (%rax), %xmm0
> -       mov     $-1, %r10d
> -       sub     %rax, %rcx
> -       shl     %cl, %r10d
> -       pmovmskb %xmm0, %edx
> -       and     %r10d, %edx
> -       jnz     L(exit)
> -
> -L(align16_start):
> -       pxor    %xmm0, %xmm0
> -       pxor    %xmm1, %xmm1
> -       pxor    %xmm2, %xmm2
> -       pxor    %xmm3, %xmm3
> -       pcmpeqb 16(%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit16)
> -
> -       pcmpeqb 32(%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit32)
> -
> -       pcmpeqb 48(%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit48)
> -
> -       pcmpeqb 64(%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit64)
> -
> -       pcmpeqb 80(%rax), %xmm0
> -       add     $64, %rax
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit16)
> -
> -       pcmpeqb 32(%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit32)
> -
> -       pcmpeqb 48(%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit48)
> -
> -       pcmpeqb 64(%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit64)
> -
> -       pcmpeqb 80(%rax), %xmm0
> -       add     $64, %rax
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit16)
> -
> -       pcmpeqb 32(%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit32)
> -
> -       pcmpeqb 48(%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit48)
> -
> -       pcmpeqb 64(%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit64)
> -
> -       pcmpeqb 80(%rax), %xmm0
> -       add     $64, %rax
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit16)
> -
> -       pcmpeqb 32(%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit32)
> -
> -       pcmpeqb 48(%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit48)
> -
> -       pcmpeqb 64(%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit64)
> -
> -       test    $0x3f, %rax
> -       jz      L(align64_loop)
> -
> -       pcmpeqb 80(%rax), %xmm0
> -       add     $80, %rax
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $0x3f, %rax
> -       jz      L(align64_loop)
> -
> -       pcmpeqb 16(%rax), %xmm1
> -       add     $16, %rax
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $0x3f, %rax
> -       jz      L(align64_loop)
> -
> -       pcmpeqb 16(%rax), %xmm2
> -       add     $16, %rax
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       test    $0x3f, %rax
> -       jz      L(align64_loop)
> -
> -       pcmpeqb 16(%rax), %xmm3
> -       add     $16, %rax
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       add     $16, %rax
> -       .p2align 4
> -       L(align64_loop):
> -       movaps  (%rax), %xmm4
> -       pminub  16(%rax),       %xmm4
> -       movaps  32(%rax),       %xmm5
> -       pminub  48(%rax),       %xmm5
> -       add     $64,    %rax
> -       pminub  %xmm4,  %xmm5
> -       pcmpeqb %xmm0,  %xmm5
> -       pmovmskb %xmm5, %edx
> -       test    %edx,   %edx
> -       jz      L(align64_loop)
> -
> -       pcmpeqb -64(%rax), %xmm0
> -       sub     $80,    %rax
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       jnz     L(exit16)
> -
> -       pcmpeqb 32(%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       jnz     L(exit32)
> -
> -       pcmpeqb 48(%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       jnz     L(exit48)
> -
> -       pcmpeqb 64(%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $64, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit):
> -       sub     %rdi, %rax
> -L(exit_less16):
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit16):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $16, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit32):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $32, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit48):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $48, %rax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit64):
> -       sub     %rdi, %rax
> -       bsf     %rdx, %rdx
> -       add     %rdx, %rax
> -       add     $64, %rax
> -
> -       .p2align 4
> -L(StartStrcpyPart):
> -       lea     (%r9, %rax), %rdi
> -       mov     %rsi, %rcx
> -       mov     %r9, %rax      /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> -       test    %r8, %r8
> -       jz      L(ExitZero)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-sse2-unaligned.S"
> -#endif
> +/* Implemented in strcpy-sse2-unaligned-v2.S  */
> diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> index 8101b91..fd5fba7 100644
> --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> @@ -1,867 +1 @@
> -/* strcat with SSSE3
> -   Copyright (C) 2011-2013 Free Software Foundation, Inc.
> -   Contributed by Intel Corporation.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <http://www.gnu.org/licenses/>.  */
> -
> -#ifndef NOT_IN_libc
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_ssse3
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -# ifdef USE_AS_STRNCAT
> -       mov     %rdx, %r8
> -# endif
> -
> -
> -/* Inline corresponding strlen file, temporary until new strcpy
> -   implementation gets merged.  */
> -
> -       xor     %eax, %eax
> -       cmpb    $0, (%rdi)
> -       jz      L(exit_tail0)
> -       cmpb    $0, 1(%rdi)
> -       jz      L(exit_tail1)
> -       cmpb    $0, 2(%rdi)
> -       jz      L(exit_tail2)
> -       cmpb    $0, 3(%rdi)
> -       jz      L(exit_tail3)
> -
> -       cmpb    $0, 4(%rdi)
> -       jz      L(exit_tail4)
> -       cmpb    $0, 5(%rdi)
> -       jz      L(exit_tail5)
> -       cmpb    $0, 6(%rdi)
> -       jz      L(exit_tail6)
> -       cmpb    $0, 7(%rdi)
> -       jz      L(exit_tail7)
> -
> -       cmpb    $0, 8(%rdi)
> -       jz      L(exit_tail8)
> -       cmpb    $0, 9(%rdi)
> -       jz      L(exit_tail9)
> -       cmpb    $0, 10(%rdi)
> -       jz      L(exit_tail10)
> -       cmpb    $0, 11(%rdi)
> -       jz      L(exit_tail11)
> -
> -       cmpb    $0, 12(%rdi)
> -       jz      L(exit_tail12)
> -       cmpb    $0, 13(%rdi)
> -       jz      L(exit_tail13)
> -       cmpb    $0, 14(%rdi)
> -       jz      L(exit_tail14)
> -       cmpb    $0, 15(%rdi)
> -       jz      L(exit_tail15)
> -       pxor    %xmm0, %xmm0
> -       lea     16(%rdi), %rcx
> -       lea     16(%rdi), %rax
> -       and     $-16, %rax
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       pxor    %xmm1, %xmm1
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       pxor    %xmm2, %xmm2
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       pxor    %xmm3, %xmm3
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       and     $-0x40, %rax
> -
> -       .p2align 4
> -L(aligned_64):
> -       pcmpeqb (%rax), %xmm0
> -       pcmpeqb 16(%rax), %xmm1
> -       pcmpeqb 32(%rax), %xmm2
> -       pcmpeqb 48(%rax), %xmm3
> -       pmovmskb %xmm0, %edx
> -       pmovmskb %xmm1, %r11d
> -       pmovmskb %xmm2, %r10d
> -       pmovmskb %xmm3, %r9d
> -       or      %edx, %r9d
> -       or      %r11d, %r9d
> -       or      %r10d, %r9d
> -       lea     64(%rax), %rax
> -       jz      L(aligned_64)
> -
> -       test    %edx, %edx
> -       jnz     L(aligned_64_exit_16)
> -       test    %r11d, %r11d
> -       jnz     L(aligned_64_exit_32)
> -       test    %r10d, %r10d
> -       jnz     L(aligned_64_exit_48)
> -
> -L(aligned_64_exit_64):
> -       pmovmskb %xmm3, %edx
> -       jmp     L(exit)
> -
> -L(aligned_64_exit_48):
> -       lea     -16(%rax), %rax
> -       mov     %r10d, %edx
> -       jmp     L(exit)
> -
> -L(aligned_64_exit_32):
> -       lea     -32(%rax), %rax
> -       mov     %r11d, %edx
> -       jmp     L(exit)
> -
> -L(aligned_64_exit_16):
> -       lea     -48(%rax), %rax
> -
> -L(exit):
> -       sub     %rcx, %rax
> -       test    %dl, %dl
> -       jz      L(exit_high)
> -       test    $0x01, %dl
> -       jnz     L(exit_tail0)
> -
> -       test    $0x02, %dl
> -       jnz     L(exit_tail1)
> -
> -       test    $0x04, %dl
> -       jnz     L(exit_tail2)
> -
> -       test    $0x08, %dl
> -       jnz     L(exit_tail3)
> -
> -       test    $0x10, %dl
> -       jnz     L(exit_tail4)
> -
> -       test    $0x20, %dl
> -       jnz     L(exit_tail5)
> -
> -       test    $0x40, %dl
> -       jnz     L(exit_tail6)
> -       add     $7, %eax
> -L(exit_tail0):
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_high):
> -       add     $8, %eax
> -       test    $0x01, %dh
> -       jnz     L(exit_tail0)
> -
> -       test    $0x02, %dh
> -       jnz     L(exit_tail1)
> -
> -       test    $0x04, %dh
> -       jnz     L(exit_tail2)
> -
> -       test    $0x08, %dh
> -       jnz     L(exit_tail3)
> -
> -       test    $0x10, %dh
> -       jnz     L(exit_tail4)
> -
> -       test    $0x20, %dh
> -       jnz     L(exit_tail5)
> -
> -       test    $0x40, %dh
> -       jnz     L(exit_tail6)
> -       add     $7, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail1):
> -       add     $1, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail2):
> -       add     $2, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail3):
> -       add     $3, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail4):
> -       add     $4, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail5):
> -       add     $5, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail6):
> -       add     $6, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail7):
> -       add     $7, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail8):
> -       add     $8, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail9):
> -       add     $9, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail10):
> -       add     $10, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail11):
> -       add     $11, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail12):
> -       add     $12, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail13):
> -       add     $13, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail14):
> -       add     $14, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail15):
> -       add     $15, %eax
> -
> -       .p2align 4
> -L(StartStrcpyPart):
> -       mov     %rsi, %rcx
> -       lea     (%rdi, %rax), %rdx
> -# ifdef USE_AS_STRNCAT
> -       test    %r8, %r8
> -       jz      L(StrncatExit0)
> -       cmp     $8, %r8
> -       jbe     L(StrncatExit8Bytes)
> -# endif
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmpb    $0, 7(%rcx)
> -       jz      L(Exit8)
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %r8
> -       jb      L(StrncatExit15Bytes)
> -# endif
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmpb    $0, 14(%rcx)
> -       jz      L(Exit15)
> -       cmpb    $0, 15(%rcx)
> -       jz      L(Exit16)
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %r8
> -       je      L(StrncatExit16)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-ssse3.S"
> -
> -       .p2align 4
> -L(CopyFrom1To16Bytes):
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       test    %al, %al
> -       jz      L(ExitHigh)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHigh):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       test    $0x08, %ah
> -       jnz     L(Exit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit1):
> -       xor     %ah, %ah
> -       movb    %ah, 1(%rdx)
> -L(Exit1):
> -       movb    (%rcx), %al
> -       movb    %al, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit2):
> -       xor     %ah, %ah
> -       movb    %ah, 2(%rdx)
> -L(Exit2):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit3):
> -       xor     %ah, %ah
> -       movb    %ah, 3(%rdx)
> -L(Exit3):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       movb    2(%rcx), %al
> -       movb    %al, 2(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit4):
> -       xor     %ah, %ah
> -       movb    %ah, 4(%rdx)
> -L(Exit4):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit5):
> -       xor     %ah, %ah
> -       movb    %ah, 5(%rdx)
> -L(Exit5):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       movb    4(%rcx), %al
> -       movb    %al, 4(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit6):
> -       xor     %ah, %ah
> -       movb    %ah, 6(%rdx)
> -L(Exit6):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       movw    4(%rcx), %ax
> -       movw    %ax, 4(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit7):
> -       xor     %ah, %ah
> -       movb    %ah, 7(%rdx)
> -L(Exit7):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       mov     3(%rcx), %eax
> -       mov     %eax, 3(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit8):
> -       xor     %ah, %ah
> -       movb    %ah, 8(%rdx)
> -L(Exit8):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit9):
> -       xor     %ah, %ah
> -       movb    %ah, 9(%rdx)
> -L(Exit9):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movb    8(%rcx), %al
> -       movb    %al, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit10):
> -       xor     %ah, %ah
> -       movb    %ah, 10(%rdx)
> -L(Exit10):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movw    8(%rcx), %ax
> -       movw    %ax, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit11):
> -       xor     %ah, %ah
> -       movb    %ah, 11(%rdx)
> -L(Exit11):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     7(%rcx), %eax
> -       mov     %eax, 7(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit12):
> -       xor     %ah, %ah
> -       movb    %ah, 12(%rdx)
> -L(Exit12):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     8(%rcx), %eax
> -       mov     %eax, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit13):
> -       xor     %ah, %ah
> -       movb    %ah, 13(%rdx)
> -L(Exit13):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  5(%rcx), %xmm1
> -       movlpd  %xmm1, 5(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit14):
> -       xor     %ah, %ah
> -       movb    %ah, 14(%rdx)
> -L(Exit14):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  6(%rcx), %xmm1
> -       movlpd  %xmm1, 6(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit15):
> -       xor     %ah, %ah
> -       movb    %ah, 15(%rdx)
> -L(Exit15):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  7(%rcx), %xmm1
> -       movlpd  %xmm1, 7(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit16):
> -       xor     %ah, %ah
> -       movb    %ah, 16(%rdx)
> -L(Exit16):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase2):
> -       add     $16, %r8
> -       add     %rsi, %rcx
> -       lea     (%rsi, %rdx), %rsi
> -       lea     -9(%r8), %rdx
> -       and     $1<<7, %dh
> -       or      %al, %dh
> -       test    %dh, %dh
> -       lea     (%rsi), %rdx
> -       jz      L(ExitHighCase2)
> -
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHighCase2):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       test    $0x8, %ah
> -       jnz     L(Exit12)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       cmp     $15, %r8
> -       je      L(StrncatExit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase3):
> -       add     $16, %r8
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       cmp     $8, %r8
> -       ja      L(ExitHighCase3)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       xor     %ah, %ah
> -       movb    %ah, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHighCase3):
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       cmp     $15, %r8
> -       je      L(StrncatExit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm1, 8(%rdx)
> -       xor     %ah, %ah
> -       movb    %ah, 16(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit0):
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit15Bytes):
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  7(%rcx), %xmm1
> -       movlpd  %xmm1, 7(%rdx)
> -       lea     14(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit8Bytes):
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -# endif
> -END (STRCAT)
> -#endif
> +/* Implemented in strcpy-ssse3-v2.S  */
> diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> index 9725857..77b9adb 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> @@ -21,18 +21,218 @@
>  # include <sysdep.h>
>
>  # ifndef STRCPY
> -#  define STRCPY_TAIL  __strcpy_sse2_unaligned_tail
>  #  define STRCPY       __strcpy_sse2_unaligned
> +#  define STRCAT       __strcat_sse2_unaligned
>  # endif
>
>  # define ALIGN(x) .p2align x
>
>  #ifndef USE_AS_STPCPY
> -ENTRY (STRCPY_TAIL)
> -       movq  %rsi, %rdx
> -       pxor  %xmm4, %xmm4
> -       jmp L(from_tail)
> -END (STRCPY_TAIL)
> +     /* Optimized strcat, we we optimize path when src and dest are at
> +       most 64 bytes large by finding terminating zero in parallel.
> +
> +         if ((src | dest) % 4096 > 4096 - 64)
> +           goto strcat_cross_page;
> +         d = zero64 (dest)
> +          s = zero64 (src)
> +         if (d)
> +            {
> +             found:;
> +             dest += ffs (d)
> +             if (s)
> +               return copy_less64_bytes (dest, src, ffs (s))
> +             copy64_bytes (dest, src)
> +             return strcpy (dest, src)
> +           }
> +          else
> +            {
> +             dest = ALIGN_DOWN (dest,64);
> +             while (1)
> +               {
> +                 dest += 64;
> +                 d = zero64 (dest);
> +                 if (d)
> +                   goto found;
> +               }
> +            }
> +        */
> +ENTRY (STRCAT)
> +       movq    %rdi, %rax
> +       pxor    %xmm5, %xmm5
> +       movq    %rsi, %rdx
> +        pxor   %xmm6, %xmm6
> +       orq     %rdi, %rdx
> +        pxor   %xmm7, %xmm7
> +       andl    $4095, %edx
> +       pxor    %xmm0, %xmm0
> +        cmpq   $4032, %rdx
> +       ja      L(strcat_cross_page)
> +       movdqu  (%rsi), %xmm1
> +       movdqu  16(%rsi), %xmm2
> +       movdqu  32(%rsi), %xmm3
> +       movdqu  48(%rsi), %xmm4
> +        pxor   %xmm8, %xmm8
> +       movdqu  (%rdi), %xmm9
> +       movdqu  16(%rdi), %xmm10
> +       movdqu  32(%rdi), %xmm11
> +       movdqu  48(%rdi), %xmm12
> +       pcmpeqb %xmm1, %xmm5
> +       pcmpeqb %xmm2, %xmm6
> +       pcmpeqb %xmm3, %xmm7
> +       pcmpeqb %xmm4, %xmm0
> +       pcmpeqb %xmm8, %xmm9
> +       pcmpeqb %xmm8, %xmm10
> +       pcmpeqb %xmm8, %xmm11
> +       pcmpeqb %xmm8, %xmm12
> +
> +       pmovmskb %xmm5, %edx
> +       pmovmskb %xmm6, %r8d
> +       pmovmskb %xmm7, %r9d
> +       pmovmskb %xmm0, %r10d
> +       shlq    $16,    %r8
> +       shlq    $32,    %r9
> +       shlq    $48,    %r10
> +       orq     %r8,    %rdx
> +       orq     %r9,    %rdx
> +       orq     %r10,   %rdx
> +
> +       pmovmskb %xmm9, %ecx
> +       pmovmskb %xmm10, %r11d
> +        pmovmskb %xmm11, %r10d
> +        pmovmskb %xmm12, %r9d
> +       shlq    $16,    %r11
> +       shlq    $32,    %r10
> +       shlq    $48,    %r9
> +       orq     %r11,   %rcx
> +       orq     %r10,   %rcx
> +       orq     %r9,    %rcx
> +
> +       testq   %rcx, %rcx
> +       je      L(strcat_loop_start)
> +L(strcat_found_zero):
> +       bsfq    %rcx, %rcx
> +       addq    %rcx, %rdi
> +       testq   %rdx, %rdx
> +       jne     L(last_64_bytes)
> +       jmp     L(strcat_first_64_bytes)
> +
> +L(strcat_loop_start):
> +       andq    $-64, %rdi
> +       ALIGN(4)
> +L(strcat_loop):
> +       movdqa  64(%rdi), %xmm5
> +       pminub  80(%rdi), %xmm5
> +       pminub  96(%rdi), %xmm5
> +       pminub  112(%rdi), %xmm5
> +       addq    $64, %rdi
> +       pcmpeqb  %xmm8, %xmm5
> +       pmovmskb %xmm5, %ecx
> +       testl   %ecx, %ecx
> +       je      L(strcat_loop)
> +       shlq    $48, %rcx
> +       movdqa  (%rdi), %xmm9
> +       movdqa  16(%rdi), %xmm10
> +       movdqa  32(%rdi), %xmm11
> +       pcmpeqb %xmm8, %xmm9
> +       pcmpeqb %xmm8, %xmm10
> +       pcmpeqb %xmm8, %xmm11
> +       pmovmskb        %xmm9, %r9d
> +       pmovmskb        %xmm10, %r10d
> +       pmovmskb        %xmm11, %r11d
> +       salq    $16, %r10
> +       salq    $32, %r11
> +       orq     %r9, %rcx
> +       orq     %r10, %rcx
> +       orq     %r11, %rcx
> +       jmp     L(strcat_found_zero)
> +
> +       /*
> +         d_al = ALIGN_DOWN (dest, 64);
> +         d = zero64 (d_al)
> +         d = d >> (dest - d_al)
> +         if (d)
> +            {
> +             dest += ffs (d)
> +             return strcpy (dest, src)
> +           }
> +          else
> +            {
> +             dest = ALIGN_DOWN (dest,64);
> +             while (1)
> +                {
> +                 dest += 64;
> +                 d = zero64 (dest);
> +                 if (d) {
> +                    dest += ffs (d)
> +                   return strcpy (dest, src)
> +                  }
> +               }
> +            } */
> +       L(strcat_cross_page):
> +       andq    $-64, %rdi
> +
> +       movdqa  48(%rdi), %xmm12
> +       pcmpeqb %xmm8, %xmm12
> +       pmovmskb %xmm12, %rcx
> +       shlq    $48, %rcx
> +       movdqa  (%rdi), %xmm9
> +       movdqa  16(%rdi), %xmm10
> +       movdqa  32(%rdi), %xmm11
> +       pcmpeqb %xmm8, %xmm9
> +       pcmpeqb %xmm8, %xmm10
> +       pcmpeqb %xmm8, %xmm11
> +       pmovmskb        %xmm9, %r9d
> +       pmovmskb        %xmm10, %r10d
> +       pmovmskb        %xmm11, %r11d
> +       salq    $16, %r10
> +       salq    $32, %r11
> +       orq     %r9, %rcx
> +       orq     %r10, %rcx
> +       orq     %r11, %rcx
> +       movq    %rcx, %rdx
> +       movq    %rax, %rcx
> +       shrq    %cl, %rdx /* We use fact that shifts are done modulo 64.  */
> +       testq   %rdx, %rdx
> +       je      L(strcat_cross_loop)
> +       movq    %rax, %rdi
> +
> +       pxor    %xmm4, %xmm4
> +       bsfq    %rdx, %rcx
> +       addq    %rcx, %rdi
> +       jmp     L(from_strcat)
> +
> +       ALIGN(4)
> +L(strcat_cross_loop):
> +       movdqa  64(%rdi), %xmm5
> +       pminub  80(%rdi), %xmm5
> +       pminub  96(%rdi), %xmm5
> +       pminub  112(%rdi), %xmm5
> +       addq    $64, %rdi
> +       pcmpeqb %xmm8, %xmm5
> +       pmovmskb        %xmm5, %ecx
> +       testl   %ecx, %ecx
> +       je      L(strcat_cross_loop)
> +       shlq    $48, %rcx
> +       movdqa  (%rdi), %xmm9
> +       movdqa  16(%rdi), %xmm10
> +       movdqa  32(%rdi), %xmm11
> +       pcmpeqb %xmm8, %xmm9
> +       pcmpeqb %xmm8, %xmm10
> +       pcmpeqb %xmm8, %xmm11
> +       pmovmskb        %xmm9, %r9d
> +       pmovmskb        %xmm10, %r10d
> +       pmovmskb        %xmm11, %r11d
> +       salq    $16, %r10
> +       salq    $32, %r11
> +       orq     %r9, %rcx
> +       orq     %r10, %rcx
> +       orq     %r11, %rcx
> +
> +       pxor    %xmm4, %xmm4
> +       bsfq    %rcx, %rcx
> +       addq    %rcx, %rdi
> +       jmp     L(from_strcat)
> +END (STRCAT)
>  #endif
>
>  ENTRY (STRCPY)
> @@ -51,7 +251,7 @@ ENTRY (STRCPY)
>         movq    %rsi, %rdx
>         pxor    %xmm4, %xmm4
>         movq    %rdi, %rax
> -L(from_tail):
> +L(from_strcat):
>         pxor    %xmm5, %xmm5
>         andl    $4095, %edx
>         pxor    %xmm6, %xmm6
> @@ -88,6 +288,7 @@ L(from_tail):
>         salq    $48, %rcx
>         orq     %rcx, %rdx
>         jne     L(between_32_64_bytes)
> +L(strcat_first_64_bytes):
>         movdqu  %xmm1, (%rdi)
>         movdqu  %xmm2, 16(%rdi)
>         movdqu  %xmm3, 32(%rdi)
> @@ -137,7 +338,7 @@ L(prepare_loop):
>       /* After loop finished we call following
>         copy_less64_bytes (erdi, ersi, ffs(erdx) + 1);
>          return; */
> -
> +L(last_64_bytes):
>         bsfq    %rdx, %rcx
>  #ifdef USE_AS_STPCPY
>         lea     (%rdi, %rcx), %rax
> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S b/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
> index 8f70c42..9705e53 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
> @@ -19,8 +19,8 @@
>  #define USE_SSSE3
>
>  #ifndef STRCPY
> -# define STRCPY_TAIL   __strcpy_ssse3_tail
>  # define STRCPY        __strcpy_ssse3
> +# define STRAT __strcat_ssse3
>  #endif
>
>  #include "strcpy-sse2-unaligned-v2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> index 133e1d2..34dd69b 100644
> --- a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> @@ -1,3 +1,286 @@
> +/* strncat with SSE2
> +   Copyright (C) 2011-2013 Free Software Foundation, Inc.
> +   Contributed by Intel Corporation.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
>  #define USE_AS_STRNCAT
>  #define STRCAT __strncat_sse2_unaligned
> -#include "strcat-sse2-unaligned.S"
> +
> +#ifndef NOT_IN_libc
> +
> +# include <sysdep.h>
> +
> +# ifndef STRCAT
> +#  define STRCAT  __strcat_sse2_unaligned
> +# endif
> +
> +# define USE_AS_STRCAT
> +
> +.text
> +ENTRY (STRCAT)
> +       mov     %rdi, %r9
> +# ifdef USE_AS_STRNCAT
> +       mov     %rdx, %r8
> +# endif
> +
> +/* Inline corresponding strlen file, temporary until new strcpy
> +   implementation gets merged.  */
> +
> +       xor     %rax, %rax
> +       mov     %edi, %ecx
> +       and     $0x3f, %ecx
> +       pxor    %xmm0, %xmm0
> +       cmp     $0x30, %ecx
> +       ja      L(next)
> +       movdqu  (%rdi), %xmm1
> +       pcmpeqb %xmm1, %xmm0
> +       pmovmskb %xmm0, %edx
> +       test    %edx, %edx
> +       jnz     L(exit_less16)
> +       mov     %rdi, %rax
> +       and     $-16, %rax
> +       jmp     L(align16_start)
> +L(next):
> +       mov     %rdi, %rax
> +       and     $-16, %rax
> +       pcmpeqb (%rax), %xmm0
> +       mov     $-1, %r10d
> +       sub     %rax, %rcx
> +       shl     %cl, %r10d
> +       pmovmskb %xmm0, %edx
> +       and     %r10d, %edx
> +       jnz     L(exit)
> +
> +L(align16_start):
> +       pxor    %xmm0, %xmm0
> +       pxor    %xmm1, %xmm1
> +       pxor    %xmm2, %xmm2
> +       pxor    %xmm3, %xmm3
> +       pcmpeqb 16(%rax), %xmm0
> +       pmovmskb %xmm0, %edx
> +       test    %edx, %edx
> +       jnz     L(exit16)
> +
> +       pcmpeqb 32(%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       test    %edx, %edx
> +       jnz     L(exit32)
> +
> +       pcmpeqb 48(%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       test    %edx, %edx
> +       jnz     L(exit48)
> +
> +       pcmpeqb 64(%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       test    %edx, %edx
> +       jnz     L(exit64)
> +
> +       pcmpeqb 80(%rax), %xmm0
> +       add     $64, %rax
> +       pmovmskb %xmm0, %edx
> +       test    %edx, %edx
> +       jnz     L(exit16)
> +
> +       pcmpeqb 32(%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       test    %edx, %edx
> +       jnz     L(exit32)
> +
> +       pcmpeqb 48(%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       test    %edx, %edx
> +       jnz     L(exit48)
> +
> +       pcmpeqb 64(%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       test    %edx, %edx
> +       jnz     L(exit64)
> +
> +       pcmpeqb 80(%rax), %xmm0
> +       add     $64, %rax
> +       pmovmskb %xmm0, %edx
> +       test    %edx, %edx
> +       jnz     L(exit16)
> +
> +       pcmpeqb 32(%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       test    %edx, %edx
> +       jnz     L(exit32)
> +
> +       pcmpeqb 48(%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       test    %edx, %edx
> +       jnz     L(exit48)
> +
> +       pcmpeqb 64(%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       test    %edx, %edx
> +       jnz     L(exit64)
> +
> +       pcmpeqb 80(%rax), %xmm0
> +       add     $64, %rax
> +       pmovmskb %xmm0, %edx
> +       test    %edx, %edx
> +       jnz     L(exit16)
> +
> +       pcmpeqb 32(%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       test    %edx, %edx
> +       jnz     L(exit32)
> +
> +       pcmpeqb 48(%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       test    %edx, %edx
> +       jnz     L(exit48)
> +
> +       pcmpeqb 64(%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       test    %edx, %edx
> +       jnz     L(exit64)
> +
> +       test    $0x3f, %rax
> +       jz      L(align64_loop)
> +
> +       pcmpeqb 80(%rax), %xmm0
> +       add     $80, %rax
> +       pmovmskb %xmm0, %edx
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       test    $0x3f, %rax
> +       jz      L(align64_loop)
> +
> +       pcmpeqb 16(%rax), %xmm1
> +       add     $16, %rax
> +       pmovmskb %xmm1, %edx
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       test    $0x3f, %rax
> +       jz      L(align64_loop)
> +
> +       pcmpeqb 16(%rax), %xmm2
> +       add     $16, %rax
> +       pmovmskb %xmm2, %edx
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       test    $0x3f, %rax
> +       jz      L(align64_loop)
> +
> +       pcmpeqb 16(%rax), %xmm3
> +       add     $16, %rax
> +       pmovmskb %xmm3, %edx
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       add     $16, %rax
> +       .p2align 4
> +       L(align64_loop):
> +       movaps  (%rax), %xmm4
> +       pminub  16(%rax),       %xmm4
> +       movaps  32(%rax),       %xmm5
> +       pminub  48(%rax),       %xmm5
> +       add     $64,    %rax
> +       pminub  %xmm4,  %xmm5
> +       pcmpeqb %xmm0,  %xmm5
> +       pmovmskb %xmm5, %edx
> +       test    %edx,   %edx
> +       jz      L(align64_loop)
> +
> +       pcmpeqb -64(%rax), %xmm0
> +       sub     $80,    %rax
> +       pmovmskb %xmm0, %edx
> +       test    %edx, %edx
> +       jnz     L(exit16)
> +
> +       pcmpeqb 32(%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       test    %edx, %edx
> +       jnz     L(exit32)
> +
> +       pcmpeqb 48(%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       test    %edx, %edx
> +       jnz     L(exit48)
> +
> +       pcmpeqb 64(%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       sub     %rdi, %rax
> +       bsf     %rdx, %rdx
> +       add     %rdx, %rax
> +       add     $64, %rax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit):
> +       sub     %rdi, %rax
> +L(exit_less16):
> +       bsf     %rdx, %rdx
> +       add     %rdx, %rax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit16):
> +       sub     %rdi, %rax
> +       bsf     %rdx, %rdx
> +       add     %rdx, %rax
> +       add     $16, %rax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit32):
> +       sub     %rdi, %rax
> +       bsf     %rdx, %rdx
> +       add     %rdx, %rax
> +       add     $32, %rax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit48):
> +       sub     %rdi, %rax
> +       bsf     %rdx, %rdx
> +       add     %rdx, %rax
> +       add     $48, %rax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit64):
> +       sub     %rdi, %rax
> +       bsf     %rdx, %rdx
> +       add     %rdx, %rax
> +       add     $64, %rax
> +
> +       .p2align 4
> +L(StartStrcpyPart):
> +       lea     (%r9, %rax), %rdi
> +       mov     %rsi, %rcx
> +       mov     %r9, %rax      /* save result */
> +
> +# ifdef USE_AS_STRNCAT
> +       test    %r8, %r8
> +       jz      L(ExitZero)
> +#  define USE_AS_STRNCPY
> +#  include "strcpy-sse2-unaligned.S"
> +
> +# else
> +       jmp __strcpy_sse2_unaligned_tail
> +  END (STRCAT)
> +# endif
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> index 6c45ff3..a76075c 100644
> --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> +++ b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> @@ -1,3 +1,870 @@
> +/* strncat with SSSE3
> +   Copyright (C) 2011-2013 Free Software Foundation, Inc.
> +   Contributed by Intel Corporation.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
>  #define USE_AS_STRNCAT
>  #define STRCAT __strncat_ssse3
> -#include "strcat-ssse3.S"
> +
> +#ifndef NOT_IN_libc
> +
> +# include <sysdep.h>
> +
> +# ifndef STRCAT
> +#  define STRCAT  __strcat_ssse3
> +# endif
> +
> +# define USE_AS_STRCAT
> +
> +.text
> +ENTRY (STRCAT)
> +# ifdef USE_AS_STRNCAT
> +       mov     %rdx, %r8
> +# endif
> +
> +
> +/* Inline corresponding strlen file, temporary until new strcpy
> +   implementation gets merged.  */
> +
> +       xor     %eax, %eax
> +       cmpb    $0, (%rdi)
> +       jz      L(exit_tail0)
> +       cmpb    $0, 1(%rdi)
> +       jz      L(exit_tail1)
> +       cmpb    $0, 2(%rdi)
> +       jz      L(exit_tail2)
> +       cmpb    $0, 3(%rdi)
> +       jz      L(exit_tail3)
> +
> +       cmpb    $0, 4(%rdi)
> +       jz      L(exit_tail4)
> +       cmpb    $0, 5(%rdi)
> +       jz      L(exit_tail5)
> +       cmpb    $0, 6(%rdi)
> +       jz      L(exit_tail6)
> +       cmpb    $0, 7(%rdi)
> +       jz      L(exit_tail7)
> +
> +       cmpb    $0, 8(%rdi)
> +       jz      L(exit_tail8)
> +       cmpb    $0, 9(%rdi)
> +       jz      L(exit_tail9)
> +       cmpb    $0, 10(%rdi)
> +       jz      L(exit_tail10)
> +       cmpb    $0, 11(%rdi)
> +       jz      L(exit_tail11)
> +
> +       cmpb    $0, 12(%rdi)
> +       jz      L(exit_tail12)
> +       cmpb    $0, 13(%rdi)
> +       jz      L(exit_tail13)
> +       cmpb    $0, 14(%rdi)
> +       jz      L(exit_tail14)
> +       cmpb    $0, 15(%rdi)
> +       jz      L(exit_tail15)
> +       pxor    %xmm0, %xmm0
> +       lea     16(%rdi), %rcx
> +       lea     16(%rdi), %rax
> +       and     $-16, %rax
> +
> +       pcmpeqb (%rax), %xmm0
> +       pmovmskb %xmm0, %edx
> +       pxor    %xmm1, %xmm1
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       pxor    %xmm2, %xmm2
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       pxor    %xmm3, %xmm3
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm0
> +       pmovmskb %xmm0, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm0
> +       pmovmskb %xmm0, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm0
> +       pmovmskb %xmm0, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       pcmpeqb (%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       test    %edx, %edx
> +       lea     16(%rax), %rax
> +       jnz     L(exit)
> +
> +       and     $-0x40, %rax
> +
> +       .p2align 4
> +L(aligned_64):
> +       pcmpeqb (%rax), %xmm0
> +       pcmpeqb 16(%rax), %xmm1
> +       pcmpeqb 32(%rax), %xmm2
> +       pcmpeqb 48(%rax), %xmm3
> +       pmovmskb %xmm0, %edx
> +       pmovmskb %xmm1, %r11d
> +       pmovmskb %xmm2, %r10d
> +       pmovmskb %xmm3, %r9d
> +       or      %edx, %r9d
> +       or      %r11d, %r9d
> +       or      %r10d, %r9d
> +       lea     64(%rax), %rax
> +       jz      L(aligned_64)
> +
> +       test    %edx, %edx
> +       jnz     L(aligned_64_exit_16)
> +       test    %r11d, %r11d
> +       jnz     L(aligned_64_exit_32)
> +       test    %r10d, %r10d
> +       jnz     L(aligned_64_exit_48)
> +
> +L(aligned_64_exit_64):
> +       pmovmskb %xmm3, %edx
> +       jmp     L(exit)
> +
> +L(aligned_64_exit_48):
> +       lea     -16(%rax), %rax
> +       mov     %r10d, %edx
> +       jmp     L(exit)
> +
> +L(aligned_64_exit_32):
> +       lea     -32(%rax), %rax
> +       mov     %r11d, %edx
> +       jmp     L(exit)
> +
> +L(aligned_64_exit_16):
> +       lea     -48(%rax), %rax
> +
> +L(exit):
> +       sub     %rcx, %rax
> +       test    %dl, %dl
> +       jz      L(exit_high)
> +       test    $0x01, %dl
> +       jnz     L(exit_tail0)
> +
> +       test    $0x02, %dl
> +       jnz     L(exit_tail1)
> +
> +       test    $0x04, %dl
> +       jnz     L(exit_tail2)
> +
> +       test    $0x08, %dl
> +       jnz     L(exit_tail3)
> +
> +       test    $0x10, %dl
> +       jnz     L(exit_tail4)
> +
> +       test    $0x20, %dl
> +       jnz     L(exit_tail5)
> +
> +       test    $0x40, %dl
> +       jnz     L(exit_tail6)
> +       add     $7, %eax
> +L(exit_tail0):
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_high):
> +       add     $8, %eax
> +       test    $0x01, %dh
> +       jnz     L(exit_tail0)
> +
> +       test    $0x02, %dh
> +       jnz     L(exit_tail1)
> +
> +       test    $0x04, %dh
> +       jnz     L(exit_tail2)
> +
> +       test    $0x08, %dh
> +       jnz     L(exit_tail3)
> +
> +       test    $0x10, %dh
> +       jnz     L(exit_tail4)
> +
> +       test    $0x20, %dh
> +       jnz     L(exit_tail5)
> +
> +       test    $0x40, %dh
> +       jnz     L(exit_tail6)
> +       add     $7, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail1):
> +       add     $1, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail2):
> +       add     $2, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail3):
> +       add     $3, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail4):
> +       add     $4, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail5):
> +       add     $5, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail6):
> +       add     $6, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail7):
> +       add     $7, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail8):
> +       add     $8, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail9):
> +       add     $9, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail10):
> +       add     $10, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail11):
> +       add     $11, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail12):
> +       add     $12, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail13):
> +       add     $13, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail14):
> +       add     $14, %eax
> +       jmp     L(StartStrcpyPart)
> +
> +       .p2align 4
> +L(exit_tail15):
> +       add     $15, %eax
> +
> +       .p2align 4
> +L(StartStrcpyPart):
> +       mov     %rsi, %rcx
> +       lea     (%rdi, %rax), %rdx
> +# ifdef USE_AS_STRNCAT
> +       test    %r8, %r8
> +       jz      L(StrncatExit0)
> +       cmp     $8, %r8
> +       jbe     L(StrncatExit8Bytes)
> +# endif
> +       cmpb    $0, (%rcx)
> +       jz      L(Exit1)
> +       cmpb    $0, 1(%rcx)
> +       jz      L(Exit2)
> +       cmpb    $0, 2(%rcx)
> +       jz      L(Exit3)
> +       cmpb    $0, 3(%rcx)
> +       jz      L(Exit4)
> +       cmpb    $0, 4(%rcx)
> +       jz      L(Exit5)
> +       cmpb    $0, 5(%rcx)
> +       jz      L(Exit6)
> +       cmpb    $0, 6(%rcx)
> +       jz      L(Exit7)
> +       cmpb    $0, 7(%rcx)
> +       jz      L(Exit8)
> +       cmpb    $0, 8(%rcx)
> +       jz      L(Exit9)
> +# ifdef USE_AS_STRNCAT
> +       cmp     $16, %r8
> +       jb      L(StrncatExit15Bytes)
> +# endif
> +       cmpb    $0, 9(%rcx)
> +       jz      L(Exit10)
> +       cmpb    $0, 10(%rcx)
> +       jz      L(Exit11)
> +       cmpb    $0, 11(%rcx)
> +       jz      L(Exit12)
> +       cmpb    $0, 12(%rcx)
> +       jz      L(Exit13)
> +       cmpb    $0, 13(%rcx)
> +       jz      L(Exit14)
> +       cmpb    $0, 14(%rcx)
> +       jz      L(Exit15)
> +       cmpb    $0, 15(%rcx)
> +       jz      L(Exit16)
> +# ifdef USE_AS_STRNCAT
> +       cmp     $16, %r8
> +       je      L(StrncatExit16)
> +#  define USE_AS_STRNCPY
> +# endif
> +
> +# include "strcpy-ssse3.S"
> +
> +       .p2align 4
> +L(CopyFrom1To16Bytes):
> +       add     %rsi, %rdx
> +       add     %rsi, %rcx
> +
> +       test    %al, %al
> +       jz      L(ExitHigh)
> +       test    $0x01, %al
> +       jnz     L(Exit1)
> +       test    $0x02, %al
> +       jnz     L(Exit2)
> +       test    $0x04, %al
> +       jnz     L(Exit3)
> +       test    $0x08, %al
> +       jnz     L(Exit4)
> +       test    $0x10, %al
> +       jnz     L(Exit5)
> +       test    $0x20, %al
> +       jnz     L(Exit6)
> +       test    $0x40, %al
> +       jnz     L(Exit7)
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(ExitHigh):
> +       test    $0x01, %ah
> +       jnz     L(Exit9)
> +       test    $0x02, %ah
> +       jnz     L(Exit10)
> +       test    $0x04, %ah
> +       jnz     L(Exit11)
> +       test    $0x08, %ah
> +       jnz     L(Exit12)
> +       test    $0x10, %ah
> +       jnz     L(Exit13)
> +       test    $0x20, %ah
> +       jnz     L(Exit14)
> +       test    $0x40, %ah
> +       jnz     L(Exit15)
> +       movlpd  (%rcx), %xmm0
> +       movlpd  8(%rcx), %xmm1
> +       movlpd  %xmm0, (%rdx)
> +       movlpd  %xmm1, 8(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit1):
> +       xor     %ah, %ah
> +       movb    %ah, 1(%rdx)
> +L(Exit1):
> +       movb    (%rcx), %al
> +       movb    %al, (%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit2):
> +       xor     %ah, %ah
> +       movb    %ah, 2(%rdx)
> +L(Exit2):
> +       movw    (%rcx), %ax
> +       movw    %ax, (%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit3):
> +       xor     %ah, %ah
> +       movb    %ah, 3(%rdx)
> +L(Exit3):
> +       movw    (%rcx), %ax
> +       movw    %ax, (%rdx)
> +       movb    2(%rcx), %al
> +       movb    %al, 2(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit4):
> +       xor     %ah, %ah
> +       movb    %ah, 4(%rdx)
> +L(Exit4):
> +       mov     (%rcx), %eax
> +       mov     %eax, (%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit5):
> +       xor     %ah, %ah
> +       movb    %ah, 5(%rdx)
> +L(Exit5):
> +       mov     (%rcx), %eax
> +       mov     %eax, (%rdx)
> +       movb    4(%rcx), %al
> +       movb    %al, 4(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit6):
> +       xor     %ah, %ah
> +       movb    %ah, 6(%rdx)
> +L(Exit6):
> +       mov     (%rcx), %eax
> +       mov     %eax, (%rdx)
> +       movw    4(%rcx), %ax
> +       movw    %ax, 4(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit7):
> +       xor     %ah, %ah
> +       movb    %ah, 7(%rdx)
> +L(Exit7):
> +       mov     (%rcx), %eax
> +       mov     %eax, (%rdx)
> +       mov     3(%rcx), %eax
> +       mov     %eax, 3(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit8):
> +       xor     %ah, %ah
> +       movb    %ah, 8(%rdx)
> +L(Exit8):
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit9):
> +       xor     %ah, %ah
> +       movb    %ah, 9(%rdx)
> +L(Exit9):
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       movb    8(%rcx), %al
> +       movb    %al, 8(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit10):
> +       xor     %ah, %ah
> +       movb    %ah, 10(%rdx)
> +L(Exit10):
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       movw    8(%rcx), %ax
> +       movw    %ax, 8(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit11):
> +       xor     %ah, %ah
> +       movb    %ah, 11(%rdx)
> +L(Exit11):
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       mov     7(%rcx), %eax
> +       mov     %eax, 7(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit12):
> +       xor     %ah, %ah
> +       movb    %ah, 12(%rdx)
> +L(Exit12):
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       mov     8(%rcx), %eax
> +       mov     %eax, 8(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit13):
> +       xor     %ah, %ah
> +       movb    %ah, 13(%rdx)
> +L(Exit13):
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       movlpd  5(%rcx), %xmm1
> +       movlpd  %xmm1, 5(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit14):
> +       xor     %ah, %ah
> +       movb    %ah, 14(%rdx)
> +L(Exit14):
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       movlpd  6(%rcx), %xmm1
> +       movlpd  %xmm1, 6(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit15):
> +       xor     %ah, %ah
> +       movb    %ah, 15(%rdx)
> +L(Exit15):
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       movlpd  7(%rcx), %xmm1
> +       movlpd  %xmm1, 7(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit16):
> +       xor     %ah, %ah
> +       movb    %ah, 16(%rdx)
> +L(Exit16):
> +       movlpd  (%rcx), %xmm0
> +       movlpd  8(%rcx), %xmm1
> +       movlpd  %xmm0, (%rdx)
> +       movlpd  %xmm1, 8(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +# ifdef USE_AS_STRNCPY
> +
> +       .p2align 4
> +L(CopyFrom1To16BytesCase2):
> +       add     $16, %r8
> +       add     %rsi, %rcx
> +       lea     (%rsi, %rdx), %rsi
> +       lea     -9(%r8), %rdx
> +       and     $1<<7, %dh
> +       or      %al, %dh
> +       test    %dh, %dh
> +       lea     (%rsi), %rdx
> +       jz      L(ExitHighCase2)
> +
> +       test    $0x01, %al
> +       jnz     L(Exit1)
> +       cmp     $1, %r8
> +       je      L(StrncatExit1)
> +       test    $0x02, %al
> +       jnz     L(Exit2)
> +       cmp     $2, %r8
> +       je      L(StrncatExit2)
> +       test    $0x04, %al
> +       jnz     L(Exit3)
> +       cmp     $3, %r8
> +       je      L(StrncatExit3)
> +       test    $0x08, %al
> +       jnz     L(Exit4)
> +       cmp     $4, %r8
> +       je      L(StrncatExit4)
> +       test    $0x10, %al
> +       jnz     L(Exit5)
> +       cmp     $5, %r8
> +       je      L(StrncatExit5)
> +       test    $0x20, %al
> +       jnz     L(Exit6)
> +       cmp     $6, %r8
> +       je      L(StrncatExit6)
> +       test    $0x40, %al
> +       jnz     L(Exit7)
> +       cmp     $7, %r8
> +       je      L(StrncatExit7)
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       lea     7(%rdx), %rax
> +       cmpb    $1, (%rax)
> +       sbb     $-1, %rax
> +       xor     %cl, %cl
> +       movb    %cl, (%rax)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(ExitHighCase2):
> +       test    $0x01, %ah
> +       jnz     L(Exit9)
> +       cmp     $9, %r8
> +       je      L(StrncatExit9)
> +       test    $0x02, %ah
> +       jnz     L(Exit10)
> +       cmp     $10, %r8
> +       je      L(StrncatExit10)
> +       test    $0x04, %ah
> +       jnz     L(Exit11)
> +       cmp     $11, %r8
> +       je      L(StrncatExit11)
> +       test    $0x8, %ah
> +       jnz     L(Exit12)
> +       cmp     $12, %r8
> +       je      L(StrncatExit12)
> +       test    $0x10, %ah
> +       jnz     L(Exit13)
> +       cmp     $13, %r8
> +       je      L(StrncatExit13)
> +       test    $0x20, %ah
> +       jnz     L(Exit14)
> +       cmp     $14, %r8
> +       je      L(StrncatExit14)
> +       test    $0x40, %ah
> +       jnz     L(Exit15)
> +       cmp     $15, %r8
> +       je      L(StrncatExit15)
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       movlpd  8(%rcx), %xmm1
> +       movlpd  %xmm1, 8(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +L(CopyFrom1To16BytesCase2OrCase3):
> +       test    %rax, %rax
> +       jnz     L(CopyFrom1To16BytesCase2)
> +
> +       .p2align 4
> +L(CopyFrom1To16BytesCase3):
> +       add     $16, %r8
> +       add     %rsi, %rdx
> +       add     %rsi, %rcx
> +
> +       cmp     $8, %r8
> +       ja      L(ExitHighCase3)
> +       cmp     $1, %r8
> +       je      L(StrncatExit1)
> +       cmp     $2, %r8
> +       je      L(StrncatExit2)
> +       cmp     $3, %r8
> +       je      L(StrncatExit3)
> +       cmp     $4, %r8
> +       je      L(StrncatExit4)
> +       cmp     $5, %r8
> +       je      L(StrncatExit5)
> +       cmp     $6, %r8
> +       je      L(StrncatExit6)
> +       cmp     $7, %r8
> +       je      L(StrncatExit7)
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       xor     %ah, %ah
> +       movb    %ah, 8(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(ExitHighCase3):
> +       cmp     $9, %r8
> +       je      L(StrncatExit9)
> +       cmp     $10, %r8
> +       je      L(StrncatExit10)
> +       cmp     $11, %r8
> +       je      L(StrncatExit11)
> +       cmp     $12, %r8
> +       je      L(StrncatExit12)
> +       cmp     $13, %r8
> +       je      L(StrncatExit13)
> +       cmp     $14, %r8
> +       je      L(StrncatExit14)
> +       cmp     $15, %r8
> +       je      L(StrncatExit15)
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       movlpd  8(%rcx), %xmm1
> +       movlpd  %xmm1, 8(%rdx)
> +       xor     %ah, %ah
> +       movb    %ah, 16(%rdx)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit0):
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit15Bytes):
> +       cmp     $9, %r8
> +       je      L(StrncatExit9)
> +       cmpb    $0, 9(%rcx)
> +       jz      L(Exit10)
> +       cmp     $10, %r8
> +       je      L(StrncatExit10)
> +       cmpb    $0, 10(%rcx)
> +       jz      L(Exit11)
> +       cmp     $11, %r8
> +       je      L(StrncatExit11)
> +       cmpb    $0, 11(%rcx)
> +       jz      L(Exit12)
> +       cmp     $12, %r8
> +       je      L(StrncatExit12)
> +       cmpb    $0, 12(%rcx)
> +       jz      L(Exit13)
> +       cmp     $13, %r8
> +       je      L(StrncatExit13)
> +       cmpb    $0, 13(%rcx)
> +       jz      L(Exit14)
> +       cmp     $14, %r8
> +       je      L(StrncatExit14)
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       movlpd  7(%rcx), %xmm1
> +       movlpd  %xmm1, 7(%rdx)
> +       lea     14(%rdx), %rax
> +       cmpb    $1, (%rax)
> +       sbb     $-1, %rax
> +       xor     %cl, %cl
> +       movb    %cl, (%rax)
> +       mov     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(StrncatExit8Bytes):
> +       cmpb    $0, (%rcx)
> +       jz      L(Exit1)
> +       cmp     $1, %r8
> +       je      L(StrncatExit1)
> +       cmpb    $0, 1(%rcx)
> +       jz      L(Exit2)
> +       cmp     $2, %r8
> +       je      L(StrncatExit2)
> +       cmpb    $0, 2(%rcx)
> +       jz      L(Exit3)
> +       cmp     $3, %r8
> +       je      L(StrncatExit3)
> +       cmpb    $0, 3(%rcx)
> +       jz      L(Exit4)
> +       cmp     $4, %r8
> +       je      L(StrncatExit4)
> +       cmpb    $0, 4(%rcx)
> +       jz      L(Exit5)
> +       cmp     $5, %r8
> +       je      L(StrncatExit5)
> +       cmpb    $0, 5(%rcx)
> +       jz      L(Exit6)
> +       cmp     $6, %r8
> +       je      L(StrncatExit6)
> +       cmpb    $0, 6(%rcx)
> +       jz      L(Exit7)
> +       cmp     $7, %r8
> +       je      L(StrncatExit7)
> +       movlpd  (%rcx), %xmm0
> +       movlpd  %xmm0, (%rdx)
> +       lea     7(%rdx), %rax
> +       cmpb    $1, (%rax)
> +       sbb     $-1, %rax
> +       xor     %cl, %cl
> +       movb    %cl, (%rax)
> +       mov     %rdi, %rax
> +       ret
> +
> +# endif
> +END (STRCAT)
> +#endif
> --
> 1.8.3.2
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]