This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [RFC] Improve strcat
- From: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
- To: Ondřej Bílka <neleai at seznam dot cz>
- Cc: "Carlos O'Donell" <carlos at redhat dot com>, Andreas Schwab <schwab at linux-m68k dot org>, GNU C Library <libc-alpha at sourceware dot org>
- Date: Wed, 11 Sep 2013 14:45:50 +0400
- Subject: Re: [RFC] Improve strcat
- Authentication-results: sourceware.org; auth=none
- References: <20130909153051 dot GA23047 at domone dot kolej dot mff dot cuni dot cz> <20130909161112 dot GB23047 at domone dot kolej dot mff dot cuni dot cz> <mvmbo42dkiq dot fsf at hawking dot suse dot de> <20130909171703 dot GA32141 at domone dot kolej dot mff dot cuni dot cz> <87ob81c1yk dot fsf at igel dot home> <20130909191829 dot GA997 at domone dot kolej dot mff dot cuni dot cz> <522E28E9 dot 5000709 at redhat dot com> <20130910142117 dot GB6536 at domone dot kolej dot mff dot cuni dot cz> <20130910202844 dot GA11358 at domone dot kolej dot mff dot cuni dot cz> <20130911102311 dot GA22325 at domone dot kolej dot mff dot cuni dot cz>
Is it possible to make less drawdowns in your plots?
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/strcat_profile/results_rand_L3/result.html
Especially #4.
--
Liubov
On Wed, Sep 11, 2013 at 2:23 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> On Tue, Sep 10, 2013 at 10:28:44PM +0200, OndÅej BÃlka wrote:
>> Hi Carlos,
>>
>> Here is strcpy with comments. To get structure I decided to include
>> ssse3 loop in this patch. If you are ok with splitting to loop header
>> an ssse3 could be reviewed separately.
>>
>> I ommited actual strcat calls as I have patch that uses them ready and
>> it needs bit of code movement.
>>
> For strcat there was one optimization oppurtunity left - find trailing
> zeros in source and destination in parallel. This patch does exactly
> that.
>
> This allows us to directly jump to code that copies given amount of
> bytes so I put strcat implementation to file strcpy-sse2-unaligned-v2.S.
>
> I do not handle strncat yet, so I copied old strcat*.S to strncat*.S
>
> I did not optimize instruction scheduling yet to make code easier to
> read.
>
> Results of benchmark are here.
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strcat_profile.html
>
> Comments?
>
> ---
> sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 280 +------
> sysdeps/x86_64/multiarch/strcat-ssse3.S | 868 +-------------------
> .../x86_64/multiarch/strcpy-sse2-unaligned-v2.S | 217 ++++-
> sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S | 2 +-
> sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S | 285 ++++++-
> sysdeps/x86_64/multiarch/strncat-ssse3.S | 869 ++++++++++++++++++++-
> 6 files changed, 1364 insertions(+), 1157 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> index 028c6d3..03c1f18 100644
> --- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
> @@ -1,279 +1 @@
> -/* strcat with SSE2
> - Copyright (C) 2011-2013 Free Software Foundation, Inc.
> - Contributed by Intel Corporation.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <http://www.gnu.org/licenses/>. */
> -
> -#ifndef NOT_IN_libc
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -# define STRCAT __strcat_sse2_unaligned
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> - mov %rdi, %r9
> -# ifdef USE_AS_STRNCAT
> - mov %rdx, %r8
> -# endif
> -
> -/* Inline corresponding strlen file, temporary until new strcpy
> - implementation gets merged. */
> -
> - xor %rax, %rax
> - mov %edi, %ecx
> - and $0x3f, %ecx
> - pxor %xmm0, %xmm0
> - cmp $0x30, %ecx
> - ja L(next)
> - movdqu (%rdi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - jnz L(exit_less16)
> - mov %rdi, %rax
> - and $-16, %rax
> - jmp L(align16_start)
> -L(next):
> - mov %rdi, %rax
> - and $-16, %rax
> - pcmpeqb (%rax), %xmm0
> - mov $-1, %r10d
> - sub %rax, %rcx
> - shl %cl, %r10d
> - pmovmskb %xmm0, %edx
> - and %r10d, %edx
> - jnz L(exit)
> -
> -L(align16_start):
> - pxor %xmm0, %xmm0
> - pxor %xmm1, %xmm1
> - pxor %xmm2, %xmm2
> - pxor %xmm3, %xmm3
> - pcmpeqb 16(%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - jnz L(exit16)
> -
> - pcmpeqb 32(%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - jnz L(exit32)
> -
> - pcmpeqb 48(%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - jnz L(exit48)
> -
> - pcmpeqb 64(%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - jnz L(exit64)
> -
> - pcmpeqb 80(%rax), %xmm0
> - add $64, %rax
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - jnz L(exit16)
> -
> - pcmpeqb 32(%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - jnz L(exit32)
> -
> - pcmpeqb 48(%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - jnz L(exit48)
> -
> - pcmpeqb 64(%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - jnz L(exit64)
> -
> - pcmpeqb 80(%rax), %xmm0
> - add $64, %rax
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - jnz L(exit16)
> -
> - pcmpeqb 32(%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - jnz L(exit32)
> -
> - pcmpeqb 48(%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - jnz L(exit48)
> -
> - pcmpeqb 64(%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - jnz L(exit64)
> -
> - pcmpeqb 80(%rax), %xmm0
> - add $64, %rax
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - jnz L(exit16)
> -
> - pcmpeqb 32(%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - jnz L(exit32)
> -
> - pcmpeqb 48(%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - jnz L(exit48)
> -
> - pcmpeqb 64(%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - jnz L(exit64)
> -
> - test $0x3f, %rax
> - jz L(align64_loop)
> -
> - pcmpeqb 80(%rax), %xmm0
> - add $80, %rax
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - jnz L(exit)
> -
> - test $0x3f, %rax
> - jz L(align64_loop)
> -
> - pcmpeqb 16(%rax), %xmm1
> - add $16, %rax
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - jnz L(exit)
> -
> - test $0x3f, %rax
> - jz L(align64_loop)
> -
> - pcmpeqb 16(%rax), %xmm2
> - add $16, %rax
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - jnz L(exit)
> -
> - test $0x3f, %rax
> - jz L(align64_loop)
> -
> - pcmpeqb 16(%rax), %xmm3
> - add $16, %rax
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - jnz L(exit)
> -
> - add $16, %rax
> - .p2align 4
> - L(align64_loop):
> - movaps (%rax), %xmm4
> - pminub 16(%rax), %xmm4
> - movaps 32(%rax), %xmm5
> - pminub 48(%rax), %xmm5
> - add $64, %rax
> - pminub %xmm4, %xmm5
> - pcmpeqb %xmm0, %xmm5
> - pmovmskb %xmm5, %edx
> - test %edx, %edx
> - jz L(align64_loop)
> -
> - pcmpeqb -64(%rax), %xmm0
> - sub $80, %rax
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - jnz L(exit16)
> -
> - pcmpeqb 32(%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - jnz L(exit32)
> -
> - pcmpeqb 48(%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - jnz L(exit48)
> -
> - pcmpeqb 64(%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - sub %rdi, %rax
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - add $64, %rax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit):
> - sub %rdi, %rax
> -L(exit_less16):
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit16):
> - sub %rdi, %rax
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - add $16, %rax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit32):
> - sub %rdi, %rax
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - add $32, %rax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit48):
> - sub %rdi, %rax
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - add $48, %rax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit64):
> - sub %rdi, %rax
> - bsf %rdx, %rdx
> - add %rdx, %rax
> - add $64, %rax
> -
> - .p2align 4
> -L(StartStrcpyPart):
> - lea (%r9, %rax), %rdi
> - mov %rsi, %rcx
> - mov %r9, %rax /* save result */
> -
> -# ifdef USE_AS_STRNCAT
> - test %r8, %r8
> - jz L(ExitZero)
> -# define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-sse2-unaligned.S"
> -#endif
> +/* Implemented in strcpy-sse2-unaligned-v2.S */
> diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> index 8101b91..fd5fba7 100644
> --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> @@ -1,867 +1 @@
> -/* strcat with SSSE3
> - Copyright (C) 2011-2013 Free Software Foundation, Inc.
> - Contributed by Intel Corporation.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <http://www.gnu.org/licenses/>. */
> -
> -#ifndef NOT_IN_libc
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -# define STRCAT __strcat_ssse3
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -# ifdef USE_AS_STRNCAT
> - mov %rdx, %r8
> -# endif
> -
> -
> -/* Inline corresponding strlen file, temporary until new strcpy
> - implementation gets merged. */
> -
> - xor %eax, %eax
> - cmpb $0, (%rdi)
> - jz L(exit_tail0)
> - cmpb $0, 1(%rdi)
> - jz L(exit_tail1)
> - cmpb $0, 2(%rdi)
> - jz L(exit_tail2)
> - cmpb $0, 3(%rdi)
> - jz L(exit_tail3)
> -
> - cmpb $0, 4(%rdi)
> - jz L(exit_tail4)
> - cmpb $0, 5(%rdi)
> - jz L(exit_tail5)
> - cmpb $0, 6(%rdi)
> - jz L(exit_tail6)
> - cmpb $0, 7(%rdi)
> - jz L(exit_tail7)
> -
> - cmpb $0, 8(%rdi)
> - jz L(exit_tail8)
> - cmpb $0, 9(%rdi)
> - jz L(exit_tail9)
> - cmpb $0, 10(%rdi)
> - jz L(exit_tail10)
> - cmpb $0, 11(%rdi)
> - jz L(exit_tail11)
> -
> - cmpb $0, 12(%rdi)
> - jz L(exit_tail12)
> - cmpb $0, 13(%rdi)
> - jz L(exit_tail13)
> - cmpb $0, 14(%rdi)
> - jz L(exit_tail14)
> - cmpb $0, 15(%rdi)
> - jz L(exit_tail15)
> - pxor %xmm0, %xmm0
> - lea 16(%rdi), %rcx
> - lea 16(%rdi), %rax
> - and $-16, %rax
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - pxor %xmm1, %xmm1
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - pxor %xmm2, %xmm2
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - pxor %xmm3, %xmm3
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - and $-0x40, %rax
> -
> - .p2align 4
> -L(aligned_64):
> - pcmpeqb (%rax), %xmm0
> - pcmpeqb 16(%rax), %xmm1
> - pcmpeqb 32(%rax), %xmm2
> - pcmpeqb 48(%rax), %xmm3
> - pmovmskb %xmm0, %edx
> - pmovmskb %xmm1, %r11d
> - pmovmskb %xmm2, %r10d
> - pmovmskb %xmm3, %r9d
> - or %edx, %r9d
> - or %r11d, %r9d
> - or %r10d, %r9d
> - lea 64(%rax), %rax
> - jz L(aligned_64)
> -
> - test %edx, %edx
> - jnz L(aligned_64_exit_16)
> - test %r11d, %r11d
> - jnz L(aligned_64_exit_32)
> - test %r10d, %r10d
> - jnz L(aligned_64_exit_48)
> -
> -L(aligned_64_exit_64):
> - pmovmskb %xmm3, %edx
> - jmp L(exit)
> -
> -L(aligned_64_exit_48):
> - lea -16(%rax), %rax
> - mov %r10d, %edx
> - jmp L(exit)
> -
> -L(aligned_64_exit_32):
> - lea -32(%rax), %rax
> - mov %r11d, %edx
> - jmp L(exit)
> -
> -L(aligned_64_exit_16):
> - lea -48(%rax), %rax
> -
> -L(exit):
> - sub %rcx, %rax
> - test %dl, %dl
> - jz L(exit_high)
> - test $0x01, %dl
> - jnz L(exit_tail0)
> -
> - test $0x02, %dl
> - jnz L(exit_tail1)
> -
> - test $0x04, %dl
> - jnz L(exit_tail2)
> -
> - test $0x08, %dl
> - jnz L(exit_tail3)
> -
> - test $0x10, %dl
> - jnz L(exit_tail4)
> -
> - test $0x20, %dl
> - jnz L(exit_tail5)
> -
> - test $0x40, %dl
> - jnz L(exit_tail6)
> - add $7, %eax
> -L(exit_tail0):
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_high):
> - add $8, %eax
> - test $0x01, %dh
> - jnz L(exit_tail0)
> -
> - test $0x02, %dh
> - jnz L(exit_tail1)
> -
> - test $0x04, %dh
> - jnz L(exit_tail2)
> -
> - test $0x08, %dh
> - jnz L(exit_tail3)
> -
> - test $0x10, %dh
> - jnz L(exit_tail4)
> -
> - test $0x20, %dh
> - jnz L(exit_tail5)
> -
> - test $0x40, %dh
> - jnz L(exit_tail6)
> - add $7, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail1):
> - add $1, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail2):
> - add $2, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail3):
> - add $3, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail4):
> - add $4, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail5):
> - add $5, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail6):
> - add $6, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail7):
> - add $7, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail8):
> - add $8, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail9):
> - add $9, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail10):
> - add $10, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail11):
> - add $11, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail12):
> - add $12, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail13):
> - add $13, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail14):
> - add $14, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail15):
> - add $15, %eax
> -
> - .p2align 4
> -L(StartStrcpyPart):
> - mov %rsi, %rcx
> - lea (%rdi, %rax), %rdx
> -# ifdef USE_AS_STRNCAT
> - test %r8, %r8
> - jz L(StrncatExit0)
> - cmp $8, %r8
> - jbe L(StrncatExit8Bytes)
> -# endif
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - cmpb $0, 7(%rcx)
> - jz L(Exit8)
> - cmpb $0, 8(%rcx)
> - jz L(Exit9)
> -# ifdef USE_AS_STRNCAT
> - cmp $16, %r8
> - jb L(StrncatExit15Bytes)
> -# endif
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - cmpb $0, 14(%rcx)
> - jz L(Exit15)
> - cmpb $0, 15(%rcx)
> - jz L(Exit16)
> -# ifdef USE_AS_STRNCAT
> - cmp $16, %r8
> - je L(StrncatExit16)
> -# define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-ssse3.S"
> -
> - .p2align 4
> -L(CopyFrom1To16Bytes):
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - test %al, %al
> - jz L(ExitHigh)
> - test $0x01, %al
> - jnz L(Exit1)
> - test $0x02, %al
> - jnz L(Exit2)
> - test $0x04, %al
> - jnz L(Exit3)
> - test $0x08, %al
> - jnz L(Exit4)
> - test $0x10, %al
> - jnz L(Exit5)
> - test $0x20, %al
> - jnz L(Exit6)
> - test $0x40, %al
> - jnz L(Exit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(ExitHigh):
> - test $0x01, %ah
> - jnz L(Exit9)
> - test $0x02, %ah
> - jnz L(Exit10)
> - test $0x04, %ah
> - jnz L(Exit11)
> - test $0x08, %ah
> - jnz L(Exit12)
> - test $0x10, %ah
> - jnz L(Exit13)
> - test $0x20, %ah
> - jnz L(Exit14)
> - test $0x40, %ah
> - jnz L(Exit15)
> - movlpd (%rcx), %xmm0
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm0, (%rdx)
> - movlpd %xmm1, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit1):
> - xor %ah, %ah
> - movb %ah, 1(%rdx)
> -L(Exit1):
> - movb (%rcx), %al
> - movb %al, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit2):
> - xor %ah, %ah
> - movb %ah, 2(%rdx)
> -L(Exit2):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit3):
> - xor %ah, %ah
> - movb %ah, 3(%rdx)
> -L(Exit3):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> - movb 2(%rcx), %al
> - movb %al, 2(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit4):
> - xor %ah, %ah
> - movb %ah, 4(%rdx)
> -L(Exit4):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit5):
> - xor %ah, %ah
> - movb %ah, 5(%rdx)
> -L(Exit5):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - movb 4(%rcx), %al
> - movb %al, 4(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit6):
> - xor %ah, %ah
> - movb %ah, 6(%rdx)
> -L(Exit6):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - movw 4(%rcx), %ax
> - movw %ax, 4(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit7):
> - xor %ah, %ah
> - movb %ah, 7(%rdx)
> -L(Exit7):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - mov 3(%rcx), %eax
> - mov %eax, 3(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit8):
> - xor %ah, %ah
> - movb %ah, 8(%rdx)
> -L(Exit8):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit9):
> - xor %ah, %ah
> - movb %ah, 9(%rdx)
> -L(Exit9):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movb 8(%rcx), %al
> - movb %al, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit10):
> - xor %ah, %ah
> - movb %ah, 10(%rdx)
> -L(Exit10):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movw 8(%rcx), %ax
> - movw %ax, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit11):
> - xor %ah, %ah
> - movb %ah, 11(%rdx)
> -L(Exit11):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov 7(%rcx), %eax
> - mov %eax, 7(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit12):
> - xor %ah, %ah
> - movb %ah, 12(%rdx)
> -L(Exit12):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov 8(%rcx), %eax
> - mov %eax, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit13):
> - xor %ah, %ah
> - movb %ah, 13(%rdx)
> -L(Exit13):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 5(%rcx), %xmm1
> - movlpd %xmm1, 5(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit14):
> - xor %ah, %ah
> - movb %ah, 14(%rdx)
> -L(Exit14):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 6(%rcx), %xmm1
> - movlpd %xmm1, 6(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit15):
> - xor %ah, %ah
> - movb %ah, 15(%rdx)
> -L(Exit15):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 7(%rcx), %xmm1
> - movlpd %xmm1, 7(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit16):
> - xor %ah, %ah
> - movb %ah, 16(%rdx)
> -L(Exit16):
> - movlpd (%rcx), %xmm0
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm0, (%rdx)
> - movlpd %xmm1, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase2):
> - add $16, %r8
> - add %rsi, %rcx
> - lea (%rsi, %rdx), %rsi
> - lea -9(%r8), %rdx
> - and $1<<7, %dh
> - or %al, %dh
> - test %dh, %dh
> - lea (%rsi), %rdx
> - jz L(ExitHighCase2)
> -
> - test $0x01, %al
> - jnz L(Exit1)
> - cmp $1, %r8
> - je L(StrncatExit1)
> - test $0x02, %al
> - jnz L(Exit2)
> - cmp $2, %r8
> - je L(StrncatExit2)
> - test $0x04, %al
> - jnz L(Exit3)
> - cmp $3, %r8
> - je L(StrncatExit3)
> - test $0x08, %al
> - jnz L(Exit4)
> - cmp $4, %r8
> - je L(StrncatExit4)
> - test $0x10, %al
> - jnz L(Exit5)
> - cmp $5, %r8
> - je L(StrncatExit5)
> - test $0x20, %al
> - jnz L(Exit6)
> - cmp $6, %r8
> - je L(StrncatExit6)
> - test $0x40, %al
> - jnz L(Exit7)
> - cmp $7, %r8
> - je L(StrncatExit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - lea 7(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> - xor %cl, %cl
> - movb %cl, (%rax)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(ExitHighCase2):
> - test $0x01, %ah
> - jnz L(Exit9)
> - cmp $9, %r8
> - je L(StrncatExit9)
> - test $0x02, %ah
> - jnz L(Exit10)
> - cmp $10, %r8
> - je L(StrncatExit10)
> - test $0x04, %ah
> - jnz L(Exit11)
> - cmp $11, %r8
> - je L(StrncatExit11)
> - test $0x8, %ah
> - jnz L(Exit12)
> - cmp $12, %r8
> - je L(StrncatExit12)
> - test $0x10, %ah
> - jnz L(Exit13)
> - cmp $13, %r8
> - je L(StrncatExit13)
> - test $0x20, %ah
> - jnz L(Exit14)
> - cmp $14, %r8
> - je L(StrncatExit14)
> - test $0x40, %ah
> - jnz L(Exit15)
> - cmp $15, %r8
> - je L(StrncatExit15)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm1, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase3):
> - add $16, %r8
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - cmp $8, %r8
> - ja L(ExitHighCase3)
> - cmp $1, %r8
> - je L(StrncatExit1)
> - cmp $2, %r8
> - je L(StrncatExit2)
> - cmp $3, %r8
> - je L(StrncatExit3)
> - cmp $4, %r8
> - je L(StrncatExit4)
> - cmp $5, %r8
> - je L(StrncatExit5)
> - cmp $6, %r8
> - je L(StrncatExit6)
> - cmp $7, %r8
> - je L(StrncatExit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - xor %ah, %ah
> - movb %ah, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(ExitHighCase3):
> - cmp $9, %r8
> - je L(StrncatExit9)
> - cmp $10, %r8
> - je L(StrncatExit10)
> - cmp $11, %r8
> - je L(StrncatExit11)
> - cmp $12, %r8
> - je L(StrncatExit12)
> - cmp $13, %r8
> - je L(StrncatExit13)
> - cmp $14, %r8
> - je L(StrncatExit14)
> - cmp $15, %r8
> - je L(StrncatExit15)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm1, 8(%rdx)
> - xor %ah, %ah
> - movb %ah, 16(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit0):
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit15Bytes):
> - cmp $9, %r8
> - je L(StrncatExit9)
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmp $10, %r8
> - je L(StrncatExit10)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmp $11, %r8
> - je L(StrncatExit11)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmp $12, %r8
> - je L(StrncatExit12)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmp $13, %r8
> - je L(StrncatExit13)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - cmp $14, %r8
> - je L(StrncatExit14)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 7(%rcx), %xmm1
> - movlpd %xmm1, 7(%rdx)
> - lea 14(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> - xor %cl, %cl
> - movb %cl, (%rax)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit8Bytes):
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmp $1, %r8
> - je L(StrncatExit1)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmp $2, %r8
> - je L(StrncatExit2)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmp $3, %r8
> - je L(StrncatExit3)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmp $4, %r8
> - je L(StrncatExit4)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmp $5, %r8
> - je L(StrncatExit5)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmp $6, %r8
> - je L(StrncatExit6)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - cmp $7, %r8
> - je L(StrncatExit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - lea 7(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> - xor %cl, %cl
> - movb %cl, (%rax)
> - mov %rdi, %rax
> - ret
> -
> -# endif
> -END (STRCAT)
> -#endif
> +/* Implemented in strcpy-ssse3-v2.S */
> diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> index 9725857..77b9adb 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
> @@ -21,18 +21,218 @@
> # include <sysdep.h>
>
> # ifndef STRCPY
> -# define STRCPY_TAIL __strcpy_sse2_unaligned_tail
> # define STRCPY __strcpy_sse2_unaligned
> +# define STRCAT __strcat_sse2_unaligned
> # endif
>
> # define ALIGN(x) .p2align x
>
> #ifndef USE_AS_STPCPY
> -ENTRY (STRCPY_TAIL)
> - movq %rsi, %rdx
> - pxor %xmm4, %xmm4
> - jmp L(from_tail)
> -END (STRCPY_TAIL)
> + /* Optimized strcat, we we optimize path when src and dest are at
> + most 64 bytes large by finding terminating zero in parallel.
> +
> + if ((src | dest) % 4096 > 4096 - 64)
> + goto strcat_cross_page;
> + d = zero64 (dest)
> + s = zero64 (src)
> + if (d)
> + {
> + found:;
> + dest += ffs (d)
> + if (s)
> + return copy_less64_bytes (dest, src, ffs (s))
> + copy64_bytes (dest, src)
> + return strcpy (dest, src)
> + }
> + else
> + {
> + dest = ALIGN_DOWN (dest,64);
> + while (1)
> + {
> + dest += 64;
> + d = zero64 (dest);
> + if (d)
> + goto found;
> + }
> + }
> + */
> +ENTRY (STRCAT)
> + movq %rdi, %rax
> + pxor %xmm5, %xmm5
> + movq %rsi, %rdx
> + pxor %xmm6, %xmm6
> + orq %rdi, %rdx
> + pxor %xmm7, %xmm7
> + andl $4095, %edx
> + pxor %xmm0, %xmm0
> + cmpq $4032, %rdx
> + ja L(strcat_cross_page)
> + movdqu (%rsi), %xmm1
> + movdqu 16(%rsi), %xmm2
> + movdqu 32(%rsi), %xmm3
> + movdqu 48(%rsi), %xmm4
> + pxor %xmm8, %xmm8
> + movdqu (%rdi), %xmm9
> + movdqu 16(%rdi), %xmm10
> + movdqu 32(%rdi), %xmm11
> + movdqu 48(%rdi), %xmm12
> + pcmpeqb %xmm1, %xmm5
> + pcmpeqb %xmm2, %xmm6
> + pcmpeqb %xmm3, %xmm7
> + pcmpeqb %xmm4, %xmm0
> + pcmpeqb %xmm8, %xmm9
> + pcmpeqb %xmm8, %xmm10
> + pcmpeqb %xmm8, %xmm11
> + pcmpeqb %xmm8, %xmm12
> +
> + pmovmskb %xmm5, %edx
> + pmovmskb %xmm6, %r8d
> + pmovmskb %xmm7, %r9d
> + pmovmskb %xmm0, %r10d
> + shlq $16, %r8
> + shlq $32, %r9
> + shlq $48, %r10
> + orq %r8, %rdx
> + orq %r9, %rdx
> + orq %r10, %rdx
> +
> + pmovmskb %xmm9, %ecx
> + pmovmskb %xmm10, %r11d
> + pmovmskb %xmm11, %r10d
> + pmovmskb %xmm12, %r9d
> + shlq $16, %r11
> + shlq $32, %r10
> + shlq $48, %r9
> + orq %r11, %rcx
> + orq %r10, %rcx
> + orq %r9, %rcx
> +
> + testq %rcx, %rcx
> + je L(strcat_loop_start)
> +L(strcat_found_zero):
> + bsfq %rcx, %rcx
> + addq %rcx, %rdi
> + testq %rdx, %rdx
> + jne L(last_64_bytes)
> + jmp L(strcat_first_64_bytes)
> +
> +L(strcat_loop_start):
> + andq $-64, %rdi
> + ALIGN(4)
> +L(strcat_loop):
> + movdqa 64(%rdi), %xmm5
> + pminub 80(%rdi), %xmm5
> + pminub 96(%rdi), %xmm5
> + pminub 112(%rdi), %xmm5
> + addq $64, %rdi
> + pcmpeqb %xmm8, %xmm5
> + pmovmskb %xmm5, %ecx
> + testl %ecx, %ecx
> + je L(strcat_loop)
> + shlq $48, %rcx
> + movdqa (%rdi), %xmm9
> + movdqa 16(%rdi), %xmm10
> + movdqa 32(%rdi), %xmm11
> + pcmpeqb %xmm8, %xmm9
> + pcmpeqb %xmm8, %xmm10
> + pcmpeqb %xmm8, %xmm11
> + pmovmskb %xmm9, %r9d
> + pmovmskb %xmm10, %r10d
> + pmovmskb %xmm11, %r11d
> + salq $16, %r10
> + salq $32, %r11
> + orq %r9, %rcx
> + orq %r10, %rcx
> + orq %r11, %rcx
> + jmp L(strcat_found_zero)
> +
> + /*
> + d_al = ALIGN_DOWN (dest, 64);
> + d = zero64 (d_al)
> + d = d >> (dest - d_al)
> + if (d)
> + {
> + dest += ffs (d)
> + return strcpy (dest, src)
> + }
> + else
> + {
> + dest = ALIGN_DOWN (dest,64);
> + while (1)
> + {
> + dest += 64;
> + d = zero64 (dest);
> + if (d) {
> + dest += ffs (d)
> + return strcpy (dest, src)
> + }
> + }
> + } */
> + L(strcat_cross_page):
> + andq $-64, %rdi
> +
> + movdqa 48(%rdi), %xmm12
> + pcmpeqb %xmm8, %xmm12
> + pmovmskb %xmm12, %rcx
> + shlq $48, %rcx
> + movdqa (%rdi), %xmm9
> + movdqa 16(%rdi), %xmm10
> + movdqa 32(%rdi), %xmm11
> + pcmpeqb %xmm8, %xmm9
> + pcmpeqb %xmm8, %xmm10
> + pcmpeqb %xmm8, %xmm11
> + pmovmskb %xmm9, %r9d
> + pmovmskb %xmm10, %r10d
> + pmovmskb %xmm11, %r11d
> + salq $16, %r10
> + salq $32, %r11
> + orq %r9, %rcx
> + orq %r10, %rcx
> + orq %r11, %rcx
> + movq %rcx, %rdx
> + movq %rax, %rcx
> + shrq %cl, %rdx /* We use fact that shifts are done modulo 64. */
> + testq %rdx, %rdx
> + je L(strcat_cross_loop)
> + movq %rax, %rdi
> +
> + pxor %xmm4, %xmm4
> + bsfq %rdx, %rcx
> + addq %rcx, %rdi
> + jmp L(from_strcat)
> +
> + ALIGN(4)
> +L(strcat_cross_loop):
> + movdqa 64(%rdi), %xmm5
> + pminub 80(%rdi), %xmm5
> + pminub 96(%rdi), %xmm5
> + pminub 112(%rdi), %xmm5
> + addq $64, %rdi
> + pcmpeqb %xmm8, %xmm5
> + pmovmskb %xmm5, %ecx
> + testl %ecx, %ecx
> + je L(strcat_cross_loop)
> + shlq $48, %rcx
> + movdqa (%rdi), %xmm9
> + movdqa 16(%rdi), %xmm10
> + movdqa 32(%rdi), %xmm11
> + pcmpeqb %xmm8, %xmm9
> + pcmpeqb %xmm8, %xmm10
> + pcmpeqb %xmm8, %xmm11
> + pmovmskb %xmm9, %r9d
> + pmovmskb %xmm10, %r10d
> + pmovmskb %xmm11, %r11d
> + salq $16, %r10
> + salq $32, %r11
> + orq %r9, %rcx
> + orq %r10, %rcx
> + orq %r11, %rcx
> +
> + pxor %xmm4, %xmm4
> + bsfq %rcx, %rcx
> + addq %rcx, %rdi
> + jmp L(from_strcat)
> +END (STRCAT)
> #endif
>
> ENTRY (STRCPY)
> @@ -51,7 +251,7 @@ ENTRY (STRCPY)
> movq %rsi, %rdx
> pxor %xmm4, %xmm4
> movq %rdi, %rax
> -L(from_tail):
> +L(from_strcat):
> pxor %xmm5, %xmm5
> andl $4095, %edx
> pxor %xmm6, %xmm6
> @@ -88,6 +288,7 @@ L(from_tail):
> salq $48, %rcx
> orq %rcx, %rdx
> jne L(between_32_64_bytes)
> +L(strcat_first_64_bytes):
> movdqu %xmm1, (%rdi)
> movdqu %xmm2, 16(%rdi)
> movdqu %xmm3, 32(%rdi)
> @@ -137,7 +338,7 @@ L(prepare_loop):
> /* After loop finished we call following
> copy_less64_bytes (erdi, ersi, ffs(erdx) + 1);
> return; */
> -
> +L(last_64_bytes):
> bsfq %rdx, %rcx
> #ifdef USE_AS_STPCPY
> lea (%rdi, %rcx), %rax
> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S b/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
> index 8f70c42..9705e53 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
> @@ -19,8 +19,8 @@
> #define USE_SSSE3
>
> #ifndef STRCPY
> -# define STRCPY_TAIL __strcpy_ssse3_tail
> # define STRCPY __strcpy_ssse3
> +# define STRAT __strcat_ssse3
> #endif
>
> #include "strcpy-sse2-unaligned-v2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> index 133e1d2..34dd69b 100644
> --- a/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> +++ b/sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S
> @@ -1,3 +1,286 @@
> +/* strncat with SSE2
> + Copyright (C) 2011-2013 Free Software Foundation, Inc.
> + Contributed by Intel Corporation.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> #define USE_AS_STRNCAT
> #define STRCAT __strncat_sse2_unaligned
> -#include "strcat-sse2-unaligned.S"
> +
> +#ifndef NOT_IN_libc
> +
> +# include <sysdep.h>
> +
> +# ifndef STRCAT
> +# define STRCAT __strcat_sse2_unaligned
> +# endif
> +
> +# define USE_AS_STRCAT
> +
> +.text
> +ENTRY (STRCAT)
> + mov %rdi, %r9
> +# ifdef USE_AS_STRNCAT
> + mov %rdx, %r8
> +# endif
> +
> +/* Inline corresponding strlen file, temporary until new strcpy
> + implementation gets merged. */
> +
> + xor %rax, %rax
> + mov %edi, %ecx
> + and $0x3f, %ecx
> + pxor %xmm0, %xmm0
> + cmp $0x30, %ecx
> + ja L(next)
> + movdqu (%rdi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pmovmskb %xmm0, %edx
> + test %edx, %edx
> + jnz L(exit_less16)
> + mov %rdi, %rax
> + and $-16, %rax
> + jmp L(align16_start)
> +L(next):
> + mov %rdi, %rax
> + and $-16, %rax
> + pcmpeqb (%rax), %xmm0
> + mov $-1, %r10d
> + sub %rax, %rcx
> + shl %cl, %r10d
> + pmovmskb %xmm0, %edx
> + and %r10d, %edx
> + jnz L(exit)
> +
> +L(align16_start):
> + pxor %xmm0, %xmm0
> + pxor %xmm1, %xmm1
> + pxor %xmm2, %xmm2
> + pxor %xmm3, %xmm3
> + pcmpeqb 16(%rax), %xmm0
> + pmovmskb %xmm0, %edx
> + test %edx, %edx
> + jnz L(exit16)
> +
> + pcmpeqb 32(%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + test %edx, %edx
> + jnz L(exit32)
> +
> + pcmpeqb 48(%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + test %edx, %edx
> + jnz L(exit48)
> +
> + pcmpeqb 64(%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + test %edx, %edx
> + jnz L(exit64)
> +
> + pcmpeqb 80(%rax), %xmm0
> + add $64, %rax
> + pmovmskb %xmm0, %edx
> + test %edx, %edx
> + jnz L(exit16)
> +
> + pcmpeqb 32(%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + test %edx, %edx
> + jnz L(exit32)
> +
> + pcmpeqb 48(%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + test %edx, %edx
> + jnz L(exit48)
> +
> + pcmpeqb 64(%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + test %edx, %edx
> + jnz L(exit64)
> +
> + pcmpeqb 80(%rax), %xmm0
> + add $64, %rax
> + pmovmskb %xmm0, %edx
> + test %edx, %edx
> + jnz L(exit16)
> +
> + pcmpeqb 32(%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + test %edx, %edx
> + jnz L(exit32)
> +
> + pcmpeqb 48(%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + test %edx, %edx
> + jnz L(exit48)
> +
> + pcmpeqb 64(%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + test %edx, %edx
> + jnz L(exit64)
> +
> + pcmpeqb 80(%rax), %xmm0
> + add $64, %rax
> + pmovmskb %xmm0, %edx
> + test %edx, %edx
> + jnz L(exit16)
> +
> + pcmpeqb 32(%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + test %edx, %edx
> + jnz L(exit32)
> +
> + pcmpeqb 48(%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + test %edx, %edx
> + jnz L(exit48)
> +
> + pcmpeqb 64(%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + test %edx, %edx
> + jnz L(exit64)
> +
> + test $0x3f, %rax
> + jz L(align64_loop)
> +
> + pcmpeqb 80(%rax), %xmm0
> + add $80, %rax
> + pmovmskb %xmm0, %edx
> + test %edx, %edx
> + jnz L(exit)
> +
> + test $0x3f, %rax
> + jz L(align64_loop)
> +
> + pcmpeqb 16(%rax), %xmm1
> + add $16, %rax
> + pmovmskb %xmm1, %edx
> + test %edx, %edx
> + jnz L(exit)
> +
> + test $0x3f, %rax
> + jz L(align64_loop)
> +
> + pcmpeqb 16(%rax), %xmm2
> + add $16, %rax
> + pmovmskb %xmm2, %edx
> + test %edx, %edx
> + jnz L(exit)
> +
> + test $0x3f, %rax
> + jz L(align64_loop)
> +
> + pcmpeqb 16(%rax), %xmm3
> + add $16, %rax
> + pmovmskb %xmm3, %edx
> + test %edx, %edx
> + jnz L(exit)
> +
> + add $16, %rax
> + .p2align 4
> + L(align64_loop):
> + movaps (%rax), %xmm4
> + pminub 16(%rax), %xmm4
> + movaps 32(%rax), %xmm5
> + pminub 48(%rax), %xmm5
> + add $64, %rax
> + pminub %xmm4, %xmm5
> + pcmpeqb %xmm0, %xmm5
> + pmovmskb %xmm5, %edx
> + test %edx, %edx
> + jz L(align64_loop)
> +
> + pcmpeqb -64(%rax), %xmm0
> + sub $80, %rax
> + pmovmskb %xmm0, %edx
> + test %edx, %edx
> + jnz L(exit16)
> +
> + pcmpeqb 32(%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + test %edx, %edx
> + jnz L(exit32)
> +
> + pcmpeqb 48(%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + test %edx, %edx
> + jnz L(exit48)
> +
> + pcmpeqb 64(%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + sub %rdi, %rax
> + bsf %rdx, %rdx
> + add %rdx, %rax
> + add $64, %rax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit):
> + sub %rdi, %rax
> +L(exit_less16):
> + bsf %rdx, %rdx
> + add %rdx, %rax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit16):
> + sub %rdi, %rax
> + bsf %rdx, %rdx
> + add %rdx, %rax
> + add $16, %rax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit32):
> + sub %rdi, %rax
> + bsf %rdx, %rdx
> + add %rdx, %rax
> + add $32, %rax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit48):
> + sub %rdi, %rax
> + bsf %rdx, %rdx
> + add %rdx, %rax
> + add $48, %rax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit64):
> + sub %rdi, %rax
> + bsf %rdx, %rdx
> + add %rdx, %rax
> + add $64, %rax
> +
> + .p2align 4
> +L(StartStrcpyPart):
> + lea (%r9, %rax), %rdi
> + mov %rsi, %rcx
> + mov %r9, %rax /* save result */
> +
> +# ifdef USE_AS_STRNCAT
> + test %r8, %r8
> + jz L(ExitZero)
> +# define USE_AS_STRNCPY
> +# include "strcpy-sse2-unaligned.S"
> +
> +# else
> + jmp __strcpy_sse2_unaligned_tail
> + END (STRCAT)
> +# endif
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> index 6c45ff3..a76075c 100644
> --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> +++ b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> @@ -1,3 +1,870 @@
> +/* strncat with SSSE3
> + Copyright (C) 2011-2013 Free Software Foundation, Inc.
> + Contributed by Intel Corporation.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> #define USE_AS_STRNCAT
> #define STRCAT __strncat_ssse3
> -#include "strcat-ssse3.S"
> +
> +#ifndef NOT_IN_libc
> +
> +# include <sysdep.h>
> +
> +# ifndef STRCAT
> +# define STRCAT __strcat_ssse3
> +# endif
> +
> +# define USE_AS_STRCAT
> +
> +.text
> +ENTRY (STRCAT)
> +# ifdef USE_AS_STRNCAT
> + mov %rdx, %r8
> +# endif
> +
> +
> +/* Inline corresponding strlen file, temporary until new strcpy
> + implementation gets merged. */
> +
> + xor %eax, %eax
> + cmpb $0, (%rdi)
> + jz L(exit_tail0)
> + cmpb $0, 1(%rdi)
> + jz L(exit_tail1)
> + cmpb $0, 2(%rdi)
> + jz L(exit_tail2)
> + cmpb $0, 3(%rdi)
> + jz L(exit_tail3)
> +
> + cmpb $0, 4(%rdi)
> + jz L(exit_tail4)
> + cmpb $0, 5(%rdi)
> + jz L(exit_tail5)
> + cmpb $0, 6(%rdi)
> + jz L(exit_tail6)
> + cmpb $0, 7(%rdi)
> + jz L(exit_tail7)
> +
> + cmpb $0, 8(%rdi)
> + jz L(exit_tail8)
> + cmpb $0, 9(%rdi)
> + jz L(exit_tail9)
> + cmpb $0, 10(%rdi)
> + jz L(exit_tail10)
> + cmpb $0, 11(%rdi)
> + jz L(exit_tail11)
> +
> + cmpb $0, 12(%rdi)
> + jz L(exit_tail12)
> + cmpb $0, 13(%rdi)
> + jz L(exit_tail13)
> + cmpb $0, 14(%rdi)
> + jz L(exit_tail14)
> + cmpb $0, 15(%rdi)
> + jz L(exit_tail15)
> + pxor %xmm0, %xmm0
> + lea 16(%rdi), %rcx
> + lea 16(%rdi), %rax
> + and $-16, %rax
> +
> + pcmpeqb (%rax), %xmm0
> + pmovmskb %xmm0, %edx
> + pxor %xmm1, %xmm1
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + pxor %xmm2, %xmm2
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + pxor %xmm3, %xmm3
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm0
> + pmovmskb %xmm0, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm0
> + pmovmskb %xmm0, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm0
> + pmovmskb %xmm0, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + pcmpeqb (%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + test %edx, %edx
> + lea 16(%rax), %rax
> + jnz L(exit)
> +
> + and $-0x40, %rax
> +
> + .p2align 4
> +L(aligned_64):
> + pcmpeqb (%rax), %xmm0
> + pcmpeqb 16(%rax), %xmm1
> + pcmpeqb 32(%rax), %xmm2
> + pcmpeqb 48(%rax), %xmm3
> + pmovmskb %xmm0, %edx
> + pmovmskb %xmm1, %r11d
> + pmovmskb %xmm2, %r10d
> + pmovmskb %xmm3, %r9d
> + or %edx, %r9d
> + or %r11d, %r9d
> + or %r10d, %r9d
> + lea 64(%rax), %rax
> + jz L(aligned_64)
> +
> + test %edx, %edx
> + jnz L(aligned_64_exit_16)
> + test %r11d, %r11d
> + jnz L(aligned_64_exit_32)
> + test %r10d, %r10d
> + jnz L(aligned_64_exit_48)
> +
> +L(aligned_64_exit_64):
> + pmovmskb %xmm3, %edx
> + jmp L(exit)
> +
> +L(aligned_64_exit_48):
> + lea -16(%rax), %rax
> + mov %r10d, %edx
> + jmp L(exit)
> +
> +L(aligned_64_exit_32):
> + lea -32(%rax), %rax
> + mov %r11d, %edx
> + jmp L(exit)
> +
> +L(aligned_64_exit_16):
> + lea -48(%rax), %rax
> +
> +L(exit):
> + sub %rcx, %rax
> + test %dl, %dl
> + jz L(exit_high)
> + test $0x01, %dl
> + jnz L(exit_tail0)
> +
> + test $0x02, %dl
> + jnz L(exit_tail1)
> +
> + test $0x04, %dl
> + jnz L(exit_tail2)
> +
> + test $0x08, %dl
> + jnz L(exit_tail3)
> +
> + test $0x10, %dl
> + jnz L(exit_tail4)
> +
> + test $0x20, %dl
> + jnz L(exit_tail5)
> +
> + test $0x40, %dl
> + jnz L(exit_tail6)
> + add $7, %eax
> +L(exit_tail0):
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_high):
> + add $8, %eax
> + test $0x01, %dh
> + jnz L(exit_tail0)
> +
> + test $0x02, %dh
> + jnz L(exit_tail1)
> +
> + test $0x04, %dh
> + jnz L(exit_tail2)
> +
> + test $0x08, %dh
> + jnz L(exit_tail3)
> +
> + test $0x10, %dh
> + jnz L(exit_tail4)
> +
> + test $0x20, %dh
> + jnz L(exit_tail5)
> +
> + test $0x40, %dh
> + jnz L(exit_tail6)
> + add $7, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail1):
> + add $1, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail2):
> + add $2, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail3):
> + add $3, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail4):
> + add $4, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail5):
> + add $5, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail6):
> + add $6, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail7):
> + add $7, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail8):
> + add $8, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail9):
> + add $9, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail10):
> + add $10, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail11):
> + add $11, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail12):
> + add $12, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail13):
> + add $13, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail14):
> + add $14, %eax
> + jmp L(StartStrcpyPart)
> +
> + .p2align 4
> +L(exit_tail15):
> + add $15, %eax
> +
> + .p2align 4
> +L(StartStrcpyPart):
> + mov %rsi, %rcx
> + lea (%rdi, %rax), %rdx
> +# ifdef USE_AS_STRNCAT
> + test %r8, %r8
> + jz L(StrncatExit0)
> + cmp $8, %r8
> + jbe L(StrncatExit8Bytes)
> +# endif
> + cmpb $0, (%rcx)
> + jz L(Exit1)
> + cmpb $0, 1(%rcx)
> + jz L(Exit2)
> + cmpb $0, 2(%rcx)
> + jz L(Exit3)
> + cmpb $0, 3(%rcx)
> + jz L(Exit4)
> + cmpb $0, 4(%rcx)
> + jz L(Exit5)
> + cmpb $0, 5(%rcx)
> + jz L(Exit6)
> + cmpb $0, 6(%rcx)
> + jz L(Exit7)
> + cmpb $0, 7(%rcx)
> + jz L(Exit8)
> + cmpb $0, 8(%rcx)
> + jz L(Exit9)
> +# ifdef USE_AS_STRNCAT
> + cmp $16, %r8
> + jb L(StrncatExit15Bytes)
> +# endif
> + cmpb $0, 9(%rcx)
> + jz L(Exit10)
> + cmpb $0, 10(%rcx)
> + jz L(Exit11)
> + cmpb $0, 11(%rcx)
> + jz L(Exit12)
> + cmpb $0, 12(%rcx)
> + jz L(Exit13)
> + cmpb $0, 13(%rcx)
> + jz L(Exit14)
> + cmpb $0, 14(%rcx)
> + jz L(Exit15)
> + cmpb $0, 15(%rcx)
> + jz L(Exit16)
> +# ifdef USE_AS_STRNCAT
> + cmp $16, %r8
> + je L(StrncatExit16)
> +# define USE_AS_STRNCPY
> +# endif
> +
> +# include "strcpy-ssse3.S"
> +
> + .p2align 4
> +L(CopyFrom1To16Bytes):
> + add %rsi, %rdx
> + add %rsi, %rcx
> +
> + test %al, %al
> + jz L(ExitHigh)
> + test $0x01, %al
> + jnz L(Exit1)
> + test $0x02, %al
> + jnz L(Exit2)
> + test $0x04, %al
> + jnz L(Exit3)
> + test $0x08, %al
> + jnz L(Exit4)
> + test $0x10, %al
> + jnz L(Exit5)
> + test $0x20, %al
> + jnz L(Exit6)
> + test $0x40, %al
> + jnz L(Exit7)
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(ExitHigh):
> + test $0x01, %ah
> + jnz L(Exit9)
> + test $0x02, %ah
> + jnz L(Exit10)
> + test $0x04, %ah
> + jnz L(Exit11)
> + test $0x08, %ah
> + jnz L(Exit12)
> + test $0x10, %ah
> + jnz L(Exit13)
> + test $0x20, %ah
> + jnz L(Exit14)
> + test $0x40, %ah
> + jnz L(Exit15)
> + movlpd (%rcx), %xmm0
> + movlpd 8(%rcx), %xmm1
> + movlpd %xmm0, (%rdx)
> + movlpd %xmm1, 8(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit1):
> + xor %ah, %ah
> + movb %ah, 1(%rdx)
> +L(Exit1):
> + movb (%rcx), %al
> + movb %al, (%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit2):
> + xor %ah, %ah
> + movb %ah, 2(%rdx)
> +L(Exit2):
> + movw (%rcx), %ax
> + movw %ax, (%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit3):
> + xor %ah, %ah
> + movb %ah, 3(%rdx)
> +L(Exit3):
> + movw (%rcx), %ax
> + movw %ax, (%rdx)
> + movb 2(%rcx), %al
> + movb %al, 2(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit4):
> + xor %ah, %ah
> + movb %ah, 4(%rdx)
> +L(Exit4):
> + mov (%rcx), %eax
> + mov %eax, (%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit5):
> + xor %ah, %ah
> + movb %ah, 5(%rdx)
> +L(Exit5):
> + mov (%rcx), %eax
> + mov %eax, (%rdx)
> + movb 4(%rcx), %al
> + movb %al, 4(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit6):
> + xor %ah, %ah
> + movb %ah, 6(%rdx)
> +L(Exit6):
> + mov (%rcx), %eax
> + mov %eax, (%rdx)
> + movw 4(%rcx), %ax
> + movw %ax, 4(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit7):
> + xor %ah, %ah
> + movb %ah, 7(%rdx)
> +L(Exit7):
> + mov (%rcx), %eax
> + mov %eax, (%rdx)
> + mov 3(%rcx), %eax
> + mov %eax, 3(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit8):
> + xor %ah, %ah
> + movb %ah, 8(%rdx)
> +L(Exit8):
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit9):
> + xor %ah, %ah
> + movb %ah, 9(%rdx)
> +L(Exit9):
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + movb 8(%rcx), %al
> + movb %al, 8(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit10):
> + xor %ah, %ah
> + movb %ah, 10(%rdx)
> +L(Exit10):
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + movw 8(%rcx), %ax
> + movw %ax, 8(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit11):
> + xor %ah, %ah
> + movb %ah, 11(%rdx)
> +L(Exit11):
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + mov 7(%rcx), %eax
> + mov %eax, 7(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit12):
> + xor %ah, %ah
> + movb %ah, 12(%rdx)
> +L(Exit12):
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + mov 8(%rcx), %eax
> + mov %eax, 8(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit13):
> + xor %ah, %ah
> + movb %ah, 13(%rdx)
> +L(Exit13):
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + movlpd 5(%rcx), %xmm1
> + movlpd %xmm1, 5(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit14):
> + xor %ah, %ah
> + movb %ah, 14(%rdx)
> +L(Exit14):
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + movlpd 6(%rcx), %xmm1
> + movlpd %xmm1, 6(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit15):
> + xor %ah, %ah
> + movb %ah, 15(%rdx)
> +L(Exit15):
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + movlpd 7(%rcx), %xmm1
> + movlpd %xmm1, 7(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit16):
> + xor %ah, %ah
> + movb %ah, 16(%rdx)
> +L(Exit16):
> + movlpd (%rcx), %xmm0
> + movlpd 8(%rcx), %xmm1
> + movlpd %xmm0, (%rdx)
> + movlpd %xmm1, 8(%rdx)
> + mov %rdi, %rax
> + ret
> +
> +# ifdef USE_AS_STRNCPY
> +
> + .p2align 4
> +L(CopyFrom1To16BytesCase2):
> + add $16, %r8
> + add %rsi, %rcx
> + lea (%rsi, %rdx), %rsi
> + lea -9(%r8), %rdx
> + and $1<<7, %dh
> + or %al, %dh
> + test %dh, %dh
> + lea (%rsi), %rdx
> + jz L(ExitHighCase2)
> +
> + test $0x01, %al
> + jnz L(Exit1)
> + cmp $1, %r8
> + je L(StrncatExit1)
> + test $0x02, %al
> + jnz L(Exit2)
> + cmp $2, %r8
> + je L(StrncatExit2)
> + test $0x04, %al
> + jnz L(Exit3)
> + cmp $3, %r8
> + je L(StrncatExit3)
> + test $0x08, %al
> + jnz L(Exit4)
> + cmp $4, %r8
> + je L(StrncatExit4)
> + test $0x10, %al
> + jnz L(Exit5)
> + cmp $5, %r8
> + je L(StrncatExit5)
> + test $0x20, %al
> + jnz L(Exit6)
> + cmp $6, %r8
> + je L(StrncatExit6)
> + test $0x40, %al
> + jnz L(Exit7)
> + cmp $7, %r8
> + je L(StrncatExit7)
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + lea 7(%rdx), %rax
> + cmpb $1, (%rax)
> + sbb $-1, %rax
> + xor %cl, %cl
> + movb %cl, (%rax)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(ExitHighCase2):
> + test $0x01, %ah
> + jnz L(Exit9)
> + cmp $9, %r8
> + je L(StrncatExit9)
> + test $0x02, %ah
> + jnz L(Exit10)
> + cmp $10, %r8
> + je L(StrncatExit10)
> + test $0x04, %ah
> + jnz L(Exit11)
> + cmp $11, %r8
> + je L(StrncatExit11)
> + test $0x8, %ah
> + jnz L(Exit12)
> + cmp $12, %r8
> + je L(StrncatExit12)
> + test $0x10, %ah
> + jnz L(Exit13)
> + cmp $13, %r8
> + je L(StrncatExit13)
> + test $0x20, %ah
> + jnz L(Exit14)
> + cmp $14, %r8
> + je L(StrncatExit14)
> + test $0x40, %ah
> + jnz L(Exit15)
> + cmp $15, %r8
> + je L(StrncatExit15)
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + movlpd 8(%rcx), %xmm1
> + movlpd %xmm1, 8(%rdx)
> + mov %rdi, %rax
> + ret
> +
> +L(CopyFrom1To16BytesCase2OrCase3):
> + test %rax, %rax
> + jnz L(CopyFrom1To16BytesCase2)
> +
> + .p2align 4
> +L(CopyFrom1To16BytesCase3):
> + add $16, %r8
> + add %rsi, %rdx
> + add %rsi, %rcx
> +
> + cmp $8, %r8
> + ja L(ExitHighCase3)
> + cmp $1, %r8
> + je L(StrncatExit1)
> + cmp $2, %r8
> + je L(StrncatExit2)
> + cmp $3, %r8
> + je L(StrncatExit3)
> + cmp $4, %r8
> + je L(StrncatExit4)
> + cmp $5, %r8
> + je L(StrncatExit5)
> + cmp $6, %r8
> + je L(StrncatExit6)
> + cmp $7, %r8
> + je L(StrncatExit7)
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + xor %ah, %ah
> + movb %ah, 8(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(ExitHighCase3):
> + cmp $9, %r8
> + je L(StrncatExit9)
> + cmp $10, %r8
> + je L(StrncatExit10)
> + cmp $11, %r8
> + je L(StrncatExit11)
> + cmp $12, %r8
> + je L(StrncatExit12)
> + cmp $13, %r8
> + je L(StrncatExit13)
> + cmp $14, %r8
> + je L(StrncatExit14)
> + cmp $15, %r8
> + je L(StrncatExit15)
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + movlpd 8(%rcx), %xmm1
> + movlpd %xmm1, 8(%rdx)
> + xor %ah, %ah
> + movb %ah, 16(%rdx)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit0):
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit15Bytes):
> + cmp $9, %r8
> + je L(StrncatExit9)
> + cmpb $0, 9(%rcx)
> + jz L(Exit10)
> + cmp $10, %r8
> + je L(StrncatExit10)
> + cmpb $0, 10(%rcx)
> + jz L(Exit11)
> + cmp $11, %r8
> + je L(StrncatExit11)
> + cmpb $0, 11(%rcx)
> + jz L(Exit12)
> + cmp $12, %r8
> + je L(StrncatExit12)
> + cmpb $0, 12(%rcx)
> + jz L(Exit13)
> + cmp $13, %r8
> + je L(StrncatExit13)
> + cmpb $0, 13(%rcx)
> + jz L(Exit14)
> + cmp $14, %r8
> + je L(StrncatExit14)
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + movlpd 7(%rcx), %xmm1
> + movlpd %xmm1, 7(%rdx)
> + lea 14(%rdx), %rax
> + cmpb $1, (%rax)
> + sbb $-1, %rax
> + xor %cl, %cl
> + movb %cl, (%rax)
> + mov %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(StrncatExit8Bytes):
> + cmpb $0, (%rcx)
> + jz L(Exit1)
> + cmp $1, %r8
> + je L(StrncatExit1)
> + cmpb $0, 1(%rcx)
> + jz L(Exit2)
> + cmp $2, %r8
> + je L(StrncatExit2)
> + cmpb $0, 2(%rcx)
> + jz L(Exit3)
> + cmp $3, %r8
> + je L(StrncatExit3)
> + cmpb $0, 3(%rcx)
> + jz L(Exit4)
> + cmp $4, %r8
> + je L(StrncatExit4)
> + cmpb $0, 4(%rcx)
> + jz L(Exit5)
> + cmp $5, %r8
> + je L(StrncatExit5)
> + cmpb $0, 5(%rcx)
> + jz L(Exit6)
> + cmp $6, %r8
> + je L(StrncatExit6)
> + cmpb $0, 6(%rcx)
> + jz L(Exit7)
> + cmp $7, %r8
> + je L(StrncatExit7)
> + movlpd (%rcx), %xmm0
> + movlpd %xmm0, (%rdx)
> + lea 7(%rdx), %rax
> + cmpb $1, (%rax)
> + sbb $-1, %rax
> + xor %cl, %cl
> + movb %cl, (%rax)
> + mov %rdi, %rax
> + ret
> +
> +# endif
> +END (STRCAT)
> +#endif
> --
> 1.8.3.2
>