This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH v3] Improve strcpy: Faster unaligned loads.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Carlos O'Donell <carlos at redhat dot com>
- Cc: Andreas Schwab <schwab at linux-m68k dot org>, libc-alpha at sourceware dot org
- Date: Tue, 10 Sep 2013 22:28:44 +0200
- Subject: [PATCH v3] Improve strcpy: Faster unaligned loads.
- Authentication-results: sourceware.org; auth=none
- References: <20130909153051 dot GA23047 at domone dot kolej dot mff dot cuni dot cz> <20130909161112 dot GB23047 at domone dot kolej dot mff dot cuni dot cz> <mvmbo42dkiq dot fsf at hawking dot suse dot de> <20130909171703 dot GA32141 at domone dot kolej dot mff dot cuni dot cz> <87ob81c1yk dot fsf at igel dot home> <20130909191829 dot GA997 at domone dot kolej dot mff dot cuni dot cz> <522E28E9 dot 5000709 at redhat dot com> <20130910142117 dot GB6536 at domone dot kolej dot mff dot cuni dot cz>
Hi Carlos,
Here is strcpy with comments. To get structure I decided to include
ssse3 loop in this patch. If you are ok with splitting to loop header
an ssse3 could be reviewed separately.
I ommited actual strcat calls as I have patch that uses them ready and
it needs bit of code movement.
* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S: New file.
* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-loop.S: New file.
* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S: Include
strcpy-sse2-unaligned-v2.S
* sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S: New file.
* sysdeps/x86_64/multiarch/strcpy-ssse3-loop.S: New file.
* sysdeps/x86_64/multiarch/stpcpy-ssse3: Include strcpy-ssse3-v2.S
* sysdeps/x86_64/multiarch/Makefile: Update.
---
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S | 2 +-
sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 2 +-
.../x86_64/multiarch/strcpy-sse2-unaligned-loop.S | 36 +++
.../x86_64/multiarch/strcpy-sse2-unaligned-v2.S | 337 +++++++++++++++++++++
sysdeps/x86_64/multiarch/strcpy-ssse3-loop.S | 133 ++++++++
sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S | 26 ++
7 files changed, 536 insertions(+), 4 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-loop.S
create mode 100644 sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
create mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3-loop.S
create mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5ab950a..ca2e4f8 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,8 +13,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
- strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
- strcpy-sse2-unaligned strncpy-sse2-unaligned \
+ strcpy-ssse3-v2 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
+ strcpy-sse2-unaligned-v2 strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
index 34231f8..363b692 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-sse2-unaligned.S
@@ -1,3 +1,3 @@
#define USE_AS_STPCPY
#define STRCPY __stpcpy_sse2_unaligned
-#include "strcpy-sse2-unaligned.S"
+#include "strcpy-sse2-unaligned-v2.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
index d971c2d..02cf8ea 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
@@ -1,3 +1,3 @@
#define USE_AS_STPCPY
#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
+#include "strcpy-ssse3-v2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-loop.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-loop.S
new file mode 100644
index 0000000..9c4bb69
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-loop.S
@@ -0,0 +1,36 @@
+ jmp L(loop_entry)
+
+ ALIGN (4)
+L(loop):
+ movdqu %xmm1, (%rdi)
+ addq $64, %rsi
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm3, 32(%rdi)
+ movdqu %xmm4, 48(%rdi)
+ addq $64, %rdi
+L(loop_entry):
+ movdqa 32(%rsi), %xmm3
+ movdqa 48(%rsi), %xmm4
+ movdqa %xmm3, %xmm0
+ movdqa 16(%rsi), %xmm2
+ pminub %xmm4, %xmm0
+ movdqa (%rsi), %xmm1
+ pminub %xmm2, %xmm0
+ pminub %xmm1, %xmm0
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ testq %rdx, %rdx
+ je L(loop)
+
+ salq $48, %rdx
+ pcmpeqb %xmm1, %xmm5
+ pcmpeqb %xmm2, %xmm6
+ pmovmskb %xmm5, %ecx
+ pmovmskb %xmm6, %r8d
+ pcmpeqb %xmm3, %xmm7
+ orq %rcx, %rdx
+ pmovmskb %xmm7, %r9d
+ salq $16, %r8
+ orq %r8, %rdx
+ salq $32, %r9
+ orq %r9, %rdx
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
new file mode 100644
index 0000000..9725857
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned-v2.S
@@ -0,0 +1,337 @@
+/* strcpy with SSE2 and unaligned load
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef STRCPY
+# define STRCPY_TAIL __strcpy_sse2_unaligned_tail
+# define STRCPY __strcpy_sse2_unaligned
+# endif
+
+# define ALIGN(x) .p2align x
+
+#ifndef USE_AS_STPCPY
+ENTRY (STRCPY_TAIL)
+ movq %rsi, %rdx
+ pxor %xmm4, %xmm4
+ jmp L(from_tail)
+END (STRCPY_TAIL)
+#endif
+
+ENTRY (STRCPY)
+ /* We use basic loop as described in
+ http://sourceware.org/glibc/wiki/Optimizations/string_functions
+
+ We use high-level c fragments as explanation. For explanation we
+ add functions:
+
+ get_zero32 (s) / get_zero64 returns bitmask that has i-th bit set if
+ and only if s[i] is zero byte.
+
+ copy_between_x_y (x, y, len) equivalent to memcpy (x, y, len) but with
+ restrictions on len sizes.
+ */
+ movq %rsi, %rdx
+ pxor %xmm4, %xmm4
+ movq %rdi, %rax
+L(from_tail):
+ pxor %xmm5, %xmm5
+ andl $4095, %edx
+ pxor %xmm6, %xmm6
+ cmpq $4032, %rdx
+ ja L(cross_page)
+ /* We copy first 64 bytes with unaligned loads.
+ if (src % 4096 > 4096 - 64)
+ goto cross_page;
+ else {
+ if (z = get_zero32 (src))
+ return copy_less32_bytes (dest, src, ffs (z) + 1);
+ if (z = get_zero32 (src + 32))
+ return copy_between_32_64_bytes (dest, src, ffs (z) + 32 + 1);
+ copy_64_bytes (dest, src);
+ } */
+
+ movdqu (%rsi), %xmm1
+ pxor %xmm7, %xmm7
+ movdqu 16(%rsi), %xmm2
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %ecx
+ pcmpeqb %xmm2, %xmm5
+ pmovmskb %xmm5, %edx
+ salq $16, %rdx
+ orq %rcx, %rdx
+ jne L(less_32_bytes)
+ movdqu 32(%rsi), %xmm3
+ movdqu 48(%rsi), %xmm4
+ pcmpeqb %xmm3, %xmm6
+ pcmpeqb %xmm4, %xmm7
+ pmovmskb %xmm6, %edx
+ pmovmskb %xmm7, %ecx
+ salq $32, %rdx
+ salq $48, %rcx
+ orq %rcx, %rdx
+ jne L(between_32_64_bytes)
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm3, 32(%rdi)
+ movdqu %xmm4, 48(%rdi)
+
+L(prepare_loop):
+ /* Aligning source to 64 bytes.
+ s_al = ALIGN_DOWN(s + 64, 64);
+ dest += s_al - s;
+ s = s_al; */
+ leaq 64(%rsi), %rdx
+ andq $-64, %rdx
+ addq %rdx, %rdi
+ pxor %xmm5, %xmm5
+ subq %rsi, %rdi
+ movq %rdx, %rsi
+
+ /* Now we use loop. Values of registers at this point are:
+
+ %rsi (irsi) - source which is aligned to 64 bytes
+ %rdi (irdi) - destination
+ %rax - return value
+ %xmm4-xmm7 initialized to 0
+
+ A loop implementation needs to write data until it finds terminating
+ zero. At that point it needs to pass control back with following
+ registers to write at most remaining 64 bytes:
+
+ %rax - return value - needs to be kept.
+ %rsi (ersi) - end source.
+ %rdi (erdi) - end destination.
+ %rdx (erdx) - mask of terminaing zero, it should hold that
+ %xmm1 - set by movdqu (%rsi), %xmm1
+ %xmm2 - set by movdqu 16(%rsi), %xmm2
+
+ Loop is required to copy bytes in irsi...ersi (exclusive) range to
+ irdi...erdi range.
+ Loop needs to compute erdx as bitmask containing terminating zero byte
+ and needs to be computed by erdx = get_zero64 (ersi) */
+
+# ifdef USE_SSSE3
+# include "strcpy-ssse3-loop.S"
+# else
+# include "strcpy-sse2-unaligned-loop.S"
+# endif
+
+ /* After loop finished we call following
+ copy_less64_bytes (erdi, ersi, ffs(erdx) + 1);
+ return; */
+
+ bsfq %rdx, %rcx
+#ifdef USE_AS_STPCPY
+ lea (%rdi, %rcx), %rax
+#endif
+ cmpq $32, %rcx
+ jb L(less_32_bytes)
+ movdqu -31(%rsi,%rcx), %xmm3
+ movdqu -15(%rsi,%rcx), %xmm4
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm3, -31(%rdi,%rcx)
+ movdqu %xmm4, -15(%rdi,%rcx)
+ ret
+
+ /* These corresponds to copy_less64_bytes pieces.
+ copy_less64_bytes (d, s, l)
+ {
+ if (l < 32)
+ if (l < 16)
+ copy_less16_bytes (d, s, l);
+ else
+ copy_between_16_31_bytes (d, s, l);
+ copy_between_32_64_bytes (d, s, l);
+ } */
+
+ ALIGN (3)
+L(less_32_bytes):
+ bsfq %rdx, %rdx
+#ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+#endif
+ cmpq $15, %rdx
+ jae L(between_16_31_bytes)
+ cmpq $7, %rdx
+ jae L(between_8_15_bytes)
+ cmpq $3, %rdx
+ jae L(between_4_7_bytes)
+ cmpq $1, %rdx
+ jb L(between_1_1_bytes) /* We need to write terminating zero. */
+ movzwl -1(%rsi,%rdx), %ecx
+ movzwl (%rsi), %esi
+ movw %si, (%rdi)
+ movw %cx, -1(%rdi,%rdx)
+ ret
+
+ ALIGN (3)
+L(between_32_64_bytes):
+ bsfq %rdx, %rdx
+#ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+#endif
+ movdqu -31(%rsi,%rdx), %xmm3
+ movdqu -15(%rsi,%rdx), %xmm4
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm3, -31(%rdi,%rdx)
+ movdqu %xmm4, -15(%rdi,%rdx)
+ ret
+
+ ALIGN (3)
+L(between_16_31_bytes):
+ movdqu -15(%rsi,%rdx), %xmm2
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm2, -15(%rdi,%rdx)
+ ret
+
+ ALIGN (3)
+L(between_8_15_bytes):
+ movq -7(%rsi,%rdx), %rcx
+ movq (%rsi), %rsi
+ movq %rsi, (%rdi)
+ movq %rcx, -7(%rdi,%rdx)
+ ret
+
+ ALIGN (3)
+L(between_4_7_bytes):
+ movl -3(%rsi,%rdx), %ecx
+ movl (%rsi), %esi
+ movl %esi, (%rdi)
+ movl %ecx, -3(%rdi,%rdx)
+ ret
+
+L(between_1_1_bytes):
+ movzbl (%rsi), %edx
+ movb %dl, (%rdi)
+ ret
+
+
+ /* Handle situations where we could cross page.
+ s_al = ALIGN_DOWN (src, 64);
+ m = get_zero64 (s_al);
+ m = m | (1L << 63);
+ m = m >> (src - s_al);
+ f = ffs (m);
+ copy_less64_bytes (d, s, f);
+ if (!s[f])
+ return;
+ goto prepare_loop; */
+
+ ALIGN(4)
+L(cross_page):
+ movq %rsi, %rcx
+ pxor %xmm0, %xmm0
+ andq $-64, %rcx
+ movabsq $-9223372036854775808, %r10
+ movdqa (%rcx), %xmm4
+ movdqa 16(%rcx), %xmm3
+ pcmpeqb %xmm0, %xmm4
+ movdqa 32(%rcx), %xmm2
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm4, %edx
+ movdqa 48(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm0, %xmm1
+ orq %r10, %rdx
+ pmovmskb %xmm3, %r10d
+ pmovmskb %xmm2, %r9d
+ salq $16, %r10
+ orq %r10, %rdx
+ pmovmskb %xmm1, %r8d
+ salq $32, %r9
+ orq %r9, %rdx
+ salq $48, %r8
+ orq %r8, %rdx
+ movq %rsi, %r10
+ subq %rcx, %r10
+ movq %r10, %rcx
+ shrq %cl, %rdx
+ bsfq %rdx, %rdx
+#ifdef USE_AS_STPCPY
+ lea (%rdi, %rdx), %rax
+#endif
+ cmpq $15, %rdx
+ jbe L(copy_less_16_bytes)
+ cmpq $31, %rdx
+ jbe L(copy_16_32_bytes)
+ movdqu (%rsi), %xmm3
+ movdqu 16(%rsi), %xmm2
+ movdqu -31(%rsi,%rdx), %xmm1
+ movdqu -15(%rsi,%rdx), %xmm0
+ movdqu %xmm3, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm1, -31(%rdi,%rdx)
+ movdqu %xmm0, -15(%rdi,%rdx)
+L(copied_cross_page):
+ pxor %xmm4, %xmm4
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ pxor %xmm7, %xmm7
+ cmpb $0, (%rsi,%rdx)
+ jne L(prepare_loop)
+ ret
+
+ ALIGN (3)
+L(copy_less_16_bytes):
+ cmpq $7, %rdx
+ jae L(copy_8_15_bytes)
+ cmpq $3, %rdx
+ jae L(copy_4_7_bytes)
+ cmpq $1, %rdx
+ jb L(copy_1_byte)
+ movzwl (%rsi), %ecx
+ movw %cx, (%rdi)
+ movzwl -1(%rsi,%rdx), %ecx
+ movw %cx, -1(%rdi,%rdx)
+ jmp L(copied_cross_page)
+
+ ALIGN (3)
+L(copy_16_32_bytes):
+ movdqu (%rsi), %xmm1
+ movdqu -15(%rsi,%rdx), %xmm0
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm0, -15(%rdi,%rdx)
+ jmp L(copied_cross_page)
+
+L(copy_8_15_bytes):
+ movq (%rsi), %r9
+ movq -7(%rsi,%rdx), %rcx
+ movq %r9, (%rdi)
+ movq %rcx, -7(%rdi,%rdx)
+ jmp L(copied_cross_page)
+
+L(copy_4_7_bytes):
+ movl (%rsi), %r9d
+ movl -3(%rsi,%rdx), %ecx
+ movl %r9d, (%rdi)
+ movl %ecx, -3(%rdi,%rdx)
+ jmp L(copied_cross_page)
+
+L(copy_1_byte):
+ movzbl (%rsi), %ecx
+ movb %cl, (%rdi)
+ jmp L(copied_cross_page)
+
+END (STRCPY)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3-loop.S b/sysdeps/x86_64/multiarch/strcpy-ssse3-loop.S
new file mode 100644
index 0000000..dc873f1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3-loop.S
@@ -0,0 +1,133 @@
+
+
+ movdqa (%rsi), %xmm1
+ movdqa 16(%rsi), %xmm2
+ movdqa %xmm1, %xmm0
+ movdqa 32(%rsi), %xmm3
+ pminub %xmm2, %xmm0
+ movdqa 48(%rsi), %xmm4
+ pminub %xmm3, %xmm0
+ pminub %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(loop_found_zero)
+ movdqu %xmm1, (%rdi)
+ movdqu %xmm2, 16(%rdi)
+ movdqu %xmm3, 32(%rdi)
+ movdqu %xmm8, 48(%rdi)
+ addq $64, %rsi
+ addq $64, %rdi
+
+ movq %rdi, %r11
+ andl $15, %r11d
+ subq %r11, %rdi
+ addq $16, %rdi
+ leaq L(entry0)(%rip), %rdx
+ movdqu -16(%rsi), %xmm0
+ movq %r11, %rcx
+ imul $128, %rcx
+ subq %rcx, %rdx
+ jmp *%rdx
+
+
+#define LOOP(shift, align) \
+ ALIGN (4) ;\
+L(loop##align): ;\
+ movdqa %xmm1, -16(%rdi) ;\
+ addq $64, %rsi ;\
+ movdqa %xmm2, (%rdi) ;\
+ movdqa %xmm6, %xmm0 ;\
+ movdqa %xmm3, 16(%rdi) ;\
+ movdqa %xmm4, 32(%rdi) ;\
+ rep ; \
+ addq $64, %rdi ;\
+L(entry##align): ;\
+ movdqa (%rsi), %xmm5 ;\
+ movdqa %xmm5, %xmm1 ;\
+ movdqa 16(%rsi), %xmm2 ;\
+ pminub %xmm2, %xmm5 ;\
+ movdqa 32(%rsi), %xmm3 ;\
+ pminub %xmm3, %xmm5 ;\
+ movdqa 48(%rsi), %xmm4 ;\
+ pminub %xmm4, %xmm5 ;\
+ movdqa %xmm4, %xmm6 ;\
+ pcmpeqb %xmm7, %xmm5 ;\
+ pmovmskb %xmm5, %edx ;\
+ testl %edx, %edx ;\
+ jne L(return) ;\
+ palignr shift, %xmm3, %xmm4 ;\
+ palignr shift, %xmm2, %xmm3 ;\
+ palignr shift, %xmm1, %xmm2 ;\
+ palignr shift, %xmm0, %xmm1 ;\
+ jmp L(loop##align)
+
+
+ LOOP($1,15)
+ LOOP($2,14)
+ LOOP($3,13)
+ LOOP($4,12)
+ LOOP($5,11)
+ LOOP($6,10)
+ LOOP($7,9)
+ LOOP($8,8)
+ LOOP($9,7)
+ LOOP($10,6)
+ LOOP($11,5)
+ LOOP($12,4)
+ LOOP($13,3)
+ LOOP($14,2)
+ LOOP($15,1)
+
+ ALIGN (4)
+ test %edx, %edx
+ ALIGN (4)
+L(loop0):
+ movdqa %xmm1, -16(%rdi)
+ addq $64, %rsi
+ movdqa %xmm2, (%rdi)
+ movdqa %xmm3, 16(%rdi)
+ movdqa %xmm4, 32(%rdi)
+ addq $64, %rdi
+ ALIGN (4)
+L(entry0):
+ movdqa 32(%rsi), %xmm3
+ movdqa 48(%rsi), %xmm4
+ movdqa %xmm3, %xmm0
+ movdqa 16(%rsi), %xmm2
+ pminub %xmm4, %xmm0
+ movdqa (%rsi), %xmm1
+ pminub %xmm2, %xmm0
+ pminub %xmm1, %xmm0
+ pcmpeqb %xmm5, %xmm0
+ pmovmskb %xmm0, %edx
+ testq %rdx, %rdx
+ je L(loop0)
+
+L(return):
+ pxor %xmm5, %xmm5
+ pxor %xmm6, %xmm6
+ addq %r11, %rdi
+ subq $16, %rdi
+ movdqu -16(%rsi), %xmm0
+ movdqu %xmm0, -16(%rdi)
+
+L(loop_found_zero):
+ movdqa (%rsi), %xmm1
+ movdqa 16(%rsi), %xmm2
+ movdqa 32(%rsi), %xmm3
+ movdqa 48(%rsi), %xmm4
+ pcmpeqb %xmm5, %xmm4
+ pmovmskb %xmm4, %edx
+ salq $48, %rdx
+ pcmpeqb %xmm1, %xmm5
+ pcmpeqb %xmm2, %xmm6
+ pmovmskb %xmm5, %ecx
+ pmovmskb %xmm6, %r8d
+ pcmpeqb %xmm3, %xmm7
+ orq %rcx, %rdx
+ pmovmskb %xmm7, %r9d
+ salq $16, %r8
+ orq %r8, %rdx
+ salq $32, %r9
+ orq %r9, %rdx
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S b/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
new file mode 100644
index 0000000..8f70c42
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3-v2.S
@@ -0,0 +1,26 @@
+/* strcpy with SSE2 and unaligned load
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_SSSE3
+
+#ifndef STRCPY
+# define STRCPY_TAIL __strcpy_ssse3_tail
+# define STRCPY __strcpy_ssse3
+#endif
+
+#include "strcpy-sse2-unaligned-v2.S"
--
1.8.3.2