This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch, master, updated. glibc-2.12-47-g6fb8cbc
- From: drepper at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 30 Jun 2010 15:26:42 -0000
- Subject: GNU C Library master sources branch, master, updated. glibc-2.12-47-g6fb8cbc
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 6fb8cbcb58a29fff73eb2101b34caa19a7f88eba (commit)
from d85f8ff66711fd3b1c5753330499c7403fa46d81 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6fb8cbcb58a29fff73eb2101b34caa19a7f88eba
commit 6fb8cbcb58a29fff73eb2101b34caa19a7f88eba
Author: H.J. Lu <hongjiu.lu@intel.com>
Date: Wed Jun 30 08:26:11 2010 -0700
Improve 64bit memcpy/memmove for Atom, Core 2 and Core i7
This patch includes optimized 64bit memcpy/memmove for Atom, Core 2 and
Core i7. It improves memcpy by up to 3X on Atom, up to 4X on Core 2 and
up to 1X on Core i7. It also improves memmove by up to 3X on Atom, up to
4X on Core 2 and up to 2X on Core i7.
diff --git a/ChangeLog b/ChangeLog
index eaf5749..175c6ed 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,35 @@
+2010-06-25 H.J. Lu <hongjiu.lu@intel.com>
+
+ * debug/memmove_chk.c (__memmove_chk): Renamed to ...
+ (MEMMOVE_CHK): ...this. Default to __memmove_chk.
+ * string/memmove.c (memmove): Renamed to ...
+ (MEMMOVE): ...this. Default to memmove.
+ * sysdeps/x86_64/memcpy.S: Use ENTRY_CHK and END_CHK.
+ * sysdeps/x86_64/sysdep.h (ENTRY_CHK): Define.
+ (END_CHK): Define.
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+ memcpy-ssse3 mempcpy-ssse3 memmove-ssse3 memcpy-ssse3-back
+ mempcpy-ssse3-back memmove-ssse3-back.
+ * sysdeps/x86_64/multiarch/bcopy.S: New file .
+ * sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: New file.
+ * sysdeps/x86_64/multiarch/memcpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/memcpy.S: New file.
+ * sysdeps/x86_64/multiarch/memcpy_chk.S: New file.
+ * sysdeps/x86_64/multiarch/memmove-ssse3-back.S: New file.
+ * sysdeps/x86_64/multiarch/memmove-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/memmove.c: New file.
+ * sysdeps/x86_64/multiarch/memmove_chk.c: New file.
+ * sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S: New file.
+ * sysdeps/x86_64/multiarch/mempcpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/mempcpy.S: New file.
+ * sysdeps/x86_64/multiarch/mempcpy_chk.S: New file.
+ * sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Copy_Backward):
+ Define.
+ (index_Fast_Copy_Backward): Define.
+ (HAS_ARCH_FEATURE): Define.
+ (HAS_FAST_REP_STRING): Define.
+ (HAS_FAST_COPY_BACKWARD): Define.
+
2010-06-21 Andreas Schwab <schwab@redhat.com>
* sysdeps/unix/sysv/linux/getlogin_r.c (__getlogin_r_loginuid):
diff --git a/debug/memmove_chk.c b/debug/memmove_chk.c
index f3b74d2..6a3e157 100644
--- a/debug/memmove_chk.c
+++ b/debug/memmove_chk.c
@@ -23,8 +23,12 @@
#include <memcopy.h>
#include <pagecopy.h>
+#ifndef MEMMOVE_CHK
+# define MEMMOVE_CHK __memmove_chk
+#endif
+
void *
-__memmove_chk (dest, src, len, destlen)
+MEMMOVE_CHK (dest, src, len, destlen)
void *dest;
const void *src;
size_t len;
diff --git a/string/memmove.c b/string/memmove.c
index 16671f7..8e36e7c 100644
--- a/string/memmove.c
+++ b/string/memmove.c
@@ -37,9 +37,12 @@
#define rettype void *
#endif
+#ifndef MEMMOVE
+#define MEMMOVE memmove
+#endif
rettype
-memmove (a1, a2, len)
+MEMMOVE (a1, a2, len)
a1const void *a1;
a2const void *a2;
size_t len;
diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index b25646b..b4545ac 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -40,12 +40,12 @@
.text
#if defined PIC && !defined NOT_IN_libc
-ENTRY (__memcpy_chk)
+ENTRY_CHK (__memcpy_chk)
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__memcpy_chk)
+END_CHK (__memcpy_chk)
#endif
ENTRY(memcpy) /* (void *, const void*, size_t) */
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index c61cf70..0ca914a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -5,7 +5,9 @@ endif
ifeq ($(subdir),string)
sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
- strend-sse4 memcmp-sse4
+ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
+ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
+ memmove-ssse3-back
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
new file mode 100644
index 0000000..11e250f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/bcopy.S
@@ -0,0 +1,7 @@
+#include <sysdep.h>
+
+ .text
+ENTRY(bcopy)
+ xchg %rdi, %rsi
+ jmp HIDDEN_BUILTIN_JUMPTARGET(memmove)
+END(bcopy)
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index f13a9f4..55c9f54 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -78,10 +78,13 @@ __init_cpu_features (void)
case 0x25:
case 0x2e:
case 0x2f:
- /* Rep string instructions are fast on Intel Core i3, i5
- and i7. */
+ /* Rep string instructions and copy backward are fast on
+ Intel Core i3, i5 and i7. */
+#if index_Fast_Rep_String != index_Fast_Copy_Backward
+# error index_Fast_Rep_String != index_Fast_Copy_Backward
+#endif
__cpu_features.feature[index_Fast_Rep_String]
- |= bit_Fast_Rep_String;
+ |= bit_Fast_Rep_String | bit_Fast_Copy_Backward;
break;
}
}
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index b2f2de3..4a211c0 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -17,6 +17,7 @@
02111-1307 USA. */
#define bit_Fast_Rep_String (1 << 0)
+#define bit_Fast_Copy_Backward (1 << 1)
#ifdef __ASSEMBLER__
@@ -32,7 +33,8 @@
# define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-#define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE
#else /* __ASSEMBLER__ */
@@ -102,6 +104,16 @@ extern const struct cpu_features *__get_cpu_features (void)
# define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
# define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
-# define index_Fast_Rep_String FEATURE_INDEX_1
+# define index_Fast_Rep_String FEATURE_INDEX_1
+# define index_Fast_Copy_Backward FEATURE_INDEX_1
+
+#define HAS_ARCH_FEATURE(idx, bit) \
+ ((__get_cpu_features ()->feature[idx] & (bit)) != 0)
+
+#define HAS_FAST_REP_STRING \
+ HAS_ARCH_FEATURE (index_Fast_Rep_String, bit_Fast_Rep_String)
+
+#define HAS_FAST_COPY_BACKWARD \
+ HAS_ARCH_FEATURE (index_Fast_Copy_Backward, bit_Fast_Copy_Backward)
#endif /* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
new file mode 100644
index 0000000..48c974e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -0,0 +1,3169 @@
+/* memcpy with SSSE3 and REP string
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY __memcpy_ssse3_back
+# define MEMCPY_CHK __memcpy_chk_ssse3_back
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#define JMPTBL(I, B) I - B
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ relative offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), INDEX; \
+ lea (%r11, INDEX), INDEX; \
+ jmp *INDEX; \
+ ud2
+
+ .section .text.ssse3,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc
+ENTRY (MEMCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+ mov %rdi, %rax
+#ifdef USE_AS_MEMPCPY
+ add %rdx, %rax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+ cmp %rsi, %rdi
+ jb L(copy_forward)
+ je L(bwd_write_0bytes)
+ cmp $144, %rdx
+ jae L(copy_backward)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+L(copy_forward):
+#endif
+ cmp $144, %rdx
+ jae L(144bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+ cmp %dil, %sil
+ jbe L(bk_write)
+#endif
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+#endif
+
+ ALIGN (4)
+L(144bytesormore):
+
+#ifndef USE_AS_MEMMOVE
+ cmp %dil, %sil
+ jle L(copy_backward)
+#endif
+ movdqu (%rsi), %xmm0
+ mov %rdi, %r8
+ and $-16, %rdi
+ add $16, %rdi
+ mov %rdi, %r9
+ sub %r8, %r9
+ sub %r9, %rdx
+ add %r9, %rsi
+ mov %rsi, %r9
+ and $0xf, %r9
+ jz L(shl_0)
+#ifdef DATA_CACHE_SIZE
+ mov $DATA_CACHE_SIZE, %rcx
+#else
+ mov __x86_64_data_cache_size(%rip), %rcx
+#endif
+ cmp %rcx, %rdx
+ jae L(gobble_mem_fwd)
+ lea L(shl_table_fwd)(%rip), %r11
+ sub $0x80, %rdx
+ movslq (%r11, %r9, 4), %r9
+ add %r11, %r9
+ jmp *%r9
+ ud2
+
+ ALIGN (4)
+L(copy_backward):
+#ifdef DATA_CACHE_SIZE
+ mov $DATA_CACHE_SIZE, %rcx
+#else
+ mov __x86_64_data_cache_size(%rip), %rcx
+#endif
+ shl $1, %rcx
+ cmp %rcx, %rdx
+ ja L(gobble_mem_bwd)
+
+ add %rdx, %rdi
+ add %rdx, %rsi
+ movdqu -16(%rsi), %xmm0
+ lea -16(%rdi), %r8
+ mov %rdi, %r9
+ and $0xf, %r9
+ xor %r9, %rdi
+ sub %r9, %rsi
+ sub %r9, %rdx
+ mov %rsi, %r9
+ and $0xf, %r9
+ jz L(shl_0_bwd)
+ lea L(shl_table_bwd)(%rip), %r11
+ sub $0x80, %rdx
+ movslq (%r11, %r9, 4), %r9
+ add %r11, %r9
+ jmp *%r9
+ ud2
+
+ ALIGN (4)
+L(shl_0):
+
+ mov %rdx, %r9
+ shr $8, %r9
+ add %rdx, %r9
+#ifdef DATA_CACHE_SIZE
+ cmp $DATA_CACHE_SIZE_HALF, %r9
+#else
+ cmp __x86_64_data_cache_size_half(%rip), %r9
+#endif
+ jae L(gobble_mem_fwd)
+ sub $0x80, %rdx
+ ALIGN (4)
+L(shl_0_loop):
+ movdqa (%rsi), %xmm1
+ movdqa %xmm1, (%rdi)
+ movaps 0x10(%rsi), %xmm2
+ movaps %xmm2, 0x10(%rdi)
+ movaps 0x20(%rsi), %xmm3
+ movaps %xmm3, 0x20(%rdi)
+ movaps 0x30(%rsi), %xmm4
+ movaps %xmm4, 0x30(%rdi)
+ movaps 0x40(%rsi), %xmm1
+ movaps %xmm1, 0x40(%rdi)
+ movaps 0x50(%rsi), %xmm2
+ movaps %xmm2, 0x50(%rdi)
+ movaps 0x60(%rsi), %xmm3
+ movaps %xmm3, 0x60(%rdi)
+ movaps 0x70(%rsi), %xmm4
+ movaps %xmm4, 0x70(%rdi)
+ sub $0x80, %rdx
+ lea 0x80(%rsi), %rsi
+ lea 0x80(%rdi), %rdi
+ jae L(shl_0_loop)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_0_bwd):
+ sub $0x80, %rdx
+L(copy_backward_loop):
+ movaps -0x10(%rsi), %xmm1
+ movaps %xmm1, -0x10(%rdi)
+ movaps -0x20(%rsi), %xmm2
+ movaps %xmm2, -0x20(%rdi)
+ movaps -0x30(%rsi), %xmm3
+ movaps %xmm3, -0x30(%rdi)
+ movaps -0x40(%rsi), %xmm4
+ movaps %xmm4, -0x40(%rdi)
+ movaps -0x50(%rsi), %xmm5
+ movaps %xmm5, -0x50(%rdi)
+ movaps -0x60(%rsi), %xmm5
+ movaps %xmm5, -0x60(%rdi)
+ movaps -0x70(%rsi), %xmm5
+ movaps %xmm5, -0x70(%rdi)
+ movaps -0x80(%rsi), %xmm5
+ movaps %xmm5, -0x80(%rdi)
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(copy_backward_loop)
+
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_1):
+ sub $0x80, %rdx
+ movaps -0x01(%rsi), %xmm1
+ movaps 0x0f(%rsi), %xmm2
+ movaps 0x1f(%rsi), %xmm3
+ movaps 0x2f(%rsi), %xmm4
+ movaps 0x3f(%rsi), %xmm5
+ movaps 0x4f(%rsi), %xmm6
+ movaps 0x5f(%rsi), %xmm7
+ movaps 0x6f(%rsi), %xmm8
+ movaps 0x7f(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $1, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $1, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $1, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $1, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $1, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $1, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $1, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_1)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_1_bwd):
+ movaps -0x01(%rsi), %xmm1
+
+ movaps -0x11(%rsi), %xmm2
+ palignr $1, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x21(%rsi), %xmm3
+ palignr $1, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x31(%rsi), %xmm4
+ palignr $1, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x41(%rsi), %xmm5
+ palignr $1, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x51(%rsi), %xmm6
+ palignr $1, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x61(%rsi), %xmm7
+ palignr $1, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x71(%rsi), %xmm8
+ palignr $1, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x81(%rsi), %xmm9
+ palignr $1, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_1_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_2):
+ sub $0x80, %rdx
+ movaps -0x02(%rsi), %xmm1
+ movaps 0x0e(%rsi), %xmm2
+ movaps 0x1e(%rsi), %xmm3
+ movaps 0x2e(%rsi), %xmm4
+ movaps 0x3e(%rsi), %xmm5
+ movaps 0x4e(%rsi), %xmm6
+ movaps 0x5e(%rsi), %xmm7
+ movaps 0x6e(%rsi), %xmm8
+ movaps 0x7e(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $2, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $2, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $2, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $2, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $2, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $2, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $2, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_2)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_2_bwd):
+ movaps -0x02(%rsi), %xmm1
+
+ movaps -0x12(%rsi), %xmm2
+ palignr $2, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x22(%rsi), %xmm3
+ palignr $2, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x32(%rsi), %xmm4
+ palignr $2, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x42(%rsi), %xmm5
+ palignr $2, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x52(%rsi), %xmm6
+ palignr $2, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x62(%rsi), %xmm7
+ palignr $2, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x72(%rsi), %xmm8
+ palignr $2, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x82(%rsi), %xmm9
+ palignr $2, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_2_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_3):
+ sub $0x80, %rdx
+ movaps -0x03(%rsi), %xmm1
+ movaps 0x0d(%rsi), %xmm2
+ movaps 0x1d(%rsi), %xmm3
+ movaps 0x2d(%rsi), %xmm4
+ movaps 0x3d(%rsi), %xmm5
+ movaps 0x4d(%rsi), %xmm6
+ movaps 0x5d(%rsi), %xmm7
+ movaps 0x6d(%rsi), %xmm8
+ movaps 0x7d(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $3, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $3, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $3, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $3, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $3, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $3, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $3, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_3)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_3_bwd):
+ movaps -0x03(%rsi), %xmm1
+
+ movaps -0x13(%rsi), %xmm2
+ palignr $3, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x23(%rsi), %xmm3
+ palignr $3, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x33(%rsi), %xmm4
+ palignr $3, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x43(%rsi), %xmm5
+ palignr $3, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x53(%rsi), %xmm6
+ palignr $3, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x63(%rsi), %xmm7
+ palignr $3, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x73(%rsi), %xmm8
+ palignr $3, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x83(%rsi), %xmm9
+ palignr $3, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_3_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_4):
+ sub $0x80, %rdx
+ movaps -0x04(%rsi), %xmm1
+ movaps 0x0c(%rsi), %xmm2
+ movaps 0x1c(%rsi), %xmm3
+ movaps 0x2c(%rsi), %xmm4
+ movaps 0x3c(%rsi), %xmm5
+ movaps 0x4c(%rsi), %xmm6
+ movaps 0x5c(%rsi), %xmm7
+ movaps 0x6c(%rsi), %xmm8
+ movaps 0x7c(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $4, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $4, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $4, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $4, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $4, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $4, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $4, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_4)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_4_bwd):
+ movaps -0x04(%rsi), %xmm1
+
+ movaps -0x14(%rsi), %xmm2
+ palignr $4, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x24(%rsi), %xmm3
+ palignr $4, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x34(%rsi), %xmm4
+ palignr $4, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x44(%rsi), %xmm5
+ palignr $4, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x54(%rsi), %xmm6
+ palignr $4, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x64(%rsi), %xmm7
+ palignr $4, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x74(%rsi), %xmm8
+ palignr $4, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x84(%rsi), %xmm9
+ palignr $4, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_4_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_5):
+ sub $0x80, %rdx
+ movaps -0x05(%rsi), %xmm1
+ movaps 0x0b(%rsi), %xmm2
+ movaps 0x1b(%rsi), %xmm3
+ movaps 0x2b(%rsi), %xmm4
+ movaps 0x3b(%rsi), %xmm5
+ movaps 0x4b(%rsi), %xmm6
+ movaps 0x5b(%rsi), %xmm7
+ movaps 0x6b(%rsi), %xmm8
+ movaps 0x7b(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $5, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $5, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $5, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $5, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $5, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $5, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $5, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_5)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_5_bwd):
+ movaps -0x05(%rsi), %xmm1
+
+ movaps -0x15(%rsi), %xmm2
+ palignr $5, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x25(%rsi), %xmm3
+ palignr $5, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x35(%rsi), %xmm4
+ palignr $5, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x45(%rsi), %xmm5
+ palignr $5, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x55(%rsi), %xmm6
+ palignr $5, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x65(%rsi), %xmm7
+ palignr $5, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x75(%rsi), %xmm8
+ palignr $5, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x85(%rsi), %xmm9
+ palignr $5, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_5_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_6):
+ sub $0x80, %rdx
+ movaps -0x06(%rsi), %xmm1
+ movaps 0x0a(%rsi), %xmm2
+ movaps 0x1a(%rsi), %xmm3
+ movaps 0x2a(%rsi), %xmm4
+ movaps 0x3a(%rsi), %xmm5
+ movaps 0x4a(%rsi), %xmm6
+ movaps 0x5a(%rsi), %xmm7
+ movaps 0x6a(%rsi), %xmm8
+ movaps 0x7a(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $6, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $6, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $6, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $6, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $6, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $6, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $6, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_6)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_6_bwd):
+ movaps -0x06(%rsi), %xmm1
+
+ movaps -0x16(%rsi), %xmm2
+ palignr $6, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x26(%rsi), %xmm3
+ palignr $6, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x36(%rsi), %xmm4
+ palignr $6, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x46(%rsi), %xmm5
+ palignr $6, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x56(%rsi), %xmm6
+ palignr $6, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x66(%rsi), %xmm7
+ palignr $6, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x76(%rsi), %xmm8
+ palignr $6, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x86(%rsi), %xmm9
+ palignr $6, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_6_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_7):
+ sub $0x80, %rdx
+ movaps -0x07(%rsi), %xmm1
+ movaps 0x09(%rsi), %xmm2
+ movaps 0x19(%rsi), %xmm3
+ movaps 0x29(%rsi), %xmm4
+ movaps 0x39(%rsi), %xmm5
+ movaps 0x49(%rsi), %xmm6
+ movaps 0x59(%rsi), %xmm7
+ movaps 0x69(%rsi), %xmm8
+ movaps 0x79(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $7, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $7, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $7, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $7, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $7, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $7, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $7, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_7)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_7_bwd):
+ movaps -0x07(%rsi), %xmm1
+
+ movaps -0x17(%rsi), %xmm2
+ palignr $7, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x27(%rsi), %xmm3
+ palignr $7, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x37(%rsi), %xmm4
+ palignr $7, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x47(%rsi), %xmm5
+ palignr $7, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x57(%rsi), %xmm6
+ palignr $7, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x67(%rsi), %xmm7
+ palignr $7, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x77(%rsi), %xmm8
+ palignr $7, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x87(%rsi), %xmm9
+ palignr $7, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_7_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_8):
+ sub $0x80, %rdx
+ movaps -0x08(%rsi), %xmm1
+ movaps 0x08(%rsi), %xmm2
+ movaps 0x18(%rsi), %xmm3
+ movaps 0x28(%rsi), %xmm4
+ movaps 0x38(%rsi), %xmm5
+ movaps 0x48(%rsi), %xmm6
+ movaps 0x58(%rsi), %xmm7
+ movaps 0x68(%rsi), %xmm8
+ movaps 0x78(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $8, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $8, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $8, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $8, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $8, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $8, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $8, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_8)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_8_bwd):
+ movaps -0x08(%rsi), %xmm1
+
+ movaps -0x18(%rsi), %xmm2
+ palignr $8, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x28(%rsi), %xmm3
+ palignr $8, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x38(%rsi), %xmm4
+ palignr $8, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x48(%rsi), %xmm5
+ palignr $8, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x58(%rsi), %xmm6
+ palignr $8, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x68(%rsi), %xmm7
+ palignr $8, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x78(%rsi), %xmm8
+ palignr $8, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x88(%rsi), %xmm9
+ palignr $8, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_8_bwd)
+L(shl_8_end_bwd):
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_9):
+ sub $0x80, %rdx
+ movaps -0x09(%rsi), %xmm1
+ movaps 0x07(%rsi), %xmm2
+ movaps 0x17(%rsi), %xmm3
+ movaps 0x27(%rsi), %xmm4
+ movaps 0x37(%rsi), %xmm5
+ movaps 0x47(%rsi), %xmm6
+ movaps 0x57(%rsi), %xmm7
+ movaps 0x67(%rsi), %xmm8
+ movaps 0x77(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $9, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $9, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $9, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $9, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $9, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $9, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $9, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_9)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_9_bwd):
+ movaps -0x09(%rsi), %xmm1
+
+ movaps -0x19(%rsi), %xmm2
+ palignr $9, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x29(%rsi), %xmm3
+ palignr $9, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x39(%rsi), %xmm4
+ palignr $9, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x49(%rsi), %xmm5
+ palignr $9, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x59(%rsi), %xmm6
+ palignr $9, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x69(%rsi), %xmm7
+ palignr $9, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x79(%rsi), %xmm8
+ palignr $9, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x89(%rsi), %xmm9
+ palignr $9, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_9_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_10):
+ sub $0x80, %rdx
+ movaps -0x0a(%rsi), %xmm1
+ movaps 0x06(%rsi), %xmm2
+ movaps 0x16(%rsi), %xmm3
+ movaps 0x26(%rsi), %xmm4
+ movaps 0x36(%rsi), %xmm5
+ movaps 0x46(%rsi), %xmm6
+ movaps 0x56(%rsi), %xmm7
+ movaps 0x66(%rsi), %xmm8
+ movaps 0x76(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $10, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $10, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $10, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $10, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $10, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $10, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $10, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_10)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_10_bwd):
+ movaps -0x0a(%rsi), %xmm1
+
+ movaps -0x1a(%rsi), %xmm2
+ palignr $10, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x2a(%rsi), %xmm3
+ palignr $10, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x3a(%rsi), %xmm4
+ palignr $10, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x4a(%rsi), %xmm5
+ palignr $10, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x5a(%rsi), %xmm6
+ palignr $10, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x6a(%rsi), %xmm7
+ palignr $10, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x7a(%rsi), %xmm8
+ palignr $10, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x8a(%rsi), %xmm9
+ palignr $10, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_10_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_11):
+ sub $0x80, %rdx
+ movaps -0x0b(%rsi), %xmm1
+ movaps 0x05(%rsi), %xmm2
+ movaps 0x15(%rsi), %xmm3
+ movaps 0x25(%rsi), %xmm4
+ movaps 0x35(%rsi), %xmm5
+ movaps 0x45(%rsi), %xmm6
+ movaps 0x55(%rsi), %xmm7
+ movaps 0x65(%rsi), %xmm8
+ movaps 0x75(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $11, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $11, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $11, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $11, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $11, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $11, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $11, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_11)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_11_bwd):
+ movaps -0x0b(%rsi), %xmm1
+
+ movaps -0x1b(%rsi), %xmm2
+ palignr $11, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x2b(%rsi), %xmm3
+ palignr $11, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x3b(%rsi), %xmm4
+ palignr $11, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x4b(%rsi), %xmm5
+ palignr $11, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x5b(%rsi), %xmm6
+ palignr $11, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x6b(%rsi), %xmm7
+ palignr $11, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x7b(%rsi), %xmm8
+ palignr $11, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x8b(%rsi), %xmm9
+ palignr $11, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_11_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_12):
+ sub $0x80, %rdx
+ movdqa -0x0c(%rsi), %xmm1
+ movaps 0x04(%rsi), %xmm2
+ movaps 0x14(%rsi), %xmm3
+ movaps 0x24(%rsi), %xmm4
+ movaps 0x34(%rsi), %xmm5
+ movaps 0x44(%rsi), %xmm6
+ movaps 0x54(%rsi), %xmm7
+ movaps 0x64(%rsi), %xmm8
+ movaps 0x74(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $12, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $12, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $12, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $12, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $12, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $12, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $12, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+
+ lea 0x80(%rdi), %rdi
+ jae L(shl_12)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_12_bwd):
+ movaps -0x0c(%rsi), %xmm1
+
+ movaps -0x1c(%rsi), %xmm2
+ palignr $12, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x2c(%rsi), %xmm3
+ palignr $12, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x3c(%rsi), %xmm4
+ palignr $12, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x4c(%rsi), %xmm5
+ palignr $12, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x5c(%rsi), %xmm6
+ palignr $12, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x6c(%rsi), %xmm7
+ palignr $12, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x7c(%rsi), %xmm8
+ palignr $12, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x8c(%rsi), %xmm9
+ palignr $12, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_12_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_13):
+ sub $0x80, %rdx
+ movaps -0x0d(%rsi), %xmm1
+ movaps 0x03(%rsi), %xmm2
+ movaps 0x13(%rsi), %xmm3
+ movaps 0x23(%rsi), %xmm4
+ movaps 0x33(%rsi), %xmm5
+ movaps 0x43(%rsi), %xmm6
+ movaps 0x53(%rsi), %xmm7
+ movaps 0x63(%rsi), %xmm8
+ movaps 0x73(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $13, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $13, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $13, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $13, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $13, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $13, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $13, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_13)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_13_bwd):
+ movaps -0x0d(%rsi), %xmm1
+
+ movaps -0x1d(%rsi), %xmm2
+ palignr $13, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x2d(%rsi), %xmm3
+ palignr $13, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x3d(%rsi), %xmm4
+ palignr $13, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x4d(%rsi), %xmm5
+ palignr $13, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x5d(%rsi), %xmm6
+ palignr $13, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x6d(%rsi), %xmm7
+ palignr $13, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x7d(%rsi), %xmm8
+ palignr $13, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x8d(%rsi), %xmm9
+ palignr $13, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_13_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_14):
+ sub $0x80, %rdx
+ movaps -0x0e(%rsi), %xmm1
+ movaps 0x02(%rsi), %xmm2
+ movaps 0x12(%rsi), %xmm3
+ movaps 0x22(%rsi), %xmm4
+ movaps 0x32(%rsi), %xmm5
+ movaps 0x42(%rsi), %xmm6
+ movaps 0x52(%rsi), %xmm7
+ movaps 0x62(%rsi), %xmm8
+ movaps 0x72(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $14, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $14, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $14, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $14, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $14, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $14, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $14, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_14)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_14_bwd):
+ movaps -0x0e(%rsi), %xmm1
+
+ movaps -0x1e(%rsi), %xmm2
+ palignr $14, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x2e(%rsi), %xmm3
+ palignr $14, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x3e(%rsi), %xmm4
+ palignr $14, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x4e(%rsi), %xmm5
+ palignr $14, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x5e(%rsi), %xmm6
+ palignr $14, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x6e(%rsi), %xmm7
+ palignr $14, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x7e(%rsi), %xmm8
+ palignr $14, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x8e(%rsi), %xmm9
+ palignr $14, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_14_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_15):
+ sub $0x80, %rdx
+ movaps -0x0f(%rsi), %xmm1
+ movaps 0x01(%rsi), %xmm2
+ movaps 0x11(%rsi), %xmm3
+ movaps 0x21(%rsi), %xmm4
+ movaps 0x31(%rsi), %xmm5
+ movaps 0x41(%rsi), %xmm6
+ movaps 0x51(%rsi), %xmm7
+ movaps 0x61(%rsi), %xmm8
+ movaps 0x71(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $15, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $15, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $15, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $15, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $15, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $15, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $15, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_15)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_15_bwd):
+ movaps -0x0f(%rsi), %xmm1
+
+ movaps -0x1f(%rsi), %xmm2
+ palignr $15, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x2f(%rsi), %xmm3
+ palignr $15, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x3f(%rsi), %xmm4
+ palignr $15, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x4f(%rsi), %xmm5
+ palignr $15, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x5f(%rsi), %xmm6
+ palignr $15, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x6f(%rsi), %xmm7
+ palignr $15, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x7f(%rsi), %xmm8
+ palignr $15, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x8f(%rsi), %xmm9
+ palignr $15, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_15_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(gobble_mem_fwd):
+ movdqu (%rsi), %xmm1
+ movdqu %xmm0, (%r8)
+ movdqa %xmm1, (%rdi)
+ sub $16, %rdx
+ add $16, %rsi
+ add $16, %rdi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+#ifdef USE_AS_MEMMOVE
+ mov %rsi, %r9
+ sub %rdi, %r9
+ cmp %rdx, %r9
+ jae L(memmove_is_memcpy_fwd)
+ cmp %rcx, %r9
+ jbe L(ll_cache_copy_fwd_start)
+L(memmove_is_memcpy_fwd):
+#endif
+ cmp %rcx, %rdx
+ ja L(bigger_in_fwd)
+ mov %rdx, %rcx
+L(bigger_in_fwd):
+ sub %rcx, %rdx
+ cmp $0x1000, %rdx
+ jbe L(ll_cache_copy_fwd)
+
+ mov %rcx, %r9
+ shl $3, %r9
+ cmp %r9, %rdx
+ jbe L(2steps_copy_fwd)
+ add %rcx, %rdx
+ xor %rcx, %rcx
+L(2steps_copy_fwd):
+ sub $0x80, %rdx
+L(gobble_mem_fwd_loop):
+ sub $0x80, %rdx
+ prefetcht0 0x200(%rsi)
+ prefetcht0 0x300(%rsi)
+ movdqu (%rsi), %xmm0
+ movdqu 0x10(%rsi), %xmm1
+ movdqu 0x20(%rsi), %xmm2
+ movdqu 0x30(%rsi), %xmm3
+ movdqu 0x40(%rsi), %xmm4
+ movdqu 0x50(%rsi), %xmm5
+ movdqu 0x60(%rsi), %xmm6
+ movdqu 0x70(%rsi), %xmm7
+ lfence
+ movntdq %xmm0, (%rdi)
+ movntdq %xmm1, 0x10(%rdi)
+ movntdq %xmm2, 0x20(%rdi)
+ movntdq %xmm3, 0x30(%rdi)
+ movntdq %xmm4, 0x40(%rdi)
+ movntdq %xmm5, 0x50(%rdi)
+ movntdq %xmm6, 0x60(%rdi)
+ movntdq %xmm7, 0x70(%rdi)
+ lea 0x80(%rsi), %rsi
+ lea 0x80(%rdi), %rdi
+ jae L(gobble_mem_fwd_loop)
+ sfence
+ cmp $0x80, %rcx
+ jb L(gobble_mem_fwd_end)
+ add $0x80, %rdx
+L(ll_cache_copy_fwd):
+ add %rcx, %rdx
+L(ll_cache_copy_fwd_start):
+ sub $0x80, %rdx
+L(gobble_ll_loop_fwd):
+ prefetchnta 0x1c0(%rsi)
+ prefetchnta 0x280(%rsi)
+ prefetchnta 0x1c0(%rdi)
+ prefetchnta 0x280(%rdi)
+ sub $0x80, %rdx
+ movdqu (%rsi), %xmm0
+ movdqu 0x10(%rsi), %xmm1
+ movdqu 0x20(%rsi), %xmm2
+ movdqu 0x30(%rsi), %xmm3
+ movdqu 0x40(%rsi), %xmm4
+ movdqu 0x50(%rsi), %xmm5
+ movdqu 0x60(%rsi), %xmm6
+ movdqu 0x70(%rsi), %xmm7
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm1, 0x10(%rdi)
+ movdqa %xmm2, 0x20(%rdi)
+ movdqa %xmm3, 0x30(%rdi)
+ movdqa %xmm4, 0x40(%rdi)
+ movdqa %xmm5, 0x50(%rdi)
+ movdqa %xmm6, 0x60(%rdi)
+ movdqa %xmm7, 0x70(%rdi)
+ lea 0x80(%rsi), %rsi
+ lea 0x80(%rdi), %rdi
+ jae L(gobble_ll_loop_fwd)
+L(gobble_mem_fwd_end):
+ add $0x80, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(gobble_mem_bwd):
+ add %rdx, %rsi
+ add %rdx, %rdi
+
+ movdqu -16(%rsi), %xmm0
+ lea -16(%rdi), %r8
+ mov %rdi, %r9
+ and $-16, %rdi
+ sub %rdi, %r9
+ sub %r9, %rsi
+ sub %r9, %rdx
+
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+#ifdef USE_AS_MEMMOVE
+ mov %rdi, %r9
+ sub %rsi, %r9
+ cmp %rdx, %r9
+ jae L(memmove_is_memcpy_bwd)
+ cmp %rcx, %r9
+ jbe L(ll_cache_copy_bwd_start)
+L(memmove_is_memcpy_bwd):
+#endif
+ cmp %rcx, %rdx
+ ja L(bigger)
+ mov %rdx, %rcx
+L(bigger):
+ sub %rcx, %rdx
+ cmp $0x1000, %rdx
+ jbe L(ll_cache_copy)
+
+ mov %rcx, %r9
+ shl $3, %r9
+ cmp %r9, %rdx
+ jbe L(2steps_copy)
+ add %rcx, %rdx
+ xor %rcx, %rcx
+L(2steps_copy):
+ sub $0x80, %rdx
+L(gobble_mem_bwd_loop):
+ sub $0x80, %rdx
+ prefetcht0 -0x200(%rsi)
+ prefetcht0 -0x300(%rsi)
+ movdqu -0x10(%rsi), %xmm1
+ movdqu -0x20(%rsi), %xmm2
+ movdqu -0x30(%rsi), %xmm3
+ movdqu -0x40(%rsi), %xmm4
+ movdqu -0x50(%rsi), %xmm5
+ movdqu -0x60(%rsi), %xmm6
+ movdqu -0x70(%rsi), %xmm7
+ movdqu -0x80(%rsi), %xmm8
+ lfence
+ movntdq %xmm1, -0x10(%rdi)
+ movntdq %xmm2, -0x20(%rdi)
+ movntdq %xmm3, -0x30(%rdi)
+ movntdq %xmm4, -0x40(%rdi)
+ movntdq %xmm5, -0x50(%rdi)
+ movntdq %xmm6, -0x60(%rdi)
+ movntdq %xmm7, -0x70(%rdi)
+ movntdq %xmm8, -0x80(%rdi)
+ lea -0x80(%rsi), %rsi
+ lea -0x80(%rdi), %rdi
+ jae L(gobble_mem_bwd_loop)
+ sfence
+ cmp $0x80, %rcx
+ jb L(gobble_mem_bwd_end)
+ add $0x80, %rdx
+L(ll_cache_copy):
+ add %rcx, %rdx
+L(ll_cache_copy_bwd_start):
+ sub $0x80, %rdx
+L(gobble_ll_loop):
+ prefetchnta -0x1c0(%rsi)
+ prefetchnta -0x280(%rsi)
+ prefetchnta -0x1c0(%rdi)
+ prefetchnta -0x280(%rdi)
+ sub $0x80, %rdx
+ movdqu -0x10(%rsi), %xmm1
+ movdqu -0x20(%rsi), %xmm2
+ movdqu -0x30(%rsi), %xmm3
+ movdqu -0x40(%rsi), %xmm4
+ movdqu -0x50(%rsi), %xmm5
+ movdqu -0x60(%rsi), %xmm6
+ movdqu -0x70(%rsi), %xmm7
+ movdqu -0x80(%rsi), %xmm8
+ movdqa %xmm1, -0x10(%rdi)
+ movdqa %xmm2, -0x20(%rdi)
+ movdqa %xmm3, -0x30(%rdi)
+ movdqa %xmm4, -0x40(%rdi)
+ movdqa %xmm5, -0x50(%rdi)
+ movdqa %xmm6, -0x60(%rdi)
+ movdqa %xmm7, -0x70(%rdi)
+ movdqa %xmm8, -0x80(%rdi)
+ lea -0x80(%rsi), %rsi
+ lea -0x80(%rdi), %rdi
+ jae L(gobble_ll_loop)
+L(gobble_mem_bwd_end):
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rsi
+ sub %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ .p2align 4
+L(fwd_write_128bytes):
+ lddqu -128(%rsi), %xmm0
+ movdqu %xmm0, -128(%rdi)
+L(fwd_write_112bytes):
+ lddqu -112(%rsi), %xmm0
+ movdqu %xmm0, -112(%rdi)
+L(fwd_write_96bytes):
+ lddqu -96(%rsi), %xmm0
+ movdqu %xmm0, -96(%rdi)
+L(fwd_write_80bytes):
+ lddqu -80(%rsi), %xmm0
+ movdqu %xmm0, -80(%rdi)
+L(fwd_write_64bytes):
+ lddqu -64(%rsi), %xmm0
+ movdqu %xmm0, -64(%rdi)
+L(fwd_write_48bytes):
+ lddqu -48(%rsi), %xmm0
+ movdqu %xmm0, -48(%rdi)
+L(fwd_write_32bytes):
+ lddqu -32(%rsi), %xmm0
+ movdqu %xmm0, -32(%rdi)
+L(fwd_write_16bytes):
+ lddqu -16(%rsi), %xmm0
+ movdqu %xmm0, -16(%rdi)
+L(fwd_write_0bytes):
+ ret
+
+
+ .p2align 4
+L(fwd_write_143bytes):
+ lddqu -143(%rsi), %xmm0
+ movdqu %xmm0, -143(%rdi)
+L(fwd_write_127bytes):
+ lddqu -127(%rsi), %xmm0
+ movdqu %xmm0, -127(%rdi)
+L(fwd_write_111bytes):
+ lddqu -111(%rsi), %xmm0
+ movdqu %xmm0, -111(%rdi)
+L(fwd_write_95bytes):
+ lddqu -95(%rsi), %xmm0
+ movdqu %xmm0, -95(%rdi)
+L(fwd_write_79bytes):
+ lddqu -79(%rsi), %xmm0
+ movdqu %xmm0, -79(%rdi)
+L(fwd_write_63bytes):
+ lddqu -63(%rsi), %xmm0
+ movdqu %xmm0, -63(%rdi)
+L(fwd_write_47bytes):
+ lddqu -47(%rsi), %xmm0
+ movdqu %xmm0, -47(%rdi)
+L(fwd_write_31bytes):
+ lddqu -31(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -31(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_15bytes):
+ mov -15(%rsi), %rdx
+ mov -8(%rsi), %rcx
+ mov %rdx, -15(%rdi)
+ mov %rcx, -8(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_142bytes):
+ lddqu -142(%rsi), %xmm0
+ movdqu %xmm0, -142(%rdi)
+L(fwd_write_126bytes):
+ lddqu -126(%rsi), %xmm0
+ movdqu %xmm0, -126(%rdi)
+L(fwd_write_110bytes):
+ lddqu -110(%rsi), %xmm0
+ movdqu %xmm0, -110(%rdi)
+L(fwd_write_94bytes):
+ lddqu -94(%rsi), %xmm0
+ movdqu %xmm0, -94(%rdi)
+L(fwd_write_78bytes):
+ lddqu -78(%rsi), %xmm0
+ movdqu %xmm0, -78(%rdi)
+L(fwd_write_62bytes):
+ lddqu -62(%rsi), %xmm0
+ movdqu %xmm0, -62(%rdi)
+L(fwd_write_46bytes):
+ lddqu -46(%rsi), %xmm0
+ movdqu %xmm0, -46(%rdi)
+L(fwd_write_30bytes):
+ lddqu -30(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -30(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_14bytes):
+ mov -14(%rsi), %rdx
+ mov -8(%rsi), %rcx
+ mov %rdx, -14(%rdi)
+ mov %rcx, -8(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_141bytes):
+ lddqu -141(%rsi), %xmm0
+ movdqu %xmm0, -141(%rdi)
+L(fwd_write_125bytes):
+ lddqu -125(%rsi), %xmm0
+ movdqu %xmm0, -125(%rdi)
+L(fwd_write_109bytes):
+ lddqu -109(%rsi), %xmm0
+ movdqu %xmm0, -109(%rdi)
+L(fwd_write_93bytes):
+ lddqu -93(%rsi), %xmm0
+ movdqu %xmm0, -93(%rdi)
+L(fwd_write_77bytes):
+ lddqu -77(%rsi), %xmm0
+ movdqu %xmm0, -77(%rdi)
+L(fwd_write_61bytes):
+ lddqu -61(%rsi), %xmm0
+ movdqu %xmm0, -61(%rdi)
+L(fwd_write_45bytes):
+ lddqu -45(%rsi), %xmm0
+ movdqu %xmm0, -45(%rdi)
+L(fwd_write_29bytes):
+ lddqu -29(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -29(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_13bytes):
+ mov -13(%rsi), %rdx
+ mov -8(%rsi), %rcx
+ mov %rdx, -13(%rdi)
+ mov %rcx, -8(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_140bytes):
+ lddqu -140(%rsi), %xmm0
+ movdqu %xmm0, -140(%rdi)
+L(fwd_write_124bytes):
+ lddqu -124(%rsi), %xmm0
+ movdqu %xmm0, -124(%rdi)
+L(fwd_write_108bytes):
+ lddqu -108(%rsi), %xmm0
+ movdqu %xmm0, -108(%rdi)
+L(fwd_write_92bytes):
+ lddqu -92(%rsi), %xmm0
+ movdqu %xmm0, -92(%rdi)
+L(fwd_write_76bytes):
+ lddqu -76(%rsi), %xmm0
+ movdqu %xmm0, -76(%rdi)
+L(fwd_write_60bytes):
+ lddqu -60(%rsi), %xmm0
+ movdqu %xmm0, -60(%rdi)
+L(fwd_write_44bytes):
+ lddqu -44(%rsi), %xmm0
+ movdqu %xmm0, -44(%rdi)
+L(fwd_write_28bytes):
+ lddqu -28(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -28(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_12bytes):
+ mov -12(%rsi), %rdx
+ mov -4(%rsi), %ecx
+ mov %rdx, -12(%rdi)
+ mov %ecx, -4(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_139bytes):
+ lddqu -139(%rsi), %xmm0
+ movdqu %xmm0, -139(%rdi)
+L(fwd_write_123bytes):
+ lddqu -123(%rsi), %xmm0
+ movdqu %xmm0, -123(%rdi)
+L(fwd_write_107bytes):
+ lddqu -107(%rsi), %xmm0
+ movdqu %xmm0, -107(%rdi)
+L(fwd_write_91bytes):
+ lddqu -91(%rsi), %xmm0
+ movdqu %xmm0, -91(%rdi)
+L(fwd_write_75bytes):
+ lddqu -75(%rsi), %xmm0
+ movdqu %xmm0, -75(%rdi)
+L(fwd_write_59bytes):
+ lddqu -59(%rsi), %xmm0
+ movdqu %xmm0, -59(%rdi)
+L(fwd_write_43bytes):
+ lddqu -43(%rsi), %xmm0
+ movdqu %xmm0, -43(%rdi)
+L(fwd_write_27bytes):
+ lddqu -27(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -27(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_11bytes):
+ mov -11(%rsi), %rdx
+ mov -4(%rsi), %ecx
+ mov %rdx, -11(%rdi)
+ mov %ecx, -4(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_138bytes):
+ lddqu -138(%rsi), %xmm0
+ movdqu %xmm0, -138(%rdi)
+L(fwd_write_122bytes):
+ lddqu -122(%rsi), %xmm0
+ movdqu %xmm0, -122(%rdi)
+L(fwd_write_106bytes):
+ lddqu -106(%rsi), %xmm0
+ movdqu %xmm0, -106(%rdi)
+L(fwd_write_90bytes):
+ lddqu -90(%rsi), %xmm0
+ movdqu %xmm0, -90(%rdi)
+L(fwd_write_74bytes):
+ lddqu -74(%rsi), %xmm0
+ movdqu %xmm0, -74(%rdi)
+L(fwd_write_58bytes):
+ lddqu -58(%rsi), %xmm0
+ movdqu %xmm0, -58(%rdi)
+L(fwd_write_42bytes):
+ lddqu -42(%rsi), %xmm0
+ movdqu %xmm0, -42(%rdi)
+L(fwd_write_26bytes):
+ lddqu -26(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -26(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_10bytes):
+ mov -10(%rsi), %rdx
+ mov -4(%rsi), %ecx
+ mov %rdx, -10(%rdi)
+ mov %ecx, -4(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_137bytes):
+ lddqu -137(%rsi), %xmm0
+ movdqu %xmm0, -137(%rdi)
+L(fwd_write_121bytes):
+ lddqu -121(%rsi), %xmm0
+ movdqu %xmm0, -121(%rdi)
+L(fwd_write_105bytes):
+ lddqu -105(%rsi), %xmm0
+ movdqu %xmm0, -105(%rdi)
+L(fwd_write_89bytes):
+ lddqu -89(%rsi), %xmm0
+ movdqu %xmm0, -89(%rdi)
+L(fwd_write_73bytes):
+ lddqu -73(%rsi), %xmm0
+ movdqu %xmm0, -73(%rdi)
+L(fwd_write_57bytes):
+ lddqu -57(%rsi), %xmm0
+ movdqu %xmm0, -57(%rdi)
+L(fwd_write_41bytes):
+ lddqu -41(%rsi), %xmm0
+ movdqu %xmm0, -41(%rdi)
+L(fwd_write_25bytes):
+ lddqu -25(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -25(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_9bytes):
+ mov -9(%rsi), %rdx
+ mov -4(%rsi), %ecx
+ mov %rdx, -9(%rdi)
+ mov %ecx, -4(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_136bytes):
+ lddqu -136(%rsi), %xmm0
+ movdqu %xmm0, -136(%rdi)
+L(fwd_write_120bytes):
+ lddqu -120(%rsi), %xmm0
+ movdqu %xmm0, -120(%rdi)
+L(fwd_write_104bytes):
+ lddqu -104(%rsi), %xmm0
+ movdqu %xmm0, -104(%rdi)
+L(fwd_write_88bytes):
+ lddqu -88(%rsi), %xmm0
+ movdqu %xmm0, -88(%rdi)
+L(fwd_write_72bytes):
+ lddqu -72(%rsi), %xmm0
+ movdqu %xmm0, -72(%rdi)
+L(fwd_write_56bytes):
+ lddqu -56(%rsi), %xmm0
+ movdqu %xmm0, -56(%rdi)
+L(fwd_write_40bytes):
+ lddqu -40(%rsi), %xmm0
+ movdqu %xmm0, -40(%rdi)
+L(fwd_write_24bytes):
+ lddqu -24(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -24(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_8bytes):
+ mov -8(%rsi), %rdx
+ mov %rdx, -8(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_135bytes):
+ lddqu -135(%rsi), %xmm0
+ movdqu %xmm0, -135(%rdi)
+L(fwd_write_119bytes):
+ lddqu -119(%rsi), %xmm0
+ movdqu %xmm0, -119(%rdi)
+L(fwd_write_103bytes):
+ lddqu -103(%rsi), %xmm0
+ movdqu %xmm0, -103(%rdi)
+L(fwd_write_87bytes):
+ lddqu -87(%rsi), %xmm0
+ movdqu %xmm0, -87(%rdi)
+L(fwd_write_71bytes):
+ lddqu -71(%rsi), %xmm0
+ movdqu %xmm0, -71(%rdi)
+L(fwd_write_55bytes):
+ lddqu -55(%rsi), %xmm0
+ movdqu %xmm0, -55(%rdi)
+L(fwd_write_39bytes):
+ lddqu -39(%rsi), %xmm0
+ movdqu %xmm0, -39(%rdi)
+L(fwd_write_23bytes):
+ lddqu -23(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -23(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_7bytes):
+ mov -7(%rsi), %edx
+ mov -4(%rsi), %ecx
+ mov %edx, -7(%rdi)
+ mov %ecx, -4(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_134bytes):
+ lddqu -134(%rsi), %xmm0
+ movdqu %xmm0, -134(%rdi)
+L(fwd_write_118bytes):
+ lddqu -118(%rsi), %xmm0
+ movdqu %xmm0, -118(%rdi)
+L(fwd_write_102bytes):
+ lddqu -102(%rsi), %xmm0
+ movdqu %xmm0, -102(%rdi)
+L(fwd_write_86bytes):
+ lddqu -86(%rsi), %xmm0
+ movdqu %xmm0, -86(%rdi)
+L(fwd_write_70bytes):
+ lddqu -70(%rsi), %xmm0
+ movdqu %xmm0, -70(%rdi)
+L(fwd_write_54bytes):
+ lddqu -54(%rsi), %xmm0
+ movdqu %xmm0, -54(%rdi)
+L(fwd_write_38bytes):
+ lddqu -38(%rsi), %xmm0
+ movdqu %xmm0, -38(%rdi)
+L(fwd_write_22bytes):
+ lddqu -22(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -22(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_6bytes):
+ mov -6(%rsi), %edx
+ mov -4(%rsi), %ecx
+ mov %edx, -6(%rdi)
+ mov %ecx, -4(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_133bytes):
+ lddqu -133(%rsi), %xmm0
+ movdqu %xmm0, -133(%rdi)
+L(fwd_write_117bytes):
+ lddqu -117(%rsi), %xmm0
+ movdqu %xmm0, -117(%rdi)
+L(fwd_write_101bytes):
+ lddqu -101(%rsi), %xmm0
+ movdqu %xmm0, -101(%rdi)
+L(fwd_write_85bytes):
+ lddqu -85(%rsi), %xmm0
+ movdqu %xmm0, -85(%rdi)
+L(fwd_write_69bytes):
+ lddqu -69(%rsi), %xmm0
+ movdqu %xmm0, -69(%rdi)
+L(fwd_write_53bytes):
+ lddqu -53(%rsi), %xmm0
+ movdqu %xmm0, -53(%rdi)
+L(fwd_write_37bytes):
+ lddqu -37(%rsi), %xmm0
+ movdqu %xmm0, -37(%rdi)
+L(fwd_write_21bytes):
+ lddqu -21(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -21(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_5bytes):
+ mov -5(%rsi), %edx
+ mov -4(%rsi), %ecx
+ mov %edx, -5(%rdi)
+ mov %ecx, -4(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_132bytes):
+ lddqu -132(%rsi), %xmm0
+ movdqu %xmm0, -132(%rdi)
+L(fwd_write_116bytes):
+ lddqu -116(%rsi), %xmm0
+ movdqu %xmm0, -116(%rdi)
+L(fwd_write_100bytes):
+ lddqu -100(%rsi), %xmm0
+ movdqu %xmm0, -100(%rdi)
+L(fwd_write_84bytes):
+ lddqu -84(%rsi), %xmm0
+ movdqu %xmm0, -84(%rdi)
+L(fwd_write_68bytes):
+ lddqu -68(%rsi), %xmm0
+ movdqu %xmm0, -68(%rdi)
+L(fwd_write_52bytes):
+ lddqu -52(%rsi), %xmm0
+ movdqu %xmm0, -52(%rdi)
+L(fwd_write_36bytes):
+ lddqu -36(%rsi), %xmm0
+ movdqu %xmm0, -36(%rdi)
+L(fwd_write_20bytes):
+ lddqu -20(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -20(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_4bytes):
+ mov -4(%rsi), %edx
+ mov %edx, -4(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_131bytes):
+ lddqu -131(%rsi), %xmm0
+ movdqu %xmm0, -131(%rdi)
+L(fwd_write_115bytes):
+ lddqu -115(%rsi), %xmm0
+ movdqu %xmm0, -115(%rdi)
+L(fwd_write_99bytes):
+ lddqu -99(%rsi), %xmm0
+ movdqu %xmm0, -99(%rdi)
+L(fwd_write_83bytes):
+ lddqu -83(%rsi), %xmm0
+ movdqu %xmm0, -83(%rdi)
+L(fwd_write_67bytes):
+ lddqu -67(%rsi), %xmm0
+ movdqu %xmm0, -67(%rdi)
+L(fwd_write_51bytes):
+ lddqu -51(%rsi), %xmm0
+ movdqu %xmm0, -51(%rdi)
+L(fwd_write_35bytes):
+ lddqu -35(%rsi), %xmm0
+ movdqu %xmm0, -35(%rdi)
+L(fwd_write_19bytes):
+ lddqu -19(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -19(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_3bytes):
+ mov -3(%rsi), %dx
+ mov -2(%rsi), %cx
+ mov %dx, -3(%rdi)
+ mov %cx, -2(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_130bytes):
+ lddqu -130(%rsi), %xmm0
+ movdqu %xmm0, -130(%rdi)
+L(fwd_write_114bytes):
+ lddqu -114(%rsi), %xmm0
+ movdqu %xmm0, -114(%rdi)
+L(fwd_write_98bytes):
+ lddqu -98(%rsi), %xmm0
+ movdqu %xmm0, -98(%rdi)
+L(fwd_write_82bytes):
+ lddqu -82(%rsi), %xmm0
+ movdqu %xmm0, -82(%rdi)
+L(fwd_write_66bytes):
+ lddqu -66(%rsi), %xmm0
+ movdqu %xmm0, -66(%rdi)
+L(fwd_write_50bytes):
+ lddqu -50(%rsi), %xmm0
+ movdqu %xmm0, -50(%rdi)
+L(fwd_write_34bytes):
+ lddqu -34(%rsi), %xmm0
+ movdqu %xmm0, -34(%rdi)
+L(fwd_write_18bytes):
+ lddqu -18(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -18(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_2bytes):
+ movzwl -2(%rsi), %edx
+ mov %dx, -2(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_129bytes):
+ lddqu -129(%rsi), %xmm0
+ movdqu %xmm0, -129(%rdi)
+L(fwd_write_113bytes):
+ lddqu -113(%rsi), %xmm0
+ movdqu %xmm0, -113(%rdi)
+L(fwd_write_97bytes):
+ lddqu -97(%rsi), %xmm0
+ movdqu %xmm0, -97(%rdi)
+L(fwd_write_81bytes):
+ lddqu -81(%rsi), %xmm0
+ movdqu %xmm0, -81(%rdi)
+L(fwd_write_65bytes):
+ lddqu -65(%rsi), %xmm0
+ movdqu %xmm0, -65(%rdi)
+L(fwd_write_49bytes):
+ lddqu -49(%rsi), %xmm0
+ movdqu %xmm0, -49(%rdi)
+L(fwd_write_33bytes):
+ lddqu -33(%rsi), %xmm0
+ movdqu %xmm0, -33(%rdi)
+L(fwd_write_17bytes):
+ lddqu -17(%rsi), %xmm0
+ lddqu -16(%rsi), %xmm1
+ movdqu %xmm0, -17(%rdi)
+ movdqu %xmm1, -16(%rdi)
+ ret
+
+ .p2align 4
+L(fwd_write_1bytes):
+ movzbl -1(%rsi), %edx
+ mov %dl, -1(%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_128bytes):
+ lddqu 112(%rsi), %xmm0
+ movdqu %xmm0, 112(%rdi)
+L(bwd_write_112bytes):
+ lddqu 96(%rsi), %xmm0
+ movdqu %xmm0, 96(%rdi)
+L(bwd_write_96bytes):
+ lddqu 80(%rsi), %xmm0
+ movdqu %xmm0, 80(%rdi)
+L(bwd_write_80bytes):
+ lddqu 64(%rsi), %xmm0
+ movdqu %xmm0, 64(%rdi)
+L(bwd_write_64bytes):
+ lddqu 48(%rsi), %xmm0
+ movdqu %xmm0, 48(%rdi)
+L(bwd_write_48bytes):
+ lddqu 32(%rsi), %xmm0
+ movdqu %xmm0, 32(%rdi)
+L(bwd_write_32bytes):
+ lddqu 16(%rsi), %xmm0
+ movdqu %xmm0, 16(%rdi)
+L(bwd_write_16bytes):
+ lddqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+L(bwd_write_0bytes):
+ ret
+
+ .p2align 4
+L(bwd_write_143bytes):
+ lddqu 127(%rsi), %xmm0
+ movdqu %xmm0, 127(%rdi)
+L(bwd_write_127bytes):
+ lddqu 111(%rsi), %xmm0
+ movdqu %xmm0, 111(%rdi)
+L(bwd_write_111bytes):
+ lddqu 95(%rsi), %xmm0
+ movdqu %xmm0, 95(%rdi)
+L(bwd_write_95bytes):
+ lddqu 79(%rsi), %xmm0
+ movdqu %xmm0, 79(%rdi)
+L(bwd_write_79bytes):
+ lddqu 63(%rsi), %xmm0
+ movdqu %xmm0, 63(%rdi)
+L(bwd_write_63bytes):
+ lddqu 47(%rsi), %xmm0
+ movdqu %xmm0, 47(%rdi)
+L(bwd_write_47bytes):
+ lddqu 31(%rsi), %xmm0
+ movdqu %xmm0, 31(%rdi)
+L(bwd_write_31bytes):
+ lddqu 15(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 15(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+
+ .p2align 4
+L(bwd_write_15bytes):
+ mov 7(%rsi), %rdx
+ mov (%rsi), %rcx
+ mov %rdx, 7(%rdi)
+ mov %rcx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_142bytes):
+ lddqu 126(%rsi), %xmm0
+ movdqu %xmm0, 126(%rdi)
+L(bwd_write_126bytes):
+ lddqu 110(%rsi), %xmm0
+ movdqu %xmm0, 110(%rdi)
+L(bwd_write_110bytes):
+ lddqu 94(%rsi), %xmm0
+ movdqu %xmm0, 94(%rdi)
+L(bwd_write_94bytes):
+ lddqu 78(%rsi), %xmm0
+ movdqu %xmm0, 78(%rdi)
+L(bwd_write_78bytes):
+ lddqu 62(%rsi), %xmm0
+ movdqu %xmm0, 62(%rdi)
+L(bwd_write_62bytes):
+ lddqu 46(%rsi), %xmm0
+ movdqu %xmm0, 46(%rdi)
+L(bwd_write_46bytes):
+ lddqu 30(%rsi), %xmm0
+ movdqu %xmm0, 30(%rdi)
+L(bwd_write_30bytes):
+ lddqu 14(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 14(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_14bytes):
+ mov 6(%rsi), %rdx
+ mov (%rsi), %rcx
+ mov %rdx, 6(%rdi)
+ mov %rcx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_141bytes):
+ lddqu 125(%rsi), %xmm0
+ movdqu %xmm0, 125(%rdi)
+L(bwd_write_125bytes):
+ lddqu 109(%rsi), %xmm0
+ movdqu %xmm0, 109(%rdi)
+L(bwd_write_109bytes):
+ lddqu 93(%rsi), %xmm0
+ movdqu %xmm0, 93(%rdi)
+L(bwd_write_93bytes):
+ lddqu 77(%rsi), %xmm0
+ movdqu %xmm0, 77(%rdi)
+L(bwd_write_77bytes):
+ lddqu 61(%rsi), %xmm0
+ movdqu %xmm0, 61(%rdi)
+L(bwd_write_61bytes):
+ lddqu 45(%rsi), %xmm0
+ movdqu %xmm0, 45(%rdi)
+L(bwd_write_45bytes):
+ lddqu 29(%rsi), %xmm0
+ movdqu %xmm0, 29(%rdi)
+L(bwd_write_29bytes):
+ lddqu 13(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 13(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_13bytes):
+ mov 5(%rsi), %rdx
+ mov (%rsi), %rcx
+ mov %rdx, 5(%rdi)
+ mov %rcx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_140bytes):
+ lddqu 124(%rsi), %xmm0
+ movdqu %xmm0, 124(%rdi)
+L(bwd_write_124bytes):
+ lddqu 108(%rsi), %xmm0
+ movdqu %xmm0, 108(%rdi)
+L(bwd_write_108bytes):
+ lddqu 92(%rsi), %xmm0
+ movdqu %xmm0, 92(%rdi)
+L(bwd_write_92bytes):
+ lddqu 76(%rsi), %xmm0
+ movdqu %xmm0, 76(%rdi)
+L(bwd_write_76bytes):
+ lddqu 60(%rsi), %xmm0
+ movdqu %xmm0, 60(%rdi)
+L(bwd_write_60bytes):
+ lddqu 44(%rsi), %xmm0
+ movdqu %xmm0, 44(%rdi)
+L(bwd_write_44bytes):
+ lddqu 28(%rsi), %xmm0
+ movdqu %xmm0, 28(%rdi)
+L(bwd_write_28bytes):
+ lddqu 12(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 12(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_12bytes):
+ mov 4(%rsi), %rdx
+ mov (%rsi), %rcx
+ mov %rdx, 4(%rdi)
+ mov %rcx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_139bytes):
+ lddqu 123(%rsi), %xmm0
+ movdqu %xmm0, 123(%rdi)
+L(bwd_write_123bytes):
+ lddqu 107(%rsi), %xmm0
+ movdqu %xmm0, 107(%rdi)
+L(bwd_write_107bytes):
+ lddqu 91(%rsi), %xmm0
+ movdqu %xmm0, 91(%rdi)
+L(bwd_write_91bytes):
+ lddqu 75(%rsi), %xmm0
+ movdqu %xmm0, 75(%rdi)
+L(bwd_write_75bytes):
+ lddqu 59(%rsi), %xmm0
+ movdqu %xmm0, 59(%rdi)
+L(bwd_write_59bytes):
+ lddqu 43(%rsi), %xmm0
+ movdqu %xmm0, 43(%rdi)
+L(bwd_write_43bytes):
+ lddqu 27(%rsi), %xmm0
+ movdqu %xmm0, 27(%rdi)
+L(bwd_write_27bytes):
+ lddqu 11(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 11(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_11bytes):
+ mov 3(%rsi), %rdx
+ mov (%rsi), %rcx
+ mov %rdx, 3(%rdi)
+ mov %rcx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_138bytes):
+ lddqu 122(%rsi), %xmm0
+ movdqu %xmm0, 122(%rdi)
+L(bwd_write_122bytes):
+ lddqu 106(%rsi), %xmm0
+ movdqu %xmm0, 106(%rdi)
+L(bwd_write_106bytes):
+ lddqu 90(%rsi), %xmm0
+ movdqu %xmm0, 90(%rdi)
+L(bwd_write_90bytes):
+ lddqu 74(%rsi), %xmm0
+ movdqu %xmm0, 74(%rdi)
+L(bwd_write_74bytes):
+ lddqu 58(%rsi), %xmm0
+ movdqu %xmm0, 58(%rdi)
+L(bwd_write_58bytes):
+ lddqu 42(%rsi), %xmm0
+ movdqu %xmm0, 42(%rdi)
+L(bwd_write_42bytes):
+ lddqu 26(%rsi), %xmm0
+ movdqu %xmm0, 26(%rdi)
+L(bwd_write_26bytes):
+ lddqu 10(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 10(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_10bytes):
+ mov 2(%rsi), %rdx
+ mov (%rsi), %rcx
+ mov %rdx, 2(%rdi)
+ mov %rcx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_137bytes):
+ lddqu 121(%rsi), %xmm0
+ movdqu %xmm0, 121(%rdi)
+L(bwd_write_121bytes):
+ lddqu 105(%rsi), %xmm0
+ movdqu %xmm0, 105(%rdi)
+L(bwd_write_105bytes):
+ lddqu 89(%rsi), %xmm0
+ movdqu %xmm0, 89(%rdi)
+L(bwd_write_89bytes):
+ lddqu 73(%rsi), %xmm0
+ movdqu %xmm0, 73(%rdi)
+L(bwd_write_73bytes):
+ lddqu 57(%rsi), %xmm0
+ movdqu %xmm0, 57(%rdi)
+L(bwd_write_57bytes):
+ lddqu 41(%rsi), %xmm0
+ movdqu %xmm0, 41(%rdi)
+L(bwd_write_41bytes):
+ lddqu 25(%rsi), %xmm0
+ movdqu %xmm0, 25(%rdi)
+L(bwd_write_25bytes):
+ lddqu 9(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 9(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_9bytes):
+ mov 1(%rsi), %rdx
+ mov (%rsi), %rcx
+ mov %rdx, 1(%rdi)
+ mov %rcx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_136bytes):
+ lddqu 120(%rsi), %xmm0
+ movdqu %xmm0, 120(%rdi)
+L(bwd_write_120bytes):
+ lddqu 104(%rsi), %xmm0
+ movdqu %xmm0, 104(%rdi)
+L(bwd_write_104bytes):
+ lddqu 88(%rsi), %xmm0
+ movdqu %xmm0, 88(%rdi)
+L(bwd_write_88bytes):
+ lddqu 72(%rsi), %xmm0
+ movdqu %xmm0, 72(%rdi)
+L(bwd_write_72bytes):
+ lddqu 56(%rsi), %xmm0
+ movdqu %xmm0, 56(%rdi)
+L(bwd_write_56bytes):
+ lddqu 40(%rsi), %xmm0
+ movdqu %xmm0, 40(%rdi)
+L(bwd_write_40bytes):
+ lddqu 24(%rsi), %xmm0
+ movdqu %xmm0, 24(%rdi)
+L(bwd_write_24bytes):
+ lddqu 8(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 8(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_8bytes):
+ mov (%rsi), %rdx
+ mov %rdx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_135bytes):
+ lddqu 119(%rsi), %xmm0
+ movdqu %xmm0, 119(%rdi)
+L(bwd_write_119bytes):
+ lddqu 103(%rsi), %xmm0
+ movdqu %xmm0, 103(%rdi)
+L(bwd_write_103bytes):
+ lddqu 87(%rsi), %xmm0
+ movdqu %xmm0, 87(%rdi)
+L(bwd_write_87bytes):
+ lddqu 71(%rsi), %xmm0
+ movdqu %xmm0, 71(%rdi)
+L(bwd_write_71bytes):
+ lddqu 55(%rsi), %xmm0
+ movdqu %xmm0, 55(%rdi)
+L(bwd_write_55bytes):
+ lddqu 39(%rsi), %xmm0
+ movdqu %xmm0, 39(%rdi)
+L(bwd_write_39bytes):
+ lddqu 23(%rsi), %xmm0
+ movdqu %xmm0, 23(%rdi)
+L(bwd_write_23bytes):
+ lddqu 7(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 7(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_7bytes):
+ mov 3(%rsi), %edx
+ mov (%rsi), %ecx
+ mov %edx, 3(%rdi)
+ mov %ecx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_134bytes):
+ lddqu 118(%rsi), %xmm0
+ movdqu %xmm0, 118(%rdi)
+L(bwd_write_118bytes):
+ lddqu 102(%rsi), %xmm0
+ movdqu %xmm0, 102(%rdi)
+L(bwd_write_102bytes):
+ lddqu 86(%rsi), %xmm0
+ movdqu %xmm0, 86(%rdi)
+L(bwd_write_86bytes):
+ lddqu 70(%rsi), %xmm0
+ movdqu %xmm0, 70(%rdi)
+L(bwd_write_70bytes):
+ lddqu 54(%rsi), %xmm0
+ movdqu %xmm0, 54(%rdi)
+L(bwd_write_54bytes):
+ lddqu 38(%rsi), %xmm0
+ movdqu %xmm0, 38(%rdi)
+L(bwd_write_38bytes):
+ lddqu 22(%rsi), %xmm0
+ movdqu %xmm0, 22(%rdi)
+L(bwd_write_22bytes):
+ lddqu 6(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 6(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_6bytes):
+ mov 2(%rsi), %edx
+ mov (%rsi), %ecx
+ mov %edx, 2(%rdi)
+ mov %ecx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_133bytes):
+ lddqu 117(%rsi), %xmm0
+ movdqu %xmm0, 117(%rdi)
+L(bwd_write_117bytes):
+ lddqu 101(%rsi), %xmm0
+ movdqu %xmm0, 101(%rdi)
+L(bwd_write_101bytes):
+ lddqu 85(%rsi), %xmm0
+ movdqu %xmm0, 85(%rdi)
+L(bwd_write_85bytes):
+ lddqu 69(%rsi), %xmm0
+ movdqu %xmm0, 69(%rdi)
+L(bwd_write_69bytes):
+ lddqu 53(%rsi), %xmm0
+ movdqu %xmm0, 53(%rdi)
+L(bwd_write_53bytes):
+ lddqu 37(%rsi), %xmm0
+ movdqu %xmm0, 37(%rdi)
+L(bwd_write_37bytes):
+ lddqu 21(%rsi), %xmm0
+ movdqu %xmm0, 21(%rdi)
+L(bwd_write_21bytes):
+ lddqu 5(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 5(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_5bytes):
+ mov 1(%rsi), %edx
+ mov (%rsi), %ecx
+ mov %edx, 1(%rdi)
+ mov %ecx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_132bytes):
+ lddqu 116(%rsi), %xmm0
+ movdqu %xmm0, 116(%rdi)
+L(bwd_write_116bytes):
+ lddqu 100(%rsi), %xmm0
+ movdqu %xmm0, 100(%rdi)
+L(bwd_write_100bytes):
+ lddqu 84(%rsi), %xmm0
+ movdqu %xmm0, 84(%rdi)
+L(bwd_write_84bytes):
+ lddqu 68(%rsi), %xmm0
+ movdqu %xmm0, 68(%rdi)
+L(bwd_write_68bytes):
+ lddqu 52(%rsi), %xmm0
+ movdqu %xmm0, 52(%rdi)
+L(bwd_write_52bytes):
+ lddqu 36(%rsi), %xmm0
+ movdqu %xmm0, 36(%rdi)
+L(bwd_write_36bytes):
+ lddqu 20(%rsi), %xmm0
+ movdqu %xmm0, 20(%rdi)
+L(bwd_write_20bytes):
+ lddqu 4(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 4(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_4bytes):
+ mov (%rsi), %edx
+ mov %edx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_131bytes):
+ lddqu 115(%rsi), %xmm0
+ movdqu %xmm0, 115(%rdi)
+L(bwd_write_115bytes):
+ lddqu 99(%rsi), %xmm0
+ movdqu %xmm0, 99(%rdi)
+L(bwd_write_99bytes):
+ lddqu 83(%rsi), %xmm0
+ movdqu %xmm0, 83(%rdi)
+L(bwd_write_83bytes):
+ lddqu 67(%rsi), %xmm0
+ movdqu %xmm0, 67(%rdi)
+L(bwd_write_67bytes):
+ lddqu 51(%rsi), %xmm0
+ movdqu %xmm0, 51(%rdi)
+L(bwd_write_51bytes):
+ lddqu 35(%rsi), %xmm0
+ movdqu %xmm0, 35(%rdi)
+L(bwd_write_35bytes):
+ lddqu 19(%rsi), %xmm0
+ movdqu %xmm0, 19(%rdi)
+L(bwd_write_19bytes):
+ lddqu 3(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 3(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_3bytes):
+ mov 1(%rsi), %dx
+ mov (%rsi), %cx
+ mov %dx, 1(%rdi)
+ mov %cx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_130bytes):
+ lddqu 114(%rsi), %xmm0
+ movdqu %xmm0, 114(%rdi)
+L(bwd_write_114bytes):
+ lddqu 98(%rsi), %xmm0
+ movdqu %xmm0, 98(%rdi)
+L(bwd_write_98bytes):
+ lddqu 82(%rsi), %xmm0
+ movdqu %xmm0, 82(%rdi)
+L(bwd_write_82bytes):
+ lddqu 66(%rsi), %xmm0
+ movdqu %xmm0, 66(%rdi)
+L(bwd_write_66bytes):
+ lddqu 50(%rsi), %xmm0
+ movdqu %xmm0, 50(%rdi)
+L(bwd_write_50bytes):
+ lddqu 34(%rsi), %xmm0
+ movdqu %xmm0, 34(%rdi)
+L(bwd_write_34bytes):
+ lddqu 18(%rsi), %xmm0
+ movdqu %xmm0, 18(%rdi)
+L(bwd_write_18bytes):
+ lddqu 2(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 2(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_2bytes):
+ movzwl (%rsi), %edx
+ mov %dx, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_129bytes):
+ lddqu 113(%rsi), %xmm0
+ movdqu %xmm0, 113(%rdi)
+L(bwd_write_113bytes):
+ lddqu 97(%rsi), %xmm0
+ movdqu %xmm0, 97(%rdi)
+L(bwd_write_97bytes):
+ lddqu 81(%rsi), %xmm0
+ movdqu %xmm0, 81(%rdi)
+L(bwd_write_81bytes):
+ lddqu 65(%rsi), %xmm0
+ movdqu %xmm0, 65(%rdi)
+L(bwd_write_65bytes):
+ lddqu 49(%rsi), %xmm0
+ movdqu %xmm0, 49(%rdi)
+L(bwd_write_49bytes):
+ lddqu 33(%rsi), %xmm0
+ movdqu %xmm0, 33(%rdi)
+L(bwd_write_33bytes):
+ lddqu 17(%rsi), %xmm0
+ movdqu %xmm0, 17(%rdi)
+L(bwd_write_17bytes):
+ lddqu 1(%rsi), %xmm0
+ lddqu (%rsi), %xmm1
+ movdqu %xmm0, 1(%rdi)
+ movdqu %xmm1, (%rdi)
+ ret
+
+ .p2align 4
+L(bwd_write_1bytes):
+ movzbl (%rsi), %edx
+ mov %dl, (%rdi)
+ ret
+
+END (MEMCPY)
+
+ .section .rodata.ssse3,"a",@progbits
+ ALIGN (3)
+L(table_144_bytes_bwd):
+ .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
+ .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
+
+ ALIGN (3)
+L(table_144_bytes_fwd):
+ .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
+ .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
+
+ ALIGN (3)
+L(shl_table_fwd):
+ .int JMPTBL (L(shl_0), L(shl_table_fwd))
+ .int JMPTBL (L(shl_1), L(shl_table_fwd))
+ .int JMPTBL (L(shl_2), L(shl_table_fwd))
+ .int JMPTBL (L(shl_3), L(shl_table_fwd))
+ .int JMPTBL (L(shl_4), L(shl_table_fwd))
+ .int JMPTBL (L(shl_5), L(shl_table_fwd))
+ .int JMPTBL (L(shl_6), L(shl_table_fwd))
+ .int JMPTBL (L(shl_7), L(shl_table_fwd))
+ .int JMPTBL (L(shl_8), L(shl_table_fwd))
+ .int JMPTBL (L(shl_9), L(shl_table_fwd))
+ .int JMPTBL (L(shl_10), L(shl_table_fwd))
+ .int JMPTBL (L(shl_11), L(shl_table_fwd))
+ .int JMPTBL (L(shl_12), L(shl_table_fwd))
+ .int JMPTBL (L(shl_13), L(shl_table_fwd))
+ .int JMPTBL (L(shl_14), L(shl_table_fwd))
+ .int JMPTBL (L(shl_15), L(shl_table_fwd))
+
+ ALIGN (3)
+L(shl_table_bwd):
+ .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
new file mode 100644
index 0000000..9a878d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -0,0 +1,3139 @@
+/* memcpy with SSSE3
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY __memcpy_ssse3
+# define MEMCPY_CHK __memcpy_chk_ssse3
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#define JMPTBL(I, B) I - B
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ relative offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), INDEX; \
+ lea (%r11, INDEX), INDEX; \
+ jmp *INDEX; \
+ ud2
+
+ .section .text.ssse3,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc
+ENTRY (MEMCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+ mov %rdi, %rax
+#ifdef USE_AS_MEMPCPY
+ add %rdx, %rax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+ cmp %rsi, %rdi
+ jb L(copy_forward)
+ je L(write_0bytes)
+ cmp $79, %rdx
+ jbe L(copy_forward)
+ jmp L(copy_backward)
+L(copy_forward):
+#endif
+ cmp $79, %rdx
+ lea L(table_less_80bytes)(%rip), %r11
+ ja L(80bytesormore)
+ movslq (%r11, %rdx, 4), %r9
+ add %rdx, %rsi
+ add %rdx, %rdi
+ add %r11, %r9
+ jmp *%r9
+ ud2
+
+ ALIGN (4)
+L(80bytesormore):
+#ifndef USE_AS_MEMMOVE
+ cmp %dil, %sil
+ jle L(copy_backward)
+#endif
+
+ movdqu (%rsi), %xmm0
+ mov %rdi, %rcx
+ and $-16, %rdi
+ add $16, %rdi
+ mov %rcx, %r8
+ sub %rdi, %rcx
+ add %rcx, %rdx
+ sub %rcx, %rsi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+ cmp %rcx, %rdx
+ mov %rsi, %r9
+ ja L(large_page_fwd)
+ and $0xf, %r9
+ jz L(shl_0)
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_64_data_cache_size_half(%rip), %rcx
+#endif
+ BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
+
+ ALIGN (4)
+L(copy_backward):
+ movdqu -16(%rsi, %rdx), %xmm0
+ add %rdx, %rsi
+ lea -16(%rdi, %rdx), %r8
+ add %rdx, %rdi
+
+ mov %rdi, %rcx
+ and $0xf, %rcx
+ xor %rcx, %rdi
+ sub %rcx, %rdx
+ sub %rcx, %rsi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ mov $SHARED_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+
+ cmp %rcx, %rdx
+ mov %rsi, %r9
+ ja L(large_page_bwd)
+ and $0xf, %r9
+ jz L(shl_0_bwd)
+#ifdef DATA_CACHE_SIZE_HALF
+ mov $DATA_CACHE_SIZE_HALF, %rcx
+#else
+ mov __x86_64_data_cache_size_half(%rip), %rcx
+#endif
+ BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
+
+ ALIGN (4)
+L(shl_0):
+ sub $16, %rdx
+ movdqa (%rsi), %xmm1
+ add $16, %rsi
+ movdqa %xmm1, (%rdi)
+ add $16, %rdi
+ cmp $128, %rdx
+ movdqu %xmm0, (%r8)
+ ja L(shl_0_gobble)
+ cmp $64, %rdx
+ jb L(shl_0_less_64bytes)
+ movaps (%rsi), %xmm4
+ movaps 16(%rsi), %xmm1
+ movaps 32(%rsi), %xmm2
+ movaps 48(%rsi), %xmm3
+ movaps %xmm4, (%rdi)
+ movaps %xmm1, 16(%rdi)
+ movaps %xmm2, 32(%rdi)
+ movaps %xmm3, 48(%rdi)
+ sub $64, %rdx
+ add $64, %rsi
+ add $64, %rdi
+L(shl_0_less_64bytes):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_0_gobble):
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %rdx
+#else
+ cmp __x86_64_data_cache_size_half(%rip), %rdx
+#endif
+ lea -128(%rdx), %rdx
+ jae L(shl_0_gobble_mem_loop)
+L(shl_0_gobble_cache_loop):
+ movdqa (%rsi), %xmm4
+ movaps 0x10(%rsi), %xmm1
+ movaps 0x20(%rsi), %xmm2
+ movaps 0x30(%rsi), %xmm3
+
+ movdqa %xmm4, (%rdi)
+ movaps %xmm1, 0x10(%rdi)
+ movaps %xmm2, 0x20(%rdi)
+ movaps %xmm3, 0x30(%rdi)
+
+ sub $128, %rdx
+ movaps 0x40(%rsi), %xmm4
+ movaps 0x50(%rsi), %xmm5
+ movaps 0x60(%rsi), %xmm6
+ movaps 0x70(%rsi), %xmm7
+ lea 0x80(%rsi), %rsi
+ movaps %xmm4, 0x40(%rdi)
+ movaps %xmm5, 0x50(%rdi)
+ movaps %xmm6, 0x60(%rdi)
+ movaps %xmm7, 0x70(%rdi)
+ lea 0x80(%rdi), %rdi
+
+ jae L(shl_0_gobble_cache_loop)
+ cmp $-0x40, %rdx
+ lea 0x80(%rdx), %rdx
+ jl L(shl_0_cache_less_64bytes)
+
+ movdqa (%rsi), %xmm4
+ sub $0x40, %rdx
+ movdqa 0x10(%rsi), %xmm1
+
+ movdqa %xmm4, (%rdi)
+ movdqa %xmm1, 0x10(%rdi)
+
+ movdqa 0x20(%rsi), %xmm4
+ movdqa 0x30(%rsi), %xmm1
+ add $0x40, %rsi
+
+ movdqa %xmm4, 0x20(%rdi)
+ movdqa %xmm1, 0x30(%rdi)
+ add $0x40, %rdi
+L(shl_0_cache_less_64bytes):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_0_gobble_mem_loop):
+ prefetcht0 0x1c0(%rsi)
+ prefetcht0 0x280(%rsi)
+
+ movdqa (%rsi), %xmm0
+ movdqa 0x10(%rsi), %xmm1
+ movdqa 0x20(%rsi), %xmm2
+ movdqa 0x30(%rsi), %xmm3
+ movdqa 0x40(%rsi), %xmm4
+ movdqa 0x50(%rsi), %xmm5
+ movdqa 0x60(%rsi), %xmm6
+ movdqa 0x70(%rsi), %xmm7
+ lea 0x80(%rsi), %rsi
+ sub $0x80, %rdx
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm1, 0x10(%rdi)
+ movdqa %xmm2, 0x20(%rdi)
+ movdqa %xmm3, 0x30(%rdi)
+ movdqa %xmm4, 0x40(%rdi)
+ movdqa %xmm5, 0x50(%rdi)
+ movdqa %xmm6, 0x60(%rdi)
+ movdqa %xmm7, 0x70(%rdi)
+ lea 0x80(%rdi), %rdi
+
+ jae L(shl_0_gobble_mem_loop)
+ cmp $-0x40, %rdx
+ lea 0x80(%rdx), %rdx
+ jl L(shl_0_mem_less_64bytes)
+
+ movdqa (%rsi), %xmm0
+ sub $0x40, %rdx
+ movdqa 0x10(%rsi), %xmm1
+
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm1, 0x10(%rdi)
+
+ movdqa 0x20(%rsi), %xmm0
+ movdqa 0x30(%rsi), %xmm1
+ add $0x40, %rsi
+
+ movdqa %xmm0, 0x20(%rdi)
+ movdqa %xmm1, 0x30(%rdi)
+ add $0x40, %rdi
+L(shl_0_mem_less_64bytes):
+ cmp $0x20, %rdx
+ jb L(shl_0_mem_less_32bytes)
+ movdqa (%rsi), %xmm0
+ sub $0x20, %rdx
+ movdqa 0x10(%rsi), %xmm1
+ add $0x20, %rsi
+ movdqa %xmm0, (%rdi)
+ movdqa %xmm1, 0x10(%rdi)
+ add $0x20, %rdi
+L(shl_0_mem_less_32bytes):
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_0_bwd):
+ sub $16, %rdx
+ movdqa -0x10(%rsi), %xmm1
+ sub $16, %rsi
+ movdqa %xmm1, -0x10(%rdi)
+ sub $16, %rdi
+ cmp $0x80, %rdx
+ movdqu %xmm0, (%r8)
+ ja L(shl_0_gobble_bwd)
+ cmp $64, %rdx
+ jb L(shl_0_less_64bytes_bwd)
+ movaps -0x10(%rsi), %xmm0
+ movaps -0x20(%rsi), %xmm1
+ movaps -0x30(%rsi), %xmm2
+ movaps -0x40(%rsi), %xmm3
+ movaps %xmm0, -0x10(%rdi)
+ movaps %xmm1, -0x20(%rdi)
+ movaps %xmm2, -0x30(%rdi)
+ movaps %xmm3, -0x40(%rdi)
+ sub $64, %rdx
+ sub $0x40, %rsi
+ sub $0x40, %rdi
+L(shl_0_less_64bytes_bwd):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_0_gobble_bwd):
+#ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %rdx
+#else
+ cmp __x86_64_data_cache_size_half(%rip), %rdx
+#endif
+ lea -128(%rdx), %rdx
+ jae L(shl_0_gobble_mem_bwd_loop)
+L(shl_0_gobble_bwd_loop):
+ movdqa -0x10(%rsi), %xmm0
+ movaps -0x20(%rsi), %xmm1
+ movaps -0x30(%rsi), %xmm2
+ movaps -0x40(%rsi), %xmm3
+
+ movdqa %xmm0, -0x10(%rdi)
+ movaps %xmm1, -0x20(%rdi)
+ movaps %xmm2, -0x30(%rdi)
+ movaps %xmm3, -0x40(%rdi)
+
+ sub $0x80, %rdx
+ movaps -0x50(%rsi), %xmm4
+ movaps -0x60(%rsi), %xmm5
+ movaps -0x70(%rsi), %xmm6
+ movaps -0x80(%rsi), %xmm7
+ lea -0x80(%rsi), %rsi
+ movaps %xmm4, -0x50(%rdi)
+ movaps %xmm5, -0x60(%rdi)
+ movaps %xmm6, -0x70(%rdi)
+ movaps %xmm7, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+
+ jae L(shl_0_gobble_bwd_loop)
+ cmp $-0x40, %rdx
+ lea 0x80(%rdx), %rdx
+ jl L(shl_0_gobble_bwd_less_64bytes)
+
+ movdqa -0x10(%rsi), %xmm0
+ sub $0x40, %rdx
+ movdqa -0x20(%rsi), %xmm1
+
+ movdqa %xmm0, -0x10(%rdi)
+ movdqa %xmm1, -0x20(%rdi)
+
+ movdqa -0x30(%rsi), %xmm0
+ movdqa -0x40(%rsi), %xmm1
+ sub $0x40, %rsi
+
+ movdqa %xmm0, -0x30(%rdi)
+ movdqa %xmm1, -0x40(%rdi)
+ sub $0x40, %rdi
+L(shl_0_gobble_bwd_less_64bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_0_gobble_mem_bwd_loop):
+ prefetcht0 -0x1c0(%rsi)
+ prefetcht0 -0x280(%rsi)
+ movdqa -0x10(%rsi), %xmm0
+ movdqa -0x20(%rsi), %xmm1
+ movdqa -0x30(%rsi), %xmm2
+ movdqa -0x40(%rsi), %xmm3
+ movdqa -0x50(%rsi), %xmm4
+ movdqa -0x60(%rsi), %xmm5
+ movdqa -0x70(%rsi), %xmm6
+ movdqa -0x80(%rsi), %xmm7
+ lea -0x80(%rsi), %rsi
+ sub $0x80, %rdx
+ movdqa %xmm0, -0x10(%rdi)
+ movdqa %xmm1, -0x20(%rdi)
+ movdqa %xmm2, -0x30(%rdi)
+ movdqa %xmm3, -0x40(%rdi)
+ movdqa %xmm4, -0x50(%rdi)
+ movdqa %xmm5, -0x60(%rdi)
+ movdqa %xmm6, -0x70(%rdi)
+ movdqa %xmm7, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+
+ jae L(shl_0_gobble_mem_bwd_loop)
+ cmp $-0x40, %rdx
+ lea 0x80(%rdx), %rdx
+ jl L(shl_0_mem_bwd_less_64bytes)
+
+ movdqa -0x10(%rsi), %xmm0
+ sub $0x40, %rdx
+ movdqa -0x20(%rsi), %xmm1
+
+ movdqa %xmm0, -0x10(%rdi)
+ movdqa %xmm1, -0x20(%rdi)
+
+ movdqa -0x30(%rsi), %xmm0
+ movdqa -0x40(%rsi), %xmm1
+ sub $0x40, %rsi
+
+ movdqa %xmm0, -0x30(%rdi)
+ movdqa %xmm1, -0x40(%rdi)
+ sub $0x40, %rdi
+L(shl_0_mem_bwd_less_64bytes):
+ cmp $0x20, %rdx
+ jb L(shl_0_mem_bwd_less_32bytes)
+ movdqa -0x10(%rsi), %xmm0
+ sub $0x20, %rdx
+ movdqa -0x20(%rsi), %xmm1
+ sub $0x20, %rsi
+ movdqa %xmm0, -0x10(%rdi)
+ movdqa %xmm1, -0x20(%rdi)
+ sub $0x20, %rdi
+L(shl_0_mem_bwd_less_32bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_1):
+ lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x01(%rsi), %xmm1
+ jb L(L1_fwd)
+ lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
+L(L1_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_1_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_1_loop_L1):
+ sub $64, %rdx
+ movaps 0x0f(%rsi), %xmm2
+ movaps 0x1f(%rsi), %xmm3
+ movaps 0x2f(%rsi), %xmm4
+ movaps 0x3f(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $1, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $1, %xmm3, %xmm4
+ palignr $1, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $1, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_1_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_1_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_1_bwd):
+ lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x01(%rsi), %xmm1
+ jb L(L1_bwd)
+ lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
+L(L1_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_1_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_1_bwd_loop_L1):
+ movaps -0x11(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x21(%rsi), %xmm3
+ movaps -0x31(%rsi), %xmm4
+ movaps -0x41(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $1, %xmm2, %xmm1
+ palignr $1, %xmm3, %xmm2
+ palignr $1, %xmm4, %xmm3
+ palignr $1, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_1_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_1_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_2):
+ lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x02(%rsi), %xmm1
+ jb L(L2_fwd)
+ lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
+L(L2_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_2_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_2_loop_L1):
+ sub $64, %rdx
+ movaps 0x0e(%rsi), %xmm2
+ movaps 0x1e(%rsi), %xmm3
+ movaps 0x2e(%rsi), %xmm4
+ movaps 0x3e(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $2, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $2, %xmm3, %xmm4
+ palignr $2, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $2, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_2_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_2_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_2_bwd):
+ lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x02(%rsi), %xmm1
+ jb L(L2_bwd)
+ lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
+L(L2_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_2_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_2_bwd_loop_L1):
+ movaps -0x12(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x22(%rsi), %xmm3
+ movaps -0x32(%rsi), %xmm4
+ movaps -0x42(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $2, %xmm2, %xmm1
+ palignr $2, %xmm3, %xmm2
+ palignr $2, %xmm4, %xmm3
+ palignr $2, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_2_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_2_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_3):
+ lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x03(%rsi), %xmm1
+ jb L(L3_fwd)
+ lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
+L(L3_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_3_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_3_loop_L1):
+ sub $64, %rdx
+ movaps 0x0d(%rsi), %xmm2
+ movaps 0x1d(%rsi), %xmm3
+ movaps 0x2d(%rsi), %xmm4
+ movaps 0x3d(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $3, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $3, %xmm3, %xmm4
+ palignr $3, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $3, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_3_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_3_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_3_bwd):
+ lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x03(%rsi), %xmm1
+ jb L(L3_bwd)
+ lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
+L(L3_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_3_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_3_bwd_loop_L1):
+ movaps -0x13(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x23(%rsi), %xmm3
+ movaps -0x33(%rsi), %xmm4
+ movaps -0x43(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $3, %xmm2, %xmm1
+ palignr $3, %xmm3, %xmm2
+ palignr $3, %xmm4, %xmm3
+ palignr $3, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_3_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_3_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_4):
+ lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x04(%rsi), %xmm1
+ jb L(L4_fwd)
+ lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
+L(L4_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_4_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_4_loop_L1):
+ sub $64, %rdx
+ movaps 0x0c(%rsi), %xmm2
+ movaps 0x1c(%rsi), %xmm3
+ movaps 0x2c(%rsi), %xmm4
+ movaps 0x3c(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $4, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $4, %xmm3, %xmm4
+ palignr $4, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $4, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_4_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_4_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_4_bwd):
+ lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x04(%rsi), %xmm1
+ jb L(L4_bwd)
+ lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
+L(L4_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_4_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_4_bwd_loop_L1):
+ movaps -0x14(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x24(%rsi), %xmm3
+ movaps -0x34(%rsi), %xmm4
+ movaps -0x44(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $4, %xmm2, %xmm1
+ palignr $4, %xmm3, %xmm2
+ palignr $4, %xmm4, %xmm3
+ palignr $4, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_4_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_4_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_5):
+ lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x05(%rsi), %xmm1
+ jb L(L5_fwd)
+ lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
+L(L5_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_5_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_5_loop_L1):
+ sub $64, %rdx
+ movaps 0x0b(%rsi), %xmm2
+ movaps 0x1b(%rsi), %xmm3
+ movaps 0x2b(%rsi), %xmm4
+ movaps 0x3b(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $5, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $5, %xmm3, %xmm4
+ palignr $5, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $5, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_5_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_5_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_5_bwd):
+ lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x05(%rsi), %xmm1
+ jb L(L5_bwd)
+ lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
+L(L5_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_5_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_5_bwd_loop_L1):
+ movaps -0x15(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x25(%rsi), %xmm3
+ movaps -0x35(%rsi), %xmm4
+ movaps -0x45(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $5, %xmm2, %xmm1
+ palignr $5, %xmm3, %xmm2
+ palignr $5, %xmm4, %xmm3
+ palignr $5, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_5_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_5_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_6):
+ lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x06(%rsi), %xmm1
+ jb L(L6_fwd)
+ lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
+L(L6_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_6_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_6_loop_L1):
+ sub $64, %rdx
+ movaps 0x0a(%rsi), %xmm2
+ movaps 0x1a(%rsi), %xmm3
+ movaps 0x2a(%rsi), %xmm4
+ movaps 0x3a(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $6, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $6, %xmm3, %xmm4
+ palignr $6, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $6, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_6_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_6_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_6_bwd):
+ lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x06(%rsi), %xmm1
+ jb L(L6_bwd)
+ lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
+L(L6_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_6_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_6_bwd_loop_L1):
+ movaps -0x16(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x26(%rsi), %xmm3
+ movaps -0x36(%rsi), %xmm4
+ movaps -0x46(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $6, %xmm2, %xmm1
+ palignr $6, %xmm3, %xmm2
+ palignr $6, %xmm4, %xmm3
+ palignr $6, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_6_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_6_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_7):
+ lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x07(%rsi), %xmm1
+ jb L(L7_fwd)
+ lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
+L(L7_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_7_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_7_loop_L1):
+ sub $64, %rdx
+ movaps 0x09(%rsi), %xmm2
+ movaps 0x19(%rsi), %xmm3
+ movaps 0x29(%rsi), %xmm4
+ movaps 0x39(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $7, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $7, %xmm3, %xmm4
+ palignr $7, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $7, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_7_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_7_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_7_bwd):
+ lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x07(%rsi), %xmm1
+ jb L(L7_bwd)
+ lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
+L(L7_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_7_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_7_bwd_loop_L1):
+ movaps -0x17(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x27(%rsi), %xmm3
+ movaps -0x37(%rsi), %xmm4
+ movaps -0x47(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $7, %xmm2, %xmm1
+ palignr $7, %xmm3, %xmm2
+ palignr $7, %xmm4, %xmm3
+ palignr $7, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_7_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_7_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_8):
+ lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x08(%rsi), %xmm1
+ jb L(L8_fwd)
+ lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
+L(L8_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+L(shl_8_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_8_loop_L1):
+ sub $64, %rdx
+ movaps 0x08(%rsi), %xmm2
+ movaps 0x18(%rsi), %xmm3
+ movaps 0x28(%rsi), %xmm4
+ movaps 0x38(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $8, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $8, %xmm3, %xmm4
+ palignr $8, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $8, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_8_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+ ALIGN (4)
+L(shl_8_end):
+ lea 64(%rdx), %rdx
+ movaps %xmm4, -0x20(%rdi)
+ add %rdx, %rsi
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_8_bwd):
+ lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x08(%rsi), %xmm1
+ jb L(L8_bwd)
+ lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
+L(L8_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_8_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_8_bwd_loop_L1):
+ movaps -0x18(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x28(%rsi), %xmm3
+ movaps -0x38(%rsi), %xmm4
+ movaps -0x48(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $8, %xmm2, %xmm1
+ palignr $8, %xmm3, %xmm2
+ palignr $8, %xmm4, %xmm3
+ palignr $8, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_8_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_8_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_9):
+ lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x09(%rsi), %xmm1
+ jb L(L9_fwd)
+ lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
+L(L9_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_9_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_9_loop_L1):
+ sub $64, %rdx
+ movaps 0x07(%rsi), %xmm2
+ movaps 0x17(%rsi), %xmm3
+ movaps 0x27(%rsi), %xmm4
+ movaps 0x37(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $9, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $9, %xmm3, %xmm4
+ palignr $9, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $9, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_9_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_9_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_9_bwd):
+ lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x09(%rsi), %xmm1
+ jb L(L9_bwd)
+ lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
+L(L9_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_9_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_9_bwd_loop_L1):
+ movaps -0x19(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x29(%rsi), %xmm3
+ movaps -0x39(%rsi), %xmm4
+ movaps -0x49(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $9, %xmm2, %xmm1
+ palignr $9, %xmm3, %xmm2
+ palignr $9, %xmm4, %xmm3
+ palignr $9, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_9_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_9_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_10):
+ lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0a(%rsi), %xmm1
+ jb L(L10_fwd)
+ lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
+L(L10_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_10_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_10_loop_L1):
+ sub $64, %rdx
+ movaps 0x06(%rsi), %xmm2
+ movaps 0x16(%rsi), %xmm3
+ movaps 0x26(%rsi), %xmm4
+ movaps 0x36(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $10, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $10, %xmm3, %xmm4
+ palignr $10, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $10, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_10_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_10_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_10_bwd):
+ lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0a(%rsi), %xmm1
+ jb L(L10_bwd)
+ lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
+L(L10_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_10_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_10_bwd_loop_L1):
+ movaps -0x1a(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x2a(%rsi), %xmm3
+ movaps -0x3a(%rsi), %xmm4
+ movaps -0x4a(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $10, %xmm2, %xmm1
+ palignr $10, %xmm3, %xmm2
+ palignr $10, %xmm4, %xmm3
+ palignr $10, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_10_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_10_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_11):
+ lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0b(%rsi), %xmm1
+ jb L(L11_fwd)
+ lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
+L(L11_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_11_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_11_loop_L1):
+ sub $64, %rdx
+ movaps 0x05(%rsi), %xmm2
+ movaps 0x15(%rsi), %xmm3
+ movaps 0x25(%rsi), %xmm4
+ movaps 0x35(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $11, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $11, %xmm3, %xmm4
+ palignr $11, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $11, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_11_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_11_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_11_bwd):
+ lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0b(%rsi), %xmm1
+ jb L(L11_bwd)
+ lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
+L(L11_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_11_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_11_bwd_loop_L1):
+ movaps -0x1b(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x2b(%rsi), %xmm3
+ movaps -0x3b(%rsi), %xmm4
+ movaps -0x4b(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $11, %xmm2, %xmm1
+ palignr $11, %xmm3, %xmm2
+ palignr $11, %xmm4, %xmm3
+ palignr $11, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_11_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_11_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_12):
+ lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0c(%rsi), %xmm1
+ jb L(L12_fwd)
+ lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
+L(L12_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_12_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_12_loop_L1):
+ sub $64, %rdx
+ movaps 0x04(%rsi), %xmm2
+ movaps 0x14(%rsi), %xmm3
+ movaps 0x24(%rsi), %xmm4
+ movaps 0x34(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $12, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $12, %xmm3, %xmm4
+ palignr $12, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $12, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_12_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_12_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_12_bwd):
+ lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0c(%rsi), %xmm1
+ jb L(L12_bwd)
+ lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
+L(L12_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_12_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_12_bwd_loop_L1):
+ movaps -0x1c(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x2c(%rsi), %xmm3
+ movaps -0x3c(%rsi), %xmm4
+ movaps -0x4c(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $12, %xmm2, %xmm1
+ palignr $12, %xmm3, %xmm2
+ palignr $12, %xmm4, %xmm3
+ palignr $12, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_12_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_12_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_13):
+ lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0d(%rsi), %xmm1
+ jb L(L13_fwd)
+ lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
+L(L13_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_13_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_13_loop_L1):
+ sub $64, %rdx
+ movaps 0x03(%rsi), %xmm2
+ movaps 0x13(%rsi), %xmm3
+ movaps 0x23(%rsi), %xmm4
+ movaps 0x33(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $13, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $13, %xmm3, %xmm4
+ palignr $13, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $13, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_13_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_13_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_13_bwd):
+ lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0d(%rsi), %xmm1
+ jb L(L13_bwd)
+ lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
+L(L13_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_13_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_13_bwd_loop_L1):
+ movaps -0x1d(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x2d(%rsi), %xmm3
+ movaps -0x3d(%rsi), %xmm4
+ movaps -0x4d(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $13, %xmm2, %xmm1
+ palignr $13, %xmm3, %xmm2
+ palignr $13, %xmm4, %xmm3
+ palignr $13, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_13_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_13_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_14):
+ lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0e(%rsi), %xmm1
+ jb L(L14_fwd)
+ lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
+L(L14_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_14_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_14_loop_L1):
+ sub $64, %rdx
+ movaps 0x02(%rsi), %xmm2
+ movaps 0x12(%rsi), %xmm3
+ movaps 0x22(%rsi), %xmm4
+ movaps 0x32(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $14, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $14, %xmm3, %xmm4
+ palignr $14, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $14, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_14_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_14_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_14_bwd):
+ lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0e(%rsi), %xmm1
+ jb L(L14_bwd)
+ lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
+L(L14_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_14_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_14_bwd_loop_L1):
+ movaps -0x1e(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x2e(%rsi), %xmm3
+ movaps -0x3e(%rsi), %xmm4
+ movaps -0x4e(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $14, %xmm2, %xmm1
+ palignr $14, %xmm3, %xmm2
+ palignr $14, %xmm4, %xmm3
+ palignr $14, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_14_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_14_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_15):
+ lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0f(%rsi), %xmm1
+ jb L(L15_fwd)
+ lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
+L(L15_fwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_15_loop_L2):
+ prefetchnta 0x1c0(%rsi)
+L(shl_15_loop_L1):
+ sub $64, %rdx
+ movaps 0x01(%rsi), %xmm2
+ movaps 0x11(%rsi), %xmm3
+ movaps 0x21(%rsi), %xmm4
+ movaps 0x31(%rsi), %xmm5
+ movdqa %xmm5, %xmm6
+ palignr $15, %xmm4, %xmm5
+ lea 64(%rsi), %rsi
+ palignr $15, %xmm3, %xmm4
+ palignr $15, %xmm2, %xmm3
+ lea 64(%rdi), %rdi
+ palignr $15, %xmm1, %xmm2
+ movdqa %xmm6, %xmm1
+ movdqa %xmm2, -0x40(%rdi)
+ movaps %xmm3, -0x30(%rdi)
+ jb L(shl_15_end)
+ movaps %xmm4, -0x20(%rdi)
+ movaps %xmm5, -0x10(%rdi)
+ jmp *%r9
+ ud2
+L(shl_15_end):
+ movaps %xmm4, -0x20(%rdi)
+ lea 64(%rdx), %rdx
+ movaps %xmm5, -0x10(%rdi)
+ add %rdx, %rdi
+ movdqu %xmm0, (%r8)
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(shl_15_bwd):
+ lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
+ cmp %rcx, %rdx
+ movaps -0x0f(%rsi), %xmm1
+ jb L(L15_bwd)
+ lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
+L(L15_bwd):
+ lea -64(%rdx), %rdx
+ jmp *%r9
+ ud2
+L(shl_15_bwd_loop_L2):
+ prefetchnta -0x1c0(%rsi)
+L(shl_15_bwd_loop_L1):
+ movaps -0x1f(%rsi), %xmm2
+ sub $0x40, %rdx
+ movaps -0x2f(%rsi), %xmm3
+ movaps -0x3f(%rsi), %xmm4
+ movaps -0x4f(%rsi), %xmm5
+ lea -0x40(%rsi), %rsi
+ palignr $15, %xmm2, %xmm1
+ palignr $15, %xmm3, %xmm2
+ palignr $15, %xmm4, %xmm3
+ palignr $15, %xmm5, %xmm4
+
+ movaps %xmm1, -0x10(%rdi)
+ movaps %xmm5, %xmm1
+
+ movaps %xmm2, -0x20(%rdi)
+ lea -0x40(%rdi), %rdi
+
+ movaps %xmm3, 0x10(%rdi)
+ jb L(shl_15_bwd_end)
+ movaps %xmm4, (%rdi)
+ jmp *%r9
+ ud2
+L(shl_15_bwd_end):
+ movaps %xmm4, (%rdi)
+ lea 64(%rdx), %rdx
+ movdqu %xmm0, (%r8)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+ ALIGN (4)
+L(write_72bytes):
+ movdqu -72(%rsi), %xmm0
+ movdqu -56(%rsi), %xmm1
+ mov -40(%rsi), %r8
+ mov -32(%rsi), %r9
+ mov -24(%rsi), %r10
+ mov -16(%rsi), %r11
+ mov -8(%rsi), %rcx
+ movdqu %xmm0, -72(%rdi)
+ movdqu %xmm1, -56(%rdi)
+ mov %r8, -40(%rdi)
+ mov %r9, -32(%rdi)
+ mov %r10, -24(%rdi)
+ mov %r11, -16(%rdi)
+ mov %rcx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_64bytes):
+ movdqu -64(%rsi), %xmm0
+ mov -48(%rsi), %rcx
+ mov -40(%rsi), %r8
+ mov -32(%rsi), %r9
+ mov -24(%rsi), %r10
+ mov -16(%rsi), %r11
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -64(%rdi)
+ mov %rcx, -48(%rdi)
+ mov %r8, -40(%rdi)
+ mov %r9, -32(%rdi)
+ mov %r10, -24(%rdi)
+ mov %r11, -16(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_56bytes):
+ movdqu -56(%rsi), %xmm0
+ mov -40(%rsi), %r8
+ mov -32(%rsi), %r9
+ mov -24(%rsi), %r10
+ mov -16(%rsi), %r11
+ mov -8(%rsi), %rcx
+ movdqu %xmm0, -56(%rdi)
+ mov %r8, -40(%rdi)
+ mov %r9, -32(%rdi)
+ mov %r10, -24(%rdi)
+ mov %r11, -16(%rdi)
+ mov %rcx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_48bytes):
+ mov -48(%rsi), %rcx
+ mov -40(%rsi), %r8
+ mov -32(%rsi), %r9
+ mov -24(%rsi), %r10
+ mov -16(%rsi), %r11
+ mov -8(%rsi), %rdx
+ mov %rcx, -48(%rdi)
+ mov %r8, -40(%rdi)
+ mov %r9, -32(%rdi)
+ mov %r10, -24(%rdi)
+ mov %r11, -16(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_40bytes):
+ mov -40(%rsi), %r8
+ mov -32(%rsi), %r9
+ mov -24(%rsi), %r10
+ mov -16(%rsi), %r11
+ mov -8(%rsi), %rdx
+ mov %r8, -40(%rdi)
+ mov %r9, -32(%rdi)
+ mov %r10, -24(%rdi)
+ mov %r11, -16(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_32bytes):
+ mov -32(%rsi), %r9
+ mov -24(%rsi), %r10
+ mov -16(%rsi), %r11
+ mov -8(%rsi), %rdx
+ mov %r9, -32(%rdi)
+ mov %r10, -24(%rdi)
+ mov %r11, -16(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_24bytes):
+ mov -24(%rsi), %r10
+ mov -16(%rsi), %r11
+ mov -8(%rsi), %rdx
+ mov %r10, -24(%rdi)
+ mov %r11, -16(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_16bytes):
+ mov -16(%rsi), %r11
+ mov -8(%rsi), %rdx
+ mov %r11, -16(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_8bytes):
+ mov -8(%rsi), %rdx
+ mov %rdx, -8(%rdi)
+L(write_0bytes):
+ ret
+
+ ALIGN (4)
+L(write_73bytes):
+ movdqu -73(%rsi), %xmm0
+ movdqu -57(%rsi), %xmm1
+ mov -41(%rsi), %rcx
+ mov -33(%rsi), %r9
+ mov -25(%rsi), %r10
+ mov -17(%rsi), %r11
+ mov -9(%rsi), %r8
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -73(%rdi)
+ movdqu %xmm1, -57(%rdi)
+ mov %rcx, -41(%rdi)
+ mov %r9, -33(%rdi)
+ mov %r10, -25(%rdi)
+ mov %r11, -17(%rdi)
+ mov %r8, -9(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_65bytes):
+ movdqu -65(%rsi), %xmm0
+ movdqu -49(%rsi), %xmm1
+ mov -33(%rsi), %r9
+ mov -25(%rsi), %r10
+ mov -17(%rsi), %r11
+ mov -9(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -65(%rdi)
+ movdqu %xmm1, -49(%rdi)
+ mov %r9, -33(%rdi)
+ mov %r10, -25(%rdi)
+ mov %r11, -17(%rdi)
+ mov %rcx, -9(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_57bytes):
+ movdqu -57(%rsi), %xmm0
+ mov -41(%rsi), %r8
+ mov -33(%rsi), %r9
+ mov -25(%rsi), %r10
+ mov -17(%rsi), %r11
+ mov -9(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -57(%rdi)
+ mov %r8, -41(%rdi)
+ mov %r9, -33(%rdi)
+ mov %r10, -25(%rdi)
+ mov %r11, -17(%rdi)
+ mov %rcx, -9(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_49bytes):
+ movdqu -49(%rsi), %xmm0
+ mov -33(%rsi), %r9
+ mov -25(%rsi), %r10
+ mov -17(%rsi), %r11
+ mov -9(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -49(%rdi)
+ mov %r9, -33(%rdi)
+ mov %r10, -25(%rdi)
+ mov %r11, -17(%rdi)
+ mov %rcx, -9(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_41bytes):
+ mov -41(%rsi), %r8
+ mov -33(%rsi), %r9
+ mov -25(%rsi), %r10
+ mov -17(%rsi), %r11
+ mov -9(%rsi), %rcx
+ mov -1(%rsi), %dl
+ mov %r8, -41(%rdi)
+ mov %r9, -33(%rdi)
+ mov %r10, -25(%rdi)
+ mov %r11, -17(%rdi)
+ mov %rcx, -9(%rdi)
+ mov %dl, -1(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_33bytes):
+ mov -33(%rsi), %r9
+ mov -25(%rsi), %r10
+ mov -17(%rsi), %r11
+ mov -9(%rsi), %rcx
+ mov -1(%rsi), %dl
+ mov %r9, -33(%rdi)
+ mov %r10, -25(%rdi)
+ mov %r11, -17(%rdi)
+ mov %rcx, -9(%rdi)
+ mov %dl, -1(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_25bytes):
+ mov -25(%rsi), %r10
+ mov -17(%rsi), %r11
+ mov -9(%rsi), %rcx
+ mov -1(%rsi), %dl
+ mov %r10, -25(%rdi)
+ mov %r11, -17(%rdi)
+ mov %rcx, -9(%rdi)
+ mov %dl, -1(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_17bytes):
+ mov -17(%rsi), %r11
+ mov -9(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r11, -17(%rdi)
+ mov %rcx, -9(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_9bytes):
+ mov -9(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %rcx, -9(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_1bytes):
+ mov -1(%rsi), %dl
+ mov %dl, -1(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_74bytes):
+ movdqu -74(%rsi), %xmm0
+ movdqu -58(%rsi), %xmm1
+ mov -42(%rsi), %r8
+ mov -34(%rsi), %r9
+ mov -26(%rsi), %r10
+ mov -18(%rsi), %r11
+ mov -10(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -74(%rdi)
+ movdqu %xmm1, -58(%rdi)
+ mov %r8, -42(%rdi)
+ mov %r9, -34(%rdi)
+ mov %r10, -26(%rdi)
+ mov %r11, -18(%rdi)
+ mov %rcx, -10(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_66bytes):
+ movdqu -66(%rsi), %xmm0
+ movdqu -50(%rsi), %xmm1
+ mov -42(%rsi), %r8
+ mov -34(%rsi), %r9
+ mov -26(%rsi), %r10
+ mov -18(%rsi), %r11
+ mov -10(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -66(%rdi)
+ movdqu %xmm1, -50(%rdi)
+ mov %r8, -42(%rdi)
+ mov %r9, -34(%rdi)
+ mov %r10, -26(%rdi)
+ mov %r11, -18(%rdi)
+ mov %rcx, -10(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_58bytes):
+ movdqu -58(%rsi), %xmm1
+ mov -42(%rsi), %r8
+ mov -34(%rsi), %r9
+ mov -26(%rsi), %r10
+ mov -18(%rsi), %r11
+ mov -10(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm1, -58(%rdi)
+ mov %r8, -42(%rdi)
+ mov %r9, -34(%rdi)
+ mov %r10, -26(%rdi)
+ mov %r11, -18(%rdi)
+ mov %rcx, -10(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_50bytes):
+ movdqu -50(%rsi), %xmm0
+ mov -34(%rsi), %r9
+ mov -26(%rsi), %r10
+ mov -18(%rsi), %r11
+ mov -10(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -50(%rdi)
+ mov %r9, -34(%rdi)
+ mov %r10, -26(%rdi)
+ mov %r11, -18(%rdi)
+ mov %rcx, -10(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_42bytes):
+ mov -42(%rsi), %r8
+ mov -34(%rsi), %r9
+ mov -26(%rsi), %r10
+ mov -18(%rsi), %r11
+ mov -10(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r8, -42(%rdi)
+ mov %r9, -34(%rdi)
+ mov %r10, -26(%rdi)
+ mov %r11, -18(%rdi)
+ mov %rcx, -10(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_34bytes):
+ mov -34(%rsi), %r9
+ mov -26(%rsi), %r10
+ mov -18(%rsi), %r11
+ mov -10(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r9, -34(%rdi)
+ mov %r10, -26(%rdi)
+ mov %r11, -18(%rdi)
+ mov %rcx, -10(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_26bytes):
+ mov -26(%rsi), %r10
+ mov -18(%rsi), %r11
+ mov -10(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r10, -26(%rdi)
+ mov %r11, -18(%rdi)
+ mov %rcx, -10(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_18bytes):
+ mov -18(%rsi), %r11
+ mov -10(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r11, -18(%rdi)
+ mov %rcx, -10(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_10bytes):
+ mov -10(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %rcx, -10(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_2bytes):
+ mov -2(%rsi), %dx
+ mov %dx, -2(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_75bytes):
+ movdqu -75(%rsi), %xmm0
+ movdqu -59(%rsi), %xmm1
+ mov -43(%rsi), %r8
+ mov -35(%rsi), %r9
+ mov -27(%rsi), %r10
+ mov -19(%rsi), %r11
+ mov -11(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -75(%rdi)
+ movdqu %xmm1, -59(%rdi)
+ mov %r8, -43(%rdi)
+ mov %r9, -35(%rdi)
+ mov %r10, -27(%rdi)
+ mov %r11, -19(%rdi)
+ mov %rcx, -11(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_67bytes):
+ movdqu -67(%rsi), %xmm0
+ movdqu -59(%rsi), %xmm1
+ mov -43(%rsi), %r8
+ mov -35(%rsi), %r9
+ mov -27(%rsi), %r10
+ mov -19(%rsi), %r11
+ mov -11(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -67(%rdi)
+ movdqu %xmm1, -59(%rdi)
+ mov %r8, -43(%rdi)
+ mov %r9, -35(%rdi)
+ mov %r10, -27(%rdi)
+ mov %r11, -19(%rdi)
+ mov %rcx, -11(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_59bytes):
+ movdqu -59(%rsi), %xmm0
+ mov -43(%rsi), %r8
+ mov -35(%rsi), %r9
+ mov -27(%rsi), %r10
+ mov -19(%rsi), %r11
+ mov -11(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -59(%rdi)
+ mov %r8, -43(%rdi)
+ mov %r9, -35(%rdi)
+ mov %r10, -27(%rdi)
+ mov %r11, -19(%rdi)
+ mov %rcx, -11(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_51bytes):
+ movdqu -51(%rsi), %xmm0
+ mov -35(%rsi), %r9
+ mov -27(%rsi), %r10
+ mov -19(%rsi), %r11
+ mov -11(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -51(%rdi)
+ mov %r9, -35(%rdi)
+ mov %r10, -27(%rdi)
+ mov %r11, -19(%rdi)
+ mov %rcx, -11(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_43bytes):
+ mov -43(%rsi), %r8
+ mov -35(%rsi), %r9
+ mov -27(%rsi), %r10
+ mov -19(%rsi), %r11
+ mov -11(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r8, -43(%rdi)
+ mov %r9, -35(%rdi)
+ mov %r10, -27(%rdi)
+ mov %r11, -19(%rdi)
+ mov %rcx, -11(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_35bytes):
+ mov -35(%rsi), %r9
+ mov -27(%rsi), %r10
+ mov -19(%rsi), %r11
+ mov -11(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r9, -35(%rdi)
+ mov %r10, -27(%rdi)
+ mov %r11, -19(%rdi)
+ mov %rcx, -11(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_27bytes):
+ mov -27(%rsi), %r10
+ mov -19(%rsi), %r11
+ mov -11(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r10, -27(%rdi)
+ mov %r11, -19(%rdi)
+ mov %rcx, -11(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_19bytes):
+ mov -19(%rsi), %r11
+ mov -11(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r11, -19(%rdi)
+ mov %rcx, -11(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_11bytes):
+ mov -11(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %rcx, -11(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_3bytes):
+ mov -3(%rsi), %dx
+ mov -2(%rsi), %cx
+ mov %dx, -3(%rdi)
+ mov %cx, -2(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_76bytes):
+ movdqu -76(%rsi), %xmm0
+ movdqu -60(%rsi), %xmm1
+ mov -44(%rsi), %r8
+ mov -36(%rsi), %r9
+ mov -28(%rsi), %r10
+ mov -20(%rsi), %r11
+ mov -12(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -76(%rdi)
+ movdqu %xmm1, -60(%rdi)
+ mov %r8, -44(%rdi)
+ mov %r9, -36(%rdi)
+ mov %r10, -28(%rdi)
+ mov %r11, -20(%rdi)
+ mov %rcx, -12(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_68bytes):
+ movdqu -68(%rsi), %xmm0
+ movdqu -52(%rsi), %xmm1
+ mov -36(%rsi), %r9
+ mov -28(%rsi), %r10
+ mov -20(%rsi), %r11
+ mov -12(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -68(%rdi)
+ movdqu %xmm1, -52(%rdi)
+ mov %r9, -36(%rdi)
+ mov %r10, -28(%rdi)
+ mov %r11, -20(%rdi)
+ mov %rcx, -12(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_60bytes):
+ movdqu -60(%rsi), %xmm0
+ mov -44(%rsi), %r8
+ mov -36(%rsi), %r9
+ mov -28(%rsi), %r10
+ mov -20(%rsi), %r11
+ mov -12(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -60(%rdi)
+ mov %r8, -44(%rdi)
+ mov %r9, -36(%rdi)
+ mov %r10, -28(%rdi)
+ mov %r11, -20(%rdi)
+ mov %rcx, -12(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_52bytes):
+ movdqu -52(%rsi), %xmm0
+ mov -36(%rsi), %r9
+ mov -28(%rsi), %r10
+ mov -20(%rsi), %r11
+ mov -12(%rsi), %rcx
+ mov -4(%rsi), %edx
+ movdqu %xmm0, -52(%rdi)
+ mov %r9, -36(%rdi)
+ mov %r10, -28(%rdi)
+ mov %r11, -20(%rdi)
+ mov %rcx, -12(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_44bytes):
+ mov -44(%rsi), %r8
+ mov -36(%rsi), %r9
+ mov -28(%rsi), %r10
+ mov -20(%rsi), %r11
+ mov -12(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r8, -44(%rdi)
+ mov %r9, -36(%rdi)
+ mov %r10, -28(%rdi)
+ mov %r11, -20(%rdi)
+ mov %rcx, -12(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_36bytes):
+ mov -36(%rsi), %r9
+ mov -28(%rsi), %r10
+ mov -20(%rsi), %r11
+ mov -12(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r9, -36(%rdi)
+ mov %r10, -28(%rdi)
+ mov %r11, -20(%rdi)
+ mov %rcx, -12(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_28bytes):
+ mov -28(%rsi), %r10
+ mov -20(%rsi), %r11
+ mov -12(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r10, -28(%rdi)
+ mov %r11, -20(%rdi)
+ mov %rcx, -12(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_20bytes):
+ mov -20(%rsi), %r11
+ mov -12(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %r11, -20(%rdi)
+ mov %rcx, -12(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_12bytes):
+ mov -12(%rsi), %rcx
+ mov -4(%rsi), %edx
+ mov %rcx, -12(%rdi)
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_4bytes):
+ mov -4(%rsi), %edx
+ mov %edx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_77bytes):
+ movdqu -77(%rsi), %xmm0
+ movdqu -61(%rsi), %xmm1
+ mov -45(%rsi), %r8
+ mov -37(%rsi), %r9
+ mov -29(%rsi), %r10
+ mov -21(%rsi), %r11
+ mov -13(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -77(%rdi)
+ movdqu %xmm1, -61(%rdi)
+ mov %r8, -45(%rdi)
+ mov %r9, -37(%rdi)
+ mov %r10, -29(%rdi)
+ mov %r11, -21(%rdi)
+ mov %rcx, -13(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_69bytes):
+ movdqu -69(%rsi), %xmm0
+ movdqu -53(%rsi), %xmm1
+ mov -37(%rsi), %r9
+ mov -29(%rsi), %r10
+ mov -21(%rsi), %r11
+ mov -13(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -69(%rdi)
+ movdqu %xmm1, -53(%rdi)
+ mov %r9, -37(%rdi)
+ mov %r10, -29(%rdi)
+ mov %r11, -21(%rdi)
+ mov %rcx, -13(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_61bytes):
+ movdqu -61(%rsi), %xmm0
+ mov -45(%rsi), %r8
+ mov -37(%rsi), %r9
+ mov -29(%rsi), %r10
+ mov -21(%rsi), %r11
+ mov -13(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -61(%rdi)
+ mov %r8, -45(%rdi)
+ mov %r9, -37(%rdi)
+ mov %r10, -29(%rdi)
+ mov %r11, -21(%rdi)
+ mov %rcx, -13(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_53bytes):
+ movdqu -53(%rsi), %xmm0
+ mov -45(%rsi), %r8
+ mov -37(%rsi), %r9
+ mov -29(%rsi), %r10
+ mov -21(%rsi), %r11
+ mov -13(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -53(%rdi)
+ mov %r9, -37(%rdi)
+ mov %r10, -29(%rdi)
+ mov %r11, -21(%rdi)
+ mov %rcx, -13(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_45bytes):
+ mov -45(%rsi), %r8
+ mov -37(%rsi), %r9
+ mov -29(%rsi), %r10
+ mov -21(%rsi), %r11
+ mov -13(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r8, -45(%rdi)
+ mov %r9, -37(%rdi)
+ mov %r10, -29(%rdi)
+ mov %r11, -21(%rdi)
+ mov %rcx, -13(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_37bytes):
+ mov -37(%rsi), %r9
+ mov -29(%rsi), %r10
+ mov -21(%rsi), %r11
+ mov -13(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r9, -37(%rdi)
+ mov %r10, -29(%rdi)
+ mov %r11, -21(%rdi)
+ mov %rcx, -13(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_29bytes):
+ mov -29(%rsi), %r10
+ mov -21(%rsi), %r11
+ mov -13(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r10, -29(%rdi)
+ mov %r11, -21(%rdi)
+ mov %rcx, -13(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_21bytes):
+ mov -21(%rsi), %r11
+ mov -13(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r11, -21(%rdi)
+ mov %rcx, -13(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_13bytes):
+ mov -13(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %rcx, -13(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_5bytes):
+ mov -5(%rsi), %edx
+ mov -4(%rsi), %ecx
+ mov %edx, -5(%rdi)
+ mov %ecx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_78bytes):
+ movdqu -78(%rsi), %xmm0
+ movdqu -62(%rsi), %xmm1
+ mov -46(%rsi), %r8
+ mov -38(%rsi), %r9
+ mov -30(%rsi), %r10
+ mov -22(%rsi), %r11
+ mov -14(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -78(%rdi)
+ movdqu %xmm1, -62(%rdi)
+ mov %r8, -46(%rdi)
+ mov %r9, -38(%rdi)
+ mov %r10, -30(%rdi)
+ mov %r11, -22(%rdi)
+ mov %rcx, -14(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_70bytes):
+ movdqu -70(%rsi), %xmm0
+ movdqu -54(%rsi), %xmm1
+ mov -38(%rsi), %r9
+ mov -30(%rsi), %r10
+ mov -22(%rsi), %r11
+ mov -14(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -70(%rdi)
+ movdqu %xmm1, -54(%rdi)
+ mov %r9, -38(%rdi)
+ mov %r10, -30(%rdi)
+ mov %r11, -22(%rdi)
+ mov %rcx, -14(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_62bytes):
+ movdqu -62(%rsi), %xmm0
+ mov -46(%rsi), %r8
+ mov -38(%rsi), %r9
+ mov -30(%rsi), %r10
+ mov -22(%rsi), %r11
+ mov -14(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -62(%rdi)
+ mov %r8, -46(%rdi)
+ mov %r9, -38(%rdi)
+ mov %r10, -30(%rdi)
+ mov %r11, -22(%rdi)
+ mov %rcx, -14(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_54bytes):
+ movdqu -54(%rsi), %xmm0
+ mov -38(%rsi), %r9
+ mov -30(%rsi), %r10
+ mov -22(%rsi), %r11
+ mov -14(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -54(%rdi)
+ mov %r9, -38(%rdi)
+ mov %r10, -30(%rdi)
+ mov %r11, -22(%rdi)
+ mov %rcx, -14(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_46bytes):
+ mov -46(%rsi), %r8
+ mov -38(%rsi), %r9
+ mov -30(%rsi), %r10
+ mov -22(%rsi), %r11
+ mov -14(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r8, -46(%rdi)
+ mov %r9, -38(%rdi)
+ mov %r10, -30(%rdi)
+ mov %r11, -22(%rdi)
+ mov %rcx, -14(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_38bytes):
+ mov -38(%rsi), %r9
+ mov -30(%rsi), %r10
+ mov -22(%rsi), %r11
+ mov -14(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r9, -38(%rdi)
+ mov %r10, -30(%rdi)
+ mov %r11, -22(%rdi)
+ mov %rcx, -14(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_30bytes):
+ mov -30(%rsi), %r10
+ mov -22(%rsi), %r11
+ mov -14(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r10, -30(%rdi)
+ mov %r11, -22(%rdi)
+ mov %rcx, -14(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_22bytes):
+ mov -22(%rsi), %r11
+ mov -14(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r11, -22(%rdi)
+ mov %rcx, -14(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_14bytes):
+ mov -14(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %rcx, -14(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_6bytes):
+ mov -6(%rsi), %edx
+ mov -4(%rsi), %ecx
+ mov %edx, -6(%rdi)
+ mov %ecx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_79bytes):
+ movdqu -79(%rsi), %xmm0
+ movdqu -63(%rsi), %xmm1
+ mov -47(%rsi), %r8
+ mov -39(%rsi), %r9
+ mov -31(%rsi), %r10
+ mov -23(%rsi), %r11
+ mov -15(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -79(%rdi)
+ movdqu %xmm1, -63(%rdi)
+ mov %r8, -47(%rdi)
+ mov %r9, -39(%rdi)
+ mov %r10, -31(%rdi)
+ mov %r11, -23(%rdi)
+ mov %rcx, -15(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_71bytes):
+ movdqu -71(%rsi), %xmm0
+ movdqu -55(%rsi), %xmm1
+ mov -39(%rsi), %r9
+ mov -31(%rsi), %r10
+ mov -23(%rsi), %r11
+ mov -15(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -71(%rdi)
+ movdqu %xmm1, -55(%rdi)
+ mov %r9, -39(%rdi)
+ mov %r10, -31(%rdi)
+ mov %r11, -23(%rdi)
+ mov %rcx, -15(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_63bytes):
+ movdqu -63(%rsi), %xmm0
+ mov -47(%rsi), %r8
+ mov -39(%rsi), %r9
+ mov -31(%rsi), %r10
+ mov -23(%rsi), %r11
+ mov -15(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -63(%rdi)
+ mov %r8, -47(%rdi)
+ mov %r9, -39(%rdi)
+ mov %r10, -31(%rdi)
+ mov %r11, -23(%rdi)
+ mov %rcx, -15(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_55bytes):
+ movdqu -55(%rsi), %xmm0
+ mov -39(%rsi), %r9
+ mov -31(%rsi), %r10
+ mov -23(%rsi), %r11
+ mov -15(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ movdqu %xmm0, -55(%rdi)
+ mov %r9, -39(%rdi)
+ mov %r10, -31(%rdi)
+ mov %r11, -23(%rdi)
+ mov %rcx, -15(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_47bytes):
+ mov -47(%rsi), %r8
+ mov -39(%rsi), %r9
+ mov -31(%rsi), %r10
+ mov -23(%rsi), %r11
+ mov -15(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r8, -47(%rdi)
+ mov %r9, -39(%rdi)
+ mov %r10, -31(%rdi)
+ mov %r11, -23(%rdi)
+ mov %rcx, -15(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_39bytes):
+ mov -39(%rsi), %r9
+ mov -31(%rsi), %r10
+ mov -23(%rsi), %r11
+ mov -15(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r9, -39(%rdi)
+ mov %r10, -31(%rdi)
+ mov %r11, -23(%rdi)
+ mov %rcx, -15(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_31bytes):
+ mov -31(%rsi), %r10
+ mov -23(%rsi), %r11
+ mov -15(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r10, -31(%rdi)
+ mov %r11, -23(%rdi)
+ mov %rcx, -15(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_23bytes):
+ mov -23(%rsi), %r11
+ mov -15(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %r11, -23(%rdi)
+ mov %rcx, -15(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_15bytes):
+ mov -15(%rsi), %rcx
+ mov -8(%rsi), %rdx
+ mov %rcx, -15(%rdi)
+ mov %rdx, -8(%rdi)
+ ret
+
+ ALIGN (4)
+L(write_7bytes):
+ mov -7(%rsi), %edx
+ mov -4(%rsi), %ecx
+ mov %edx, -7(%rdi)
+ mov %ecx, -4(%rdi)
+ ret
+
+ ALIGN (4)
+L(large_page_fwd):
+ movdqu (%rsi), %xmm1
+ lea 16(%rsi), %rsi
+ movdqu %xmm0, (%r8)
+ movntdq %xmm1, (%rdi)
+ lea 16(%rdi), %rdi
+ lea -0x90(%rdx), %rdx
+#ifdef USE_AS_MEMMOVE
+ mov %rsi, %r9
+ sub %rdi, %r9
+ cmp %rdx, %r9
+ jae L(memmove_is_memcpy_fwd)
+ shl $2, %rcx
+ cmp %rcx, %rdx
+ jb L(ll_cache_copy_fwd_start)
+L(memmove_is_memcpy_fwd):
+#endif
+L(large_page_loop):
+ movdqu (%rsi), %xmm0
+ movdqu 0x10(%rsi), %xmm1
+ movdqu 0x20(%rsi), %xmm2
+ movdqu 0x30(%rsi), %xmm3
+ movdqu 0x40(%rsi), %xmm4
+ movdqu 0x50(%rsi), %xmm5
+ movdqu 0x60(%rsi), %xmm6
+ movdqu 0x70(%rsi), %xmm7
+ lea 0x80(%rsi), %rsi
+
+ sub $0x80, %rdx
+ movntdq %xmm0, (%rdi)
+ movntdq %xmm1, 0x10(%rdi)
+ movntdq %xmm2, 0x20(%rdi)
+ movntdq %xmm3, 0x30(%rdi)
+ movntdq %xmm4, 0x40(%rdi)
+ movntdq %xmm5, 0x50(%rdi)
+ movntdq %xmm6, 0x60(%rdi)
+ movntdq %xmm7, 0x70(%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(large_page_loop)
+ cmp $-0x40, %rdx
+ lea 0x80(%rdx), %rdx
+ jl L(large_page_less_64bytes)
+
+ movdqu (%rsi), %xmm0
+ movdqu 0x10(%rsi), %xmm1
+ movdqu 0x20(%rsi), %xmm2
+ movdqu 0x30(%rsi), %xmm3
+ lea 0x40(%rsi), %rsi
+
+ movntdq %xmm0, (%rdi)
+ movntdq %xmm1, 0x10(%rdi)
+ movntdq %xmm2, 0x20(%rdi)
+ movntdq %xmm3, 0x30(%rdi)
+ lea 0x40(%rdi), %rdi
+ sub $0x40, %rdx
+L(large_page_less_64bytes):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ sfence
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#ifdef USE_AS_MEMMOVE
+ ALIGN (4)
+L(ll_cache_copy_fwd_start):
+ prefetcht0 0x1c0(%rsi)
+ prefetcht0 0x200(%rsi)
+ movdqu (%rsi), %xmm0
+ movdqu 0x10(%rsi), %xmm1
+ movdqu 0x20(%rsi), %xmm2
+ movdqu 0x30(%rsi), %xmm3
+ movdqu 0x40(%rsi), %xmm4
+ movdqu 0x50(%rsi), %xmm5
+ movdqu 0x60(%rsi), %xmm6
+ movdqu 0x70(%rsi), %xmm7
+ lea 0x80(%rsi), %rsi
+
+ sub $0x80, %rdx
+ movaps %xmm0, (%rdi)
+ movaps %xmm1, 0x10(%rdi)
+ movaps %xmm2, 0x20(%rdi)
+ movaps %xmm3, 0x30(%rdi)
+ movaps %xmm4, 0x40(%rdi)
+ movaps %xmm5, 0x50(%rdi)
+ movaps %xmm6, 0x60(%rdi)
+ movaps %xmm7, 0x70(%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(ll_cache_copy_fwd_start)
+ cmp $-0x40, %rdx
+ lea 0x80(%rdx), %rdx
+ jl L(large_page_ll_less_fwd_64bytes)
+
+ movdqu (%rsi), %xmm0
+ movdqu 0x10(%rsi), %xmm1
+ movdqu 0x20(%rsi), %xmm2
+ movdqu 0x30(%rsi), %xmm3
+ lea 0x40(%rsi), %rsi
+
+ movaps %xmm0, (%rdi)
+ movaps %xmm1, 0x10(%rdi)
+ movaps %xmm2, 0x20(%rdi)
+ movaps %xmm3, 0x30(%rdi)
+ lea 0x40(%rdi), %rdi
+ sub $0x40, %rdx
+L(large_page_ll_less_fwd_64bytes):
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#endif
+ ALIGN (4)
+L(large_page_bwd):
+ movdqu -0x10(%rsi), %xmm1
+ lea -16(%rsi), %rsi
+ movdqu %xmm0, (%r8)
+ movdqa %xmm1, -0x10(%rdi)
+ lea -16(%rdi), %rdi
+ lea -0x90(%rdx), %rdx
+#ifdef USE_AS_MEMMOVE
+ mov %rdi, %r9
+ sub %rsi, %r9
+ cmp %rdx, %r9
+ jae L(memmove_is_memcpy_bwd)
+ cmp %rcx, %r9
+ jb L(ll_cache_copy_bwd_start)
+L(memmove_is_memcpy_bwd):
+#endif
+L(large_page_bwd_loop):
+ movdqu -0x10(%rsi), %xmm0
+ movdqu -0x20(%rsi), %xmm1
+ movdqu -0x30(%rsi), %xmm2
+ movdqu -0x40(%rsi), %xmm3
+ movdqu -0x50(%rsi), %xmm4
+ movdqu -0x60(%rsi), %xmm5
+ movdqu -0x70(%rsi), %xmm6
+ movdqu -0x80(%rsi), %xmm7
+ lea -0x80(%rsi), %rsi
+
+ sub $0x80, %rdx
+ movntdq %xmm0, -0x10(%rdi)
+ movntdq %xmm1, -0x20(%rdi)
+ movntdq %xmm2, -0x30(%rdi)
+ movntdq %xmm3, -0x40(%rdi)
+ movntdq %xmm4, -0x50(%rdi)
+ movntdq %xmm5, -0x60(%rdi)
+ movntdq %xmm6, -0x70(%rdi)
+ movntdq %xmm7, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+ jae L(large_page_bwd_loop)
+ cmp $-0x40, %rdx
+ lea 0x80(%rdx), %rdx
+ jl L(large_page_less_bwd_64bytes)
+
+ movdqu -0x10(%rsi), %xmm0
+ movdqu -0x20(%rsi), %xmm1
+ movdqu -0x30(%rsi), %xmm2
+ movdqu -0x40(%rsi), %xmm3
+ lea -0x40(%rsi), %rsi
+
+ movntdq %xmm0, -0x10(%rdi)
+ movntdq %xmm1, -0x20(%rdi)
+ movntdq %xmm2, -0x30(%rdi)
+ movntdq %xmm3, -0x40(%rdi)
+ lea -0x40(%rdi), %rdi
+ sub $0x40, %rdx
+L(large_page_less_bwd_64bytes):
+ sfence
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#ifdef USE_AS_MEMMOVE
+ ALIGN (4)
+L(ll_cache_copy_bwd_start):
+ prefetcht0 -0x1c0(%rsi)
+ prefetcht0 -0x200(%rsi)
+ movdqu -0x10(%rsi), %xmm0
+ movdqu -0x20(%rsi), %xmm1
+ movdqu -0x30(%rsi), %xmm2
+ movdqu -0x40(%rsi), %xmm3
+ movdqu -0x50(%rsi), %xmm4
+ movdqu -0x60(%rsi), %xmm5
+ movdqu -0x70(%rsi), %xmm6
+ movdqu -0x80(%rsi), %xmm7
+ lea -0x80(%rsi), %rsi
+
+ sub $0x80, %rdx
+ movaps %xmm0, -0x10(%rdi)
+ movaps %xmm1, -0x20(%rdi)
+ movaps %xmm2, -0x30(%rdi)
+ movaps %xmm3, -0x40(%rdi)
+ movaps %xmm4, -0x50(%rdi)
+ movaps %xmm5, -0x60(%rdi)
+ movaps %xmm6, -0x70(%rdi)
+ movaps %xmm7, -0x80(%rdi)
+ lea -0x80(%rdi), %rdi
+ jae L(ll_cache_copy_bwd_start)
+ cmp $-0x40, %rdx
+ lea 0x80(%rdx), %rdx
+ jl L(large_page_ll_less_bwd_64bytes)
+
+ movdqu -0x10(%rsi), %xmm0
+ movdqu -0x20(%rsi), %xmm1
+ movdqu -0x30(%rsi), %xmm2
+ movdqu -0x40(%rsi), %xmm3
+ lea -0x40(%rsi), %rsi
+
+ movaps %xmm0, -0x10(%rdi)
+ movaps %xmm1, -0x20(%rdi)
+ movaps %xmm2, -0x30(%rdi)
+ movaps %xmm3, -0x40(%rdi)
+ lea -0x40(%rdi), %rdi
+ sub $0x40, %rdx
+L(large_page_ll_less_bwd_64bytes):
+ BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+#endif
+
+END (MEMCPY)
+
+ .section .rodata.ssse3,"a",@progbits
+ ALIGN (3)
+L(table_less_80bytes):
+ .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
+ .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
+
+ ALIGN (3)
+L(shl_table):
+ .int JMPTBL (L(shl_0), L(shl_table))
+ .int JMPTBL (L(shl_1), L(shl_table))
+ .int JMPTBL (L(shl_2), L(shl_table))
+ .int JMPTBL (L(shl_3), L(shl_table))
+ .int JMPTBL (L(shl_4), L(shl_table))
+ .int JMPTBL (L(shl_5), L(shl_table))
+ .int JMPTBL (L(shl_6), L(shl_table))
+ .int JMPTBL (L(shl_7), L(shl_table))
+ .int JMPTBL (L(shl_8), L(shl_table))
+ .int JMPTBL (L(shl_9), L(shl_table))
+ .int JMPTBL (L(shl_10), L(shl_table))
+ .int JMPTBL (L(shl_11), L(shl_table))
+ .int JMPTBL (L(shl_12), L(shl_table))
+ .int JMPTBL (L(shl_13), L(shl_table))
+ .int JMPTBL (L(shl_14), L(shl_table))
+ .int JMPTBL (L(shl_15), L(shl_table))
+
+ ALIGN (3)
+L(shl_table_bwd):
+ .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+ .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
new file mode 100644
index 0000000..8e9fb19
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -0,0 +1,73 @@
+/* Multiple versions of memcpy
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. In static binaries we need memcpy before the initialization
+ happened. */
+#if defined SHARED && !defined NOT_IN_libc
+ .text
+ENTRY(memcpy)
+ .type memcpy, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __memcpy_sse2(%rip), %rax
+ testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jz 2f
+ leaq __memcpy_ssse3(%rip), %rax
+ testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+ jz 2f
+ leaq __memcpy_ssse3_back(%rip), %rax
+2: ret
+END(memcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __memcpy_sse2, @function; \
+ .p2align 4; \
+ __memcpy_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __memcpy_sse2, .-__memcpy_sse2
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __memcpy_chk_sse2, @function; \
+ .globl __memcpy_chk_sse2; \
+ .p2align 4; \
+ __memcpy_chk_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2
+
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memcpy calls through a PLT.
+ The speedup we get from using SSSE3 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memcpy; __GI_memcpy = __memcpy_sse2
+#endif
+
+#include "../memcpy.S"
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
new file mode 100644
index 0000000..948f61c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -0,0 +1,47 @@
+/* Multiple versions of __memcpy_chk
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. There are no multiarch memcpy functions for static binaries.
+ */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ .text
+ENTRY(__memcpy_chk)
+ .type __memcpy_chk, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __memcpy_chk_sse2(%rip), %rax
+ testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jz 2f
+ leaq __memcpy_chk_ssse3(%rip), %rax
+ testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+ jz 2f
+ leaq __memcpy_chk_ssse3_back(%rip), %rax
+2: ret
+END(__memcpy_chk)
+# else
+# include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
new file mode 100644
index 0000000..f9a4e9a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_ssse3_back
+#define MEMCPY_CHK __memmove_chk_ssse3_back
+#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
new file mode 100644
index 0000000..295430b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_ssse3
+#define MEMCPY_CHK __memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
new file mode 100644
index 0000000..bbe9627
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -0,0 +1,24 @@
+#ifndef NOT_IN_libc
+#include "init-arch.h"
+
+#define MEMMOVE __memmove_sse2
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+ __hidden_ver1 (__memmove_sse2, __GI_memmove, __memmove_sse2);
+#endif
+#endif
+
+#include "string/memmove.c"
+
+#ifndef NOT_IN_libc
+extern __typeof (__memmove_sse2) __memmove_sse2 attribute_hidden;
+extern __typeof (__memmove_sse2) __memmove_ssse3 attribute_hidden;
+extern __typeof (__memmove_sse2) __memmove_ssse3_back attribute_hidden;
+
+libc_ifunc (memmove,
+ HAS_SSSE3
+ ? (HAS_FAST_COPY_BACKWARD
+ ? __memmove_ssse3_back : __memmove_ssse3)
+ : __memmove_sse2);
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
new file mode 100644
index 0000000..a474f5f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -0,0 +1,15 @@
+#include "init-arch.h"
+
+#define MEMMOVE_CHK __memmove_chk_sse2
+
+#include "debug/memmove_chk.c"
+
+extern __typeof (__memmove_chk_sse2) __memmove_chk_sse2 attribute_hidden;
+extern __typeof (__memmove_chk_sse2) __memmove_chk_ssse3 attribute_hidden;
+extern __typeof (__memmove_chk_sse2) __memmove_chk_ssse3_back attribute_hidden;
+
+libc_ifunc (__memmove_chk,
+ HAS_SSSE3
+ ? (HAS_FAST_COPY_BACKWARD
+ ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
+ : __memmove_chk_sse2);
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
new file mode 100644
index 0000000..82ffacb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_ssse3_back
+#define MEMCPY_CHK __mempcpy_chk_ssse3_back
+#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
new file mode 100644
index 0000000..822d98e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_ssse3
+#define MEMCPY_CHK __mempcpy_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
new file mode 100644
index 0000000..e8152d6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -0,0 +1,75 @@
+/* Multiple versions of mempcpy
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. In static binaries we need mempcpy before the initialization
+ happened. */
+#if defined SHARED && !defined NOT_IN_libc
+ENTRY(__mempcpy)
+ .type __mempcpy, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __mempcpy_sse2(%rip), %rax
+ testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jz 2f
+ leaq __mempcpy_ssse3(%rip), %rax
+ testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+ jz 2f
+ leaq __mempcpy_ssse3_back(%rip), %rax
+2: ret
+END(__mempcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __mempcpy_sse2, @function; \
+ .p2align 4; \
+ __mempcpy_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+ .type __mempcpy_chk_sse2, @function; \
+ .globl __mempcpy_chk_sse2; \
+ .p2align 4; \
+ __mempcpy_chk_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+ cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2
+
+# undef libc_hidden_def
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal mempcpy calls through a PLT.
+ The speedup we get from using SSSE3 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_def(name) \
+ .globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2
+# define libc_hidden_builtin_def(name) \
+ .globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2
+#endif
+
+#include "../mempcpy.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
new file mode 100644
index 0000000..024c775
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -0,0 +1,47 @@
+/* Multiple versions of __mempcpy_chk
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+ DSO. There are no multiarch mempcpy functions for static binaries.
+ */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+ .text
+ENTRY(__mempcpy_chk)
+ .type __mempcpy_chk, @gnu_indirect_function
+ cmpl $0, KIND_OFFSET+__cpu_features(%rip)
+ jne 1f
+ call __init_cpu_features
+1: leaq __mempcpy_chk_sse2(%rip), %rax
+ testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jz 2f
+ leaq __mempcpy_chk_ssse3(%rip), %rax
+ testl $bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+ jz 2f
+ leaq __mempcpy_chk_ssse3_back(%rip), %rax
+2: ret
+END(__mempcpy_chk)
+# else
+# include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index 122270f..1d35f8f 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -58,6 +58,9 @@
cfi_endproc; \
ASM_SIZE_DIRECTIVE(name)
+#define ENTRY_CHK(name) ENTRY (name)
+#define END_CHK(name) END (name)
+
/* If compiled for profiling, call `mcount' at the start of each function. */
#ifdef PROF
/* The mcount code relies on a normal frame pointer being on the stack
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 32 +
debug/memmove_chk.c | 6 +-
string/memmove.c | 5 +-
sysdeps/x86_64/memcpy.S | 4 +-
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/bcopy.S | 7 +
sysdeps/x86_64/multiarch/init-arch.c | 9 +-
sysdeps/x86_64/multiarch/init-arch.h | 16 +-
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3169 ++++++++++++++++++++
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3139 +++++++++++++++++++
sysdeps/x86_64/multiarch/memcpy.S | 73 +
sysdeps/x86_64/multiarch/memcpy_chk.S | 47 +
sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 +
.../i686 => x86_64}/multiarch/memmove-ssse3.S | 0
sysdeps/x86_64/multiarch/memmove.c | 24 +
sysdeps/x86_64/multiarch/memmove_chk.c | 15 +
sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S | 4 +
.../i686 => x86_64}/multiarch/mempcpy-ssse3.S | 0
sysdeps/x86_64/multiarch/mempcpy.S | 75 +
sysdeps/x86_64/multiarch/mempcpy_chk.S | 47 +
sysdeps/x86_64/sysdep.h | 3 +
21 files changed, 6673 insertions(+), 10 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/bcopy.S
create mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
create mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
create mode 100644 sysdeps/x86_64/multiarch/memcpy.S
create mode 100644 sysdeps/x86_64/multiarch/memcpy_chk.S
create mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
copy sysdeps/{i386/i686 => x86_64}/multiarch/memmove-ssse3.S (100%)
create mode 100644 sysdeps/x86_64/multiarch/memmove.c
create mode 100644 sysdeps/x86_64/multiarch/memmove_chk.c
create mode 100644 sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
copy sysdeps/{i386/i686 => x86_64}/multiarch/mempcpy-ssse3.S (100%)
create mode 100644 sysdeps/x86_64/multiarch/mempcpy.S
create mode 100644 sysdeps/x86_64/multiarch/mempcpy_chk.S
hooks/post-receive
--
GNU C Library master sources