This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch, master, updated. glibc-2.12-47-g6fb8cbc


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  6fb8cbcb58a29fff73eb2101b34caa19a7f88eba (commit)
      from  d85f8ff66711fd3b1c5753330499c7403fa46d81 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6fb8cbcb58a29fff73eb2101b34caa19a7f88eba

commit 6fb8cbcb58a29fff73eb2101b34caa19a7f88eba
Author: H.J. Lu <hongjiu.lu@intel.com>
Date:   Wed Jun 30 08:26:11 2010 -0700

    Improve 64bit memcpy/memmove for Atom, Core 2 and Core i7
    
    This patch includes optimized 64bit memcpy/memmove for Atom, Core 2 and
    Core i7.  It improves memcpy by up to 3X on Atom, up to 4X on Core 2 and
    up to 1X on Core i7.  It also improves memmove by up to 3X on Atom, up to
    4X on Core 2 and up to 2X on Core i7.

diff --git a/ChangeLog b/ChangeLog
index eaf5749..175c6ed 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,35 @@
+2010-06-25  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* debug/memmove_chk.c (__memmove_chk): Renamed to ...
+	(MEMMOVE_CHK): ...this.  Default to __memmove_chk.
+	* string/memmove.c (memmove): Renamed to ...
+	(MEMMOVE): ...this.  Default to memmove.
+	* sysdeps/x86_64/memcpy.S: Use ENTRY_CHK and END_CHK.
+	* sysdeps/x86_64/sysdep.h (ENTRY_CHK): Define.
+	(END_CHK): Define.
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	memcpy-ssse3 mempcpy-ssse3 memmove-ssse3 memcpy-ssse3-back
+	mempcpy-ssse3-back memmove-ssse3-back.
+	* sysdeps/x86_64/multiarch/bcopy.S: New file .
+	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: New file.
+	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/memcpy.S: New file.
+	* sysdeps/x86_64/multiarch/memcpy_chk.S: New file.
+	* sysdeps/x86_64/multiarch/memmove-ssse3-back.S: New file.
+	* sysdeps/x86_64/multiarch/memmove-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/memmove.c: New file.
+	* sysdeps/x86_64/multiarch/memmove_chk.c: New file.
+	* sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S: New file.
+	* sysdeps/x86_64/multiarch/mempcpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/mempcpy.S: New file.
+	* sysdeps/x86_64/multiarch/mempcpy_chk.S: New file.
+	* sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Copy_Backward):
+	Define.
+	(index_Fast_Copy_Backward): Define.
+	(HAS_ARCH_FEATURE): Define.
+	(HAS_FAST_REP_STRING): Define.
+	(HAS_FAST_COPY_BACKWARD): Define.
+
 2010-06-21  Andreas Schwab  <schwab@redhat.com>
 
 	* sysdeps/unix/sysv/linux/getlogin_r.c (__getlogin_r_loginuid):
diff --git a/debug/memmove_chk.c b/debug/memmove_chk.c
index f3b74d2..6a3e157 100644
--- a/debug/memmove_chk.c
+++ b/debug/memmove_chk.c
@@ -23,8 +23,12 @@
 #include <memcopy.h>
 #include <pagecopy.h>
 
+#ifndef MEMMOVE_CHK
+# define MEMMOVE_CHK __memmove_chk
+#endif
+
 void *
-__memmove_chk (dest, src, len, destlen)
+MEMMOVE_CHK (dest, src, len, destlen)
      void *dest;
      const void *src;
      size_t len;
diff --git a/string/memmove.c b/string/memmove.c
index 16671f7..8e36e7c 100644
--- a/string/memmove.c
+++ b/string/memmove.c
@@ -37,9 +37,12 @@
 #define	rettype		void *
 #endif
 
+#ifndef MEMMOVE
+#define MEMMOVE memmove
+#endif
 
 rettype
-memmove (a1, a2, len)
+MEMMOVE (a1, a2, len)
      a1const void *a1;
      a2const void *a2;
      size_t len;
diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index b25646b..b4545ac 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -40,12 +40,12 @@
         .text
 
 #if defined PIC && !defined NOT_IN_libc
-ENTRY (__memcpy_chk)
+ENTRY_CHK (__memcpy_chk)
 
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 
-END (__memcpy_chk)
+END_CHK (__memcpy_chk)
 #endif
 
 ENTRY(memcpy)				/* (void *, const void*, size_t) */
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index c61cf70..0ca914a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -5,7 +5,9 @@ endif
 
 ifeq ($(subdir),string)
 sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
-		   strend-sse4 memcmp-sse4
+		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
+		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
+		   memmove-ssse3-back
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
new file mode 100644
index 0000000..11e250f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/bcopy.S
@@ -0,0 +1,7 @@
+#include <sysdep.h>
+
+	.text
+ENTRY(bcopy)
+	xchg	%rdi, %rsi
+	jmp	HIDDEN_BUILTIN_JUMPTARGET(memmove)
+END(bcopy)
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index f13a9f4..55c9f54 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -78,10 +78,13 @@ __init_cpu_features (void)
 	    case 0x25:
 	    case 0x2e:
 	    case 0x2f:
-	      /* Rep string instructions are fast on Intel Core i3, i5
-		 and i7.  */
+	      /* Rep string instructions and copy backward are fast on
+		 Intel Core i3, i5 and i7.  */
+#if index_Fast_Rep_String != index_Fast_Copy_Backward
+# error index_Fast_Rep_String != index_Fast_Copy_Backward
+#endif
 	      __cpu_features.feature[index_Fast_Rep_String]
-		|= bit_Fast_Rep_String;
+		|= bit_Fast_Rep_String | bit_Fast_Copy_Backward;
 	      break;
 	    }
 	}
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index b2f2de3..4a211c0 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -17,6 +17,7 @@
    02111-1307 USA.  */
 
 #define bit_Fast_Rep_String	(1 << 0)
+#define bit_Fast_Copy_Backward	(1 << 1)
 
 #ifdef	__ASSEMBLER__
 
@@ -32,7 +33,8 @@
 # define index_SSE4_1	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 # define index_SSE4_2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 
-#define index_Fast_Rep_String	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Fast_Rep_String		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Fast_Copy_Backward	FEATURE_INDEX_1*FEATURE_SIZE
 
 #else	/* __ASSEMBLER__ */
 
@@ -102,6 +104,16 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define HAS_SSE4_2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
 # define HAS_FMA	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
 
-# define index_Fast_Rep_String	FEATURE_INDEX_1
+# define index_Fast_Rep_String		FEATURE_INDEX_1
+# define index_Fast_Copy_Backward	FEATURE_INDEX_1
+
+#define HAS_ARCH_FEATURE(idx, bit) \
+  ((__get_cpu_features ()->feature[idx] & (bit)) != 0)
+
+#define HAS_FAST_REP_STRING \
+  HAS_ARCH_FEATURE (index_Fast_Rep_String, bit_Fast_Rep_String)
+
+#define HAS_FAST_COPY_BACKWARD \
+  HAS_ARCH_FEATURE (index_Fast_Copy_Backward, bit_Fast_Copy_Backward)
 
 #endif	/* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
new file mode 100644
index 0000000..48c974e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -0,0 +1,3169 @@
+/* memcpy with SSSE3 and REP string
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3_back
+# define MEMCPY_CHK	__memcpy_chk_ssse3_back
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#define JMPTBL(I, B)	I - B
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   relative offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), INDEX;			\
+  lea		(%r11, INDEX), INDEX;				\
+  jmp		*INDEX;						\
+  ud2
+
+	.section .text.ssse3,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jb	L(copy_forward)
+	je	L(bwd_write_0bytes)
+	cmp	$144, %rdx
+	jae	L(copy_backward)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+L(copy_forward):
+#endif
+	cmp	$144, %rdx
+	jae	L(144bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jbe	L(bk_write)
+#endif
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+#endif
+
+	ALIGN (4)
+L(144bytesormore):
+
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jle	L(copy_backward)
+#endif
+	movdqu	(%rsi), %xmm0
+	mov	%rdi, %r8
+	and	$-16, %rdi
+	add	$16, %rdi
+	mov	%rdi, %r9
+	sub	%r8, %r9
+	sub	%r9, %rdx
+	add	%r9, %rsi
+	mov	%rsi, %r9
+	and	$0xf, %r9
+	jz	L(shl_0)
+#ifdef DATA_CACHE_SIZE
+	mov	$DATA_CACHE_SIZE, %rcx
+#else
+	mov	__x86_64_data_cache_size(%rip), %rcx
+#endif
+	cmp	%rcx, %rdx
+	jae	L(gobble_mem_fwd)
+	lea    	L(shl_table_fwd)(%rip), %r11
+	sub	$0x80, %rdx
+	movslq	(%r11, %r9, 4), %r9
+	add	%r11, %r9
+	jmp	*%r9
+	ud2
+
+	ALIGN (4)
+L(copy_backward):
+#ifdef DATA_CACHE_SIZE
+	mov	$DATA_CACHE_SIZE, %rcx
+#else
+	mov	__x86_64_data_cache_size(%rip), %rcx
+#endif
+	shl	$1, %rcx
+	cmp	%rcx, %rdx
+	ja	L(gobble_mem_bwd)
+
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	movdqu	-16(%rsi), %xmm0
+	lea	-16(%rdi), %r8
+	mov	%rdi, %r9
+	and	$0xf, %r9
+	xor	%r9, %rdi
+	sub	%r9, %rsi
+	sub	%r9, %rdx
+	mov	%rsi, %r9
+	and	$0xf, %r9
+	jz	L(shl_0_bwd)
+	lea    	L(shl_table_bwd)(%rip), %r11
+	sub	$0x80, %rdx
+	movslq	(%r11, %r9, 4), %r9
+	add	%r11, %r9
+	jmp	*%r9
+	ud2
+
+	ALIGN (4)
+L(shl_0):
+
+	mov	%rdx, %r9
+	shr	$8, %r9
+	add	%rdx, %r9
+#ifdef DATA_CACHE_SIZE
+	cmp	$DATA_CACHE_SIZE_HALF, %r9
+#else
+	cmp	__x86_64_data_cache_size_half(%rip), %r9
+#endif
+	jae	L(gobble_mem_fwd)
+	sub	$0x80, %rdx
+	ALIGN (4)
+L(shl_0_loop):
+	movdqa	(%rsi), %xmm1
+	movdqa	%xmm1, (%rdi)
+	movaps	0x10(%rsi), %xmm2
+	movaps	%xmm2, 0x10(%rdi)
+	movaps	0x20(%rsi), %xmm3
+	movaps	%xmm3, 0x20(%rdi)
+	movaps	0x30(%rsi), %xmm4
+	movaps	%xmm4, 0x30(%rdi)
+	movaps	0x40(%rsi), %xmm1
+	movaps	%xmm1, 0x40(%rdi)
+	movaps	0x50(%rsi), %xmm2
+	movaps	%xmm2, 0x50(%rdi)
+	movaps	0x60(%rsi), %xmm3
+	movaps	%xmm3, 0x60(%rdi)
+	movaps	0x70(%rsi), %xmm4
+	movaps	%xmm4, 0x70(%rdi)
+	sub	$0x80, %rdx
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_0_loop)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_0_bwd):
+	sub	$0x80, %rdx
+L(copy_backward_loop):
+	movaps	-0x10(%rsi), %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	-0x20(%rsi), %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+	movaps	-0x30(%rsi), %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+	movaps	-0x40(%rsi), %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+	movaps	-0x50(%rsi), %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+	movaps	-0x60(%rsi), %xmm5
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	-0x70(%rsi), %xmm5
+	movaps	%xmm5, -0x70(%rdi)
+	movaps	-0x80(%rsi), %xmm5
+	movaps	%xmm5, -0x80(%rdi)
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(copy_backward_loop)
+
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_1):
+	sub	$0x80, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	movaps	0x0f(%rsi), %xmm2
+	movaps	0x1f(%rsi), %xmm3
+	movaps	0x2f(%rsi), %xmm4
+	movaps	0x3f(%rsi), %xmm5
+	movaps	0x4f(%rsi), %xmm6
+	movaps	0x5f(%rsi), %xmm7
+	movaps	0x6f(%rsi), %xmm8
+	movaps	0x7f(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$1, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$1, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$1, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$1, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$1, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$1, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_1)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_1_bwd):
+	movaps	-0x01(%rsi), %xmm1
+
+	movaps	-0x11(%rsi), %xmm2
+	palignr	$1, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x21(%rsi), %xmm3
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x31(%rsi), %xmm4
+	palignr	$1, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x41(%rsi), %xmm5
+	palignr	$1, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x51(%rsi), %xmm6
+	palignr	$1, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x61(%rsi), %xmm7
+	palignr	$1, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x71(%rsi), %xmm8
+	palignr	$1, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x81(%rsi), %xmm9
+	palignr	$1, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_1_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_2):
+	sub	$0x80, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	movaps	0x0e(%rsi), %xmm2
+	movaps	0x1e(%rsi), %xmm3
+	movaps	0x2e(%rsi), %xmm4
+	movaps	0x3e(%rsi), %xmm5
+	movaps	0x4e(%rsi), %xmm6
+	movaps	0x5e(%rsi), %xmm7
+	movaps	0x6e(%rsi), %xmm8
+	movaps	0x7e(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$2, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$2, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$2, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$2, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$2, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$2, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_2)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_2_bwd):
+	movaps	-0x02(%rsi), %xmm1
+
+	movaps	-0x12(%rsi), %xmm2
+	palignr	$2, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x22(%rsi), %xmm3
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x32(%rsi), %xmm4
+	palignr	$2, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x42(%rsi), %xmm5
+	palignr	$2, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x52(%rsi), %xmm6
+	palignr	$2, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x62(%rsi), %xmm7
+	palignr	$2, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x72(%rsi), %xmm8
+	palignr	$2, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x82(%rsi), %xmm9
+	palignr	$2, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_2_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_3):
+	sub	$0x80, %rdx
+	movaps -0x03(%rsi), %xmm1
+	movaps	0x0d(%rsi), %xmm2
+	movaps	0x1d(%rsi), %xmm3
+	movaps	0x2d(%rsi), %xmm4
+	movaps	0x3d(%rsi), %xmm5
+	movaps	0x4d(%rsi), %xmm6
+	movaps	0x5d(%rsi), %xmm7
+	movaps	0x6d(%rsi), %xmm8
+	movaps	0x7d(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$3, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$3, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$3, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$3, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$3, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$3, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_3)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_3_bwd):
+	movaps	-0x03(%rsi), %xmm1
+
+	movaps	-0x13(%rsi), %xmm2
+	palignr	$3, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x23(%rsi), %xmm3
+	palignr	$3, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x33(%rsi), %xmm4
+	palignr	$3, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x43(%rsi), %xmm5
+	palignr	$3, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x53(%rsi), %xmm6
+	palignr	$3, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x63(%rsi), %xmm7
+	palignr	$3, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x73(%rsi), %xmm8
+	palignr	$3, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x83(%rsi), %xmm9
+	palignr	$3, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_3_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_4):
+	sub	$0x80, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	movaps	0x0c(%rsi), %xmm2
+	movaps	0x1c(%rsi), %xmm3
+	movaps	0x2c(%rsi), %xmm4
+	movaps	0x3c(%rsi), %xmm5
+	movaps	0x4c(%rsi), %xmm6
+	movaps	0x5c(%rsi), %xmm7
+	movaps	0x6c(%rsi), %xmm8
+	movaps	0x7c(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$4, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$4, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$4, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$4, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$4, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$4, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$4, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_4)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_4_bwd):
+	movaps	-0x04(%rsi), %xmm1
+
+	movaps	-0x14(%rsi), %xmm2
+	palignr	$4, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x24(%rsi), %xmm3
+	palignr	$4, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x34(%rsi), %xmm4
+	palignr	$4, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x44(%rsi), %xmm5
+	palignr	$4, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x54(%rsi), %xmm6
+	palignr	$4, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x64(%rsi), %xmm7
+	palignr	$4, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x74(%rsi), %xmm8
+	palignr	$4, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x84(%rsi), %xmm9
+	palignr	$4, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_4_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_5):
+	sub	$0x80, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	movaps	0x0b(%rsi), %xmm2
+	movaps	0x1b(%rsi), %xmm3
+	movaps	0x2b(%rsi), %xmm4
+	movaps	0x3b(%rsi), %xmm5
+	movaps	0x4b(%rsi), %xmm6
+	movaps	0x5b(%rsi), %xmm7
+	movaps	0x6b(%rsi), %xmm8
+	movaps	0x7b(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$5, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$5, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$5, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$5, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$5, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$5, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$5, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$5, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_5)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_5_bwd):
+	movaps	-0x05(%rsi), %xmm1
+
+	movaps	-0x15(%rsi), %xmm2
+	palignr	$5, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x25(%rsi), %xmm3
+	palignr	$5, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x35(%rsi), %xmm4
+	palignr	$5, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x45(%rsi), %xmm5
+	palignr	$5, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x55(%rsi), %xmm6
+	palignr	$5, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x65(%rsi), %xmm7
+	palignr	$5, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x75(%rsi), %xmm8
+	palignr	$5, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x85(%rsi), %xmm9
+	palignr	$5, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_5_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_6):
+	sub	$0x80, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	movaps	0x0a(%rsi), %xmm2
+	movaps	0x1a(%rsi), %xmm3
+	movaps	0x2a(%rsi), %xmm4
+	movaps	0x3a(%rsi), %xmm5
+	movaps	0x4a(%rsi), %xmm6
+	movaps	0x5a(%rsi), %xmm7
+	movaps	0x6a(%rsi), %xmm8
+	movaps	0x7a(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$6, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$6, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$6, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$6, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$6, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$6, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$6, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$6, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_6)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_6_bwd):
+	movaps	-0x06(%rsi), %xmm1
+
+	movaps	-0x16(%rsi), %xmm2
+	palignr	$6, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x26(%rsi), %xmm3
+	palignr	$6, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x36(%rsi), %xmm4
+	palignr	$6, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x46(%rsi), %xmm5
+	palignr	$6, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x56(%rsi), %xmm6
+	palignr	$6, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x66(%rsi), %xmm7
+	palignr	$6, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x76(%rsi), %xmm8
+	palignr	$6, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x86(%rsi), %xmm9
+	palignr	$6, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_6_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_7):
+	sub	$0x80, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	movaps	0x09(%rsi), %xmm2
+	movaps	0x19(%rsi), %xmm3
+	movaps	0x29(%rsi), %xmm4
+	movaps	0x39(%rsi), %xmm5
+	movaps	0x49(%rsi), %xmm6
+	movaps	0x59(%rsi), %xmm7
+	movaps	0x69(%rsi), %xmm8
+	movaps	0x79(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$7, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$7, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$7, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$7, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$7, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$7, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$7, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$7, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_7)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_7_bwd):
+	movaps	-0x07(%rsi), %xmm1
+
+	movaps	-0x17(%rsi), %xmm2
+	palignr	$7, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x27(%rsi), %xmm3
+	palignr	$7, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x37(%rsi), %xmm4
+	palignr	$7, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x47(%rsi), %xmm5
+	palignr	$7, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x57(%rsi), %xmm6
+	palignr	$7, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x67(%rsi), %xmm7
+	palignr	$7, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x77(%rsi), %xmm8
+	palignr	$7, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x87(%rsi), %xmm9
+	palignr	$7, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_7_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_8):
+	sub	$0x80, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	movaps	0x08(%rsi), %xmm2
+	movaps	0x18(%rsi), %xmm3
+	movaps	0x28(%rsi), %xmm4
+	movaps	0x38(%rsi), %xmm5
+	movaps	0x48(%rsi), %xmm6
+	movaps	0x58(%rsi), %xmm7
+	movaps	0x68(%rsi), %xmm8
+	movaps	0x78(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$8, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$8, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$8, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$8, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$8, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$8, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$8, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$8, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_8)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_8_bwd):
+	movaps	-0x08(%rsi), %xmm1
+
+	movaps	-0x18(%rsi), %xmm2
+	palignr	$8, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x28(%rsi), %xmm3
+	palignr	$8, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x38(%rsi), %xmm4
+	palignr	$8, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x48(%rsi), %xmm5
+	palignr	$8, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x58(%rsi), %xmm6
+	palignr	$8, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x68(%rsi), %xmm7
+	palignr	$8, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x78(%rsi), %xmm8
+	palignr	$8, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x88(%rsi), %xmm9
+	palignr	$8, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_8_bwd)
+L(shl_8_end_bwd):
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_9):
+	sub	$0x80, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	movaps	0x07(%rsi), %xmm2
+	movaps	0x17(%rsi), %xmm3
+	movaps	0x27(%rsi), %xmm4
+	movaps	0x37(%rsi), %xmm5
+	movaps	0x47(%rsi), %xmm6
+	movaps	0x57(%rsi), %xmm7
+	movaps	0x67(%rsi), %xmm8
+	movaps	0x77(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$9, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$9, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$9, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$9, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$9, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$9, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$9, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$9, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_9)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_9_bwd):
+	movaps	-0x09(%rsi), %xmm1
+
+	movaps	-0x19(%rsi), %xmm2
+	palignr	$9, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x29(%rsi), %xmm3
+	palignr	$9, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x39(%rsi), %xmm4
+	palignr	$9, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x49(%rsi), %xmm5
+	palignr	$9, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x59(%rsi), %xmm6
+	palignr	$9, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x69(%rsi), %xmm7
+	palignr	$9, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x79(%rsi), %xmm8
+	palignr	$9, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x89(%rsi), %xmm9
+	palignr	$9, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_9_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_10):
+	sub	$0x80, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	movaps	0x06(%rsi), %xmm2
+	movaps	0x16(%rsi), %xmm3
+	movaps	0x26(%rsi), %xmm4
+	movaps	0x36(%rsi), %xmm5
+	movaps	0x46(%rsi), %xmm6
+	movaps	0x56(%rsi), %xmm7
+	movaps	0x66(%rsi), %xmm8
+	movaps	0x76(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$10, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$10, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$10, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$10, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$10, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$10, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$10, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$10, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_10)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_10_bwd):
+	movaps	-0x0a(%rsi), %xmm1
+
+	movaps	-0x1a(%rsi), %xmm2
+	palignr	$10, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2a(%rsi), %xmm3
+	palignr	$10, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3a(%rsi), %xmm4
+	palignr	$10, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4a(%rsi), %xmm5
+	palignr	$10, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5a(%rsi), %xmm6
+	palignr	$10, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6a(%rsi), %xmm7
+	palignr	$10, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7a(%rsi), %xmm8
+	palignr	$10, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8a(%rsi), %xmm9
+	palignr	$10, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_10_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_11):
+	sub	$0x80, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	movaps	0x05(%rsi), %xmm2
+	movaps	0x15(%rsi), %xmm3
+	movaps	0x25(%rsi), %xmm4
+	movaps	0x35(%rsi), %xmm5
+	movaps	0x45(%rsi), %xmm6
+	movaps	0x55(%rsi), %xmm7
+	movaps	0x65(%rsi), %xmm8
+	movaps	0x75(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$11, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$11, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$11, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$11, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$11, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$11, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$11, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$11, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_11)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_11_bwd):
+	movaps	-0x0b(%rsi), %xmm1
+
+	movaps	-0x1b(%rsi), %xmm2
+	palignr	$11, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2b(%rsi), %xmm3
+	palignr	$11, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3b(%rsi), %xmm4
+	palignr	$11, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4b(%rsi), %xmm5
+	palignr	$11, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5b(%rsi), %xmm6
+	palignr	$11, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6b(%rsi), %xmm7
+	palignr	$11, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7b(%rsi), %xmm8
+	palignr	$11, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8b(%rsi), %xmm9
+	palignr	$11, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_11_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_12):
+	sub	$0x80, %rdx
+	movdqa	-0x0c(%rsi), %xmm1
+	movaps	0x04(%rsi), %xmm2
+	movaps	0x14(%rsi), %xmm3
+	movaps	0x24(%rsi), %xmm4
+	movaps	0x34(%rsi), %xmm5
+	movaps	0x44(%rsi), %xmm6
+	movaps	0x54(%rsi), %xmm7
+	movaps	0x64(%rsi), %xmm8
+	movaps	0x74(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$12, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$12, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$12, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$12, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$12, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$12, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$12, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$12, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_12)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_12_bwd):
+	movaps	-0x0c(%rsi), %xmm1
+
+	movaps	-0x1c(%rsi), %xmm2
+	palignr	$12, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2c(%rsi), %xmm3
+	palignr	$12, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3c(%rsi), %xmm4
+	palignr	$12, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4c(%rsi), %xmm5
+	palignr	$12, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5c(%rsi), %xmm6
+	palignr	$12, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6c(%rsi), %xmm7
+	palignr	$12, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7c(%rsi), %xmm8
+	palignr	$12, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8c(%rsi), %xmm9
+	palignr	$12, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_12_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_13):
+	sub	$0x80, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	movaps	0x03(%rsi), %xmm2
+	movaps	0x13(%rsi), %xmm3
+	movaps	0x23(%rsi), %xmm4
+	movaps	0x33(%rsi), %xmm5
+	movaps	0x43(%rsi), %xmm6
+	movaps	0x53(%rsi), %xmm7
+	movaps	0x63(%rsi), %xmm8
+	movaps	0x73(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$13, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$13, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$13, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$13, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$13, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$13, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$13, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$13, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_13)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_13_bwd):
+	movaps	-0x0d(%rsi), %xmm1
+
+	movaps	-0x1d(%rsi), %xmm2
+	palignr	$13, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2d(%rsi), %xmm3
+	palignr	$13, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3d(%rsi), %xmm4
+	palignr	$13, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4d(%rsi), %xmm5
+	palignr	$13, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5d(%rsi), %xmm6
+	palignr	$13, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6d(%rsi), %xmm7
+	palignr	$13, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7d(%rsi), %xmm8
+	palignr	$13, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8d(%rsi), %xmm9
+	palignr	$13, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_13_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_14):
+	sub	$0x80, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	movaps	0x02(%rsi), %xmm2
+	movaps	0x12(%rsi), %xmm3
+	movaps	0x22(%rsi), %xmm4
+	movaps	0x32(%rsi), %xmm5
+	movaps	0x42(%rsi), %xmm6
+	movaps	0x52(%rsi), %xmm7
+	movaps	0x62(%rsi), %xmm8
+	movaps	0x72(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$14, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$14, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$14, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$14, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$14, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$14, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$14, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$14, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_14)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_14_bwd):
+	movaps	-0x0e(%rsi), %xmm1
+
+	movaps	-0x1e(%rsi), %xmm2
+	palignr	$14, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2e(%rsi), %xmm3
+	palignr	$14, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3e(%rsi), %xmm4
+	palignr	$14, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4e(%rsi), %xmm5
+	palignr	$14, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5e(%rsi), %xmm6
+	palignr	$14, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6e(%rsi), %xmm7
+	palignr	$14, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7e(%rsi), %xmm8
+	palignr	$14, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8e(%rsi), %xmm9
+	palignr	$14, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_14_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_15):
+	sub	$0x80, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	movaps	0x01(%rsi), %xmm2
+	movaps	0x11(%rsi), %xmm3
+	movaps	0x21(%rsi), %xmm4
+	movaps	0x31(%rsi), %xmm5
+	movaps	0x41(%rsi), %xmm6
+	movaps	0x51(%rsi), %xmm7
+	movaps	0x61(%rsi), %xmm8
+	movaps	0x71(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$15, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$15, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$15, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$15, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$15, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$15, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$15, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$15, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_15)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_15_bwd):
+	movaps	-0x0f(%rsi), %xmm1
+
+	movaps	-0x1f(%rsi), %xmm2
+	palignr	$15, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x2f(%rsi), %xmm3
+	palignr	$15, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x3f(%rsi), %xmm4
+	palignr	$15, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x4f(%rsi), %xmm5
+	palignr	$15, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x5f(%rsi), %xmm6
+	palignr	$15, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x6f(%rsi), %xmm7
+	palignr	$15, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x7f(%rsi), %xmm8
+	palignr	$15, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x8f(%rsi), %xmm9
+	palignr	$15, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_15_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(gobble_mem_fwd):
+	movdqu	(%rsi), %xmm1
+	movdqu	%xmm0, (%r8)
+	movdqa	%xmm1, (%rdi)
+	sub	$16, %rdx
+	add	$16, %rsi
+	add	$16, %rdi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+#ifdef USE_AS_MEMMOVE
+	mov	%rsi, %r9
+	sub	%rdi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_fwd)
+	cmp	%rcx, %r9
+	jbe	L(ll_cache_copy_fwd_start)
+L(memmove_is_memcpy_fwd):
+#endif
+	cmp	%rcx, %rdx
+	ja	L(bigger_in_fwd)
+	mov	%rdx, %rcx
+L(bigger_in_fwd):
+	sub	%rcx, %rdx
+	cmp	$0x1000, %rdx
+	jbe	L(ll_cache_copy_fwd)
+
+	mov	%rcx, %r9
+	shl	$3, %r9
+	cmp	%r9, %rdx
+	jbe	L(2steps_copy_fwd)
+	add	%rcx, %rdx
+	xor	%rcx, %rcx
+L(2steps_copy_fwd):
+	sub	$0x80, %rdx
+L(gobble_mem_fwd_loop):
+	sub	$0x80, %rdx
+	prefetcht0 0x200(%rsi)
+	prefetcht0 0x300(%rsi)
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lfence
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	movntdq	%xmm4, 0x40(%rdi)
+	movntdq	%xmm5, 0x50(%rdi)
+	movntdq	%xmm6, 0x60(%rdi)
+	movntdq	%xmm7, 0x70(%rdi)
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(gobble_mem_fwd_loop)
+	sfence
+	cmp	$0x80, %rcx
+	jb	L(gobble_mem_fwd_end)
+	add	$0x80, %rdx
+L(ll_cache_copy_fwd):
+	add	%rcx, %rdx
+L(ll_cache_copy_fwd_start):
+	sub	$0x80, %rdx
+L(gobble_ll_loop_fwd):
+	prefetchnta 0x1c0(%rsi)
+	prefetchnta 0x280(%rsi)
+	prefetchnta 0x1c0(%rdi)
+	prefetchnta 0x280(%rdi)
+	sub	$0x80, %rdx
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	movdqa	%xmm2, 0x20(%rdi)
+	movdqa	%xmm3, 0x30(%rdi)
+	movdqa	%xmm4, 0x40(%rdi)
+	movdqa	%xmm5, 0x50(%rdi)
+	movdqa	%xmm6, 0x60(%rdi)
+	movdqa	%xmm7, 0x70(%rdi)
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(gobble_ll_loop_fwd)
+L(gobble_mem_fwd_end):
+	add	$0x80, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(gobble_mem_bwd):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+
+	movdqu	-16(%rsi), %xmm0
+	lea	-16(%rdi), %r8
+	mov	%rdi, %r9
+	and	$-16, %rdi
+	sub	%rdi, %r9
+	sub	%r9, %rsi
+	sub	%r9, %rdx
+
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %r9
+	sub	%rsi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_bwd)
+	cmp	%rcx, %r9
+	jbe	L(ll_cache_copy_bwd_start)
+L(memmove_is_memcpy_bwd):
+#endif
+	cmp	%rcx, %rdx
+	ja	L(bigger)
+	mov	%rdx, %rcx
+L(bigger):
+	sub	%rcx, %rdx
+	cmp	$0x1000, %rdx
+	jbe	L(ll_cache_copy)
+
+	mov	%rcx, %r9
+	shl	$3, %r9
+	cmp	%r9, %rdx
+	jbe	L(2steps_copy)
+	add	%rcx, %rdx
+	xor	%rcx, %rcx
+L(2steps_copy):
+	sub	$0x80, %rdx
+L(gobble_mem_bwd_loop):
+	sub	$0x80, %rdx
+	prefetcht0 -0x200(%rsi)
+	prefetcht0 -0x300(%rsi)
+	movdqu	-0x10(%rsi), %xmm1
+	movdqu	-0x20(%rsi), %xmm2
+	movdqu	-0x30(%rsi), %xmm3
+	movdqu	-0x40(%rsi), %xmm4
+	movdqu	-0x50(%rsi), %xmm5
+	movdqu	-0x60(%rsi), %xmm6
+	movdqu	-0x70(%rsi), %xmm7
+	movdqu	-0x80(%rsi), %xmm8
+	lfence
+	movntdq	%xmm1, -0x10(%rdi)
+	movntdq	%xmm2, -0x20(%rdi)
+	movntdq	%xmm3, -0x30(%rdi)
+	movntdq	%xmm4, -0x40(%rdi)
+	movntdq	%xmm5, -0x50(%rdi)
+	movntdq	%xmm6, -0x60(%rdi)
+	movntdq	%xmm7, -0x70(%rdi)
+	movntdq	%xmm8, -0x80(%rdi)
+	lea	-0x80(%rsi), %rsi
+	lea	-0x80(%rdi), %rdi
+	jae	L(gobble_mem_bwd_loop)
+	sfence
+	cmp	$0x80, %rcx
+	jb	L(gobble_mem_bwd_end)
+	add	$0x80, %rdx
+L(ll_cache_copy):
+	add	%rcx, %rdx
+L(ll_cache_copy_bwd_start):
+	sub	$0x80, %rdx
+L(gobble_ll_loop):
+	prefetchnta -0x1c0(%rsi)
+	prefetchnta -0x280(%rsi)
+	prefetchnta -0x1c0(%rdi)
+	prefetchnta -0x280(%rdi)
+	sub	$0x80, %rdx
+	movdqu	-0x10(%rsi), %xmm1
+	movdqu	-0x20(%rsi), %xmm2
+	movdqu	-0x30(%rsi), %xmm3
+	movdqu	-0x40(%rsi), %xmm4
+	movdqu	-0x50(%rsi), %xmm5
+	movdqu	-0x60(%rsi), %xmm6
+	movdqu	-0x70(%rsi), %xmm7
+	movdqu	-0x80(%rsi), %xmm8
+	movdqa	%xmm1, -0x10(%rdi)
+	movdqa	%xmm2, -0x20(%rdi)
+	movdqa	%xmm3, -0x30(%rdi)
+	movdqa	%xmm4, -0x40(%rdi)
+	movdqa	%xmm5, -0x50(%rdi)
+	movdqa	%xmm6, -0x60(%rdi)
+	movdqa	%xmm7, -0x70(%rdi)
+	movdqa	%xmm8, -0x80(%rdi)
+	lea	-0x80(%rsi), %rsi
+	lea	-0x80(%rdi), %rdi
+	jae	L(gobble_ll_loop)
+L(gobble_mem_bwd_end):
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rsi
+	sub	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	.p2align 4
+L(fwd_write_128bytes):
+	lddqu	-128(%rsi), %xmm0
+	movdqu	%xmm0, -128(%rdi)
+L(fwd_write_112bytes):
+	lddqu	-112(%rsi), %xmm0
+	movdqu	%xmm0, -112(%rdi)
+L(fwd_write_96bytes):
+	lddqu	-96(%rsi), %xmm0
+	movdqu	%xmm0, -96(%rdi)
+L(fwd_write_80bytes):
+	lddqu	-80(%rsi), %xmm0
+	movdqu	%xmm0, -80(%rdi)
+L(fwd_write_64bytes):
+	lddqu	-64(%rsi), %xmm0
+	movdqu	%xmm0, -64(%rdi)
+L(fwd_write_48bytes):
+	lddqu	-48(%rsi), %xmm0
+	movdqu	%xmm0, -48(%rdi)
+L(fwd_write_32bytes):
+	lddqu	-32(%rsi), %xmm0
+	movdqu	%xmm0, -32(%rdi)
+L(fwd_write_16bytes):
+	lddqu	-16(%rsi), %xmm0
+	movdqu	%xmm0, -16(%rdi)
+L(fwd_write_0bytes):
+	ret
+
+
+	.p2align 4
+L(fwd_write_143bytes):
+	lddqu	-143(%rsi), %xmm0
+	movdqu	%xmm0, -143(%rdi)
+L(fwd_write_127bytes):
+	lddqu	-127(%rsi), %xmm0
+	movdqu	%xmm0, -127(%rdi)
+L(fwd_write_111bytes):
+	lddqu	-111(%rsi), %xmm0
+	movdqu	%xmm0, -111(%rdi)
+L(fwd_write_95bytes):
+	lddqu	-95(%rsi), %xmm0
+	movdqu	%xmm0, -95(%rdi)
+L(fwd_write_79bytes):
+	lddqu	-79(%rsi), %xmm0
+	movdqu	%xmm0, -79(%rdi)
+L(fwd_write_63bytes):
+	lddqu	-63(%rsi), %xmm0
+	movdqu	%xmm0, -63(%rdi)
+L(fwd_write_47bytes):
+	lddqu	-47(%rsi), %xmm0
+	movdqu	%xmm0, -47(%rdi)
+L(fwd_write_31bytes):
+	lddqu	-31(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -31(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_15bytes):
+	mov	-15(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -15(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_142bytes):
+	lddqu	-142(%rsi), %xmm0
+	movdqu	%xmm0, -142(%rdi)
+L(fwd_write_126bytes):
+	lddqu	-126(%rsi), %xmm0
+	movdqu	%xmm0, -126(%rdi)
+L(fwd_write_110bytes):
+	lddqu	-110(%rsi), %xmm0
+	movdqu	%xmm0, -110(%rdi)
+L(fwd_write_94bytes):
+	lddqu	-94(%rsi), %xmm0
+	movdqu	%xmm0, -94(%rdi)
+L(fwd_write_78bytes):
+	lddqu	-78(%rsi), %xmm0
+	movdqu	%xmm0, -78(%rdi)
+L(fwd_write_62bytes):
+	lddqu	-62(%rsi), %xmm0
+	movdqu	%xmm0, -62(%rdi)
+L(fwd_write_46bytes):
+	lddqu	-46(%rsi), %xmm0
+	movdqu	%xmm0, -46(%rdi)
+L(fwd_write_30bytes):
+	lddqu	-30(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -30(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_14bytes):
+	mov	-14(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -14(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_141bytes):
+	lddqu	-141(%rsi), %xmm0
+	movdqu	%xmm0, -141(%rdi)
+L(fwd_write_125bytes):
+	lddqu	-125(%rsi), %xmm0
+	movdqu	%xmm0, -125(%rdi)
+L(fwd_write_109bytes):
+	lddqu	-109(%rsi), %xmm0
+	movdqu	%xmm0, -109(%rdi)
+L(fwd_write_93bytes):
+	lddqu	-93(%rsi), %xmm0
+	movdqu	%xmm0, -93(%rdi)
+L(fwd_write_77bytes):
+	lddqu	-77(%rsi), %xmm0
+	movdqu	%xmm0, -77(%rdi)
+L(fwd_write_61bytes):
+	lddqu	-61(%rsi), %xmm0
+	movdqu	%xmm0, -61(%rdi)
+L(fwd_write_45bytes):
+	lddqu	-45(%rsi), %xmm0
+	movdqu	%xmm0, -45(%rdi)
+L(fwd_write_29bytes):
+	lddqu	-29(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -29(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_13bytes):
+	mov	-13(%rsi), %rdx
+	mov	-8(%rsi), %rcx
+	mov	%rdx, -13(%rdi)
+	mov	%rcx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_140bytes):
+	lddqu	-140(%rsi), %xmm0
+	movdqu	%xmm0, -140(%rdi)
+L(fwd_write_124bytes):
+	lddqu	-124(%rsi), %xmm0
+	movdqu	%xmm0, -124(%rdi)
+L(fwd_write_108bytes):
+	lddqu	-108(%rsi), %xmm0
+	movdqu	%xmm0, -108(%rdi)
+L(fwd_write_92bytes):
+	lddqu	-92(%rsi), %xmm0
+	movdqu	%xmm0, -92(%rdi)
+L(fwd_write_76bytes):
+	lddqu	-76(%rsi), %xmm0
+	movdqu	%xmm0, -76(%rdi)
+L(fwd_write_60bytes):
+	lddqu	-60(%rsi), %xmm0
+	movdqu	%xmm0, -60(%rdi)
+L(fwd_write_44bytes):
+	lddqu	-44(%rsi), %xmm0
+	movdqu	%xmm0, -44(%rdi)
+L(fwd_write_28bytes):
+	lddqu	-28(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -28(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_12bytes):
+	mov	-12(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -12(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_139bytes):
+	lddqu	-139(%rsi), %xmm0
+	movdqu	%xmm0, -139(%rdi)
+L(fwd_write_123bytes):
+	lddqu	-123(%rsi), %xmm0
+	movdqu	%xmm0, -123(%rdi)
+L(fwd_write_107bytes):
+	lddqu	-107(%rsi), %xmm0
+	movdqu	%xmm0, -107(%rdi)
+L(fwd_write_91bytes):
+	lddqu	-91(%rsi), %xmm0
+	movdqu	%xmm0, -91(%rdi)
+L(fwd_write_75bytes):
+	lddqu	-75(%rsi), %xmm0
+	movdqu	%xmm0, -75(%rdi)
+L(fwd_write_59bytes):
+	lddqu	-59(%rsi), %xmm0
+	movdqu	%xmm0, -59(%rdi)
+L(fwd_write_43bytes):
+	lddqu	-43(%rsi), %xmm0
+	movdqu	%xmm0, -43(%rdi)
+L(fwd_write_27bytes):
+	lddqu	-27(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -27(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_11bytes):
+	mov	-11(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -11(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_138bytes):
+	lddqu	-138(%rsi), %xmm0
+	movdqu	%xmm0, -138(%rdi)
+L(fwd_write_122bytes):
+	lddqu	-122(%rsi), %xmm0
+	movdqu	%xmm0, -122(%rdi)
+L(fwd_write_106bytes):
+	lddqu	-106(%rsi), %xmm0
+	movdqu	%xmm0, -106(%rdi)
+L(fwd_write_90bytes):
+	lddqu	-90(%rsi), %xmm0
+	movdqu	%xmm0, -90(%rdi)
+L(fwd_write_74bytes):
+	lddqu	-74(%rsi), %xmm0
+	movdqu	%xmm0, -74(%rdi)
+L(fwd_write_58bytes):
+	lddqu	-58(%rsi), %xmm0
+	movdqu	%xmm0, -58(%rdi)
+L(fwd_write_42bytes):
+	lddqu	-42(%rsi), %xmm0
+	movdqu	%xmm0, -42(%rdi)
+L(fwd_write_26bytes):
+	lddqu	-26(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -26(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_10bytes):
+	mov	-10(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -10(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_137bytes):
+	lddqu	-137(%rsi), %xmm0
+	movdqu	%xmm0, -137(%rdi)
+L(fwd_write_121bytes):
+	lddqu	-121(%rsi), %xmm0
+	movdqu	%xmm0, -121(%rdi)
+L(fwd_write_105bytes):
+	lddqu	-105(%rsi), %xmm0
+	movdqu	%xmm0, -105(%rdi)
+L(fwd_write_89bytes):
+	lddqu	-89(%rsi), %xmm0
+	movdqu	%xmm0, -89(%rdi)
+L(fwd_write_73bytes):
+	lddqu	-73(%rsi), %xmm0
+	movdqu	%xmm0, -73(%rdi)
+L(fwd_write_57bytes):
+	lddqu	-57(%rsi), %xmm0
+	movdqu	%xmm0, -57(%rdi)
+L(fwd_write_41bytes):
+	lddqu	-41(%rsi), %xmm0
+	movdqu	%xmm0, -41(%rdi)
+L(fwd_write_25bytes):
+	lddqu	-25(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -25(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_9bytes):
+	mov	-9(%rsi), %rdx
+	mov	-4(%rsi), %ecx
+	mov	%rdx, -9(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_136bytes):
+	lddqu	-136(%rsi), %xmm0
+	movdqu	%xmm0, -136(%rdi)
+L(fwd_write_120bytes):
+	lddqu	-120(%rsi), %xmm0
+	movdqu	%xmm0, -120(%rdi)
+L(fwd_write_104bytes):
+	lddqu	-104(%rsi), %xmm0
+	movdqu	%xmm0, -104(%rdi)
+L(fwd_write_88bytes):
+	lddqu	-88(%rsi), %xmm0
+	movdqu	%xmm0, -88(%rdi)
+L(fwd_write_72bytes):
+	lddqu	-72(%rsi), %xmm0
+	movdqu	%xmm0, -72(%rdi)
+L(fwd_write_56bytes):
+	lddqu	-56(%rsi), %xmm0
+	movdqu	%xmm0, -56(%rdi)
+L(fwd_write_40bytes):
+	lddqu	-40(%rsi), %xmm0
+	movdqu	%xmm0, -40(%rdi)
+L(fwd_write_24bytes):
+	lddqu	-24(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -24(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_8bytes):
+	mov	-8(%rsi), %rdx
+	mov	%rdx, -8(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_135bytes):
+	lddqu	-135(%rsi), %xmm0
+	movdqu	%xmm0, -135(%rdi)
+L(fwd_write_119bytes):
+	lddqu	-119(%rsi), %xmm0
+	movdqu	%xmm0, -119(%rdi)
+L(fwd_write_103bytes):
+	lddqu	-103(%rsi), %xmm0
+	movdqu	%xmm0, -103(%rdi)
+L(fwd_write_87bytes):
+	lddqu	-87(%rsi), %xmm0
+	movdqu	%xmm0, -87(%rdi)
+L(fwd_write_71bytes):
+	lddqu	-71(%rsi), %xmm0
+	movdqu	%xmm0, -71(%rdi)
+L(fwd_write_55bytes):
+	lddqu	-55(%rsi), %xmm0
+	movdqu	%xmm0, -55(%rdi)
+L(fwd_write_39bytes):
+	lddqu	-39(%rsi), %xmm0
+	movdqu	%xmm0, -39(%rdi)
+L(fwd_write_23bytes):
+	lddqu	-23(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -23(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_7bytes):
+	mov	-7(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -7(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_134bytes):
+	lddqu	-134(%rsi), %xmm0
+	movdqu	%xmm0, -134(%rdi)
+L(fwd_write_118bytes):
+	lddqu	-118(%rsi), %xmm0
+	movdqu	%xmm0, -118(%rdi)
+L(fwd_write_102bytes):
+	lddqu	-102(%rsi), %xmm0
+	movdqu	%xmm0, -102(%rdi)
+L(fwd_write_86bytes):
+	lddqu	-86(%rsi), %xmm0
+	movdqu	%xmm0, -86(%rdi)
+L(fwd_write_70bytes):
+	lddqu	-70(%rsi), %xmm0
+	movdqu	%xmm0, -70(%rdi)
+L(fwd_write_54bytes):
+	lddqu	-54(%rsi), %xmm0
+	movdqu	%xmm0, -54(%rdi)
+L(fwd_write_38bytes):
+	lddqu	-38(%rsi), %xmm0
+	movdqu	%xmm0, -38(%rdi)
+L(fwd_write_22bytes):
+	lddqu	-22(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -22(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_6bytes):
+	mov	-6(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -6(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_133bytes):
+	lddqu	-133(%rsi), %xmm0
+	movdqu	%xmm0, -133(%rdi)
+L(fwd_write_117bytes):
+	lddqu	-117(%rsi), %xmm0
+	movdqu	%xmm0, -117(%rdi)
+L(fwd_write_101bytes):
+	lddqu	-101(%rsi), %xmm0
+	movdqu	%xmm0, -101(%rdi)
+L(fwd_write_85bytes):
+	lddqu	-85(%rsi), %xmm0
+	movdqu	%xmm0, -85(%rdi)
+L(fwd_write_69bytes):
+	lddqu	-69(%rsi), %xmm0
+	movdqu	%xmm0, -69(%rdi)
+L(fwd_write_53bytes):
+	lddqu	-53(%rsi), %xmm0
+	movdqu	%xmm0, -53(%rdi)
+L(fwd_write_37bytes):
+	lddqu	-37(%rsi), %xmm0
+	movdqu	%xmm0, -37(%rdi)
+L(fwd_write_21bytes):
+	lddqu	-21(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -21(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_5bytes):
+	mov	-5(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	%edx, -5(%rdi)
+	mov	%ecx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_132bytes):
+	lddqu	-132(%rsi), %xmm0
+	movdqu	%xmm0, -132(%rdi)
+L(fwd_write_116bytes):
+	lddqu	-116(%rsi), %xmm0
+	movdqu	%xmm0, -116(%rdi)
+L(fwd_write_100bytes):
+	lddqu	-100(%rsi), %xmm0
+	movdqu	%xmm0, -100(%rdi)
+L(fwd_write_84bytes):
+	lddqu	-84(%rsi), %xmm0
+	movdqu	%xmm0, -84(%rdi)
+L(fwd_write_68bytes):
+	lddqu	-68(%rsi), %xmm0
+	movdqu	%xmm0, -68(%rdi)
+L(fwd_write_52bytes):
+	lddqu	-52(%rsi), %xmm0
+	movdqu	%xmm0, -52(%rdi)
+L(fwd_write_36bytes):
+	lddqu	-36(%rsi), %xmm0
+	movdqu	%xmm0, -36(%rdi)
+L(fwd_write_20bytes):
+	lddqu	-20(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -20(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_4bytes):
+	mov	-4(%rsi), %edx
+	mov	%edx, -4(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_131bytes):
+	lddqu	-131(%rsi), %xmm0
+	movdqu	%xmm0, -131(%rdi)
+L(fwd_write_115bytes):
+	lddqu	-115(%rsi), %xmm0
+	movdqu	%xmm0, -115(%rdi)
+L(fwd_write_99bytes):
+	lddqu	-99(%rsi), %xmm0
+	movdqu	%xmm0, -99(%rdi)
+L(fwd_write_83bytes):
+	lddqu	-83(%rsi), %xmm0
+	movdqu	%xmm0, -83(%rdi)
+L(fwd_write_67bytes):
+	lddqu	-67(%rsi), %xmm0
+	movdqu	%xmm0, -67(%rdi)
+L(fwd_write_51bytes):
+	lddqu	-51(%rsi), %xmm0
+	movdqu	%xmm0, -51(%rdi)
+L(fwd_write_35bytes):
+	lddqu	-35(%rsi), %xmm0
+	movdqu	%xmm0, -35(%rdi)
+L(fwd_write_19bytes):
+	lddqu	-19(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -19(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_3bytes):
+	mov	-3(%rsi), %dx
+	mov	-2(%rsi), %cx
+	mov	%dx, -3(%rdi)
+	mov	%cx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_130bytes):
+	lddqu	-130(%rsi), %xmm0
+	movdqu	%xmm0, -130(%rdi)
+L(fwd_write_114bytes):
+	lddqu	-114(%rsi), %xmm0
+	movdqu	%xmm0, -114(%rdi)
+L(fwd_write_98bytes):
+	lddqu	-98(%rsi), %xmm0
+	movdqu	%xmm0, -98(%rdi)
+L(fwd_write_82bytes):
+	lddqu	-82(%rsi), %xmm0
+	movdqu	%xmm0, -82(%rdi)
+L(fwd_write_66bytes):
+	lddqu	-66(%rsi), %xmm0
+	movdqu	%xmm0, -66(%rdi)
+L(fwd_write_50bytes):
+	lddqu	-50(%rsi), %xmm0
+	movdqu	%xmm0, -50(%rdi)
+L(fwd_write_34bytes):
+	lddqu	-34(%rsi), %xmm0
+	movdqu	%xmm0, -34(%rdi)
+L(fwd_write_18bytes):
+	lddqu	-18(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -18(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_2bytes):
+	movzwl	-2(%rsi), %edx
+	mov	%dx, -2(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_129bytes):
+	lddqu	-129(%rsi), %xmm0
+	movdqu	%xmm0, -129(%rdi)
+L(fwd_write_113bytes):
+	lddqu	-113(%rsi), %xmm0
+	movdqu	%xmm0, -113(%rdi)
+L(fwd_write_97bytes):
+	lddqu	-97(%rsi), %xmm0
+	movdqu	%xmm0, -97(%rdi)
+L(fwd_write_81bytes):
+	lddqu	-81(%rsi), %xmm0
+	movdqu	%xmm0, -81(%rdi)
+L(fwd_write_65bytes):
+	lddqu	-65(%rsi), %xmm0
+	movdqu	%xmm0, -65(%rdi)
+L(fwd_write_49bytes):
+	lddqu	-49(%rsi), %xmm0
+	movdqu	%xmm0, -49(%rdi)
+L(fwd_write_33bytes):
+	lddqu	-33(%rsi), %xmm0
+	movdqu	%xmm0, -33(%rdi)
+L(fwd_write_17bytes):
+	lddqu	-17(%rsi), %xmm0
+	lddqu	-16(%rsi), %xmm1
+	movdqu	%xmm0, -17(%rdi)
+	movdqu	%xmm1, -16(%rdi)
+	ret
+
+	.p2align 4
+L(fwd_write_1bytes):
+	movzbl	-1(%rsi), %edx
+	mov	%dl, -1(%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_128bytes):
+	lddqu	112(%rsi), %xmm0
+	movdqu	%xmm0, 112(%rdi)
+L(bwd_write_112bytes):
+	lddqu	96(%rsi), %xmm0
+	movdqu	%xmm0, 96(%rdi)
+L(bwd_write_96bytes):
+	lddqu	80(%rsi), %xmm0
+	movdqu	%xmm0, 80(%rdi)
+L(bwd_write_80bytes):
+	lddqu	64(%rsi), %xmm0
+	movdqu	%xmm0, 64(%rdi)
+L(bwd_write_64bytes):
+	lddqu	48(%rsi), %xmm0
+	movdqu	%xmm0, 48(%rdi)
+L(bwd_write_48bytes):
+	lddqu	32(%rsi), %xmm0
+	movdqu	%xmm0, 32(%rdi)
+L(bwd_write_32bytes):
+	lddqu	16(%rsi), %xmm0
+	movdqu	%xmm0, 16(%rdi)
+L(bwd_write_16bytes):
+	lddqu	(%rsi), %xmm0
+	movdqu	%xmm0, (%rdi)
+L(bwd_write_0bytes):
+	ret
+
+	.p2align 4
+L(bwd_write_143bytes):
+	lddqu	127(%rsi), %xmm0
+	movdqu	%xmm0, 127(%rdi)
+L(bwd_write_127bytes):
+	lddqu	111(%rsi), %xmm0
+	movdqu	%xmm0, 111(%rdi)
+L(bwd_write_111bytes):
+	lddqu	95(%rsi), %xmm0
+	movdqu	%xmm0, 95(%rdi)
+L(bwd_write_95bytes):
+	lddqu	79(%rsi), %xmm0
+	movdqu	%xmm0, 79(%rdi)
+L(bwd_write_79bytes):
+	lddqu	63(%rsi), %xmm0
+	movdqu	%xmm0, 63(%rdi)
+L(bwd_write_63bytes):
+	lddqu	47(%rsi), %xmm0
+	movdqu	%xmm0, 47(%rdi)
+L(bwd_write_47bytes):
+	lddqu	31(%rsi), %xmm0
+	movdqu	%xmm0, 31(%rdi)
+L(bwd_write_31bytes):
+	lddqu	15(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 15(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+
+	.p2align 4
+L(bwd_write_15bytes):
+	mov	7(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 7(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_142bytes):
+	lddqu	126(%rsi), %xmm0
+	movdqu	%xmm0, 126(%rdi)
+L(bwd_write_126bytes):
+	lddqu	110(%rsi), %xmm0
+	movdqu	%xmm0, 110(%rdi)
+L(bwd_write_110bytes):
+	lddqu	94(%rsi), %xmm0
+	movdqu	%xmm0, 94(%rdi)
+L(bwd_write_94bytes):
+	lddqu	78(%rsi), %xmm0
+	movdqu	%xmm0, 78(%rdi)
+L(bwd_write_78bytes):
+	lddqu	62(%rsi), %xmm0
+	movdqu	%xmm0, 62(%rdi)
+L(bwd_write_62bytes):
+	lddqu	46(%rsi), %xmm0
+	movdqu	%xmm0, 46(%rdi)
+L(bwd_write_46bytes):
+	lddqu	30(%rsi), %xmm0
+	movdqu	%xmm0, 30(%rdi)
+L(bwd_write_30bytes):
+	lddqu	14(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 14(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_14bytes):
+	mov	6(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 6(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_141bytes):
+	lddqu	125(%rsi), %xmm0
+	movdqu	%xmm0, 125(%rdi)
+L(bwd_write_125bytes):
+	lddqu	109(%rsi), %xmm0
+	movdqu	%xmm0, 109(%rdi)
+L(bwd_write_109bytes):
+	lddqu	93(%rsi), %xmm0
+	movdqu	%xmm0, 93(%rdi)
+L(bwd_write_93bytes):
+	lddqu	77(%rsi), %xmm0
+	movdqu	%xmm0, 77(%rdi)
+L(bwd_write_77bytes):
+	lddqu	61(%rsi), %xmm0
+	movdqu	%xmm0, 61(%rdi)
+L(bwd_write_61bytes):
+	lddqu	45(%rsi), %xmm0
+	movdqu	%xmm0, 45(%rdi)
+L(bwd_write_45bytes):
+	lddqu	29(%rsi), %xmm0
+	movdqu	%xmm0, 29(%rdi)
+L(bwd_write_29bytes):
+	lddqu	13(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 13(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_13bytes):
+	mov	5(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 5(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_140bytes):
+	lddqu	124(%rsi), %xmm0
+	movdqu	%xmm0, 124(%rdi)
+L(bwd_write_124bytes):
+	lddqu	108(%rsi), %xmm0
+	movdqu	%xmm0, 108(%rdi)
+L(bwd_write_108bytes):
+	lddqu	92(%rsi), %xmm0
+	movdqu	%xmm0, 92(%rdi)
+L(bwd_write_92bytes):
+	lddqu	76(%rsi), %xmm0
+	movdqu	%xmm0, 76(%rdi)
+L(bwd_write_76bytes):
+	lddqu	60(%rsi), %xmm0
+	movdqu	%xmm0, 60(%rdi)
+L(bwd_write_60bytes):
+	lddqu	44(%rsi), %xmm0
+	movdqu	%xmm0, 44(%rdi)
+L(bwd_write_44bytes):
+	lddqu	28(%rsi), %xmm0
+	movdqu	%xmm0, 28(%rdi)
+L(bwd_write_28bytes):
+	lddqu	12(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 12(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_12bytes):
+	mov	4(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 4(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_139bytes):
+	lddqu	123(%rsi), %xmm0
+	movdqu	%xmm0, 123(%rdi)
+L(bwd_write_123bytes):
+	lddqu	107(%rsi), %xmm0
+	movdqu	%xmm0, 107(%rdi)
+L(bwd_write_107bytes):
+	lddqu	91(%rsi), %xmm0
+	movdqu	%xmm0, 91(%rdi)
+L(bwd_write_91bytes):
+	lddqu	75(%rsi), %xmm0
+	movdqu	%xmm0, 75(%rdi)
+L(bwd_write_75bytes):
+	lddqu	59(%rsi), %xmm0
+	movdqu	%xmm0, 59(%rdi)
+L(bwd_write_59bytes):
+	lddqu	43(%rsi), %xmm0
+	movdqu	%xmm0, 43(%rdi)
+L(bwd_write_43bytes):
+	lddqu	27(%rsi), %xmm0
+	movdqu	%xmm0, 27(%rdi)
+L(bwd_write_27bytes):
+	lddqu	11(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 11(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_11bytes):
+	mov	3(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 3(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_138bytes):
+	lddqu	122(%rsi), %xmm0
+	movdqu	%xmm0, 122(%rdi)
+L(bwd_write_122bytes):
+	lddqu	106(%rsi), %xmm0
+	movdqu	%xmm0, 106(%rdi)
+L(bwd_write_106bytes):
+	lddqu	90(%rsi), %xmm0
+	movdqu	%xmm0, 90(%rdi)
+L(bwd_write_90bytes):
+	lddqu	74(%rsi), %xmm0
+	movdqu	%xmm0, 74(%rdi)
+L(bwd_write_74bytes):
+	lddqu	58(%rsi), %xmm0
+	movdqu	%xmm0, 58(%rdi)
+L(bwd_write_58bytes):
+	lddqu	42(%rsi), %xmm0
+	movdqu	%xmm0, 42(%rdi)
+L(bwd_write_42bytes):
+	lddqu	26(%rsi), %xmm0
+	movdqu	%xmm0, 26(%rdi)
+L(bwd_write_26bytes):
+	lddqu	10(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 10(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_10bytes):
+	mov	2(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 2(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_137bytes):
+	lddqu	121(%rsi), %xmm0
+	movdqu	%xmm0, 121(%rdi)
+L(bwd_write_121bytes):
+	lddqu	105(%rsi), %xmm0
+	movdqu	%xmm0, 105(%rdi)
+L(bwd_write_105bytes):
+	lddqu	89(%rsi), %xmm0
+	movdqu	%xmm0, 89(%rdi)
+L(bwd_write_89bytes):
+	lddqu	73(%rsi), %xmm0
+	movdqu	%xmm0, 73(%rdi)
+L(bwd_write_73bytes):
+	lddqu	57(%rsi), %xmm0
+	movdqu	%xmm0, 57(%rdi)
+L(bwd_write_57bytes):
+	lddqu	41(%rsi), %xmm0
+	movdqu	%xmm0, 41(%rdi)
+L(bwd_write_41bytes):
+	lddqu	25(%rsi), %xmm0
+	movdqu	%xmm0, 25(%rdi)
+L(bwd_write_25bytes):
+	lddqu	9(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 9(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_9bytes):
+	mov	1(%rsi), %rdx
+	mov	(%rsi), %rcx
+	mov	%rdx, 1(%rdi)
+	mov	%rcx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_136bytes):
+	lddqu	120(%rsi), %xmm0
+	movdqu	%xmm0, 120(%rdi)
+L(bwd_write_120bytes):
+	lddqu	104(%rsi), %xmm0
+	movdqu	%xmm0, 104(%rdi)
+L(bwd_write_104bytes):
+	lddqu	88(%rsi), %xmm0
+	movdqu	%xmm0, 88(%rdi)
+L(bwd_write_88bytes):
+	lddqu	72(%rsi), %xmm0
+	movdqu	%xmm0, 72(%rdi)
+L(bwd_write_72bytes):
+	lddqu	56(%rsi), %xmm0
+	movdqu	%xmm0, 56(%rdi)
+L(bwd_write_56bytes):
+	lddqu	40(%rsi), %xmm0
+	movdqu	%xmm0, 40(%rdi)
+L(bwd_write_40bytes):
+	lddqu	24(%rsi), %xmm0
+	movdqu	%xmm0, 24(%rdi)
+L(bwd_write_24bytes):
+	lddqu	8(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 8(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_8bytes):
+	mov	(%rsi), %rdx
+	mov	%rdx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_135bytes):
+	lddqu	119(%rsi), %xmm0
+	movdqu	%xmm0, 119(%rdi)
+L(bwd_write_119bytes):
+	lddqu	103(%rsi), %xmm0
+	movdqu	%xmm0, 103(%rdi)
+L(bwd_write_103bytes):
+	lddqu	87(%rsi), %xmm0
+	movdqu	%xmm0, 87(%rdi)
+L(bwd_write_87bytes):
+	lddqu	71(%rsi), %xmm0
+	movdqu	%xmm0, 71(%rdi)
+L(bwd_write_71bytes):
+	lddqu	55(%rsi), %xmm0
+	movdqu	%xmm0, 55(%rdi)
+L(bwd_write_55bytes):
+	lddqu	39(%rsi), %xmm0
+	movdqu	%xmm0, 39(%rdi)
+L(bwd_write_39bytes):
+	lddqu	23(%rsi), %xmm0
+	movdqu	%xmm0, 23(%rdi)
+L(bwd_write_23bytes):
+	lddqu	7(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 7(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_7bytes):
+	mov	3(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 3(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_134bytes):
+	lddqu	118(%rsi), %xmm0
+	movdqu	%xmm0, 118(%rdi)
+L(bwd_write_118bytes):
+	lddqu	102(%rsi), %xmm0
+	movdqu	%xmm0, 102(%rdi)
+L(bwd_write_102bytes):
+	lddqu	86(%rsi), %xmm0
+	movdqu	%xmm0, 86(%rdi)
+L(bwd_write_86bytes):
+	lddqu	70(%rsi), %xmm0
+	movdqu	%xmm0, 70(%rdi)
+L(bwd_write_70bytes):
+	lddqu	54(%rsi), %xmm0
+	movdqu	%xmm0, 54(%rdi)
+L(bwd_write_54bytes):
+	lddqu	38(%rsi), %xmm0
+	movdqu	%xmm0, 38(%rdi)
+L(bwd_write_38bytes):
+	lddqu	22(%rsi), %xmm0
+	movdqu	%xmm0, 22(%rdi)
+L(bwd_write_22bytes):
+	lddqu	6(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 6(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_6bytes):
+	mov	2(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 2(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_133bytes):
+	lddqu	117(%rsi), %xmm0
+	movdqu	%xmm0, 117(%rdi)
+L(bwd_write_117bytes):
+	lddqu	101(%rsi), %xmm0
+	movdqu	%xmm0, 101(%rdi)
+L(bwd_write_101bytes):
+	lddqu	85(%rsi), %xmm0
+	movdqu	%xmm0, 85(%rdi)
+L(bwd_write_85bytes):
+	lddqu	69(%rsi), %xmm0
+	movdqu	%xmm0, 69(%rdi)
+L(bwd_write_69bytes):
+	lddqu	53(%rsi), %xmm0
+	movdqu	%xmm0, 53(%rdi)
+L(bwd_write_53bytes):
+	lddqu	37(%rsi), %xmm0
+	movdqu	%xmm0, 37(%rdi)
+L(bwd_write_37bytes):
+	lddqu	21(%rsi), %xmm0
+	movdqu	%xmm0, 21(%rdi)
+L(bwd_write_21bytes):
+	lddqu	5(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 5(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_5bytes):
+	mov	1(%rsi), %edx
+	mov	(%rsi), %ecx
+	mov	%edx, 1(%rdi)
+	mov	%ecx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_132bytes):
+	lddqu	116(%rsi), %xmm0
+	movdqu	%xmm0, 116(%rdi)
+L(bwd_write_116bytes):
+	lddqu	100(%rsi), %xmm0
+	movdqu	%xmm0, 100(%rdi)
+L(bwd_write_100bytes):
+	lddqu	84(%rsi), %xmm0
+	movdqu	%xmm0, 84(%rdi)
+L(bwd_write_84bytes):
+	lddqu	68(%rsi), %xmm0
+	movdqu	%xmm0, 68(%rdi)
+L(bwd_write_68bytes):
+	lddqu	52(%rsi), %xmm0
+	movdqu	%xmm0, 52(%rdi)
+L(bwd_write_52bytes):
+	lddqu	36(%rsi), %xmm0
+	movdqu	%xmm0, 36(%rdi)
+L(bwd_write_36bytes):
+	lddqu	20(%rsi), %xmm0
+	movdqu	%xmm0, 20(%rdi)
+L(bwd_write_20bytes):
+	lddqu	4(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 4(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_4bytes):
+	mov	(%rsi), %edx
+	mov	%edx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_131bytes):
+	lddqu	115(%rsi), %xmm0
+	movdqu	%xmm0, 115(%rdi)
+L(bwd_write_115bytes):
+	lddqu	99(%rsi), %xmm0
+	movdqu	%xmm0, 99(%rdi)
+L(bwd_write_99bytes):
+	lddqu	83(%rsi), %xmm0
+	movdqu	%xmm0, 83(%rdi)
+L(bwd_write_83bytes):
+	lddqu	67(%rsi), %xmm0
+	movdqu	%xmm0, 67(%rdi)
+L(bwd_write_67bytes):
+	lddqu	51(%rsi), %xmm0
+	movdqu	%xmm0, 51(%rdi)
+L(bwd_write_51bytes):
+	lddqu	35(%rsi), %xmm0
+	movdqu	%xmm0, 35(%rdi)
+L(bwd_write_35bytes):
+	lddqu	19(%rsi), %xmm0
+	movdqu	%xmm0, 19(%rdi)
+L(bwd_write_19bytes):
+	lddqu	3(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 3(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_3bytes):
+	mov	1(%rsi), %dx
+	mov	(%rsi), %cx
+	mov	%dx, 1(%rdi)
+	mov	%cx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_130bytes):
+	lddqu	114(%rsi), %xmm0
+	movdqu	%xmm0, 114(%rdi)
+L(bwd_write_114bytes):
+	lddqu	98(%rsi), %xmm0
+	movdqu	%xmm0, 98(%rdi)
+L(bwd_write_98bytes):
+	lddqu	82(%rsi), %xmm0
+	movdqu	%xmm0, 82(%rdi)
+L(bwd_write_82bytes):
+	lddqu	66(%rsi), %xmm0
+	movdqu	%xmm0, 66(%rdi)
+L(bwd_write_66bytes):
+	lddqu	50(%rsi), %xmm0
+	movdqu	%xmm0, 50(%rdi)
+L(bwd_write_50bytes):
+	lddqu	34(%rsi), %xmm0
+	movdqu	%xmm0, 34(%rdi)
+L(bwd_write_34bytes):
+	lddqu	18(%rsi), %xmm0
+	movdqu	%xmm0, 18(%rdi)
+L(bwd_write_18bytes):
+	lddqu	2(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 2(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_2bytes):
+	movzwl	(%rsi), %edx
+	mov	%dx, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_129bytes):
+	lddqu	113(%rsi), %xmm0
+	movdqu	%xmm0, 113(%rdi)
+L(bwd_write_113bytes):
+	lddqu	97(%rsi), %xmm0
+	movdqu	%xmm0, 97(%rdi)
+L(bwd_write_97bytes):
+	lddqu	81(%rsi), %xmm0
+	movdqu	%xmm0, 81(%rdi)
+L(bwd_write_81bytes):
+	lddqu	65(%rsi), %xmm0
+	movdqu	%xmm0, 65(%rdi)
+L(bwd_write_65bytes):
+	lddqu	49(%rsi), %xmm0
+	movdqu	%xmm0, 49(%rdi)
+L(bwd_write_49bytes):
+	lddqu	33(%rsi), %xmm0
+	movdqu	%xmm0, 33(%rdi)
+L(bwd_write_33bytes):
+	lddqu	17(%rsi), %xmm0
+	movdqu	%xmm0, 17(%rdi)
+L(bwd_write_17bytes):
+	lddqu	1(%rsi), %xmm0
+	lddqu	(%rsi), %xmm1
+	movdqu	%xmm0, 1(%rdi)
+	movdqu	%xmm1, (%rdi)
+	ret
+
+	.p2align 4
+L(bwd_write_1bytes):
+	movzbl	(%rsi), %edx
+	mov	%dl, (%rdi)
+	ret
+
+END (MEMCPY)
+
+	.section .rodata.ssse3,"a",@progbits
+	ALIGN (3)
+L(table_144_bytes_bwd):
+	.int	JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
+	.int	JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
+
+	ALIGN (3)
+L(table_144_bytes_fwd):
+	.int	JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
+	.int	JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
+
+	ALIGN (3)
+L(shl_table_fwd):
+	.int	JMPTBL (L(shl_0), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_1), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_2), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_3), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_4), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_5), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_6), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_7), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_8), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_9), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_10), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_11), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_12), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_13), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_14), L(shl_table_fwd))
+	.int	JMPTBL (L(shl_15), L(shl_table_fwd))
+
+	ALIGN (3)
+L(shl_table_bwd):
+	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
new file mode 100644
index 0000000..9a878d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -0,0 +1,3139 @@
+/* memcpy with SSSE3
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3
+# define MEMCPY_CHK	__memcpy_chk_ssse3
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#define JMPTBL(I, B)	I - B
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   relative offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), INDEX;			\
+  lea		(%r11, INDEX), INDEX;				\
+  jmp		*INDEX;						\
+  ud2
+
+	.section .text.ssse3,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jb	L(copy_forward)
+	je	L(write_0bytes)
+	cmp	$79, %rdx
+	jbe	L(copy_forward)
+	jmp	L(copy_backward)
+L(copy_forward):
+#endif
+	cmp	$79, %rdx
+	lea     L(table_less_80bytes)(%rip), %r11
+	ja	L(80bytesormore)
+	movslq	(%r11, %rdx, 4), %r9
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	add	%r11, %r9
+	jmp	*%r9
+	ud2
+
+	ALIGN (4)
+L(80bytesormore):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jle	L(copy_backward)
+#endif
+
+	movdqu	(%rsi), %xmm0
+	mov	%rdi, %rcx
+	and	$-16, %rdi
+	add	$16, %rdi
+	mov	%rcx, %r8
+	sub	%rdi, %rcx
+	add	%rcx, %rdx
+	sub	%rcx, %rsi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+	cmp	%rcx, %rdx
+	mov	%rsi, %r9
+	ja	L(large_page_fwd)
+	and	$0xf, %r9
+	jz	L(shl_0)
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_64_data_cache_size_half(%rip), %rcx
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
+
+	ALIGN (4)
+L(copy_backward):
+	movdqu	-16(%rsi, %rdx), %xmm0
+	add	%rdx, %rsi
+	lea	-16(%rdi, %rdx), %r8
+	add	%rdx, %rdi
+
+	mov	%rdi, %rcx
+	and	$0xf, %rcx
+	xor	%rcx, %rdi
+	sub	%rcx, %rdx
+	sub	%rcx, %rsi
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+
+	cmp	%rcx, %rdx
+	mov	%rsi, %r9
+	ja	L(large_page_bwd)
+	and	$0xf, %r9
+	jz	L(shl_0_bwd)
+#ifdef DATA_CACHE_SIZE_HALF
+	mov	$DATA_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_64_data_cache_size_half(%rip), %rcx
+#endif
+	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
+
+	ALIGN (4)
+L(shl_0):
+	sub	$16, %rdx
+	movdqa	(%rsi), %xmm1
+	add	$16, %rsi
+	movdqa	%xmm1, (%rdi)
+	add	$16, %rdi
+	cmp	$128, %rdx
+	movdqu	%xmm0, (%r8)
+	ja	L(shl_0_gobble)
+	cmp	$64, %rdx
+	jb	L(shl_0_less_64bytes)
+	movaps	(%rsi), %xmm4
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	%xmm4, (%rdi)
+	movaps	%xmm1, 16(%rdi)
+	movaps	%xmm2, 32(%rdi)
+	movaps	%xmm3, 48(%rdi)
+	sub	$64, %rdx
+	add	$64, %rsi
+	add	$64, %rdi
+L(shl_0_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_0_gobble):
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %rdx
+#else
+	cmp	__x86_64_data_cache_size_half(%rip), %rdx
+#endif
+	lea	-128(%rdx), %rdx
+	jae	L(shl_0_gobble_mem_loop)
+L(shl_0_gobble_cache_loop):
+	movdqa	(%rsi), %xmm4
+	movaps	0x10(%rsi), %xmm1
+	movaps	0x20(%rsi), %xmm2
+	movaps	0x30(%rsi), %xmm3
+
+	movdqa	%xmm4, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+
+	sub	$128, %rdx
+	movaps	0x40(%rsi), %xmm4
+	movaps	0x50(%rsi), %xmm5
+	movaps	0x60(%rsi), %xmm6
+	movaps	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+	movaps	%xmm4, 0x40(%rdi)
+	movaps	%xmm5, 0x50(%rdi)
+	movaps	%xmm6, 0x60(%rdi)
+	movaps	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_cache_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_cache_less_64bytes)
+
+	movdqa	(%rsi), %xmm4
+	sub	$0x40, %rdx
+	movdqa	0x10(%rsi), %xmm1
+
+	movdqa	%xmm4, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+
+	movdqa	0x20(%rsi), %xmm4
+	movdqa	0x30(%rsi), %xmm1
+	add	$0x40, %rsi
+
+	movdqa	%xmm4, 0x20(%rdi)
+	movdqa	%xmm1, 0x30(%rdi)
+	add	$0x40, %rdi
+L(shl_0_cache_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_0_gobble_mem_loop):
+	prefetcht0 0x1c0(%rsi)
+	prefetcht0 0x280(%rsi)
+
+	movdqa	(%rsi), %xmm0
+	movdqa	0x10(%rsi), %xmm1
+	movdqa	0x20(%rsi), %xmm2
+	movdqa	0x30(%rsi), %xmm3
+	movdqa	0x40(%rsi), %xmm4
+	movdqa	0x50(%rsi), %xmm5
+	movdqa	0x60(%rsi), %xmm6
+	movdqa	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+	sub	$0x80, %rdx
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	movdqa	%xmm2, 0x20(%rdi)
+	movdqa	%xmm3, 0x30(%rdi)
+	movdqa	%xmm4, 0x40(%rdi)
+	movdqa	%xmm5, 0x50(%rdi)
+	movdqa	%xmm6, 0x60(%rdi)
+	movdqa	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_mem_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_mem_less_64bytes)
+
+	movdqa	(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	0x10(%rsi), %xmm1
+
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+
+	movdqa	0x20(%rsi), %xmm0
+	movdqa	0x30(%rsi), %xmm1
+	add	$0x40, %rsi
+
+	movdqa	%xmm0, 0x20(%rdi)
+	movdqa	%xmm1, 0x30(%rdi)
+	add	$0x40, %rdi
+L(shl_0_mem_less_64bytes):
+	cmp	$0x20, %rdx
+	jb	L(shl_0_mem_less_32bytes)
+	movdqa	(%rsi), %xmm0
+	sub	$0x20, %rdx
+	movdqa	0x10(%rsi), %xmm1
+	add	$0x20, %rsi
+	movdqa	%xmm0, (%rdi)
+	movdqa	%xmm1, 0x10(%rdi)
+	add	$0x20, %rdi
+L(shl_0_mem_less_32bytes):
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_0_bwd):
+	sub	$16, %rdx
+	movdqa	-0x10(%rsi), %xmm1
+	sub	$16, %rsi
+	movdqa	%xmm1, -0x10(%rdi)
+	sub	$16, %rdi
+	cmp	$0x80, %rdx
+	movdqu	%xmm0, (%r8)
+	ja	L(shl_0_gobble_bwd)
+	cmp	$64, %rdx
+	jb	L(shl_0_less_64bytes_bwd)
+	movaps	-0x10(%rsi), %xmm0
+	movaps	-0x20(%rsi), %xmm1
+	movaps	-0x30(%rsi), %xmm2
+	movaps	-0x40(%rsi), %xmm3
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	sub	$64, %rdx
+	sub	$0x40, %rsi
+	sub	$0x40, %rdi
+L(shl_0_less_64bytes_bwd):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_0_gobble_bwd):
+#ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %rdx
+#else
+	cmp	__x86_64_data_cache_size_half(%rip), %rdx
+#endif
+	lea	-128(%rdx), %rdx
+	jae	L(shl_0_gobble_mem_bwd_loop)
+L(shl_0_gobble_bwd_loop):
+	movdqa	-0x10(%rsi), %xmm0
+	movaps	-0x20(%rsi), %xmm1
+	movaps	-0x30(%rsi), %xmm2
+	movaps	-0x40(%rsi), %xmm3
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+
+	sub	$0x80, %rdx
+	movaps	-0x50(%rsi), %xmm4
+	movaps	-0x60(%rsi), %xmm5
+	movaps	-0x70(%rsi), %xmm6
+	movaps	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+	movaps	%xmm4, -0x50(%rdi)
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	%xmm6, -0x70(%rdi)
+	movaps	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_gobble_bwd_less_64bytes)
+
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+
+	movdqa	-0x30(%rsi), %xmm0
+	movdqa	-0x40(%rsi), %xmm1
+	sub	$0x40, %rsi
+
+	movdqa	%xmm0, -0x30(%rdi)
+	movdqa	%xmm1, -0x40(%rdi)
+	sub	$0x40, %rdi
+L(shl_0_gobble_bwd_less_64bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_0_gobble_mem_bwd_loop):
+	prefetcht0 -0x1c0(%rsi)
+	prefetcht0 -0x280(%rsi)
+	movdqa	-0x10(%rsi), %xmm0
+	movdqa	-0x20(%rsi), %xmm1
+	movdqa	-0x30(%rsi), %xmm2
+	movdqa	-0x40(%rsi), %xmm3
+	movdqa	-0x50(%rsi), %xmm4
+	movdqa	-0x60(%rsi), %xmm5
+	movdqa	-0x70(%rsi), %xmm6
+	movdqa	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+	sub	$0x80, %rdx
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+	movdqa	%xmm2, -0x30(%rdi)
+	movdqa	%xmm3, -0x40(%rdi)
+	movdqa	%xmm4, -0x50(%rdi)
+	movdqa	%xmm5, -0x60(%rdi)
+	movdqa	%xmm6, -0x70(%rdi)
+	movdqa	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+
+	jae	L(shl_0_gobble_mem_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(shl_0_mem_bwd_less_64bytes)
+
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x40, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+
+	movdqa	-0x30(%rsi), %xmm0
+	movdqa	-0x40(%rsi), %xmm1
+	sub	$0x40, %rsi
+
+	movdqa	%xmm0, -0x30(%rdi)
+	movdqa	%xmm1, -0x40(%rdi)
+	sub	$0x40, %rdi
+L(shl_0_mem_bwd_less_64bytes):
+	cmp	$0x20, %rdx
+	jb	L(shl_0_mem_bwd_less_32bytes)
+	movdqa	-0x10(%rsi), %xmm0
+	sub	$0x20, %rdx
+	movdqa	-0x20(%rsi), %xmm1
+	sub	$0x20, %rsi
+	movdqa	%xmm0, -0x10(%rdi)
+	movdqa	%xmm1, -0x20(%rdi)
+	sub	$0x20, %rdi
+L(shl_0_mem_bwd_less_32bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_1):
+	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	jb	L(L1_fwd)
+	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
+L(L1_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_1_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_1_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0f(%rsi), %xmm2
+	movaps	0x1f(%rsi), %xmm3
+	movaps	0x2f(%rsi), %xmm4
+	movaps	0x3f(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$1, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$1, %xmm3, %xmm4
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$1, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_1_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_1_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_1_bwd):
+	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	jb	L(L1_bwd)
+	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
+L(L1_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_1_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_1_bwd_loop_L1):
+	movaps	-0x11(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x21(%rsi), %xmm3
+	movaps	-0x31(%rsi), %xmm4
+	movaps	-0x41(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$1, %xmm2, %xmm1
+	palignr	$1, %xmm3, %xmm2
+	palignr	$1, %xmm4, %xmm3
+	palignr	$1, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_1_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_1_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_2):
+	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	jb	L(L2_fwd)
+	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
+L(L2_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_2_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_2_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0e(%rsi), %xmm2
+	movaps	0x1e(%rsi), %xmm3
+	movaps	0x2e(%rsi), %xmm4
+	movaps	0x3e(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$2, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$2, %xmm3, %xmm4
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$2, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_2_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_2_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_2_bwd):
+	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	jb	L(L2_bwd)
+	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
+L(L2_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_2_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_2_bwd_loop_L1):
+	movaps	-0x12(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x22(%rsi), %xmm3
+	movaps	-0x32(%rsi), %xmm4
+	movaps	-0x42(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$2, %xmm2, %xmm1
+	palignr	$2, %xmm3, %xmm2
+	palignr	$2, %xmm4, %xmm3
+	palignr	$2, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_2_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_2_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_3):
+	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x03(%rsi), %xmm1
+	jb	L(L3_fwd)
+	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
+L(L3_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_3_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_3_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0d(%rsi), %xmm2
+	movaps	0x1d(%rsi), %xmm3
+	movaps	0x2d(%rsi), %xmm4
+	movaps	0x3d(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$3, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$3, %xmm3, %xmm4
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$3, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_3_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_3_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_3_bwd):
+	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x03(%rsi), %xmm1
+	jb	L(L3_bwd)
+	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
+L(L3_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_3_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_3_bwd_loop_L1):
+	movaps	-0x13(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x23(%rsi), %xmm3
+	movaps	-0x33(%rsi), %xmm4
+	movaps	-0x43(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$3, %xmm2, %xmm1
+	palignr	$3, %xmm3, %xmm2
+	palignr	$3, %xmm4, %xmm3
+	palignr	$3, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_3_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_3_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_4):
+	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	jb	L(L4_fwd)
+	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
+L(L4_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_4_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_4_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0c(%rsi), %xmm2
+	movaps	0x1c(%rsi), %xmm3
+	movaps	0x2c(%rsi), %xmm4
+	movaps	0x3c(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$4, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$4, %xmm3, %xmm4
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$4, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_4_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_4_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_4_bwd):
+	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x04(%rsi), %xmm1
+	jb	L(L4_bwd)
+	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
+L(L4_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_4_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_4_bwd_loop_L1):
+	movaps	-0x14(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x24(%rsi), %xmm3
+	movaps	-0x34(%rsi), %xmm4
+	movaps	-0x44(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$4, %xmm2, %xmm1
+	palignr	$4, %xmm3, %xmm2
+	palignr	$4, %xmm4, %xmm3
+	palignr	$4, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_4_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_4_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_5):
+	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	jb	L(L5_fwd)
+	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
+L(L5_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_5_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_5_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0b(%rsi), %xmm2
+	movaps	0x1b(%rsi), %xmm3
+	movaps	0x2b(%rsi), %xmm4
+	movaps	0x3b(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$5, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$5, %xmm3, %xmm4
+	palignr	$5, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$5, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_5_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_5_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_5_bwd):
+	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x05(%rsi), %xmm1
+	jb	L(L5_bwd)
+	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
+L(L5_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_5_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_5_bwd_loop_L1):
+	movaps	-0x15(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x25(%rsi), %xmm3
+	movaps	-0x35(%rsi), %xmm4
+	movaps	-0x45(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$5, %xmm2, %xmm1
+	palignr	$5, %xmm3, %xmm2
+	palignr	$5, %xmm4, %xmm3
+	palignr	$5, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_5_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_5_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_6):
+	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	jb	L(L6_fwd)
+	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
+L(L6_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_6_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_6_loop_L1):
+	sub	$64, %rdx
+	movaps	0x0a(%rsi), %xmm2
+	movaps	0x1a(%rsi), %xmm3
+	movaps	0x2a(%rsi), %xmm4
+	movaps	0x3a(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$6, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$6, %xmm3, %xmm4
+	palignr	$6, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$6, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_6_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_6_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_6_bwd):
+	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x06(%rsi), %xmm1
+	jb	L(L6_bwd)
+	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
+L(L6_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_6_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_6_bwd_loop_L1):
+	movaps	-0x16(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x26(%rsi), %xmm3
+	movaps	-0x36(%rsi), %xmm4
+	movaps	-0x46(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$6, %xmm2, %xmm1
+	palignr	$6, %xmm3, %xmm2
+	palignr	$6, %xmm4, %xmm3
+	palignr	$6, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_6_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_6_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_7):
+	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	jb	L(L7_fwd)
+	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
+L(L7_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_7_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_7_loop_L1):
+	sub	$64, %rdx
+	movaps	0x09(%rsi), %xmm2
+	movaps	0x19(%rsi), %xmm3
+	movaps	0x29(%rsi), %xmm4
+	movaps	0x39(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$7, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$7, %xmm3, %xmm4
+	palignr	$7, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$7, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_7_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_7_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_7_bwd):
+	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x07(%rsi), %xmm1
+	jb	L(L7_bwd)
+	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
+L(L7_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_7_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_7_bwd_loop_L1):
+	movaps	-0x17(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x27(%rsi), %xmm3
+	movaps	-0x37(%rsi), %xmm4
+	movaps	-0x47(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$7, %xmm2, %xmm1
+	palignr	$7, %xmm3, %xmm2
+	palignr	$7, %xmm4, %xmm3
+	palignr	$7, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_7_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_7_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_8):
+	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	jb	L(L8_fwd)
+	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
+L(L8_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+L(shl_8_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_8_loop_L1):
+	sub	$64, %rdx
+	movaps	0x08(%rsi), %xmm2
+	movaps	0x18(%rsi), %xmm3
+	movaps	0x28(%rsi), %xmm4
+	movaps	0x38(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$8, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$8, %xmm3, %xmm4
+	palignr	$8, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$8, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_8_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+	ALIGN (4)
+L(shl_8_end):
+	lea	64(%rdx), %rdx
+	movaps	%xmm4, -0x20(%rdi)
+	add	%rdx, %rsi
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_8_bwd):
+	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x08(%rsi), %xmm1
+	jb	L(L8_bwd)
+	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
+L(L8_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_8_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_8_bwd_loop_L1):
+	movaps	-0x18(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x28(%rsi), %xmm3
+	movaps	-0x38(%rsi), %xmm4
+	movaps	-0x48(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$8, %xmm2, %xmm1
+	palignr	$8, %xmm3, %xmm2
+	palignr	$8, %xmm4, %xmm3
+	palignr	$8, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_8_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_8_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_9):
+	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	jb	L(L9_fwd)
+	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
+L(L9_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_9_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_9_loop_L1):
+	sub	$64, %rdx
+	movaps	0x07(%rsi), %xmm2
+	movaps	0x17(%rsi), %xmm3
+	movaps	0x27(%rsi), %xmm4
+	movaps	0x37(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$9, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$9, %xmm3, %xmm4
+	palignr	$9, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$9, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_9_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_9_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_9_bwd):
+	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x09(%rsi), %xmm1
+	jb	L(L9_bwd)
+	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
+L(L9_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_9_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_9_bwd_loop_L1):
+	movaps	-0x19(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x29(%rsi), %xmm3
+	movaps	-0x39(%rsi), %xmm4
+	movaps	-0x49(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$9, %xmm2, %xmm1
+	palignr	$9, %xmm3, %xmm2
+	palignr	$9, %xmm4, %xmm3
+	palignr	$9, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_9_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_9_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_10):
+	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	jb	L(L10_fwd)
+	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
+L(L10_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_10_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_10_loop_L1):
+	sub	$64, %rdx
+	movaps	0x06(%rsi), %xmm2
+	movaps	0x16(%rsi), %xmm3
+	movaps	0x26(%rsi), %xmm4
+	movaps	0x36(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$10, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$10, %xmm3, %xmm4
+	palignr	$10, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$10, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_10_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_10_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_10_bwd):
+	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0a(%rsi), %xmm1
+	jb	L(L10_bwd)
+	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
+L(L10_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_10_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_10_bwd_loop_L1):
+	movaps	-0x1a(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2a(%rsi), %xmm3
+	movaps	-0x3a(%rsi), %xmm4
+	movaps	-0x4a(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$10, %xmm2, %xmm1
+	palignr	$10, %xmm3, %xmm2
+	palignr	$10, %xmm4, %xmm3
+	palignr	$10, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_10_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_10_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_11):
+	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	jb	L(L11_fwd)
+	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
+L(L11_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_11_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_11_loop_L1):
+	sub	$64, %rdx
+	movaps	0x05(%rsi), %xmm2
+	movaps	0x15(%rsi), %xmm3
+	movaps	0x25(%rsi), %xmm4
+	movaps	0x35(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$11, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$11, %xmm3, %xmm4
+	palignr	$11, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$11, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_11_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_11_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_11_bwd):
+	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0b(%rsi), %xmm1
+	jb	L(L11_bwd)
+	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
+L(L11_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_11_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_11_bwd_loop_L1):
+	movaps	-0x1b(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2b(%rsi), %xmm3
+	movaps	-0x3b(%rsi), %xmm4
+	movaps	-0x4b(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$11, %xmm2, %xmm1
+	palignr	$11, %xmm3, %xmm2
+	palignr	$11, %xmm4, %xmm3
+	palignr	$11, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_11_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_11_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_12):
+	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0c(%rsi), %xmm1
+	jb	L(L12_fwd)
+	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
+L(L12_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_12_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_12_loop_L1):
+	sub	$64, %rdx
+	movaps	0x04(%rsi), %xmm2
+	movaps	0x14(%rsi), %xmm3
+	movaps	0x24(%rsi), %xmm4
+	movaps	0x34(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$12, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$12, %xmm3, %xmm4
+	palignr	$12, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$12, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_12_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_12_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_12_bwd):
+	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0c(%rsi), %xmm1
+	jb	L(L12_bwd)
+	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
+L(L12_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_12_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_12_bwd_loop_L1):
+	movaps	-0x1c(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2c(%rsi), %xmm3
+	movaps	-0x3c(%rsi), %xmm4
+	movaps	-0x4c(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$12, %xmm2, %xmm1
+	palignr	$12, %xmm3, %xmm2
+	palignr	$12, %xmm4, %xmm3
+	palignr	$12, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_12_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_12_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_13):
+	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	jb	L(L13_fwd)
+	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
+L(L13_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_13_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_13_loop_L1):
+	sub	$64, %rdx
+	movaps	0x03(%rsi), %xmm2
+	movaps	0x13(%rsi), %xmm3
+	movaps	0x23(%rsi), %xmm4
+	movaps	0x33(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$13, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$13, %xmm3, %xmm4
+	palignr	$13, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$13, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_13_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_13_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_13_bwd):
+	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0d(%rsi), %xmm1
+	jb	L(L13_bwd)
+	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
+L(L13_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_13_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_13_bwd_loop_L1):
+	movaps	-0x1d(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2d(%rsi), %xmm3
+	movaps	-0x3d(%rsi), %xmm4
+	movaps	-0x4d(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$13, %xmm2, %xmm1
+	palignr	$13, %xmm3, %xmm2
+	palignr	$13, %xmm4, %xmm3
+	palignr	$13, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_13_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_13_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_14):
+	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	jb	L(L14_fwd)
+	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
+L(L14_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_14_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_14_loop_L1):
+	sub	$64, %rdx
+	movaps	0x02(%rsi), %xmm2
+	movaps	0x12(%rsi), %xmm3
+	movaps	0x22(%rsi), %xmm4
+	movaps	0x32(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$14, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$14, %xmm3, %xmm4
+	palignr	$14, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$14, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_14_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_14_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_14_bwd):
+	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0e(%rsi), %xmm1
+	jb	L(L14_bwd)
+	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
+L(L14_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_14_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_14_bwd_loop_L1):
+	movaps	-0x1e(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2e(%rsi), %xmm3
+	movaps	-0x3e(%rsi), %xmm4
+	movaps	-0x4e(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$14, %xmm2, %xmm1
+	palignr	$14, %xmm3, %xmm2
+	palignr	$14, %xmm4, %xmm3
+	palignr	$14, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_14_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_14_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_15):
+	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	jb	L(L15_fwd)
+	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
+L(L15_fwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_15_loop_L2):
+	prefetchnta 0x1c0(%rsi)
+L(shl_15_loop_L1):
+	sub	$64, %rdx
+	movaps	0x01(%rsi), %xmm2
+	movaps	0x11(%rsi), %xmm3
+	movaps	0x21(%rsi), %xmm4
+	movaps	0x31(%rsi), %xmm5
+	movdqa	%xmm5, %xmm6
+	palignr	$15, %xmm4, %xmm5
+	lea	64(%rsi), %rsi
+	palignr	$15, %xmm3, %xmm4
+	palignr	$15, %xmm2, %xmm3
+	lea	64(%rdi), %rdi
+	palignr	$15, %xmm1, %xmm2
+	movdqa	%xmm6, %xmm1
+	movdqa	%xmm2, -0x40(%rdi)
+	movaps	%xmm3, -0x30(%rdi)
+	jb	L(shl_15_end)
+	movaps	%xmm4, -0x20(%rdi)
+	movaps	%xmm5, -0x10(%rdi)
+	jmp	*%r9
+	ud2
+L(shl_15_end):
+	movaps	%xmm4, -0x20(%rdi)
+	lea	64(%rdx), %rdx
+	movaps	%xmm5, -0x10(%rdi)
+	add	%rdx, %rdi
+	movdqu	%xmm0, (%r8)
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(shl_15_bwd):
+	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
+	cmp	%rcx, %rdx
+	movaps	-0x0f(%rsi), %xmm1
+	jb	L(L15_bwd)
+	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
+L(L15_bwd):
+	lea	-64(%rdx), %rdx
+	jmp	*%r9
+	ud2
+L(shl_15_bwd_loop_L2):
+	prefetchnta -0x1c0(%rsi)
+L(shl_15_bwd_loop_L1):
+	movaps	-0x1f(%rsi), %xmm2
+	sub	$0x40, %rdx
+	movaps	-0x2f(%rsi), %xmm3
+	movaps	-0x3f(%rsi), %xmm4
+	movaps	-0x4f(%rsi), %xmm5
+	lea	-0x40(%rsi), %rsi
+	palignr	$15, %xmm2, %xmm1
+	palignr	$15, %xmm3, %xmm2
+	palignr	$15, %xmm4, %xmm3
+	palignr	$15, %xmm5, %xmm4
+
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	%xmm5, %xmm1
+
+	movaps	%xmm2, -0x20(%rdi)
+	lea	-0x40(%rdi), %rdi
+
+	movaps	%xmm3, 0x10(%rdi)
+	jb	L(shl_15_bwd_end)
+	movaps	%xmm4, (%rdi)
+	jmp	*%r9
+	ud2
+L(shl_15_bwd_end):
+	movaps	%xmm4, (%rdi)
+	lea	64(%rdx), %rdx
+	movdqu	%xmm0, (%r8)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
+
+	ALIGN (4)
+L(write_72bytes):
+	movdqu	-72(%rsi), %xmm0
+	movdqu	-56(%rsi), %xmm1
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rcx
+	movdqu	 %xmm0, -72(%rdi)
+	movdqu	 %xmm1, -56(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rcx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_64bytes):
+	movdqu	-64(%rsi), %xmm0
+	mov	-48(%rsi), %rcx
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -64(%rdi)
+	mov	 %rcx, -48(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_56bytes):
+	movdqu	-56(%rsi), %xmm0
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rcx
+	movdqu	 %xmm0, -56(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rcx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_48bytes):
+	mov	-48(%rsi), %rcx
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -48(%rdi)
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_40bytes):
+	mov	-40(%rsi), %r8
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -40(%rdi)
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_32bytes):
+	mov	-32(%rsi), %r9
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -32(%rdi)
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_24bytes):
+	mov	-24(%rsi), %r10
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -24(%rdi)
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_16bytes):
+	mov	-16(%rsi), %r11
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -16(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_8bytes):
+	mov	-8(%rsi), %rdx
+	mov	 %rdx, -8(%rdi)
+L(write_0bytes):
+	ret
+
+	ALIGN (4)
+L(write_73bytes):
+	movdqu	-73(%rsi), %xmm0
+	movdqu	-57(%rsi), %xmm1
+	mov	-41(%rsi), %rcx
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %r8
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -73(%rdi)
+	movdqu	 %xmm1, -57(%rdi)
+	mov	 %rcx, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %r8, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_65bytes):
+	movdqu	-65(%rsi), %xmm0
+	movdqu	-49(%rsi), %xmm1
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -65(%rdi)
+	movdqu	 %xmm1, -49(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_57bytes):
+	movdqu	-57(%rsi), %xmm0
+	mov	-41(%rsi), %r8
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -57(%rdi)
+	mov	 %r8, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_49bytes):
+	movdqu	-49(%rsi), %xmm0
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -49(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_41bytes):
+	mov	-41(%rsi), %r8
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r8, -41(%rdi)
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_33bytes):
+	mov	-33(%rsi), %r9
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r9, -33(%rdi)
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_25bytes):
+	mov	-25(%rsi), %r10
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-1(%rsi), %dl
+	mov	 %r10, -25(%rdi)
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %dl, -1(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_17bytes):
+	mov	-17(%rsi), %r11
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -17(%rdi)
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_9bytes):
+	mov	-9(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -9(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_1bytes):
+	mov	-1(%rsi), %dl
+	mov	 %dl, -1(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_74bytes):
+	movdqu	-74(%rsi), %xmm0
+	movdqu	-58(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -74(%rdi)
+	movdqu	 %xmm1, -58(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_66bytes):
+	movdqu	-66(%rsi), %xmm0
+	movdqu	-50(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -66(%rdi)
+	movdqu	 %xmm1, -50(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_58bytes):
+	movdqu	-58(%rsi), %xmm1
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm1, -58(%rdi)
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_50bytes):
+	movdqu	-50(%rsi), %xmm0
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -50(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_42bytes):
+	mov	-42(%rsi), %r8
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -42(%rdi)
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_34bytes):
+	mov	-34(%rsi), %r9
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -34(%rdi)
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_26bytes):
+	mov	-26(%rsi), %r10
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -26(%rdi)
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_18bytes):
+	mov	-18(%rsi), %r11
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -18(%rdi)
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_10bytes):
+	mov	-10(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -10(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_2bytes):
+	mov	-2(%rsi), %dx
+	mov	 %dx, -2(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_75bytes):
+	movdqu	-75(%rsi), %xmm0
+	movdqu	-59(%rsi), %xmm1
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -75(%rdi)
+	movdqu	 %xmm1, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_67bytes):
+	movdqu	-67(%rsi), %xmm0
+	movdqu	-59(%rsi), %xmm1
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -67(%rdi)
+	movdqu	 %xmm1, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_59bytes):
+	movdqu	-59(%rsi), %xmm0
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -59(%rdi)
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_51bytes):
+	movdqu	-51(%rsi), %xmm0
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -51(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_43bytes):
+	mov	-43(%rsi), %r8
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -43(%rdi)
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_35bytes):
+	mov	-35(%rsi), %r9
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -35(%rdi)
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_27bytes):
+	mov	-27(%rsi), %r10
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -27(%rdi)
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_19bytes):
+	mov	-19(%rsi), %r11
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -19(%rdi)
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_11bytes):
+	mov	-11(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -11(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_3bytes):
+	mov	-3(%rsi), %dx
+	mov	-2(%rsi), %cx
+	mov	 %dx, -3(%rdi)
+	mov	 %cx, -2(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_76bytes):
+	movdqu	-76(%rsi), %xmm0
+	movdqu	-60(%rsi), %xmm1
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -76(%rdi)
+	movdqu	 %xmm1, -60(%rdi)
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_68bytes):
+	movdqu	-68(%rsi), %xmm0
+	movdqu	-52(%rsi), %xmm1
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -68(%rdi)
+	movdqu	 %xmm1, -52(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_60bytes):
+	movdqu	-60(%rsi), %xmm0
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -60(%rdi)
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_52bytes):
+	movdqu	-52(%rsi), %xmm0
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	movdqu	 %xmm0, -52(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_44bytes):
+	mov	-44(%rsi), %r8
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r8, -44(%rdi)
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_36bytes):
+	mov	-36(%rsi), %r9
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r9, -36(%rdi)
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_28bytes):
+	mov	-28(%rsi), %r10
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r10, -28(%rdi)
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_20bytes):
+	mov	-20(%rsi), %r11
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %r11, -20(%rdi)
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_12bytes):
+	mov	-12(%rsi), %rcx
+	mov	-4(%rsi), %edx
+	mov	 %rcx, -12(%rdi)
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_4bytes):
+	mov	-4(%rsi), %edx
+	mov	 %edx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_77bytes):
+	movdqu	-77(%rsi), %xmm0
+	movdqu	-61(%rsi), %xmm1
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -77(%rdi)
+	movdqu	 %xmm1, -61(%rdi)
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_69bytes):
+	movdqu	-69(%rsi), %xmm0
+	movdqu	-53(%rsi), %xmm1
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -69(%rdi)
+	movdqu	 %xmm1, -53(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_61bytes):
+	movdqu	-61(%rsi), %xmm0
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -61(%rdi)
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_53bytes):
+	movdqu	-53(%rsi), %xmm0
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -53(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_45bytes):
+	mov	-45(%rsi), %r8
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -45(%rdi)
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_37bytes):
+	mov	-37(%rsi), %r9
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -37(%rdi)
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_29bytes):
+	mov	-29(%rsi), %r10
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -29(%rdi)
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_21bytes):
+	mov	-21(%rsi), %r11
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -21(%rdi)
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_13bytes):
+	mov	-13(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -13(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_5bytes):
+	mov	-5(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -5(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_78bytes):
+	movdqu	-78(%rsi), %xmm0
+	movdqu	-62(%rsi), %xmm1
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -78(%rdi)
+	movdqu	 %xmm1, -62(%rdi)
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_70bytes):
+	movdqu	-70(%rsi), %xmm0
+	movdqu	-54(%rsi), %xmm1
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -70(%rdi)
+	movdqu	 %xmm1, -54(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_62bytes):
+	movdqu	-62(%rsi), %xmm0
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -62(%rdi)
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_54bytes):
+	movdqu	-54(%rsi), %xmm0
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -54(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_46bytes):
+	mov	-46(%rsi), %r8
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -46(%rdi)
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_38bytes):
+	mov	-38(%rsi), %r9
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -38(%rdi)
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_30bytes):
+	mov	-30(%rsi), %r10
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -30(%rdi)
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_22bytes):
+	mov	-22(%rsi), %r11
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -22(%rdi)
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_14bytes):
+	mov	-14(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -14(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_6bytes):
+	mov	-6(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -6(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_79bytes):
+	movdqu	-79(%rsi), %xmm0
+	movdqu	-63(%rsi), %xmm1
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -79(%rdi)
+	movdqu	 %xmm1, -63(%rdi)
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_71bytes):
+	movdqu	-71(%rsi), %xmm0
+	movdqu	-55(%rsi), %xmm1
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -71(%rdi)
+	movdqu	 %xmm1, -55(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_63bytes):
+	movdqu	-63(%rsi), %xmm0
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -63(%rdi)
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_55bytes):
+	movdqu	-55(%rsi), %xmm0
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	movdqu	 %xmm0, -55(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_47bytes):
+	mov	-47(%rsi), %r8
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r8, -47(%rdi)
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_39bytes):
+	mov	-39(%rsi), %r9
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r9, -39(%rdi)
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_31bytes):
+	mov	-31(%rsi), %r10
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r10, -31(%rdi)
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_23bytes):
+	mov	-23(%rsi), %r11
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %r11, -23(%rdi)
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_15bytes):
+	mov	-15(%rsi), %rcx
+	mov	-8(%rsi), %rdx
+	mov	 %rcx, -15(%rdi)
+	mov	 %rdx, -8(%rdi)
+	ret
+
+	ALIGN (4)
+L(write_7bytes):
+	mov	-7(%rsi), %edx
+	mov	-4(%rsi), %ecx
+	mov	 %edx, -7(%rdi)
+	mov	 %ecx, -4(%rdi)
+	ret
+
+	ALIGN (4)
+L(large_page_fwd):
+	movdqu	(%rsi), %xmm1
+	lea	16(%rsi), %rsi
+	movdqu	%xmm0, (%r8)
+	movntdq	%xmm1, (%rdi)
+	lea	16(%rdi), %rdi
+	lea	-0x90(%rdx), %rdx
+#ifdef USE_AS_MEMMOVE
+	mov	%rsi, %r9
+	sub	%rdi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_fwd)
+	shl	$2, %rcx
+	cmp	%rcx, %rdx
+	jb	L(ll_cache_copy_fwd_start)
+L(memmove_is_memcpy_fwd):
+#endif
+L(large_page_loop):
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	movntdq	%xmm4, 0x40(%rdi)
+	movntdq	%xmm5, 0x50(%rdi)
+	movntdq	%xmm6, 0x60(%rdi)
+	movntdq	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(large_page_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_less_64bytes)
+
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	lea	0x40(%rsi), %rsi
+
+	movntdq	%xmm0, (%rdi)
+	movntdq	%xmm1, 0x10(%rdi)
+	movntdq	%xmm2, 0x20(%rdi)
+	movntdq	%xmm3, 0x30(%rdi)
+	lea	0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_less_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#ifdef USE_AS_MEMMOVE
+	ALIGN (4)
+L(ll_cache_copy_fwd_start):
+	prefetcht0 0x1c0(%rsi)
+	prefetcht0 0x200(%rsi)
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	movdqu	0x40(%rsi), %xmm4
+	movdqu	0x50(%rsi), %xmm5
+	movdqu	0x60(%rsi), %xmm6
+	movdqu	0x70(%rsi), %xmm7
+	lea	0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movaps	%xmm0, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+	movaps	%xmm4, 0x40(%rdi)
+	movaps	%xmm5, 0x50(%rdi)
+	movaps	%xmm6, 0x60(%rdi)
+	movaps	%xmm7, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(ll_cache_copy_fwd_start)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_ll_less_fwd_64bytes)
+
+	movdqu	(%rsi), %xmm0
+	movdqu	0x10(%rsi), %xmm1
+	movdqu	0x20(%rsi), %xmm2
+	movdqu	0x30(%rsi), %xmm3
+	lea	0x40(%rsi), %rsi
+
+	movaps	%xmm0, (%rdi)
+	movaps	%xmm1, 0x10(%rdi)
+	movaps	%xmm2, 0x20(%rdi)
+	movaps	%xmm3, 0x30(%rdi)
+	lea	0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_ll_less_fwd_64bytes):
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#endif
+	ALIGN (4)
+L(large_page_bwd):
+	movdqu	-0x10(%rsi), %xmm1
+	lea	-16(%rsi), %rsi
+	movdqu	%xmm0, (%r8)
+	movdqa	%xmm1, -0x10(%rdi)
+	lea	-16(%rdi), %rdi
+	lea	-0x90(%rdx), %rdx
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %r9
+	sub	%rsi, %r9
+	cmp	%rdx, %r9
+	jae	L(memmove_is_memcpy_bwd)
+	cmp	%rcx, %r9
+	jb	L(ll_cache_copy_bwd_start)
+L(memmove_is_memcpy_bwd):
+#endif
+L(large_page_bwd_loop):
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	movdqu	-0x50(%rsi), %xmm4
+	movdqu	-0x60(%rsi), %xmm5
+	movdqu	-0x70(%rsi), %xmm6
+	movdqu	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movntdq	%xmm0, -0x10(%rdi)
+	movntdq	%xmm1, -0x20(%rdi)
+	movntdq	%xmm2, -0x30(%rdi)
+	movntdq	%xmm3, -0x40(%rdi)
+	movntdq	%xmm4, -0x50(%rdi)
+	movntdq	%xmm5, -0x60(%rdi)
+	movntdq	%xmm6, -0x70(%rdi)
+	movntdq	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	jae	L(large_page_bwd_loop)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_less_bwd_64bytes)
+
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	lea	-0x40(%rsi), %rsi
+
+	movntdq	%xmm0, -0x10(%rdi)
+	movntdq	%xmm1, -0x20(%rdi)
+	movntdq	%xmm2, -0x30(%rdi)
+	movntdq	%xmm3, -0x40(%rdi)
+	lea	-0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_less_bwd_64bytes):
+	sfence
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+
+#ifdef USE_AS_MEMMOVE
+	ALIGN (4)
+L(ll_cache_copy_bwd_start):
+	prefetcht0 -0x1c0(%rsi)
+	prefetcht0 -0x200(%rsi)
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	movdqu	-0x50(%rsi), %xmm4
+	movdqu	-0x60(%rsi), %xmm5
+	movdqu	-0x70(%rsi), %xmm6
+	movdqu	-0x80(%rsi), %xmm7
+	lea	-0x80(%rsi), %rsi
+
+	sub	$0x80, %rdx
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	movaps	%xmm4, -0x50(%rdi)
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	%xmm6, -0x70(%rdi)
+	movaps	%xmm7, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	jae	L(ll_cache_copy_bwd_start)
+	cmp	$-0x40, %rdx
+	lea	0x80(%rdx), %rdx
+	jl	L(large_page_ll_less_bwd_64bytes)
+
+	movdqu	-0x10(%rsi), %xmm0
+	movdqu	-0x20(%rsi), %xmm1
+	movdqu	-0x30(%rsi), %xmm2
+	movdqu	-0x40(%rsi), %xmm3
+	lea	-0x40(%rsi), %rsi
+
+	movaps	%xmm0, -0x10(%rdi)
+	movaps	%xmm1, -0x20(%rdi)
+	movaps	%xmm2, -0x30(%rdi)
+	movaps	%xmm3, -0x40(%rdi)
+	lea	-0x40(%rdi), %rdi
+	sub	$0x40, %rdx
+L(large_page_ll_less_bwd_64bytes):
+	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
+#endif
+
+END (MEMCPY)
+
+	.section .rodata.ssse3,"a",@progbits
+	ALIGN (3)
+L(table_less_80bytes):
+	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
+	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
+
+	ALIGN (3)
+L(shl_table):
+	.int	JMPTBL (L(shl_0), L(shl_table))
+	.int	JMPTBL (L(shl_1), L(shl_table))
+	.int	JMPTBL (L(shl_2), L(shl_table))
+	.int	JMPTBL (L(shl_3), L(shl_table))
+	.int	JMPTBL (L(shl_4), L(shl_table))
+	.int	JMPTBL (L(shl_5), L(shl_table))
+	.int	JMPTBL (L(shl_6), L(shl_table))
+	.int	JMPTBL (L(shl_7), L(shl_table))
+	.int	JMPTBL (L(shl_8), L(shl_table))
+	.int	JMPTBL (L(shl_9), L(shl_table))
+	.int	JMPTBL (L(shl_10), L(shl_table))
+	.int	JMPTBL (L(shl_11), L(shl_table))
+	.int	JMPTBL (L(shl_12), L(shl_table))
+	.int	JMPTBL (L(shl_13), L(shl_table))
+	.int	JMPTBL (L(shl_14), L(shl_table))
+	.int	JMPTBL (L(shl_15), L(shl_table))
+
+	ALIGN (3)
+L(shl_table_bwd):
+	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
+	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
new file mode 100644
index 0000000..8e9fb19
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -0,0 +1,73 @@
+/* Multiple versions of memcpy
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need memcpy before the initialization
+   happened.  */
+#if defined SHARED && !defined NOT_IN_libc
+	.text
+ENTRY(memcpy)
+	.type	memcpy, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__memcpy_sse2(%rip), %rax
+	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jz	2f
+	leaq	__memcpy_ssse3(%rip), %rax
+	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	jz	2f
+	leaq	__memcpy_ssse3_back(%rip), %rax
+2:	ret
+END(memcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memcpy_sse2, @function; \
+	.p2align 4; \
+	__memcpy_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memcpy_sse2, .-__memcpy_sse2
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __memcpy_chk_sse2, @function; \
+	.globl __memcpy_chk_sse2; \
+	.p2align 4; \
+	__memcpy_chk_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2
+
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memcpy calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_memcpy; __GI_memcpy = __memcpy_sse2
+#endif
+
+#include "../memcpy.S"
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
new file mode 100644
index 0000000..948f61c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -0,0 +1,47 @@
+/* Multiple versions of __memcpy_chk
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch memcpy functions for static binaries.
+ */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.text
+ENTRY(__memcpy_chk)
+	.type	__memcpy_chk, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__memcpy_chk_sse2(%rip), %rax
+	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jz	2f
+	leaq	__memcpy_chk_ssse3(%rip), %rax
+	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	jz	2f
+	leaq	__memcpy_chk_ssse3_back(%rip), %rax
+2:	ret
+END(__memcpy_chk)
+# else
+#  include "../memcpy_chk.S"
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
new file mode 100644
index 0000000..f9a4e9a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3_back
+#define MEMCPY_CHK	__memmove_chk_ssse3_back
+#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
new file mode 100644
index 0000000..295430b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_ssse3
+#define MEMCPY_CHK	__memmove_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
new file mode 100644
index 0000000..bbe9627
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -0,0 +1,24 @@
+#ifndef NOT_IN_libc
+#include "init-arch.h"
+
+#define MEMMOVE __memmove_sse2
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__memmove_sse2, __GI_memmove, __memmove_sse2);
+#endif
+#endif
+
+#include "string/memmove.c"
+
+#ifndef NOT_IN_libc
+extern __typeof (__memmove_sse2) __memmove_sse2 attribute_hidden;
+extern __typeof (__memmove_sse2) __memmove_ssse3 attribute_hidden;
+extern __typeof (__memmove_sse2) __memmove_ssse3_back attribute_hidden;
+
+libc_ifunc (memmove,
+	    HAS_SSSE3
+	    ? (HAS_FAST_COPY_BACKWARD
+	       ? __memmove_ssse3_back : __memmove_ssse3)
+	    : __memmove_sse2);
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
new file mode 100644
index 0000000..a474f5f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -0,0 +1,15 @@
+#include "init-arch.h"
+
+#define MEMMOVE_CHK __memmove_chk_sse2
+
+#include "debug/memmove_chk.c"
+
+extern __typeof (__memmove_chk_sse2) __memmove_chk_sse2 attribute_hidden;
+extern __typeof (__memmove_chk_sse2) __memmove_chk_ssse3 attribute_hidden;
+extern __typeof (__memmove_chk_sse2) __memmove_chk_ssse3_back attribute_hidden;
+
+libc_ifunc (__memmove_chk,
+	    HAS_SSSE3
+	    ? (HAS_FAST_COPY_BACKWARD
+	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
+	    : __memmove_chk_sse2);
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
new file mode 100644
index 0000000..82ffacb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3_back
+#define MEMCPY_CHK	__mempcpy_chk_ssse3_back
+#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
new file mode 100644
index 0000000..822d98e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_ssse3
+#define MEMCPY_CHK	__mempcpy_chk_ssse3
+#include "memcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
new file mode 100644
index 0000000..e8152d6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -0,0 +1,75 @@
+/* Multiple versions of mempcpy
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  In static binaries we need mempcpy before the initialization
+   happened.  */
+#if defined SHARED && !defined NOT_IN_libc
+ENTRY(__mempcpy)
+	.type	__mempcpy, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__mempcpy_sse2(%rip), %rax
+	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jz	2f
+	leaq	__mempcpy_ssse3(%rip), %rax
+	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	jz	2f
+	leaq	__mempcpy_ssse3_back(%rip), %rax
+2:	ret
+END(__mempcpy)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __mempcpy_sse2, @function; \
+	.p2align 4; \
+	__mempcpy_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2
+
+# undef ENTRY_CHK
+# define ENTRY_CHK(name) \
+	.type __mempcpy_chk_sse2, @function; \
+	.globl __mempcpy_chk_sse2; \
+	.p2align 4; \
+	__mempcpy_chk_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END_CHK
+# define END_CHK(name) \
+	cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2
+
+# undef libc_hidden_def
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal mempcpy calls through a PLT.
+   The speedup we get from using SSSE3 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_def(name) \
+	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2
+# define libc_hidden_builtin_def(name) \
+	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2
+#endif
+
+#include "../mempcpy.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
new file mode 100644
index 0000000..024c775
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -0,0 +1,47 @@
+/* Multiple versions of __mempcpy_chk
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  There are no multiarch mempcpy functions for static binaries.
+ */
+#ifndef NOT_IN_libc
+# ifdef SHARED
+	.text
+ENTRY(__mempcpy_chk)
+	.type	__mempcpy_chk, @gnu_indirect_function
+	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__mempcpy_chk_sse2(%rip), %rax
+	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	jz	2f
+	leaq	__mempcpy_chk_ssse3(%rip), %rax
+	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	jz	2f
+	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
+2:	ret
+END(__mempcpy_chk)
+# else
+#  include "../mempcpy_chk.S"
+# endif
+#endif
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index 122270f..1d35f8f 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -58,6 +58,9 @@
   cfi_endproc;								      \
   ASM_SIZE_DIRECTIVE(name)
 
+#define ENTRY_CHK(name) ENTRY (name)
+#define END_CHK(name) END (name)
+
 /* If compiled for profiling, call `mcount' at the start of each function.  */
 #ifdef	PROF
 /* The mcount code relies on a normal frame pointer being on the stack

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                          |   32 +
 debug/memmove_chk.c                                |    6 +-
 string/memmove.c                                   |    5 +-
 sysdeps/x86_64/memcpy.S                            |    4 +-
 sysdeps/x86_64/multiarch/Makefile                  |    4 +-
 sysdeps/x86_64/multiarch/bcopy.S                   |    7 +
 sysdeps/x86_64/multiarch/init-arch.c               |    9 +-
 sysdeps/x86_64/multiarch/init-arch.h               |   16 +-
 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S       | 3169 ++++++++++++++++++++
 sysdeps/x86_64/multiarch/memcpy-ssse3.S            | 3139 +++++++++++++++++++
 sysdeps/x86_64/multiarch/memcpy.S                  |   73 +
 sysdeps/x86_64/multiarch/memcpy_chk.S              |   47 +
 sysdeps/x86_64/multiarch/memmove-ssse3-back.S      |    4 +
 .../i686 => x86_64}/multiarch/memmove-ssse3.S      |    0
 sysdeps/x86_64/multiarch/memmove.c                 |   24 +
 sysdeps/x86_64/multiarch/memmove_chk.c             |   15 +
 sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S      |    4 +
 .../i686 => x86_64}/multiarch/mempcpy-ssse3.S      |    0
 sysdeps/x86_64/multiarch/mempcpy.S                 |   75 +
 sysdeps/x86_64/multiarch/mempcpy_chk.S             |   47 +
 sysdeps/x86_64/sysdep.h                            |    3 +
 21 files changed, 6673 insertions(+), 10 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/bcopy.S
 create mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 create mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
 create mode 100644 sysdeps/x86_64/multiarch/memcpy.S
 create mode 100644 sysdeps/x86_64/multiarch/memcpy_chk.S
 create mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
 copy sysdeps/{i386/i686 => x86_64}/multiarch/memmove-ssse3.S (100%)
 create mode 100644 sysdeps/x86_64/multiarch/memmove.c
 create mode 100644 sysdeps/x86_64/multiarch/memmove_chk.c
 create mode 100644 sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
 copy sysdeps/{i386/i686 => x86_64}/multiarch/mempcpy-ssse3.S (100%)
 create mode 100644 sysdeps/x86_64/multiarch/mempcpy.S
 create mode 100644 sysdeps/x86_64/multiarch/mempcpy_chk.S


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]