This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch, master, updated. glibc-2.15-532-g4b43400
- From: drepper at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 30 Mar 2012 20:46:01 -0000
- Subject: GNU C Library master sources branch, master, updated. glibc-2.15-532-g4b43400
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 4b43400f6a710fa3d931a57eaae4cb332fb60edc (commit)
from 48c41d04ee06efc6ec97325ed6697c121b40865f (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4b43400f6a710fa3d931a57eaae4cb332fb60edc
commit 4b43400f6a710fa3d931a57eaae4cb332fb60edc
Author: Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
Date: Fri Mar 30 16:45:27 2012 -0400
optimize the following memcpy: sysdeps/i386/i686/multiarch/memcpy-ssse3.S
I've improved the following implementation of memcpy:
"sysdeps/i386/i686/multiarch/memcpy-ssse3.S".
The patch includes some minor style fixes, but the important part is
just using prefetch loops for the case:
DATA_CACHE_SIZE_HALF <= len < SHARED_CACHE_SIZE_HALF and
src and dst pointers have unequal 16 byte alignments.
This gives from 6% - 50% performance boost on the atom machine, about
24,73% in geometric mean.
diff --git a/ChangeLog b/ChangeLog
index 2e16f98..61ec1e1 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2012-03-22 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
+
+ * sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Update.
+ Optimize memcpy with prefetch if
+ DATA_CACHE_SIZE_HALF <= len < SHARED_CACHE_SIZE_HALF and
+ src, dst pointers have unequal 16 byte alignments.
+
2012-03-30 Siddhesh Poyarekar <siddhesh@redhat.com>
[BZ #13928]
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
index 3a3ab79..30bdad6 100644
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -17,109 +17,100 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-
#if !defined NOT_IN_libc \
&& (defined SHARED \
|| defined USE_AS_MEMMOVE \
|| !defined USE_MULTIARCH)
-#include "asm-syntax.h"
+# include <sysdep.h>
+# include "asm-syntax.h"
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-#endif
+# ifndef MEMCPY
+# define MEMCPY __memcpy_ssse3
+# define MEMCPY_CHK __memcpy_chk_ssse3
+# endif
-#ifdef USE_AS_BCOPY
-# define SRC PARMS
-# define DEST SRC+4
-# define LEN DEST+4
-#else
-# define DEST PARMS
-# define SRC DEST+4
-# define LEN SRC+4
-#endif
+# ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+# else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+# endif
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
-#ifdef SHARED
-# define PARMS 8 /* Preserve EBX. */
-# define ENTRANCE PUSH (%ebx);
-# define RETURN_END POP (%ebx); ret
-# define RETURN RETURN_END; CFI_PUSH (%ebx)
-# define JMPTBL(I, B) I - B
+# ifdef SHARED
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B) I - B
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. INDEX is a register contains the
- index into the jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- /* We first load PC into EBX. */ \
- SETUP_PIC_REG(bx); \
- /* Get the address of the jump table. */ \
- addl $(TABLE - .), %ebx; \
- /* Get the entry and convert the relative offset to the \
- absolute address. */ \
- addl (%ebx,INDEX,SCALE), %ebx; \
- /* We loaded the jump table. Go. */ \
- jmp *%ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
- addl $(TABLE - .), %ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
- addl (%ebx,INDEX,SCALE), %ebx; \
- /* We loaded the jump table. Go. */ \
- jmp *%ebx
-#else
-# define PARMS 4
-# define ENTRANCE
-# define RETURN_END ret
-# define RETURN RETURN_END
-# define JMPTBL(I, B) I
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx, INDEX, SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+# else
-/* Branch to an entry in a jump table. TABLE is a jump table with
- absolute offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- jmp *TABLE(,INDEX,SCALE)
+# define PARMS 4
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define JMPTBL(I, B) I
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
- jmp *TABLE(,INDEX,SCALE)
-#endif
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(, INDEX, SCALE)
+# endif
.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_BCOPY
+# if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
movl 12(%esp), %eax
cmpl %eax, 16(%esp)
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMCPY_CHK)
-#endif
+# endif
ENTRY (MEMCPY)
ENTRANCE
movl LEN(%esp), %ecx
movl SRC(%esp), %eax
movl DEST(%esp), %edx
-#ifdef USE_AS_MEMMOVE
+# ifdef USE_AS_MEMMOVE
cmp %eax, %edx
jb L(copy_forward)
je L(fwd_write_0bytes)
cmp $32, %ecx
jae L(memmove_bwd)
jmp L(bk_write_less32bytes_2)
+
+ .p2align 4
L(memmove_bwd):
add %ecx, %eax
cmp %eax, %edx
@@ -127,67 +118,72 @@ L(memmove_bwd):
jb L(copy_backward)
L(copy_forward):
-#endif
+# endif
cmp $48, %ecx
jae L(48bytesormore)
L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
+# ifndef USE_AS_MEMMOVE
cmp %dl, %al
jb L(bk_write)
-#endif
+# endif
add %ecx, %edx
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-#ifndef USE_AS_MEMMOVE
+# ifndef USE_AS_MEMMOVE
+ .p2align 4
L(bk_write):
BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-#endif
+# endif
- ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned. */
+ .p2align 4
L(48bytesormore):
+# ifndef USE_AS_MEMMOVE
+ movlpd (%eax), %xmm0
+ movlpd 8(%eax), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+# else
movdqu (%eax), %xmm0
+# endif
PUSH (%edi)
movl %edx, %edi
and $-16, %edx
- PUSH (%esi)
- cfi_remember_state
add $16, %edx
- movl %edi, %esi
sub %edx, %edi
add %edi, %ecx
sub %edi, %eax
-#ifdef SHARED_CACHE_SIZE_HALF
+# ifdef SHARED_CACHE_SIZE_HALF
cmp $SHARED_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
+# else
+# ifdef SHARED
SETUP_PIC_REG(bx)
add $_GLOBAL_OFFSET_TABLE_, %ebx
cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-# else
+# else
cmp __x86_shared_cache_size_half, %ecx
+# endif
# endif
-#endif
mov %eax, %edi
jae L(large_page)
and $0xf, %edi
jz L(shl_0)
-
BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_0):
- movdqu %xmm0, (%esi)
+# ifdef USE_AS_MEMMOVE
+ movl DEST+4(%esp), %edi
+ movdqu %xmm0, (%edi)
+# endif
xor %edi, %edi
- POP (%esi)
cmp $127, %ecx
ja L(shl_0_gobble)
lea -32(%ecx), %ecx
+
+ .p2align 4
L(shl_0_loop):
movdqa (%eax, %edi), %xmm0
movdqa 16(%eax, %edi), %xmm1
@@ -219,6 +215,7 @@ L(shl_0_loop):
movdqa %xmm0, (%edx, %edi)
movdqa %xmm1, 16(%edx, %edi)
lea 32(%edi), %edi
+
L(shl_0_end):
lea 32(%ecx), %ecx
add %ecx, %edi
@@ -228,23 +225,25 @@ L(shl_0_end):
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
CFI_PUSH (%edi)
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
+ .p2align 4
+L(shl_0_gobble):
+# ifdef DATA_CACHE_SIZE_HALF
cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
+# else
+# ifdef SHARED
SETUP_PIC_REG(bx)
add $_GLOBAL_OFFSET_TABLE_, %ebx
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
+# else
cmp __x86_data_cache_size_half, %ecx
+# endif
# endif
-#endif
-
- POP (%edi)
+ POP (%edi)
lea -128(%ecx), %ecx
jae L(shl_0_gobble_mem_loop)
+
+ .p2align 4
L(shl_0_gobble_cache_loop):
movdqa (%eax), %xmm0
movdqa 0x10(%eax), %xmm1
@@ -274,17 +273,15 @@ L(shl_0_gobble_cache_loop):
movdqa (%eax), %xmm0
sub $0x40, %ecx
movdqa 0x10(%eax), %xmm1
-
movdqa %xmm0, (%edx)
movdqa %xmm1, 0x10(%edx)
-
movdqa 0x20(%eax), %xmm0
movdqa 0x30(%eax), %xmm1
add $0x40, %eax
-
movdqa %xmm0, 0x20(%edx)
movdqa %xmm1, 0x30(%edx)
add $0x40, %edx
+
L(shl_0_cache_less_64bytes):
cmp $0x20, %ecx
jb L(shl_0_cache_less_32bytes)
@@ -295,6 +292,7 @@ L(shl_0_cache_less_64bytes):
movdqa %xmm0, (%edx)
movdqa %xmm1, 0x10(%edx)
add $0x20, %edx
+
L(shl_0_cache_less_32bytes):
cmp $0x10, %ecx
jb L(shl_0_cache_less_16bytes)
@@ -303,13 +301,13 @@ L(shl_0_cache_less_32bytes):
add $0x10, %eax
movdqa %xmm0, (%edx)
add $0x10, %edx
+
L(shl_0_cache_less_16bytes):
add %ecx, %edx
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
- ALIGN (4)
+ .p2align 4
L(shl_0_gobble_mem_loop):
prefetcht0 0x1c0(%eax)
prefetcht0 0x280(%eax)
@@ -354,6 +352,7 @@ L(shl_0_gobble_mem_loop):
movdqa %xmm0, 0x20(%edx)
movdqa %xmm1, 0x30(%edx)
add $0x40, %edx
+
L(shl_0_mem_less_64bytes):
cmp $0x20, %ecx
jb L(shl_0_mem_less_32bytes)
@@ -364,6 +363,7 @@ L(shl_0_mem_less_64bytes):
movdqa %xmm0, (%edx)
movdqa %xmm1, 0x10(%edx)
add $0x20, %edx
+
L(shl_0_mem_less_32bytes):
cmp $0x10, %ecx
jb L(shl_0_mem_less_16bytes)
@@ -372,24 +372,84 @@ L(shl_0_mem_less_32bytes):
add $0x10, %eax
movdqa %xmm0, (%edx)
add $0x10, %edx
+
L(shl_0_mem_less_16bytes):
add %ecx, %edx
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_1):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -1(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -1(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_1_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl1LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 15(%eax), %xmm2
+ movaps 31(%eax), %xmm3
+ movaps 47(%eax), %xmm4
+ movaps 63(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ palignr $1, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $1, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 15(%eax), %xmm2
+ movaps 31(%eax), %xmm3
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_1_no_prefetch):
+ lea -32(%ecx), %ecx
lea -1(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_1_loop):
+ .p2align 4
+L(sh_1_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -399,8 +459,7 @@ L(shl_1_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_1_end)
+ jb L(sh_1_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -411,30 +470,90 @@ L(shl_1_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_1_no_prefetch_loop)
- jae L(shl_1_loop)
-
-L(shl_1_end):
+L(sh_1_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 1(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_2):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -2(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -2(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_2_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl2LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 14(%eax), %xmm2
+ movaps 30(%eax), %xmm3
+ movaps 46(%eax), %xmm4
+ movaps 62(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ palignr $2, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $2, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 14(%eax), %xmm2
+ movaps 30(%eax), %xmm3
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_2_no_prefetch):
+ lea -32(%ecx), %ecx
lea -2(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_2_loop):
+ .p2align 4
+L(sh_2_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -444,8 +563,7 @@ L(shl_2_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_2_end)
+ jb L(sh_2_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -456,30 +574,90 @@ L(shl_2_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_2_no_prefetch_loop)
- jae L(shl_2_loop)
-
-L(shl_2_end):
+L(sh_2_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 2(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_3):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -3(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -3(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_3_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl3LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 13(%eax), %xmm2
+ movaps 29(%eax), %xmm3
+ movaps 45(%eax), %xmm4
+ movaps 61(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ palignr $3, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $3, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 13(%eax), %xmm2
+ movaps 29(%eax), %xmm3
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_3_no_prefetch):
+ lea -32(%ecx), %ecx
lea -3(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_3_loop):
+ .p2align 4
+L(sh_3_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -490,7 +668,7 @@ L(shl_3_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jb L(shl_3_end)
+ jb L(sh_3_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -502,29 +680,90 @@ L(shl_3_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jae L(shl_3_loop)
+ jae L(sh_3_no_prefetch_loop)
-L(shl_3_end):
+L(sh_3_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 3(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_4):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -4(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -4(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_4_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl4LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 12(%eax), %xmm2
+ movaps 28(%eax), %xmm3
+ movaps 44(%eax), %xmm4
+ movaps 60(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ palignr $4, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $4, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 12(%eax), %xmm2
+ movaps 28(%eax), %xmm3
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_4_no_prefetch):
+ lea -32(%ecx), %ecx
lea -4(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_4_loop):
+ .p2align 4
+L(sh_4_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -535,7 +774,7 @@ L(shl_4_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jb L(shl_4_end)
+ jb L(sh_4_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -547,29 +786,90 @@ L(shl_4_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jae L(shl_4_loop)
+ jae L(sh_4_no_prefetch_loop)
-L(shl_4_end):
+L(sh_4_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 4(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_5):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -5(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -5(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_5_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl5LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 11(%eax), %xmm2
+ movaps 27(%eax), %xmm3
+ movaps 43(%eax), %xmm4
+ movaps 59(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $5, %xmm4, %xmm5
+ palignr $5, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $5, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl5LoopStart)
+
+L(Shl5LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 11(%eax), %xmm2
+ movaps 27(%eax), %xmm3
+ palignr $5, %xmm2, %xmm3
+ palignr $5, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_5_no_prefetch):
+ lea -32(%ecx), %ecx
lea -5(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_5_loop):
+ .p2align 4
+L(sh_5_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -580,7 +880,7 @@ L(shl_5_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jb L(shl_5_end)
+ jb L(sh_5_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -592,29 +892,90 @@ L(shl_5_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jae L(shl_5_loop)
+ jae L(sh_5_no_prefetch_loop)
-L(shl_5_end):
+L(sh_5_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 5(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_6):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -6(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -6(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_6_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl6LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 10(%eax), %xmm2
+ movaps 26(%eax), %xmm3
+ movaps 42(%eax), %xmm4
+ movaps 58(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $6, %xmm4, %xmm5
+ palignr $6, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $6, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl6LoopStart)
+
+L(Shl6LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 10(%eax), %xmm2
+ movaps 26(%eax), %xmm3
+ palignr $6, %xmm2, %xmm3
+ palignr $6, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_6_no_prefetch):
+ lea -32(%ecx), %ecx
lea -6(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_6_loop):
+ .p2align 4
+L(sh_6_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -625,7 +986,7 @@ L(shl_6_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jb L(shl_6_end)
+ jb L(sh_6_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -637,29 +998,90 @@ L(shl_6_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jae L(shl_6_loop)
+ jae L(sh_6_no_prefetch_loop)
-L(shl_6_end):
+L(sh_6_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 6(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_7):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -7(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -7(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_7_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl7LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 9(%eax), %xmm2
+ movaps 25(%eax), %xmm3
+ movaps 41(%eax), %xmm4
+ movaps 57(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $7, %xmm4, %xmm5
+ palignr $7, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $7, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl7LoopStart)
+
+L(Shl7LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 9(%eax), %xmm2
+ movaps 25(%eax), %xmm3
+ palignr $7, %xmm2, %xmm3
+ palignr $7, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_7_no_prefetch):
+ lea -32(%ecx), %ecx
lea -7(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_7_loop):
+ .p2align 4
+L(sh_7_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -669,8 +1091,7 @@ L(shl_7_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_7_end)
+ jb L(sh_7_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -681,30 +1102,90 @@ L(shl_7_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_7_no_prefetch_loop)
- jae L(shl_7_loop)
-
-L(shl_7_end):
+L(sh_7_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 7(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_8):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -8(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -8(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_8_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl8LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 8(%eax), %xmm2
+ movaps 24(%eax), %xmm3
+ movaps 40(%eax), %xmm4
+ movaps 56(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $8, %xmm4, %xmm5
+ palignr $8, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $8, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl8LoopStart)
+
+L(LoopLeave8):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 8(%eax), %xmm2
+ movaps 24(%eax), %xmm3
+ palignr $8, %xmm2, %xmm3
+ palignr $8, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_8_no_prefetch):
+ lea -32(%ecx), %ecx
lea -8(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_8_loop):
+ .p2align 4
+L(sh_8_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -714,8 +1195,7 @@ L(shl_8_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_8_end)
+ jb L(sh_8_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -726,30 +1206,91 @@ L(shl_8_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_8_no_prefetch_loop)
- jae L(shl_8_loop)
-
-L(shl_8_end):
+L(sh_8_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 8(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_9):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -9(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -9(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_9_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl9LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 7(%eax), %xmm2
+ movaps 23(%eax), %xmm3
+ movaps 39(%eax), %xmm4
+ movaps 55(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $9, %xmm4, %xmm5
+ palignr $9, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $9, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $9, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl9LoopStart)
+
+L(Shl9LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 7(%eax), %xmm2
+ movaps 23(%eax), %xmm3
+ palignr $9, %xmm2, %xmm3
+ palignr $9, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_9_no_prefetch):
+ lea -32(%ecx), %ecx
lea -9(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_9_loop):
+ .p2align 4
+L(sh_9_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -759,8 +1300,7 @@ L(shl_9_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_9_end)
+ jb L(sh_9_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -771,30 +1311,91 @@ L(shl_9_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_9_no_prefetch_loop)
+
+L(sh_9_end_no_prefetch_loop):
+ lea 32(%ecx), %ecx
+ add %ecx, %edi
+ add %edi, %edx
+ lea 9(%edi, %eax), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(shl_10):
+# ifndef USE_AS_MEMMOVE
+ movaps -10(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -10(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_10_no_prefetch)
- jae L(shl_9_loop)
+ lea -64(%ecx), %ecx
-L(shl_9_end):
- lea 32(%ecx), %ecx
- add %ecx, %edi
- add %edi, %edx
- lea 9(%edi, %eax), %eax
+ .p2align 4
+L(Shl10LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 6(%eax), %xmm2
+ movaps 22(%eax), %xmm3
+ movaps 38(%eax), %xmm4
+ movaps 54(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $10, %xmm4, %xmm5
+ palignr $10, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $10, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $10, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl10LoopStart)
+
+L(Shl10LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 6(%eax), %xmm2
+ movaps 22(%eax), %xmm3
+ palignr $10, %xmm2, %xmm3
+ palignr $10, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
-L(shl_10):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_10_no_prefetch):
+ lea -32(%ecx), %ecx
lea -10(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_10_loop):
+ .p2align 4
+L(sh_10_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -804,8 +1405,7 @@ L(shl_10_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_10_end)
+ jb L(sh_10_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -816,30 +1416,91 @@ L(shl_10_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_10_no_prefetch_loop)
- jae L(shl_10_loop)
-
-L(shl_10_end):
+L(sh_10_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 10(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_11):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -11(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -11(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_11_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl11LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 5(%eax), %xmm2
+ movaps 21(%eax), %xmm3
+ movaps 37(%eax), %xmm4
+ movaps 53(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $11, %xmm4, %xmm5
+ palignr $11, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $11, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $11, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl11LoopStart)
+
+L(Shl11LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 5(%eax), %xmm2
+ movaps 21(%eax), %xmm3
+ palignr $11, %xmm2, %xmm3
+ palignr $11, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_11_no_prefetch):
+ lea -32(%ecx), %ecx
lea -11(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_11_loop):
+ .p2align 4
+L(sh_11_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -849,8 +1510,7 @@ L(shl_11_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_11_end)
+ jb L(sh_11_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -861,30 +1521,91 @@ L(shl_11_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_11_no_prefetch_loop)
- jae L(shl_11_loop)
-
-L(shl_11_end):
+L(sh_11_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 11(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_12):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -12(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -12(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_12_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl12LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 4(%eax), %xmm2
+ movaps 20(%eax), %xmm3
+ movaps 36(%eax), %xmm4
+ movaps 52(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $12, %xmm4, %xmm5
+ palignr $12, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $12, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $12, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl12LoopStart)
+
+L(Shl12LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 4(%eax), %xmm2
+ movaps 20(%eax), %xmm3
+ palignr $12, %xmm2, %xmm3
+ palignr $12, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_12_no_prefetch):
+ lea -32(%ecx), %ecx
lea -12(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_12_loop):
+ .p2align 4
+L(sh_12_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -894,8 +1615,7 @@ L(shl_12_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_12_end)
+ jb L(sh_12_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -906,30 +1626,91 @@ L(shl_12_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_12_no_prefetch_loop)
- jae L(shl_12_loop)
-
-L(shl_12_end):
+L(sh_12_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 12(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_13):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -13(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -13(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_13_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl13LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 3(%eax), %xmm2
+ movaps 19(%eax), %xmm3
+ movaps 35(%eax), %xmm4
+ movaps 51(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $13, %xmm4, %xmm5
+ palignr $13, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $13, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $13, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl13LoopStart)
+
+L(Shl13LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 3(%eax), %xmm2
+ movaps 19(%eax), %xmm3
+ palignr $13, %xmm2, %xmm3
+ palignr $13, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_13_no_prefetch):
+ lea -32(%ecx), %ecx
lea -13(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_13_loop):
+ .p2align 4
+L(sh_13_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -939,8 +1720,7 @@ L(shl_13_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_13_end)
+ jb L(sh_13_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -951,30 +1731,91 @@ L(shl_13_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_13_no_prefetch_loop)
- jae L(shl_13_loop)
-
-L(shl_13_end):
+L(sh_13_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 13(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_14):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -14(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -14(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_14_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl14LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 2(%eax), %xmm2
+ movaps 18(%eax), %xmm3
+ movaps 34(%eax), %xmm4
+ movaps 50(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $14, %xmm4, %xmm5
+ palignr $14, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $14, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $14, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl14LoopStart)
+
+L(Shl14LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 2(%eax), %xmm2
+ movaps 18(%eax), %xmm3
+ palignr $14, %xmm2, %xmm3
+ palignr $14, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_14_no_prefetch):
+ lea -32(%ecx), %ecx
lea -14(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_14_loop):
+ .p2align 4
+L(sh_14_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -984,8 +1825,7 @@ L(shl_14_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_14_end)
+ jb L(sh_14_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -996,30 +1836,91 @@ L(shl_14_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_14_no_prefetch_loop)
- jae L(shl_14_loop)
-
-L(shl_14_end):
+L(sh_14_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 14(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_15):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -15(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -15(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_15_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl15LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 1(%eax), %xmm2
+ movaps 17(%eax), %xmm3
+ movaps 33(%eax), %xmm4
+ movaps 49(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $15, %xmm4, %xmm5
+ palignr $15, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $15, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $15, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl15LoopStart)
+
+L(Shl15LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 1(%eax), %xmm2
+ movaps 17(%eax), %xmm3
+ palignr $15, %xmm2, %xmm3
+ palignr $15, %xmm1, %xmm2
+
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_15_no_prefetch):
+ lea -32(%ecx), %ecx
lea -15(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_15_loop):
+ .p2align 4
+L(sh_15_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -1029,8 +1930,7 @@ L(shl_15_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_15_end)
+ jb L(sh_15_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -1041,19 +1941,27 @@ L(shl_15_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_15_no_prefetch_loop)
- jae L(shl_15_loop)
-
-L(shl_15_end):
+L(sh_15_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 15(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+ .p2align 4
+L(shl_end_0):
+ lea 32(%ecx), %ecx
+ lea (%edx, %ecx), %edx
+ lea (%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- ALIGN (4)
+ .p2align 4
L(fwd_write_44bytes):
movq -44(%eax), %xmm0
movq %xmm0, -44(%edx)
@@ -1072,16 +1980,16 @@ L(fwd_write_12bytes):
L(fwd_write_4bytes):
movl -4(%eax), %ecx
movl %ecx, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_40bytes):
movq -40(%eax), %xmm0
movq %xmm0, -40(%edx)
@@ -1098,31 +2006,31 @@ L(fwd_write_8bytes):
movq -8(%eax), %xmm0
movq %xmm0, -8(%edx)
L(fwd_write_0bytes):
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_5bytes):
movl -5(%eax), %ecx
movl -4(%eax), %eax
movl %ecx, -5(%edx)
movl %eax, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_45bytes):
movq -45(%eax), %xmm0
movq %xmm0, -45(%edx)
@@ -1142,16 +2050,16 @@ L(fwd_write_13bytes):
movl %ecx, -5(%edx)
movzbl -1(%eax), %ecx
movb %cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_41bytes):
movq -41(%eax), %xmm0
movq %xmm0, -41(%edx)
@@ -1170,16 +2078,16 @@ L(fwd_write_9bytes):
L(fwd_write_1bytes):
movzbl -1(%eax), %ecx
movb %cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_46bytes):
movq -46(%eax), %xmm0
movq %xmm0, -46(%edx)
@@ -1200,16 +2108,16 @@ L(fwd_write_6bytes):
movl %ecx, -6(%edx)
movzwl -2(%eax), %ecx
movw %cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_42bytes):
movq -42(%eax), %xmm0
movq %xmm0, -42(%edx)
@@ -1228,16 +2136,16 @@ L(fwd_write_10bytes):
L(fwd_write_2bytes):
movzwl -2(%eax), %ecx
movw %cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_47bytes):
movq -47(%eax), %xmm0
movq %xmm0, -47(%edx)
@@ -1260,16 +2168,16 @@ L(fwd_write_7bytes):
movzbl -1(%eax), %eax
movw %cx, -3(%edx)
movb %al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_43bytes):
movq -43(%eax), %xmm0
movq %xmm0, -43(%edx)
@@ -1290,16 +2198,16 @@ L(fwd_write_3bytes):
movzbl -1(%eax), %eax
movw %cx, -3(%edx)
movb %al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_40bytes_align):
movdqa -40(%eax), %xmm0
movdqa %xmm0, -40(%edx)
@@ -1310,47 +2218,47 @@ L(fwd_write_8bytes_align):
movq -8(%eax), %xmm0
movq %xmm0, -8(%edx)
L(fwd_write_0bytes_align):
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_32bytes_align):
movdqa -32(%eax), %xmm0
movdqa %xmm0, -32(%edx)
L(fwd_write_16bytes_align):
movdqa -16(%eax), %xmm0
movdqa %xmm0, -16(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_5bytes_align):
movl -5(%eax), %ecx
movl -4(%eax), %eax
movl %ecx, -5(%edx)
movl %eax, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_45bytes_align):
movdqa -45(%eax), %xmm0
movdqa %xmm0, -45(%edx)
@@ -1364,16 +2272,16 @@ L(fwd_write_13bytes_align):
movl %ecx, -5(%edx)
movzbl -1(%eax), %ecx
movb %cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_37bytes_align):
movdqa -37(%eax), %xmm0
movdqa %xmm0, -37(%edx)
@@ -1384,16 +2292,16 @@ L(fwd_write_21bytes_align):
movl %ecx, -5(%edx)
movzbl -1(%eax), %ecx
movb %cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_41bytes_align):
movdqa -41(%eax), %xmm0
movdqa %xmm0, -41(%edx)
@@ -1406,16 +2314,16 @@ L(fwd_write_9bytes_align):
L(fwd_write_1bytes_align):
movzbl -1(%eax), %ecx
movb %cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_33bytes_align):
movdqa -33(%eax), %xmm0
movdqa %xmm0, -33(%edx)
@@ -1424,16 +2332,16 @@ L(fwd_write_17bytes_align):
movdqa %xmm0, -17(%edx)
movzbl -1(%eax), %ecx
movb %cl, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_46bytes_align):
movdqa -46(%eax), %xmm0
movdqa %xmm0, -46(%edx)
@@ -1448,16 +2356,16 @@ L(fwd_write_6bytes_align):
movl %ecx, -6(%edx)
movzwl -2(%eax), %ecx
movw %cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_38bytes_align):
movdqa -38(%eax), %xmm0
movdqa %xmm0, -38(%edx)
@@ -1468,16 +2376,16 @@ L(fwd_write_22bytes_align):
movl %ecx, -6(%edx)
movzwl -2(%eax), %ecx
movw %cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_42bytes_align):
movdqa -42(%eax), %xmm0
movdqa %xmm0, -42(%edx)
@@ -1490,16 +2398,16 @@ L(fwd_write_10bytes_align):
L(fwd_write_2bytes_align):
movzwl -2(%eax), %ecx
movw %cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_34bytes_align):
movdqa -34(%eax), %xmm0
movdqa %xmm0, -34(%edx)
@@ -1508,16 +2416,16 @@ L(fwd_write_18bytes_align):
movdqa %xmm0, -18(%edx)
movzwl -2(%eax), %ecx
movw %cx, -2(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_47bytes_align):
movdqa -47(%eax), %xmm0
movdqa %xmm0, -47(%edx)
@@ -1534,16 +2442,16 @@ L(fwd_write_7bytes_align):
movzbl -1(%eax), %eax
movw %cx, -3(%edx)
movb %al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_39bytes_align):
movdqa -39(%eax), %xmm0
movdqa %xmm0, -39(%edx)
@@ -1556,16 +2464,16 @@ L(fwd_write_23bytes_align):
movzbl -1(%eax), %eax
movw %cx, -3(%edx)
movb %al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_43bytes_align):
movdqa -43(%eax), %xmm0
movdqa %xmm0, -43(%edx)
@@ -1580,16 +2488,16 @@ L(fwd_write_3bytes_align):
movzbl -1(%eax), %eax
movw %cx, -3(%edx)
movb %al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_35bytes_align):
movdqa -35(%eax), %xmm0
movdqa %xmm0, -35(%edx)
@@ -1600,16 +2508,16 @@ L(fwd_write_19bytes_align):
movzbl -1(%eax), %eax
movw %cx, -3(%edx)
movb %al, -1(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_44bytes_align):
movdqa -44(%eax), %xmm0
movdqa %xmm0, -44(%edx)
@@ -1622,16 +2530,16 @@ L(fwd_write_12bytes_align):
L(fwd_write_4bytes_align):
movl -4(%eax), %ecx
movl %ecx, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(fwd_write_36bytes_align):
movdqa -36(%eax), %xmm0
movdqa %xmm0, -36(%edx)
@@ -1640,27 +2548,31 @@ L(fwd_write_20bytes_align):
movdqa %xmm0, -20(%edx)
movl -4(%eax), %ecx
movl %ecx, -4(%edx)
-#ifndef USE_AS_BCOPY
-# ifdef USE_AS_MEMPCPY
+# ifndef USE_AS_BCOPY
+# ifdef USE_AS_MEMPCPY
movl %edx, %eax
-# else
+# else
movl DEST(%esp), %eax
+# endif
# endif
-#endif
RETURN_END
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(large_page):
movdqu (%eax), %xmm1
+# ifdef USE_AS_MEMMOVE
+ movl DEST+4(%esp), %edi
+ movdqu %xmm0, (%edi)
+# endif
lea 16(%eax), %eax
- movdqu %xmm0, (%esi)
movntdq %xmm1, (%edx)
lea 16(%edx), %edx
- POP (%esi)
lea -0x90(%ecx), %ecx
POP (%edi)
+
+ .p2align 4
L(large_page_loop):
movdqu (%eax), %xmm0
movdqu 0x10(%eax), %xmm1
@@ -1715,8 +2627,7 @@ L(large_page_less_32bytes):
sfence
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
- ALIGN (4)
+ .p2align 4
L(bk_write_44bytes):
movq 36(%eax), %xmm0
movq %xmm0, 36(%edx)
@@ -1736,16 +2647,16 @@ L(bk_write_4bytes):
movl (%eax), %ecx
movl %ecx, (%edx)
L(bk_write_0bytes):
-#ifndef USE_AS_BCOPY
+# ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifdef USE_AS_MEMPCPY
movl LEN(%esp), %ecx
add %ecx, %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(bk_write_40bytes):
movq 32(%eax), %xmm0
movq %xmm0, 32(%edx)
@@ -1761,16 +2672,16 @@ L(bk_write_16bytes):
L(bk_write_8bytes):
movq (%eax), %xmm0
movq %xmm0, (%edx)
-#ifndef USE_AS_BCOPY
+# ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifdef USE_AS_MEMPCPY
movl LEN(%esp), %ecx
add %ecx, %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(bk_write_45bytes):
movq 37(%eax), %xmm0
movq %xmm0, 37(%edx)
@@ -1792,16 +2703,16 @@ L(bk_write_5bytes):
L(bk_write_1bytes):
movzbl (%eax), %ecx
movb %cl, (%edx)
-#ifndef USE_AS_BCOPY
+# ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifdef USE_AS_MEMPCPY
movl LEN(%esp), %ecx
add %ecx, %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(bk_write_41bytes):
movq 33(%eax), %xmm0
movq %xmm0, 33(%edx)
@@ -1819,16 +2730,16 @@ L(bk_write_9bytes):
movq %xmm0, 1(%edx)
movzbl (%eax), %ecx
movb %cl, (%edx)
-#ifndef USE_AS_BCOPY
+# ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifdef USE_AS_MEMPCPY
movl LEN(%esp), %ecx
add %ecx, %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(bk_write_46bytes):
movq 38(%eax), %xmm0
movq %xmm0, 38(%edx)
@@ -1849,16 +2760,16 @@ L(bk_write_6bytes):
movl %ecx, 2(%edx)
movzwl (%eax), %ecx
movw %cx, (%edx)
-#ifndef USE_AS_BCOPY
+# ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifdef USE_AS_MEMPCPY
movl LEN(%esp), %ecx
add %ecx, %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(bk_write_42bytes):
movq 34(%eax), %xmm0
movq %xmm0, 34(%edx)
@@ -1877,16 +2788,16 @@ L(bk_write_10bytes):
L(bk_write_2bytes):
movzwl (%eax), %ecx
movw %cx, (%edx)
-#ifndef USE_AS_BCOPY
+# ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifdef USE_AS_MEMPCPY
movl LEN(%esp), %ecx
add %ecx, %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(bk_write_47bytes):
movq 39(%eax), %xmm0
movq %xmm0, 39(%edx)
@@ -1909,16 +2820,16 @@ L(bk_write_7bytes):
movw %cx, 1(%edx)
movzbl (%eax), %eax
movb %al, (%edx)
-#ifndef USE_AS_BCOPY
+# ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifdef USE_AS_MEMPCPY
movl LEN(%esp), %ecx
add %ecx, %eax
+# endif
# endif
-#endif
RETURN
- ALIGN (4)
+ .p2align 4
L(bk_write_43bytes):
movq 35(%eax), %xmm0
movq %xmm0, 35(%edx)
@@ -1939,18 +2850,18 @@ L(bk_write_3bytes):
movw %cx, 1(%edx)
movzbl (%eax), %eax
movb %al, (%edx)
-#ifndef USE_AS_BCOPY
+# ifndef USE_AS_BCOPY
movl DEST(%esp), %eax
-# ifdef USE_AS_MEMPCPY
+# ifdef USE_AS_MEMPCPY
movl LEN(%esp), %ecx
add %ecx, %eax
+# endif
# endif
-#endif
RETURN_END
.pushsection .rodata.ssse3,"a",@progbits
- ALIGN (2)
+ .p2align 2
L(table_48bytes_fwd):
.int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
.int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
@@ -2001,7 +2912,7 @@ L(table_48bytes_fwd):
.int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
.int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
- ALIGN (2)
+ .p2align 2
L(table_48bytes_fwd_align):
.int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
.int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
@@ -2052,7 +2963,7 @@ L(table_48bytes_fwd_align):
.int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
.int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
- ALIGN (2)
+ .p2align 2
L(shl_table):
.int JMPTBL (L(shl_0), L(shl_table))
.int JMPTBL (L(shl_1), L(shl_table))
@@ -2071,7 +2982,7 @@ L(shl_table):
.int JMPTBL (L(shl_14), L(shl_table))
.int JMPTBL (L(shl_15), L(shl_table))
- ALIGN (2)
+ .p2align 2
L(table_48_bytes_bwd):
.int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
.int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
@@ -2124,13 +3035,13 @@ L(table_48_bytes_bwd):
.popsection
-#ifdef USE_AS_MEMMOVE
- ALIGN (4)
+# ifdef USE_AS_MEMMOVE
+ .p2align 4
L(copy_backward):
- PUSH (%esi)
- movl %eax, %esi
+ PUSH (%edi)
+ movl %eax, %edi
lea (%ecx,%edx,1),%edx
- lea (%ecx,%esi,1),%esi
+ lea (%ecx,%edi,1),%edi
testl $0x3, %edx
jnz L(bk_align)
@@ -2145,52 +3056,53 @@ L(bk_write_64bytesless):
L(bk_write_more32bytes):
/* Copy 32 bytes at a time. */
sub $32, %ecx
- movq -8(%esi), %xmm0
+ movq -8(%edi), %xmm0
movq %xmm0, -8(%edx)
- movq -16(%esi), %xmm0
+ movq -16(%edi), %xmm0
movq %xmm0, -16(%edx)
- movq -24(%esi), %xmm0
+ movq -24(%edi), %xmm0
movq %xmm0, -24(%edx)
- movq -32(%esi), %xmm0
+ movq -32(%edi), %xmm0
movq %xmm0, -32(%edx)
sub $32, %edx
- sub $32, %esi
+ sub $32, %edi
L(bk_write_less32bytes):
- movl %esi, %eax
+ movl %edi, %eax
sub %ecx, %edx
sub %ecx, %eax
- POP (%esi)
+ POP (%edi)
L(bk_write_less32bytes_2):
BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
- CFI_PUSH (%esi)
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(bk_align):
cmp $8, %ecx
jbe L(bk_write_less32bytes)
testl $1, %edx
/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
- then (EDX & 2) must be != 0. */
+ then (EDX & 2) must be != 0. */
jz L(bk_got2)
- sub $1, %esi
+ sub $1, %edi
sub $1, %ecx
sub $1, %edx
- movzbl (%esi), %eax
+ movzbl (%edi), %eax
movb %al, (%edx)
testl $2, %edx
jz L(bk_aligned_4)
L(bk_got2):
- sub $2, %esi
+ sub $2, %edi
sub $2, %ecx
sub $2, %edx
- movzwl (%esi), %eax
+ movzwl (%edi), %eax
movw %ax, (%edx)
jmp L(bk_aligned_4)
- ALIGN (4)
+ .p2align 4
L(bk_write_more64bytes):
/* Check alignment of last byte. */
testl $15, %edx
@@ -2198,51 +3110,52 @@ L(bk_write_more64bytes):
/* EDX is aligned 4 bytes, but not 16 bytes. */
L(bk_ssse3_align):
- sub $4, %esi
+ sub $4, %edi
sub $4, %ecx
sub $4, %edx
- movl (%esi), %eax
+ movl (%edi), %eax
movl %eax, (%edx)
testl $15, %edx
jz L(bk_ssse3_cpy_pre)
- sub $4, %esi
+ sub $4, %edi
sub $4, %ecx
sub $4, %edx
- movl (%esi), %eax
+ movl (%edi), %eax
movl %eax, (%edx)
testl $15, %edx
jz L(bk_ssse3_cpy_pre)
- sub $4, %esi
+ sub $4, %edi
sub $4, %ecx
sub $4, %edx
- movl (%esi), %eax
+ movl (%edi), %eax
movl %eax, (%edx)
L(bk_ssse3_cpy_pre):
cmp $64, %ecx
jb L(bk_write_more32bytes)
+ .p2align 4
L(bk_ssse3_cpy):
- sub $64, %esi
+ sub $64, %edi
sub $64, %ecx
sub $64, %edx
- movdqu 0x30(%esi), %xmm3
+ movdqu 0x30(%edi), %xmm3
movdqa %xmm3, 0x30(%edx)
- movdqu 0x20(%esi), %xmm2
+ movdqu 0x20(%edi), %xmm2
movdqa %xmm2, 0x20(%edx)
- movdqu 0x10(%esi), %xmm1
+ movdqu 0x10(%edi), %xmm1
movdqa %xmm1, 0x10(%edx)
- movdqu (%esi), %xmm0
+ movdqu (%edi), %xmm0
movdqa %xmm0, (%edx)
cmp $64, %ecx
jae L(bk_ssse3_cpy)
jmp L(bk_write_64bytesless)
-#endif
+# endif
END (MEMCPY)
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 7 +
sysdeps/i386/i686/multiarch/memcpy-ssse3.S | 1991 ++++++++++++++++++++--------
2 files changed, 1459 insertions(+), 539 deletions(-)
hooks/post-receive
--
GNU C Library master sources