This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch, master, updated. glibc-2.11-227-g3093e0c

From: drepper at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 25 Feb 2010 02:26:53 -0000
Subject: GNU C Library master sources branch, master, updated. glibc-2.11-227-g3093e0c
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  3093e0c713306755b364e59393e2ca18706d8a47 (commit)
       via  a0ac24d98ace90d1ccba6a2f3e7d55600f2fdb6e (commit)
       via  cc50f1a4b458f769ceb72d88bb78c8429361fec1 (commit)
      from  7ca890b88e6ab7624afb1742a9fffb37ad5b3fc3 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3093e0c713306755b364e59393e2ca18706d8a47

commit 3093e0c713306755b364e59393e2ca18706d8a47
Author: H.J. Lu <hongjiu.lu@intel.com>
Date:   Wed Feb 24 18:26:30 2010 -0800

    Fix issues in x86 memcpy-ssse3-rep.S

diff --git a/ChangeLog b/ChangeLog
index 2932b05..ec890ea 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,14 @@
 2010-02-24  H.J. Lu  <hongjiu.lu@intel.com>
 
+	* sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
+	(bk_write_less32bytes_2): Renamed to ...
+	(bk_write_less48bytes): This.
+	Use unsigned conditional jumps.
+	Correct unwind info.
+	Use add/sub instead of lea if possible.
+	(shl_0_gobble_cache_loop_tail): Removed.
+	(large_page): Properly adjust ECX.
+
 	* sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Use unsigned
 	conditional jumps.
 	Correct unwind info.
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
index b26037d..48a109c 100644
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S
@@ -127,10 +127,8 @@ ENTRY (MEMCPY)
 	cmp	%eax, %edx
 	jb	L(copy_forward)
 	je	L(fwd_write_0bytes)
-	cmp	$32, %ecx
-	jge	L(memmove_bwd)
-	jmp	L(bk_write_less32bytes_2)
-L(memmove_bwd):
+	cmp	$48, %ecx
+	jb	L(bk_write_less48bytes)
 	add	%ecx, %eax
 	cmp	%eax, %edx
 	movl	SRC(%esp), %eax
@@ -139,12 +137,12 @@ L(memmove_bwd):
 L(copy_forward):
 #endif
 	cmp	$48, %ecx
-	jge	L(48bytesormore)
+	jae	L(48bytesormore)
 
 L(fwd_write_less32bytes):
 #ifndef USE_AS_MEMMOVE
 	cmp	%dl, %al
-	jl	L(bk_write)
+	jb	L(bk_write)
 #endif
 	add	%ecx, %edx
 	add	%ecx, %eax
@@ -162,6 +160,7 @@ L(48bytesormore):
 	movl	%edx, %edi
 	and	$-16, %edx
 	PUSH (%esi)
+	cfi_remember_state
 	add	$16, %edx
 	movl	%edi, %esi
 	sub	%edx, %edi
@@ -181,7 +180,7 @@ L(48bytesormore):
 #endif
 
 	mov	%eax, %edi
-	jge	L(large_page)
+	jae	L(large_page)
 	and	$0xf, %edi
 	jz	L(shl_0)
 
@@ -201,7 +200,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -209,7 +208,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -217,7 +216,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -234,6 +233,8 @@ L(shl_0_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 L(shl_0_gobble):
 
 #ifdef DATA_CACHE_SIZE_HALF
@@ -251,8 +252,8 @@ L(shl_0_gobble):
 	shr	$3, %esi
 	sub	%esi, %edi
 	cmp	%edi, %ecx
-	jge	L(shl_0_gobble_mem_start)
-	lea	-128(%ecx), %ecx
+	jae	L(shl_0_gobble_mem_start)
+	sub	$128, %ecx
 	ALIGN (4)
 L(shl_0_gobble_cache_loop):
 	movdqa	(%eax), %xmm0
@@ -275,11 +276,10 @@ L(shl_0_gobble_cache_loop):
 	movaps	%xmm7, 0x70(%edx)
 	lea	0x80(%edx), %edx
 
-	jge	L(shl_0_gobble_cache_loop)
-L(shl_0_gobble_cache_loop_tail):
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_cache_less_64bytes)
+	jae	L(shl_0_gobble_cache_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_cache_less_64bytes)
 
 	movdqa	(%eax), %xmm0
 	sub	$0x40, %ecx
@@ -297,7 +297,7 @@ L(shl_0_gobble_cache_loop_tail):
 	add	$0x40, %edx
 L(shl_0_cache_less_64bytes):
 	cmp	$0x20, %ecx
-	jl	L(shl_0_cache_less_32bytes)
+	jb	L(shl_0_cache_less_32bytes)
 	movdqa	(%eax), %xmm0
 	sub	$0x20, %ecx
 	movdqa	0x10(%eax), %xmm1
@@ -307,7 +307,7 @@ L(shl_0_cache_less_64bytes):
 	add	$0x20, %edx
 L(shl_0_cache_less_32bytes):
 	cmp	$0x10, %ecx
-	jl	L(shl_0_cache_less_16bytes)
+	jb	L(shl_0_cache_less_16bytes)
 	sub	$0x10, %ecx
 	movdqa	(%eax), %xmm0
 	add	$0x10, %eax
@@ -320,12 +320,13 @@ L(shl_0_cache_less_16bytes):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_0_gobble_mem_start):
 	cmp	%al, %dl
 	je	L(copy_page_by_rep)
-	lea	-128(%ecx), %ecx
+	sub	$128, %ecx
 L(shl_0_gobble_mem_loop):
 	prefetchnta 0x1c0(%eax)
 	prefetchnta 0x280(%eax)
@@ -352,10 +353,10 @@ L(shl_0_gobble_mem_loop):
 	movaps	%xmm7, 0x70(%edx)
 	lea	0x80(%edx), %edx
 
-	jge	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(shl_0_mem_less_64bytes)
+	jae	L(shl_0_gobble_mem_loop)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(shl_0_mem_less_64bytes)
 
 	movdqa	(%eax), %xmm0
 	sub	$0x40, %ecx
@@ -373,7 +374,7 @@ L(shl_0_gobble_mem_loop):
 	add	$0x40, %edx
 L(shl_0_mem_less_64bytes):
 	cmp	$0x20, %ecx
-	jl	L(shl_0_mem_less_32bytes)
+	jb	L(shl_0_mem_less_32bytes)
 	movdqa	(%eax), %xmm0
 	sub	$0x20, %ecx
 	movdqa	0x10(%eax), %xmm1
@@ -383,7 +384,7 @@ L(shl_0_mem_less_64bytes):
 	add	$0x20, %edx
 L(shl_0_mem_less_32bytes):
 	cmp	$0x10, %ecx
-	jl	L(shl_0_mem_less_16bytes)
+	jb	L(shl_0_mem_less_16bytes)
 	sub	$0x10, %ecx
 	movdqa	(%eax), %xmm0
 	add	$0x10, %eax
@@ -396,14 +397,15 @@ L(shl_0_mem_less_16bytes):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_1):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-1(%eax), %eax
+	sub	$1, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_1_loop):
@@ -418,7 +420,7 @@ L(shl_1_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_1_end)
+	jb	L(shl_1_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -433,20 +435,22 @@ L(shl_1_loop):
 	jae	L(shl_1_loop)
 
 L(shl_1_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	1(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_2):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-2(%eax), %eax
+	sub	$2, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_2_loop):
@@ -461,7 +465,7 @@ L(shl_2_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_2_end)
+	jb	L(shl_2_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -476,20 +480,22 @@ L(shl_2_loop):
 	jae	L(shl_2_loop)
 
 L(shl_2_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	2(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_3):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-3(%eax), %eax
+	sub	$3, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_3_loop):
@@ -504,7 +510,7 @@ L(shl_3_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_3_end)
+	jb	L(shl_3_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -519,20 +525,22 @@ L(shl_3_loop):
 	jae	L(shl_3_loop)
 
 L(shl_3_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	3(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_4):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-4(%eax), %eax
+	sub	$4, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_4_loop):
@@ -547,7 +555,7 @@ L(shl_4_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_4_end)
+	jb	L(shl_4_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -562,20 +570,22 @@ L(shl_4_loop):
 	jae	L(shl_4_loop)
 
 L(shl_4_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	4(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_5):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-5(%eax), %eax
+	sub	$5, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_5_loop):
@@ -590,7 +600,7 @@ L(shl_5_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_5_end)
+	jb	L(shl_5_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -605,21 +615,22 @@ L(shl_5_loop):
 	jae	L(shl_5_loop)
 
 L(shl_5_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	5(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_6):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-6(%eax), %eax
+	sub	$6, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_6_loop):
@@ -634,7 +645,7 @@ L(shl_6_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_6_end)
+	jb	L(shl_6_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -649,20 +660,22 @@ L(shl_6_loop):
 	jae	L(shl_6_loop)
 
 L(shl_6_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	6(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_7):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-7(%eax), %eax
+	sub	$7, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_7_loop):
@@ -677,7 +690,7 @@ L(shl_7_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_7_end)
+	jb	L(shl_7_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -692,20 +705,22 @@ L(shl_7_loop):
 	jae	L(shl_7_loop)
 
 L(shl_7_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	7(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_8):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-8(%eax), %eax
+	sub	$8, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_8_loop):
@@ -720,7 +735,7 @@ L(shl_8_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_8_end)
+	jb	L(shl_8_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -735,20 +750,22 @@ L(shl_8_loop):
 	jae	L(shl_8_loop)
 
 L(shl_8_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	8(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_9):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-9(%eax), %eax
+	sub	$9, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_9_loop):
@@ -763,7 +780,7 @@ L(shl_9_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_9_end)
+	jb	L(shl_9_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -778,20 +795,22 @@ L(shl_9_loop):
 	jae	L(shl_9_loop)
 
 L(shl_9_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	9(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_10):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-10(%eax), %eax
+	sub	$10, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_10_loop):
@@ -806,7 +825,7 @@ L(shl_10_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_10_end)
+	jb	L(shl_10_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -821,20 +840,22 @@ L(shl_10_loop):
 	jae	L(shl_10_loop)
 
 L(shl_10_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	10(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_11):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-11(%eax), %eax
+	sub	$11, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_11_loop):
@@ -849,7 +870,7 @@ L(shl_11_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_11_end)
+	jb	L(shl_11_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -864,20 +885,22 @@ L(shl_11_loop):
 	jae	L(shl_11_loop)
 
 L(shl_11_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	11(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_12):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-12(%eax), %eax
+	sub	$12, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_12_loop):
@@ -892,7 +915,7 @@ L(shl_12_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_12_end)
+	jb	L(shl_12_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -907,20 +930,22 @@ L(shl_12_loop):
 	jae	L(shl_12_loop)
 
 L(shl_12_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	12(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_13):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-13(%eax), %eax
+	sub	$13, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_13_loop):
@@ -935,7 +960,7 @@ L(shl_13_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_13_end)
+	jb	L(shl_13_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -950,20 +975,22 @@ L(shl_13_loop):
 	jae	L(shl_13_loop)
 
 L(shl_13_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	13(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_14):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-14(%eax), %eax
+	sub	$14, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_14_loop):
@@ -978,7 +1005,7 @@ L(shl_14_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_14_end)
+	jb	L(shl_14_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -993,21 +1020,22 @@ L(shl_14_loop):
 	jae	L(shl_14_loop)
 
 L(shl_14_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	14(%edi, %eax), %eax
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_15):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
-	lea	-15(%eax), %eax
+	sub	$15, %eax
 	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
+	sub	$32, %ecx
 	movdqu	%xmm0, (%esi)
 	POP (%esi)
 L(shl_15_loop):
@@ -1022,7 +1050,7 @@ L(shl_15_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_15_end)
+	jb	L(shl_15_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -1037,7 +1065,7 @@ L(shl_15_loop):
 	jae	L(shl_15_loop)
 
 L(shl_15_end):
-	lea	32(%ecx), %ecx
+	add	$32, %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	15(%edi, %eax), %eax
@@ -1241,20 +1269,23 @@ L(fwd_write_3bytes):
 	movl	DEST(%esp), %eax
 # endif
 #endif
-	RETURN
+	RETURN_END
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(large_page):
 	movdqu	(%eax), %xmm1
-	lea	16(%eax), %eax
 	movdqu	%xmm0, (%esi)
 	movntdq	%xmm1, (%edx)
-	lea	16(%edx), %edx
+	add	$0x10, %eax
+	add	$0x10, %edx
+	sub	$0x10, %ecx
 	cmp	%al, %dl
 	je	L(copy_page_by_rep)
 L(large_page_loop_init):
 	POP (%esi)
-	lea	-0x90(%ecx), %ecx
+	sub	$0x80, %ecx
 	POP (%edi)
 L(large_page_loop):
 	prefetchnta	0x1c0(%eax)
@@ -1280,9 +1311,9 @@ L(large_page_loop):
 	movntdq	%xmm7, 0x70(%edx)
 	lea	0x80(%edx), %edx
 	jae	L(large_page_loop)
-	cmp	$-0x40, %ecx
-	lea	0x80(%ecx), %ecx
-	jl	L(large_page_less_64bytes)
+	add	$0x80, %ecx
+	cmp	$0x40, %ecx
+	jb	L(large_page_less_64bytes)
 
 	movdqu	(%eax), %xmm0
 	movdqu	0x10(%eax), %xmm1
@@ -1298,7 +1329,7 @@ L(large_page_loop):
 	sub	$0x40, %ecx
 L(large_page_less_64bytes):
 	cmp	$32, %ecx
-	jl	L(large_page_less_32bytes)
+	jb	L(large_page_less_32bytes)
 	movdqu	(%eax), %xmm0
 	movdqu	0x10(%eax), %xmm1
 	lea	0x20(%eax), %eax
@@ -1312,6 +1343,8 @@ L(large_page_less_32bytes):
 	sfence
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(copy_page_by_rep):
 	mov	%eax, %esi
@@ -1658,18 +1691,18 @@ L(table_48_bytes_bwd):
 L(copy_backward):
 	PUSH (%esi)
 	movl	%eax, %esi
-	lea	(%ecx,%edx,1),%edx
-	lea	(%ecx,%esi,1),%esi
+	add	%ecx, %edx
+	add	%ecx, %esi
 	testl	$0x3, %edx
 	jnz	L(bk_align)
 
 L(bk_aligned_4):
 	cmp	$64, %ecx
-	jge	L(bk_write_more64bytes)
+	jae	L(bk_write_more64bytes)
 
 L(bk_write_64bytesless):
 	cmp	$32, %ecx
-	jl	L(bk_write_less32bytes)
+	jb	L(bk_write_less32bytes)
 
 L(bk_write_more32bytes):
 	/* Copy 32 bytes at a time.  */
@@ -1698,13 +1731,14 @@ L(bk_write_less32bytes):
 	sub	%ecx, %edx
 	sub	%ecx, %eax
 	POP (%esi)
-L(bk_write_less32bytes_2):
+L(bk_write_less48bytes):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
 
+	CFI_PUSH (%esi)
 	ALIGN (4)
 L(bk_align):
 	cmp	$8, %ecx
-	jle	L(bk_write_less32bytes)
+	jbe	L(bk_write_less32bytes)
 	testl	$1, %edx
 	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
 	   then (EDX & 2) must be != 0.  */
@@ -1760,7 +1794,7 @@ L(bk_ssse3_align):
 
 L(bk_ssse3_cpy_pre):
 	cmp	$64, %ecx
-	jl	L(bk_write_more32bytes)
+	jb	L(bk_write_more32bytes)
 
 L(bk_ssse3_cpy):
 	sub	$64, %esi
@@ -1775,7 +1809,7 @@ L(bk_ssse3_cpy):
 	movdqu	(%esi), %xmm0
 	movdqa	%xmm0, (%edx)
 	cmp	$64, %ecx
-	jge	L(bk_ssse3_cpy)
+	jae	L(bk_ssse3_cpy)
 	jmp	L(bk_write_64bytesless)
 
 #endif

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a0ac24d98ace90d1ccba6a2f3e7d55600f2fdb6e

commit a0ac24d98ace90d1ccba6a2f3e7d55600f2fdb6e
Author: H.J. Lu <hongjiu.lu@intel.com>
Date:   Wed Feb 24 18:20:57 2010 -0800

    Fix issues in x86 memcpy-ssse3.S

diff --git a/ChangeLog b/ChangeLog
index 26429c6..2932b05 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2010-02-24  H.J. Lu  <hongjiu.lu@intel.com>
 
+	* sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Use unsigned
+	conditional jumps.
+	Correct unwind info.
+
 	* sysdeps/i386/i686/multiarch/memset-sse2-rep.S: Remove redundant
 	punpcklbw.
 	Use unsigned conditional jumps.
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
index 749c82d..ec9eeb9 100644
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -128,7 +128,7 @@ ENTRY (MEMCPY)
 	jb	L(copy_forward)
 	je	L(fwd_write_0bytes)
 	cmp	$32, %ecx
-	jge	L(memmove_bwd)
+	jae	L(memmove_bwd)
 	jmp	L(bk_write_less32bytes_2)
 L(memmove_bwd):
 	add	%ecx, %eax
@@ -139,12 +139,12 @@ L(memmove_bwd):
 L(copy_forward):
 #endif
 	cmp	$48, %ecx
-	jge	L(48bytesormore)
+	jae	L(48bytesormore)
 
 L(fwd_write_less32bytes):
 #ifndef USE_AS_MEMMOVE
 	cmp	%dl, %al
-	jl	L(bk_write)
+	jb	L(bk_write)
 #endif
 	add	%ecx, %edx
 	add	%ecx, %eax
@@ -162,6 +162,7 @@ L(48bytesormore):
 	movl	%edx, %edi
 	and	$-16, %edx
 	PUSH (%esi)
+	cfi_remember_state
 	add	$16, %edx
 	movl	%edi, %esi
 	sub	%edx, %edi
@@ -181,12 +182,14 @@ L(48bytesormore):
 #endif
 
 	mov	%eax, %edi
-	jge	L(large_page)
+	jae	L(large_page)
 	and	$0xf, %edi
 	jz	L(shl_0)
 
 	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_0):
 	movdqu	%xmm0, (%esi)
@@ -202,7 +205,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -210,7 +213,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -218,7 +221,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
-	jl	L(shl_0_end)
+	jb	L(shl_0_end)
 
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -234,6 +237,7 @@ L(shl_0_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
+	CFI_PUSH (%edi)
 L(shl_0_gobble):
 
 #ifdef DATA_CACHE_SIZE_HALF
@@ -250,7 +254,7 @@ L(shl_0_gobble):
 
 	POP (%edi)
 	lea	-128(%ecx), %ecx
-	jge	L(shl_0_gobble_mem_loop)
+	jae	L(shl_0_gobble_mem_loop)
 L(shl_0_gobble_cache_loop):
 	movdqa	(%eax), %xmm0
 	movdqa	0x10(%eax), %xmm1
@@ -272,8 +276,7 @@ L(shl_0_gobble_cache_loop):
 	movdqa	%xmm7, 0x70(%edx)
 	lea	0x80(%edx), %edx
 
-	jge	L(shl_0_gobble_cache_loop)
-L(shl_0_gobble_cache_loop_tail):
+	jae	L(shl_0_gobble_cache_loop)
 	cmp	$-0x40, %ecx
 	lea	0x80(%ecx), %ecx
 	jl	L(shl_0_cache_less_64bytes)
@@ -294,7 +297,7 @@ L(shl_0_gobble_cache_loop_tail):
 	add	$0x40, %edx
 L(shl_0_cache_less_64bytes):
 	cmp	$0x20, %ecx
-	jl	L(shl_0_cache_less_32bytes)
+	jb	L(shl_0_cache_less_32bytes)
 	movdqa	(%eax), %xmm0
 	sub	$0x20, %ecx
 	movdqa	0x10(%eax), %xmm1
@@ -304,7 +307,7 @@ L(shl_0_cache_less_64bytes):
 	add	$0x20, %edx
 L(shl_0_cache_less_32bytes):
 	cmp	$0x10, %ecx
-	jl	L(shl_0_cache_less_16bytes)
+	jb	L(shl_0_cache_less_16bytes)
 	sub	$0x10, %ecx
 	movdqa	(%eax), %xmm0
 	add	$0x10, %eax
@@ -342,7 +345,7 @@ L(shl_0_gobble_mem_loop):
 	movdqa	%xmm7, 0x70(%edx)
 	lea	0x80(%edx), %edx
 
-	jge	L(shl_0_gobble_mem_loop)
+	jae	L(shl_0_gobble_mem_loop)
 	cmp	$-0x40, %ecx
 	lea	0x80(%ecx), %ecx
 	jl	L(shl_0_mem_less_64bytes)
@@ -363,7 +366,7 @@ L(shl_0_gobble_mem_loop):
 	add	$0x40, %edx
 L(shl_0_mem_less_64bytes):
 	cmp	$0x20, %ecx
-	jl	L(shl_0_mem_less_32bytes)
+	jb	L(shl_0_mem_less_32bytes)
 	movdqa	(%eax), %xmm0
 	sub	$0x20, %ecx
 	movdqa	0x10(%eax), %xmm1
@@ -373,7 +376,7 @@ L(shl_0_mem_less_64bytes):
 	add	$0x20, %edx
 L(shl_0_mem_less_32bytes):
 	cmp	$0x10, %ecx
-	jl	L(shl_0_mem_less_16bytes)
+	jb	L(shl_0_mem_less_16bytes)
 	sub	$0x10, %ecx
 	movdqa	(%eax), %xmm0
 	add	$0x10, %eax
@@ -384,7 +387,8 @@ L(shl_0_mem_less_16bytes):
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_1):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -406,7 +410,7 @@ L(shl_1_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_1_end)
+	jb	L(shl_1_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -428,6 +432,8 @@ L(shl_1_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_2):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -449,7 +455,7 @@ L(shl_2_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_2_end)
+	jb	L(shl_2_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -471,6 +477,8 @@ L(shl_2_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_3):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -492,7 +500,7 @@ L(shl_3_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_3_end)
+	jb	L(shl_3_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -514,6 +522,8 @@ L(shl_3_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_4):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -535,7 +545,7 @@ L(shl_4_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_4_end)
+	jb	L(shl_4_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -557,6 +567,8 @@ L(shl_4_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_5):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -578,7 +590,7 @@ L(shl_5_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_5_end)
+	jb	L(shl_5_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -600,7 +612,8 @@ L(shl_5_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_6):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -622,7 +635,7 @@ L(shl_6_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_6_end)
+	jb	L(shl_6_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -644,6 +657,8 @@ L(shl_6_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_7):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -665,7 +680,7 @@ L(shl_7_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_7_end)
+	jb	L(shl_7_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -687,6 +702,8 @@ L(shl_7_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_8):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -708,7 +725,7 @@ L(shl_8_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_8_end)
+	jb	L(shl_8_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -730,6 +747,8 @@ L(shl_8_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_9):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -751,7 +770,7 @@ L(shl_9_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_9_end)
+	jb	L(shl_9_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -773,6 +792,8 @@ L(shl_9_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_10):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -794,7 +815,7 @@ L(shl_10_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_10_end)
+	jb	L(shl_10_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -816,6 +837,8 @@ L(shl_10_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_11):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -837,7 +860,7 @@ L(shl_11_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_11_end)
+	jb	L(shl_11_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -859,6 +882,8 @@ L(shl_11_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_12):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -880,7 +905,7 @@ L(shl_12_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_12_end)
+	jb	L(shl_12_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -902,6 +927,8 @@ L(shl_12_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_13):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -923,7 +950,7 @@ L(shl_13_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_13_end)
+	jb	L(shl_13_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -945,6 +972,8 @@ L(shl_13_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_14):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -966,7 +995,7 @@ L(shl_14_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_14_end)
+	jb	L(shl_14_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -988,7 +1017,8 @@ L(shl_14_end):
 	POP (%edi)
 	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
 
-
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(shl_15):
 	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
@@ -1010,7 +1040,7 @@ L(shl_15_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jl	L(shl_15_end)
+	jb	L(shl_15_end)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -1229,8 +1259,10 @@ L(fwd_write_3bytes):
 	movl	DEST(%esp), %eax
 # endif
 #endif
-	RETURN
+	RETURN_END
 
+	cfi_restore_state
+	cfi_remember_state
 	ALIGN (4)
 L(large_page):
 	movdqu	(%eax), %xmm1
@@ -1281,7 +1313,7 @@ L(large_page_loop):
 	sub	$0x40, %ecx
 L(large_page_less_64bytes):
 	cmp	$32, %ecx
-	jl	L(large_page_less_32bytes)
+	jb	L(large_page_less_32bytes)
 	movdqu	(%eax), %xmm0
 	movdqu	0x10(%eax), %xmm1
 	lea	0x20(%eax), %eax
@@ -1617,11 +1649,11 @@ L(copy_backward):
 
 L(bk_aligned_4):
 	cmp	$64, %ecx
-	jge	L(bk_write_more64bytes)
+	jae	L(bk_write_more64bytes)
 
 L(bk_write_64bytesless):
 	cmp	$32, %ecx
-	jl	L(bk_write_less32bytes)
+	jb	L(bk_write_less32bytes)
 
 L(bk_write_more32bytes):
 	/* Copy 32 bytes at a time.  */
@@ -1653,10 +1685,11 @@ L(bk_write_less32bytes):
 L(bk_write_less32bytes_2):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
 
+	CFI_PUSH (%esi)
 	ALIGN (4)
 L(bk_align):
 	cmp	$8, %ecx
-	jle	L(bk_write_less32bytes)
+	jbe	L(bk_write_less32bytes)
 	testl	$1, %edx
 	/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
 	   then (EDX & 2) must be != 0.  */
@@ -1712,7 +1745,7 @@ L(bk_ssse3_align):
 
 L(bk_ssse3_cpy_pre):
 	cmp	$64, %ecx
-	jl	L(bk_write_more32bytes)
+	jb	L(bk_write_more32bytes)
 
 L(bk_ssse3_cpy):
 	sub	$64, %esi
@@ -1727,7 +1760,7 @@ L(bk_ssse3_cpy):
 	movdqu	(%esi), %xmm0
 	movdqa	%xmm0, (%edx)
 	cmp	$64, %ecx
-	jge	L(bk_ssse3_cpy)
+	jae	L(bk_ssse3_cpy)
 	jmp	L(bk_write_64bytesless)
 
 #endif

http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cc50f1a4b458f769ceb72d88bb78c8429361fec1

commit cc50f1a4b458f769ceb72d88bb78c8429361fec1
Author: H.J. Lu <hongjiu.lu@intel.com>
Date:   Wed Feb 24 18:11:35 2010 -0800

    Fix issues in x86 memset-sse2.S/memset-sse2-rep.S

diff --git a/ChangeLog b/ChangeLog
index 25691cd..26429c6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2010-02-24  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* sysdeps/i386/i686/multiarch/memset-sse2-rep.S: Remove redundant
+	punpcklbw.
+	Use unsigned conditional jumps.
+	(128bytesormore_nt): Renamed to ...
+	(128bytesormore_endof_L1): This.
+	Use add instead of lea if possible.
+	Correct unwind info.
+	* sysdeps/i386/i686/multiarch/memset-sse2.S: Remove redundant
+	punpcklbw.
+	Use unsigned conditional jumps.
+	Use add instead of lea if possible.
+	Correct unwind info.
+
 2010-02-24  Ulrich Drepper  <drepper@redhat.com>
 
 	[BZ #11319]
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
index 84afffe..f9a0b13 100644
--- a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
+++ b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S
@@ -243,7 +243,6 @@ L(32bytesormore):
 	pxor	%xmm0, %xmm0
 #else
 	movd	%eax, %xmm0
-	punpcklbw %xmm0, %xmm0
 	pshufd	$0, %xmm0, %xmm0
 #endif
 	testl	$0xf, %edx
@@ -261,7 +260,7 @@ L(not_aligned_16):
 	ALIGN (4)
 L(aligned_16):
 	cmp	$128, %ecx
-	jge	L(128bytesormore)
+	jae	L(128bytesormore)
 
 L(aligned_16_less128bytes):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
@@ -293,7 +292,7 @@ L(128bytesormore):
  * fast string will prefetch and combine data efficiently.
  */
 	cmp	%edi, %ecx
-	jae	L(128bytesormore_nt)
+	jae	L(128bytesormore_endof_L1)
 	subl	$128, %ecx
 L(128bytesormore_normal):
 	sub	$128, %ecx
@@ -306,7 +305,7 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jl	L(128bytesless_normal)
+	jb	L(128bytesless_normal)
 
 
 	sub	$128, %ecx
@@ -319,15 +318,16 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jge	L(128bytesormore_normal)
+	jae	L(128bytesormore_normal)
 
 L(128bytesless_normal):
 	POP (%edi)
-	lea	128(%ecx), %ecx
+	add	$128, %ecx
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
+	CFI_PUSH (%edi)
 	ALIGN (4)
-L(128bytesormore_nt):
+L(128bytesormore_endof_L1):
 	mov	%edx, %edi
 	mov	%ecx, %edx
 	shr	$2, %ecx
diff --git a/sysdeps/i386/i686/multiarch/memset-sse2.S b/sysdeps/i386/i686/multiarch/memset-sse2.S
index b2b9791..92ad601 100644
--- a/sysdeps/i386/i686/multiarch/memset-sse2.S
+++ b/sysdeps/i386/i686/multiarch/memset-sse2.S
@@ -243,7 +243,6 @@ L(32bytesormore):
 	pxor	%xmm0, %xmm0
 #else
 	movd	%eax, %xmm0
-	punpcklbw %xmm0, %xmm0
 	pshufd	$0, %xmm0, %xmm0
 #endif
 	testl	$0xf, %edx
@@ -261,7 +260,7 @@ L(not_aligned_16):
 	ALIGN (4)
 L(aligned_16):
 	cmp	$128, %ecx
-	jge	L(128bytesormore)
+	jae	L(128bytesormore)
 
 L(aligned_16_less128bytes):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
@@ -287,14 +286,17 @@ L(128bytesormore):
 
 #ifdef DATA_CACHE_SIZE
 	POP (%ebx)
+# define RESTORE_EBX_STATE CFI_PUSH (%ebx)
 	cmp	$DATA_CACHE_SIZE, %ecx
 #else
 # ifdef SHARED
+#  define RESTORE_EBX_STATE
 	call	__i686.get_pc_thunk.bx
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmp	__x86_data_cache_size@GOTOFF(%ebx), %ecx
 # else
 	POP (%ebx)
+#  define RESTORE_EBX_STATE CFI_PUSH (%ebx)
 	cmp	__x86_data_cache_size, %ecx
 # endif
 #endif
@@ -312,7 +314,7 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jl	L(128bytesless_normal)
+	jb	L(128bytesless_normal)
 
 
 	sub	$128, %ecx
@@ -325,10 +327,10 @@ L(128bytesormore_normal):
 	movdqa	%xmm0, 0x60(%edx)
 	movdqa	%xmm0, 0x70(%edx)
 	lea	128(%edx), %edx
-	jge	L(128bytesormore_normal)
+	jae	L(128bytesormore_normal)
 
 L(128bytesless_normal):
-	lea	128(%ecx), %ecx
+	add	$128, %ecx
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
 	ALIGN (4)
@@ -346,11 +348,12 @@ L(128bytes_L2_normal):
 	movaps	%xmm0, 0x70(%edx)
 	add	$128, %edx
 	cmp	$128, %ecx
-	jge	L(128bytes_L2_normal)
+	jae	L(128bytes_L2_normal)
 
 L(128bytesless_L2_normal):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes))
 
+	RESTORE_EBX_STATE
 L(128bytesormore_nt_start):
 	sub	%ebx, %ecx
 	ALIGN (4)
@@ -368,7 +371,7 @@ L(128bytesormore_shared_cache_loop):
 	movdqa	%xmm0, 0x70(%edx)
 	add	$0x80, %edx
 	cmp	$0x80, %ebx
-	jge	L(128bytesormore_shared_cache_loop)
+	jae	L(128bytesormore_shared_cache_loop)
 	cmp	$0x80, %ecx
 	jb	L(shared_cache_loop_end)
 	ALIGN (4)
@@ -384,7 +387,7 @@ L(128bytesormore_nt):
 	movntdq	%xmm0, 0x70(%edx)
 	add	$0x80, %edx
 	cmp	$0x80, %ecx
-	jge	L(128bytesormore_nt)
+	jae	L(128bytesormore_nt)
 	sfence
 L(shared_cache_loop_end):
 #if defined DATA_CACHE_SIZE || !defined SHARED

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                      |   28 +++
 sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S |  246 ++++++++++++++----------
 sysdeps/i386/i686/multiarch/memcpy-ssse3.S     |  113 +++++++----
 sysdeps/i386/i686/multiarch/memset-sse2-rep.S  |   14 +-
 sysdeps/i386/i686/multiarch/memset-sse2.S      |   19 +-
 5 files changed, 259 insertions(+), 161 deletions(-)


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]