This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/erms/hybrid created. glibc-2.23-117-g0debc67
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 18 Mar 2016 16:34:30 -0000
- Subject: GNU C Library master sources branch hjl/erms/hybrid created. glibc-2.23-117-g0debc67
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/erms/hybrid has been created
at 0debc67b0128dab2a524b44387ff848bc01480bb (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0debc67b0128dab2a524b44387ff848bc01480bb
commit 0debc67b0128dab2a524b44387ff848bc01480bb
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 09:22:56 2016 -0700
Add __memset_sse2_erms and __memset_chk_sse2_erms
* sysdeps/x86_64/memset.S (__memset_chk_sse2_erms): New
function.
(__memset_sse2_erms): Likewise.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memset_chk_sse2_erms and
__memset_sse2_erms.
* sysdeps/x86_64/sysdep.h (REP_STOSB_THRESHOLD): New.
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 4cf0da0..71796a7 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -42,6 +42,26 @@ ENTRY(__memset_tail)
END(__memset_tail)
#endif
+#ifdef USE_MULTIARCH
+# if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memset_chk_sse2_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memset_chk_sse2_erms)
+# endif
+
+ENTRY (__memset_sse2_erms)
+ cmpq $REP_STOSB_THRESHOLD, %rdx
+ jbe L(memset_entry)
+ movq %rdx, %rcx
+ movzbl %sil, %eax
+ movq %rdi, %rdx
+ rep stosb
+ movq %rdx, %rax
+ ret
+END (__memset_sse2_erms)
+#endif
+
#if defined PIC && IS_IN (libc)
ENTRY_CHK (__memset_chk)
cmpq %rdx, %rcx
@@ -50,6 +70,7 @@ END_CHK (__memset_chk)
#endif
ENTRY (memset)
+L(memset_entry):
movd %esi, %xmm0
movq %rdi, %rax
punpcklbw %xmm0, %xmm0
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index ba3202e..543c637 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -89,6 +89,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_sse2)
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_sse2_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX2_Usable),
@@ -103,6 +105,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memset.S. */
IFUNC_IMPL (i, name, memset,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2_erms)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX2_Usable),
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index 2444d63..38d3a32 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -133,6 +133,9 @@ lose: \
/* Threshold to use Enhanced REP MOVSB. */
#define REP_MOVSB_THRESHOLD 2048
+/* Threshold to use Enhanced REP STOSB. */
+#define REP_STOSB_THRESHOLD 1024
+
#else /* __ASSEMBLER__ */
/* Long and pointer size in bytes. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f47ec52f2506c944b514c7da14b54d6aee485350
commit f47ec52f2506c944b514c7da14b54d6aee485350
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 08:32:05 2016 -0700
Add sse2_unaligned_erms versions of memcpy/mempcpy
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memcpy_chk_sse2_unaligned_erms,
__memcpy_sse2_unaligned_erms, __mempcpy_chk_sse2_unaligned_erms
and __mempcpy_sse2_unaligned_erms.
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
(__mempcpy_chk_sse2_unaligned_erms): New function.
(__mempcpy_sse2_unaligned_erms): Likewise.
(__memcpy_chk_sse2_unaligned_erms): Likewise.
(__memcpy_sse2_unaligned_erms): Likewise.
* sysdeps/x86_64/sysdep.h (REP_MOVSB_THRESHOLD): New.
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 26b4137..ba3202e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -286,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2))
@@ -305,6 +307,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_avx512_no_vzeroupper)
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
@@ -327,6 +331,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2))
@@ -347,6 +353,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ __mempcpy_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 947c50f..53e9464 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -23,6 +23,51 @@
#include "asm-syntax.h"
# ifdef SHARED
+ENTRY (__mempcpy_chk_sse2_unaligned_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_sse2_unaligned_erms)
+# endif
+
+ENTRY (__mempcpy_sse2_unaligned_erms)
+ mov %rdi, %rax
+ add %rdx, %rax
+ jmp L(start_erms)
+END (__mempcpy_sse2_unaligned_erms)
+
+# ifdef SHARED
+ENTRY (__memcpy_chk_sse2_unaligned_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk_sse2_unaligned_erms)
+# endif
+
+ENTRY(__memcpy_sse2_unaligned_erms)
+ movq %rdi, %rax
+L(start_erms):
+ testq %rdx, %rdx
+ je L(return)
+ cmpq $16, %rdx
+ jbe L(less_16)
+ cmpq $REP_MOVSB_THRESHOLD, %rdx
+ ja .Lerms
+ movdqu (%rsi), %xmm8
+ cmpq $32, %rdx
+ movdqu %xmm8, (%rdi)
+ movdqu -16(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -16(%rdi,%rdx)
+ ja .L31
+ ret
+
+ .p2align 4,,10
+ .p2align 4
+.Lerms:
+ mov %rdx, %rcx
+ rep movsb
+ ret
+END (__memcpy_sse2_unaligned_erms)
+
+# ifdef SHARED
ENTRY (__mempcpy_chk_sse2_unaligned)
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index fbe3560..2444d63 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -130,6 +130,9 @@ lose: \
#define R14_LP r14
#define R15_LP r15
+/* Threshold to use Enhanced REP MOVSB. */
+#define REP_MOVSB_THRESHOLD 2048
+
#else /* __ASSEMBLER__ */
/* Long and pointer size in bytes. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=53363d1b76a45543f1ac9c1854a17d0a90bb3cba
commit 53363d1b76a45543f1ac9c1854a17d0a90bb3cba
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Mar 7 05:47:26 2016 -0800
Enable __memcpy_chk_sse2_unaligned
Check Fast_Unaligned_Load for __memcpy_chk_sse2_unaligned. The new
selection order is:
1. __memcpy_chk_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
2. __memcpy_chk_sse2_unaligned if Fast_Unaligned_Load bit is set.
3. __memcpy_chk_sse2 if SSSE3 isn't available.
4. __memcpy_chk_ssse3_back if Fast_Copy_Backward bit it set.
5. __memcpy_chk_ssse3
[BZ #19776]
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Check
Fast_Unaligned_Load to enable __mempcpy_chk_sse2_unaligned.
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 0e21c09..9e218c0 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -35,22 +35,25 @@ ENTRY(__memcpy_chk)
jnz 2f
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
+ jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jz 1f
- leaq __memcpy_chk_avx512_no_vzeroupper(%rip), %rax
+ jz 1f
+ lea __memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __memcpy_chk_sse2(%rip), %rax
+1: lea __memcpy_chk_avx_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ lea __memcpy_chk_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ lea __memcpy_chk_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __memcpy_chk_ssse3(%rip), %rax
+ lea __memcpy_chk_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __memcpy_chk_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __memcpy_chk_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __memcpy_chk_ssse3(%rip), %RAX_LP
2: ret
END(__memcpy_chk)
# else
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9615edc8835cb28b09d2941df19919ac7da29a38
commit 9615edc8835cb28b09d2941df19919ac7da29a38
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Mar 7 05:44:58 2016 -0800
Enable __mempcpy_chk_sse2_unaligned
Check Fast_Unaligned_Load for __mempcpy_chk_sse2_unaligned. The new
selection order is:
1. __mempcpy_chk_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
2. __mempcpy_chk_sse2_unaligned if Fast_Unaligned_Load bit is set.
3. __mempcpy_chk_sse2 if SSSE3 isn't available.
4. __mempcpy_chk_ssse3_back if Fast_Copy_Backward bit it set.
5. __mempcpy_chk_ssse3
[BZ #19776]
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Check
Fast_Unaligned_Load to enable __mempcpy_chk_sse2_unaligned.
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index de888f3..7c888d3 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -38,19 +38,22 @@ ENTRY(__mempcpy_chk)
jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
jz 1f
- leaq __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax
+ lea __mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __mempcpy_chk_sse2(%rip), %rax
+1: lea __mempcpy_chk_avx_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_chk_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_chk_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __mempcpy_chk_ssse3(%rip), %rax
+ lea __mempcpy_chk_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __mempcpy_chk_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __mempcpy_chk_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __mempcpy_chk_ssse3(%rip), %RAX_LP
2: ret
END(__mempcpy_chk)
# else
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e497a68bd68d08a2fffadbd9a5ed0c082cdc62e9
commit e497a68bd68d08a2fffadbd9a5ed0c082cdc62e9
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Mar 7 05:42:46 2016 -0800
Enable __mempcpy_sse2_unaligned
Check Fast_Unaligned_Load for __mempcpy_sse2_unaligned. The new
selection order is:
1. __mempcpy_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
2. __mempcpy_sse2_unaligned if Fast_Unaligned_Load bit is set.
3. __mempcpy_sse2 if SSSE3 isn't available.
4. __mempcpy_ssse3_back if Fast_Copy_Backward bit it set.
5. __mempcpy_ssse3
[BZ #19776]
* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Check
Fast_Unaligned_Load to enable __mempcpy_sse2_unaligned.
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index b85cf27..05c1fc8 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -36,19 +36,22 @@ ENTRY(__mempcpy)
jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
jz 1f
- leaq __mempcpy_avx512_no_vzeroupper(%rip), %rax
+ lea __mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __mempcpy_sse2(%rip), %rax
+1: lea __mempcpy_avx_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __mempcpy_ssse3(%rip), %rax
+ lea __mempcpy_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __mempcpy_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __mempcpy_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __mempcpy_ssse3(%rip), %RAX_LP
2: ret
END(__mempcpy)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f3397461af3990d1b147c7110b3fba6449b36400
commit f3397461af3990d1b147c7110b3fba6449b36400
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 17:06:41 2016 -0800
Add entry points for __mempcpy_sse2_unaligned and _chk functions
Add entry points for __mempcpy_chk_sse2_unaligned,
__mempcpy_sse2_unaligned and __memcpy_chk_sse2_unaligned.
[BZ #19776]
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memcpy_chk_sse2_unaligned,
__mempcpy_chk_sse2_unaligned and __mempcpy_sse2_unaligned.
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
(__mempcpy_chk_sse2_unaligned): New.
(__mempcpy_sse2_unaligned): Likewise.
(__memcpy_chk_sse2_unaligned): Likewise.
(L(start): New label.
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index b0d300d..26b4137 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -284,6 +284,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2))
@@ -323,6 +325,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2))
@@ -341,6 +345,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ __mempcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 335a498..947c50f 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -22,9 +22,29 @@
#include "asm-syntax.h"
+# ifdef SHARED
+ENTRY (__mempcpy_chk_sse2_unaligned)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_sse2_unaligned)
+# endif
+
+ENTRY (__mempcpy_sse2_unaligned)
+ mov %rdi, %rax
+ add %rdx, %rax
+ jmp L(start)
+END (__mempcpy_sse2_unaligned)
+
+# ifdef SHARED
+ENTRY (__memcpy_chk_sse2_unaligned)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk_sse2_unaligned)
+# endif
ENTRY(__memcpy_sse2_unaligned)
movq %rdi, %rax
+L(start):
testq %rdx, %rdx
je L(return)
cmpq $16, %rdx
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e923b259dfd17705954d098370b399c24dcef2cf
commit e923b259dfd17705954d098370b399c24dcef2cf
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 16:52:53 2016 -0800
Remove L(overlapping) from memcpy-sse2-unaligned.S
Since memcpy doesn't need to check overlapping source and destination,
we can remove L(overlapping).
[BZ #19776]
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
(L(overlapping)): Removed.
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 19d8aa6..335a498 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -25,12 +25,8 @@
ENTRY(__memcpy_sse2_unaligned)
movq %rdi, %rax
- movq %rsi, %r11
- leaq (%rdx,%rdx), %rcx
- subq %rdi, %r11
- subq %rdx, %r11
- cmpq %rcx, %r11
- jb L(overlapping)
+ testq %rdx, %rdx
+ je L(return)
cmpq $16, %rdx
jbe L(less_16)
movdqu (%rsi), %xmm8
@@ -89,45 +85,6 @@ L(loop):
cmpq %rcx, %rdx
jne L(loop)
ret
-L(overlapping):
- testq %rdx, %rdx
- .p2align 4,,5
- je L(return)
- movq %rdx, %r9
- leaq 16(%rsi), %rcx
- leaq 16(%rdi), %r8
- shrq $4, %r9
- movq %r9, %r11
- salq $4, %r11
- cmpq %rcx, %rdi
- setae %cl
- cmpq %r8, %rsi
- setae %r8b
- orl %r8d, %ecx
- cmpq $15, %rdx
- seta %r8b
- testb %r8b, %cl
- je .L21
- testq %r11, %r11
- je .L21
- xorl %ecx, %ecx
- xorl %r8d, %r8d
-.L7:
- movdqu (%rsi,%rcx), %xmm8
- addq $1, %r8
- movdqu %xmm8, (%rdi,%rcx)
- addq $16, %rcx
- cmpq %r8, %r9
- ja .L7
- cmpq %r11, %rdx
- je L(return)
-.L21:
- movzbl (%rsi,%r11), %ecx
- movb %cl, (%rdi,%r11)
- addq $1, %r11
- cmpq %r11, %rdx
- ja .L21
- ret
L(less_16):
testb $24, %dl
jne L(between_9_16)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=af82e9a0269e61b459aaf071d3f93b35ecb11e9e
commit af82e9a0269e61b459aaf071d3f93b35ecb11e9e
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 13:46:54 2016 -0800
Don't use RAX as scratch register
To prepare sharing code with mempcpy, don't use RAX as scratch register
so that RAX can be set to the return value at entrance.
[BZ #19776]
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Don't use
RAX as scratch register.
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index 7207753..19d8aa6 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -24,11 +24,12 @@
ENTRY(__memcpy_sse2_unaligned)
- movq %rsi, %rax
+ movq %rdi, %rax
+ movq %rsi, %r11
leaq (%rdx,%rdx), %rcx
- subq %rdi, %rax
- subq %rdx, %rax
- cmpq %rcx, %rax
+ subq %rdi, %r11
+ subq %rdx, %r11
+ cmpq %rcx, %r11
jb L(overlapping)
cmpq $16, %rdx
jbe L(less_16)
@@ -39,7 +40,6 @@ ENTRY(__memcpy_sse2_unaligned)
movdqu %xmm8, -16(%rdi,%rdx)
ja .L31
L(return):
- movq %rdi, %rax
ret
.p2align 4,,10
.p2align 4
@@ -64,16 +64,16 @@ L(return):
addq %rdi, %rdx
andq $-64, %rdx
andq $-64, %rcx
- movq %rcx, %rax
- subq %rdi, %rax
- addq %rax, %rsi
+ movq %rcx, %r11
+ subq %rdi, %r11
+ addq %r11, %rsi
cmpq %rdx, %rcx
je L(return)
movq %rsi, %r10
subq %rcx, %r10
leaq 16(%r10), %r9
leaq 32(%r10), %r8
- leaq 48(%r10), %rax
+ leaq 48(%r10), %r11
.p2align 4,,10
.p2align 4
L(loop):
@@ -83,12 +83,12 @@ L(loop):
movdqa %xmm8, 16(%rcx)
movdqu (%rcx,%r8), %xmm8
movdqa %xmm8, 32(%rcx)
- movdqu (%rcx,%rax), %xmm8
+ movdqu (%rcx,%r11), %xmm8
movdqa %xmm8, 48(%rcx)
addq $64, %rcx
cmpq %rcx, %rdx
jne L(loop)
- jmp L(return)
+ ret
L(overlapping):
testq %rdx, %rdx
.p2align 4,,5
@@ -97,8 +97,8 @@ L(overlapping):
leaq 16(%rsi), %rcx
leaq 16(%rdi), %r8
shrq $4, %r9
- movq %r9, %rax
- salq $4, %rax
+ movq %r9, %r11
+ salq $4, %r11
cmpq %rcx, %rdi
setae %cl
cmpq %r8, %rsi
@@ -107,9 +107,9 @@ L(overlapping):
cmpq $15, %rdx
seta %r8b
testb %r8b, %cl
- je .L16
- testq %rax, %rax
- je .L16
+ je .L21
+ testq %r11, %r11
+ je .L21
xorl %ecx, %ecx
xorl %r8d, %r8d
.L7:
@@ -119,15 +119,15 @@ L(overlapping):
addq $16, %rcx
cmpq %r8, %r9
ja .L7
- cmpq %rax, %rdx
+ cmpq %r11, %rdx
je L(return)
.L21:
- movzbl (%rsi,%rax), %ecx
- movb %cl, (%rdi,%rax)
- addq $1, %rax
- cmpq %rax, %rdx
+ movzbl (%rsi,%r11), %ecx
+ movb %cl, (%rdi,%r11)
+ addq $1, %r11
+ cmpq %r11, %rdx
ja .L21
- jmp L(return)
+ ret
L(less_16):
testb $24, %dl
jne L(between_9_16)
@@ -137,28 +137,25 @@ L(less_16):
testq %rdx, %rdx
.p2align 4,,2
je L(return)
- movzbl (%rsi), %eax
+ movzbl (%rsi), %ecx
testb $2, %dl
- movb %al, (%rdi)
+ movb %cl, (%rdi)
je L(return)
- movzwl -2(%rsi,%rdx), %eax
- movw %ax, -2(%rdi,%rdx)
- jmp L(return)
+ movzwl -2(%rsi,%rdx), %ecx
+ movw %cx, -2(%rdi,%rdx)
+ ret
L(between_9_16):
- movq (%rsi), %rax
- movq %rax, (%rdi)
- movq -8(%rsi,%rdx), %rax
- movq %rax, -8(%rdi,%rdx)
- jmp L(return)
-.L16:
- xorl %eax, %eax
- jmp .L21
+ movq (%rsi), %rcx
+ movq %rcx, (%rdi)
+ movq -8(%rsi,%rdx), %rcx
+ movq %rcx, -8(%rdi,%rdx)
+ ret
L(between_5_8):
- movl (%rsi), %eax
- movl %eax, (%rdi)
- movl -4(%rsi,%rdx), %eax
- movl %eax, -4(%rdi,%rdx)
- jmp L(return)
+ movl (%rsi), %ecx
+ movl %ecx, (%rdi)
+ movl -4(%rsi,%rdx), %ecx
+ movl %ecx, -4(%rdi,%rdx)
+ ret
END(__memcpy_sse2_unaligned)
#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d700853c5270817df932f319917467931f433c41
commit d700853c5270817df932f319917467931f433c41
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 14:16:32 2016 -0800
Remove dead code from memcpy-sse2-unaligned.S
There are
ENTRY(__memcpy_sse2_unaligned)
movq %rsi, %rax
leaq (%rdx,%rdx), %rcx
subq %rdi, %rax
subq %rdx, %rax
cmpq %rcx, %rax
jb L(overlapping)
When branch is taken,
cmpq %rsi, %rdi
jae .L3
will never be taken. We can remove the dead code.
[BZ #19776]
* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S (.L3) Removed.
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index c450983..7207753 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -90,8 +90,6 @@ L(loop):
jne L(loop)
jmp L(return)
L(overlapping):
- cmpq %rsi, %rdi
- jae .L3
testq %rdx, %rdx
.p2align 4,,5
je L(return)
@@ -146,15 +144,6 @@ L(less_16):
movzwl -2(%rsi,%rdx), %eax
movw %ax, -2(%rdi,%rdx)
jmp L(return)
-.L3:
- leaq -1(%rdx), %rax
- .p2align 4,,10
- .p2align 4
-.L11:
- movzbl (%rsi,%rax), %edx
- movb %dl, (%rdi,%rax)
- subq $1, %rax
- jmp .L11
L(between_9_16):
movq (%rsi), %rax
movq %rax, (%rdi)
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources