This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/erms/hybrid created. glibc-2.23-132-gcbb91f9
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 24 Mar 2016 22:41:21 -0000
- Subject: GNU C Library master sources branch hjl/erms/hybrid created. glibc-2.23-132-gcbb91f9
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/erms/hybrid has been created
at cbb91f949f7e1560c299e7ad4727c6dfe976bb0c (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cbb91f949f7e1560c299e7ad4727c6dfe976bb0c
commit cbb91f949f7e1560c299e7ad4727c6dfe976bb0c
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Mar 24 15:26:59 2016 -0700
Add memcpy-avx512-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 7d5b1a8..89894e0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -23,6 +23,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memset-avx512-no-vzeroupper \
memcpy-sse2-unaligned-erms \
memcpy-avx-unaligned-erms \
+ memcpy-avx512-unaligned-erms \
memcpy-erms mempcpy-erms memmove-erms \
memset-erms
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index ee50de5..23d4057 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -52,6 +52,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_chk_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_chk_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_ARCH_FEATURE (AVX_Usable),
@@ -86,6 +89,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memmove_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memmove_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3_back)
@@ -296,6 +302,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_chk_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_chk_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
@@ -334,6 +343,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__memcpy_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __memcpy_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1,
@@ -349,6 +361,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_chk_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_chk_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_ARCH_FEATURE (AVX_Usable),
@@ -377,6 +392,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX512F_Usable),
__mempcpy_avx512_no_vzeroupper)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX512F_Usable),
+ __mempcpy_avx512_unaligned_erms)
#endif
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memcpy-avx512-unaligned-erms.S
new file mode 100644
index 0000000..6078619
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-unaligned-erms.S
@@ -0,0 +1,13 @@
+#define VEC_SIZE 64
+#define VEC(i) zmm##i
+#define VMOVU vmovdqu64
+#define VMOVA vmovdqa64
+
+#define MEMCPY_ERMS __memcpy_avx512_unaligned_erms
+#define MEMPCPY_ERMS __mempcpy_avx512_unaligned_erms
+#define MEMMOVE_ERMS __memmove_avx512_unaligned_erms
+#define MEMCPY_CHK_ERMS __memcpy_chk_avx512_unaligned_erms
+#define MEMPCPY_CHK_ERMS __mempcpy_chk_avx512_unaligned_erms
+#define MEMMOVE_CHK_ERMS __memmove_chk_avx512_unaligned_erms
+
+#include "memcpy-vec-unaligned-erms.S"
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=85b5d5a77c7ecd88daffa902a0e6535222f373bc
commit 85b5d5a77c7ecd88daffa902a0e6535222f373bc
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Mar 24 15:23:33 2016 -0700
Extend it to VEC_SIZE == 64
diff --git a/sysdeps/x86_64/multiarch/memcpy-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memcpy-vec-unaligned-erms.S
index cff5c4c..3bf4780 100644
--- a/sysdeps/x86_64/multiarch/memcpy-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memcpy-vec-unaligned-erms.S
@@ -207,13 +207,18 @@ L(loop):
ret
L(less_vec):
/* Less than 1 VEC. */
-# if VEC_SIZE != 16 && VEC_SIZE != 32
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
# endif
+# if VEC_SIZE > 32
+ cmpb $32, %dl
+ je L(last_32)
+ ja L(between_33_63)
+# endif
# if VEC_SIZE > 16
cmpb $16, %dl
je L(last_16)
- ja L(between_15_31)
+ ja L(between_17_31)
# endif
cmpb $8, %dl
je L(last_8)
@@ -236,8 +241,22 @@ L(between_0_1):
movb %cl, (%rdi)
1:
ret
+# if VEC_SIZE > 32
+L(between_33_63):
+ vmovdqu (%rsi), %ymm0
+ vmovdqu -32(%rsi,%rdx), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, -32(%rdi,%rdx)
+ VZEROUPPER
+ ret
+L(last_32):
+ vmovdqu (%rsi), %ymm0
+ vmovdqu %ymm0, (%rdi)
+ VZEROUPPER
+ ret
+# endif
# if VEC_SIZE > 16
-L(between_15_31):
+L(between_17_31):
vmovdqu (%rsi), %xmm0
vmovdqu -16(%rsi,%rdx), %xmm1
vmovdqu %xmm0, (%rdi)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b12a3dc8134a8632e803213a169b4f0d9fa765a0
commit b12a3dc8134a8632e803213a169b4f0d9fa765a0
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 14:16:32 2016 -0800
Add entry points for __mempcpy_sse2_unaligned and _chk functions
Add entry points for __mempcpy_chk_sse2_unaligned,
__mempcpy_sse2_unaligned and __memcpy_chk_sse2_unaligned.
Add sse2_unaligned_erms versions of memcpy/mempcpy
[BZ #19776]
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memcpy_chk_sse2_unaligned,
__mempcpy_chk_sse2_unaligned and __mempcpy_sse2_unaligned.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memcpy_chk_sse2_unaligned_erms,
__memcpy_sse2_unaligned_erms, __mempcpy_chk_sse2_unaligned_erms
and __mempcpy_sse2_unaligned_erms.
* sysdeps/x86_64/sysdep.h (REP_MOVSB_THRESHOLD): New.
Enable __mempcpy_sse2_unaligned
Check Fast_Unaligned_Load for __mempcpy_sse2_unaligned. The new
selection order is:
1. __mempcpy_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
2. __mempcpy_sse2_unaligned if Fast_Unaligned_Load bit is set.
3. __mempcpy_sse2 if SSSE3 isn't available.
4. __mempcpy_ssse3_back if Fast_Copy_Backward bit it set.
5. __mempcpy_ssse3
[BZ #19776]
* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Check
Fast_Unaligned_Load to enable __mempcpy_sse2_unaligned.
Enable __mempcpy_chk_sse2_unaligned
Check Fast_Unaligned_Load for __mempcpy_chk_sse2_unaligned. The new
selection order is:
1. __mempcpy_chk_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
2. __mempcpy_chk_sse2_unaligned if Fast_Unaligned_Load bit is set.
3. __mempcpy_chk_sse2 if SSSE3 isn't available.
4. __mempcpy_chk_ssse3_back if Fast_Copy_Backward bit it set.
5. __mempcpy_chk_ssse3
[BZ #19776]
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Check
Fast_Unaligned_Load to enable __mempcpy_chk_sse2_unaligned.
Enable __memcpy_chk_sse2_unaligned
Check Fast_Unaligned_Load for __memcpy_chk_sse2_unaligned. The new
selection order is:
1. __memcpy_chk_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
2. __memcpy_chk_sse2_unaligned if Fast_Unaligned_Load bit is set.
3. __memcpy_chk_sse2 if SSSE3 isn't available.
4. __memcpy_chk_ssse3_back if Fast_Copy_Backward bit it set.
5. __memcpy_chk_ssse3
[BZ #19776]
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Check
Fast_Unaligned_Load to enable __mempcpy_chk_sse2_unaligned.
Use Hybrid_ERMS in memcpy.S
Use Hybrid_ERMS in mempcpy.S
Add memcpy-sse2-unaligned-erms.S
Add initial memcpy-vec-unaligned-erms.S
Add memcpy-avx-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index fdb8448..7d5b1a8 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -21,6 +21,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
memset-avx512-no-vzeroupper \
+ memcpy-sse2-unaligned-erms \
+ memcpy-avx-unaligned-erms \
memcpy-erms mempcpy-erms memmove-erms \
memset-erms
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 336ff1a..ee50de5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -57,6 +57,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memmove_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
HAS_CPU_FEATURE (SSSE3),
__memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
@@ -65,6 +68,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2))
/* Support sysdeps/x86_64/multiarch/memmove.S. */
@@ -72,6 +79,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX_Usable),
__memmove_avx_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memmove_avx_unaligned_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, memmove,
HAS_ARCH_FEATURE (AVX512F_Usable),
@@ -82,6 +92,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
+ IFUNC_IMPL_ADD (array, i, memmove, 1,
+ __memmove_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove, 1,
+ __memmove_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
/* Support sysdeps/x86_64/multiarch/memset_chk.S. */
@@ -287,12 +301,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2))
@@ -302,6 +323,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__memcpy_avx_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __memcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -312,6 +336,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_avx512_no_vzeroupper)
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned_2)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
@@ -326,12 +354,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_chk_avx_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2))
@@ -346,10 +381,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
HAS_ARCH_FEATURE (AVX_Usable),
__mempcpy_avx_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ HAS_ARCH_FEATURE (AVX_Usable),
+ __mempcpy_avx_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ __mempcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+ __mempcpy_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned-erms.S
new file mode 100644
index 0000000..0267763
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned-erms.S
@@ -0,0 +1,13 @@
+#define VEC_SIZE 32
+#define VEC(i) ymm##i
+#define VMOVU vmovdqu
+#define VMOVA vmovdqa
+
+#define MEMCPY_ERMS __memcpy_avx_unaligned_erms
+#define MEMPCPY_ERMS __mempcpy_avx_unaligned_erms
+#define MEMMOVE_ERMS __memmove_avx_unaligned_erms
+#define MEMCPY_CHK_ERMS __memcpy_chk_avx_unaligned_erms
+#define MEMPCPY_CHK_ERMS __mempcpy_chk_avx_unaligned_erms
+#define MEMMOVE_CHK_ERMS __memmove_chk_avx_unaligned_erms
+
+#include "memcpy-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned-erms.S
new file mode 100644
index 0000000..8574a4c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned-erms.S
@@ -0,0 +1,19 @@
+#define VEC_SIZE 16
+#define VEC(i) xmm##i
+#define VMOVU movdqu
+#define VMOVA movdqa
+
+#define MEMCPY __memcpy_sse2_unaligned_2
+#define MEMCPY_ERMS __memcpy_sse2_unaligned_erms
+#define MEMPCPY __mempcpy_sse2_unaligned
+#define MEMPCPY_ERMS __mempcpy_sse2_unaligned_erms
+#define MEMMOVE __memmove_sse2_unaligned
+#define MEMMOVE_ERMS __memmove_sse2_unaligned_erms
+#define MEMCPY_CHK __memcpy_chk_sse2_unaligned
+#define MEMCPY_CHK_ERMS __memcpy_chk_sse2_unaligned_erms
+#define MEMPCPY_CHK __mempcpy_chk_sse2_unaligned
+#define MEMPCPY_CHK_ERMS __mempcpy_chk_sse2_unaligned_erms
+#define MEMMOVE_CHK __memmove_chk_sse2_unaligned
+#define MEMMOVE_CHK_ERMS __memmove_chk_sse2_unaligned_erms
+
+#include "memcpy-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memcpy-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memcpy-vec-unaligned-erms.S
new file mode 100644
index 0000000..cff5c4c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-vec-unaligned-erms.S
@@ -0,0 +1,408 @@
+/* memcpy/mempcpy/memmove with vector unaliged loads and rep movsb
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* memcpy/mempcpy/memmove is implemented as:
+ 1. If size is 2 * VEC_SIZE or below, load all sources into registers
+ first and copy them to destination together.
+ 2. If there is no overflap, copy from both ends with 4 * VEC_SIZE
+ at a time.
+ 3. If size is less than 8 * VEC_SIZE, load all sources into registers
+ first and copy them to destination together.
+ 4. If address of destination > address of source, copy 8 * VEC_SIZE
+ at a time backward.
+ 5. Otherwise, copy * VEC_SIZE at a time forward.
+ */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef VZEROUPPER
+# if VEC_SIZE > 16
+# define VZEROUPPER vzeroupper
+# else
+# define VZEROUPPER
+# endif
+# endif
+
+# ifdef MEMPCPY
+# ifdef SHARED
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+# endif
+
+ENTRY (MEMPCPY)
+ mov %rdi, %rax
+ add %rdx, %rax
+ jmp L(start)
+END (MEMPCPY)
+# endif
+
+# ifdef MEMCPY
+# ifdef SHARED
+ENTRY (MEMCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+ movq %rdi, %rax
+L(start):
+ cmpq $VEC_SIZE, %rdx
+ je L(last_vec)
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+ VMOVU (%rsi), %VEC(0)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
+ VZEROUPPER
+ ret
+END (MEMCPY)
+# endif
+
+# ifdef SHARED
+ENTRY (MEMPCPY_CHK_ERMS)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK_ERMS)
+
+ENTRY (MEMPCPY_ERMS)
+ mov %rdi, %rax
+ add %rdx, %rax
+ jmp L(start_erms)
+END (MEMPCPY_ERMS)
+
+ENTRY (MEMCPY_CHK_ERMS)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK_ERMS)
+# endif
+
+ENTRY(MEMCPY_ERMS)
+ movq %rdi, %rax
+L(start_erms):
+ cmpq $VEC_SIZE, %rdx
+ je L(last_vec)
+ jb L(less_vec)
+ cmpq $REP_MOVSB_THRESHOLD, %rdx
+ ja L(movsb)
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+L(last_2x_vec):
+ VMOVU (%rsi), %VEC(0)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
+ VZEROUPPER
+ ret
+L(movsb):
+ movq %rdx, %rcx
+ cmpq %rsi, %rdi
+ jbe L(movsb_forward)
+ leaq (%rsi,%rcx), %rdx
+ cmpq %rdx, %rdi
+ jb L(movsb_backward)
+L(movsb_forward):
+ rep movsb
+ ret
+L(movsb_backward):
+ leaq -1(%rdi,%rcx), %rdi
+ leaq -1(%rsi,%rcx), %rsi
+ std
+ rep movsb
+ cld
+ ret
+
+L(last_vec):
+ /* Last VEC. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU %VEC(0), (%rdi)
+L(return):
+ VZEROUPPER
+ ret
+
+ .p2align 4,,10
+ .p2align 4
+L(more_2x_vec):
+ /* More than 2 * VEC. */
+ cmpq %rsi, %rdi
+ jbe L(copy_forward)
+ leaq (%rsi,%rdx), %rcx
+ cmpq %rcx, %rdi
+ jb L(more_2x_vec_overlap)
+L(copy_forward):
+ leaq (%rdi,%rdx), %rcx
+ cmpq %rcx, %rsi
+ jb L(more_2x_vec_overlap)
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+ cmpq $(VEC_SIZE * 4), %rdx
+ jbe L(return)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(0)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
+ VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
+ VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(1), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
+ cmpq $(VEC_SIZE * 8), %rdx
+ jbe L(return)
+ leaq (VEC_SIZE * 4)(%rdi), %rcx
+ addq %rdi, %rdx
+ andq $-(VEC_SIZE * 4), %rdx
+ andq $-(VEC_SIZE * 4), %rcx
+ movq %rcx, %r11
+ subq %rdi, %r11
+ addq %r11, %rsi
+ cmpq %rdx, %rcx
+ je L(return)
+ movq %rsi, %r10
+ subq %rcx, %r10
+ leaq VEC_SIZE(%r10), %r9
+ leaq (VEC_SIZE * 2)(%r10), %r8
+ leaq (VEC_SIZE * 3)(%r10), %r11
+ .p2align 4,,10
+ .p2align 4
+L(loop):
+ VMOVU (%rcx,%r10), %VEC(0)
+ VMOVU (%rcx,%r9), %VEC(1)
+ VMOVU (%rcx,%r8), %VEC(2)
+ VMOVU (%rcx,%r11), %VEC(3)
+ VMOVA %VEC(0), (%rcx)
+ VMOVA %VEC(1), VEC_SIZE(%rcx)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rcx)
+ addq $(VEC_SIZE * 4), %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+ VZEROUPPER
+ ret
+L(less_vec):
+ /* Less than 1 VEC. */
+# if VEC_SIZE != 16 && VEC_SIZE != 32
+# error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 16
+ cmpb $16, %dl
+ je L(last_16)
+ ja L(between_15_31)
+# endif
+ cmpb $8, %dl
+ je L(last_8)
+ ja L(between_9_15)
+ cmpb $4, %dl
+ je L(last_4)
+ ja L(between_5_7)
+ cmpb $2, %dl
+ je L(last_2)
+ jb L(between_0_1)
+ movzwl -2(%rsi,%rdx), %ecx
+ movzbl (%rsi), %esi
+ movw %cx, -2(%rdi,%rdx)
+ movb %sil, (%rdi)
+ ret
+L(between_0_1):
+ testb %dl, %dl
+ je 1f
+ movzbl (%rsi), %ecx
+ movb %cl, (%rdi)
+1:
+ ret
+# if VEC_SIZE > 16
+L(between_15_31):
+ vmovdqu (%rsi), %xmm0
+ vmovdqu -16(%rsi,%rdx), %xmm1
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu %xmm1, -16(%rdi,%rdx)
+ ret
+L(last_16):
+ vmovdqu (%rsi), %xmm0
+ vmovdqu %xmm0, (%rdi)
+ ret
+# endif
+L(between_9_15):
+ movq -8(%rsi,%rdx), %rcx
+ movq (%rsi), %rsi
+ movq %rcx, -8(%rdi,%rdx)
+ movq %rsi, (%rdi)
+ ret
+L(last_8):
+ movq (%rsi), %rcx
+ movq %rcx, (%rdi)
+ ret
+L(between_5_7):
+ movl -4(%rsi,%rdx), %ecx
+ movl (%rsi), %esi
+ movl %ecx, -4(%rdi,%rdx)
+ movl %esi, (%rdi)
+ ret
+L(last_4):
+ movl (%rsi), %ecx
+ movl %ecx, (%rdi)
+ ret
+L(last_2):
+ movzwl (%rsi), %ecx
+ movw %cx, (%rdi)
+ ret
+
+L(more_2x_vec_overlap):
+ /* More than 2 * VEC and there is overlap bewteen destination
+ and source. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ ja L(more_8x_vec)
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(last_4x_vec)
+L(between_4x_vec_and_8x_vec):
+ /* Copy from 4 * VEC to 8 * VEC, inclusively. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
+ VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+ VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+ VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+ VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+ VZEROUPPER
+ ret
+L(last_4x_vec):
+ /* Copy from 2 * VEC to 4 * VEC. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
+ VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VZEROUPPER
+ ret
+L(between_0_and_4x_vec):
+ /* Copy from 0 to 4 * VEC. */
+ cmpl $(VEC_SIZE * 2), %edx
+ jae L(last_4x_vec)
+ /* Copy from 0 to 2 * VEC. */
+ cmpl $VEC_SIZE, %edx
+ je L(last_vec)
+ ja L(last_2x_vec)
+ /* Copy from 0 to VEC. */
+ VZEROUPPER
+ jmp L(less_vec)
+L(more_8x_vec):
+ cmpq %rsi, %rdi
+ ja L(more_8x_vec_backward)
+
+ .p2align 4,,10
+ .p2align 4
+L(loop_8x_vec_forward):
+ /* Copy 8 * VEC a time forward. */
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ VMOVU (VEC_SIZE * 4)(%rsi), %VEC(4)
+ VMOVU (VEC_SIZE * 5)(%rsi), %VEC(5)
+ VMOVU (VEC_SIZE * 6)(%rsi), %VEC(6)
+ VMOVU (VEC_SIZE * 7)(%rsi), %VEC(7)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
+ VMOVU %VEC(4), (VEC_SIZE * 4)(%rdi)
+ VMOVU %VEC(5), (VEC_SIZE * 5)(%rdi)
+ VMOVU %VEC(6), (VEC_SIZE * 6)(%rdi)
+ VMOVU %VEC(7), (VEC_SIZE * 7)(%rdi)
+ addq $(VEC_SIZE * 8), %rdi
+ addq $(VEC_SIZE * 8), %rsi
+ subq $(VEC_SIZE * 8), %rdx
+ cmpq $(VEC_SIZE * 8), %rdx
+ je L(between_4x_vec_and_8x_vec)
+ ja L(loop_8x_vec_forward)
+ /* Less than 8 * VEC to copy. */
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(between_0_and_4x_vec)
+ jmp L(between_4x_vec_and_8x_vec)
+
+L(more_8x_vec_backward):
+ leaq -VEC_SIZE(%rsi, %rdx), %rcx
+ leaq -VEC_SIZE(%rdi, %rdx), %r9
+
+ .p2align 4,,10
+ .p2align 4
+L(loop_8x_vec_backward):
+ /* Copy 8 * VEC a time backward. */
+ VMOVU (%rcx), %VEC(0)
+ VMOVU -VEC_SIZE(%rcx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
+ VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
+ VMOVU -(VEC_SIZE * 4)(%rcx), %VEC(4)
+ VMOVU -(VEC_SIZE * 5)(%rcx), %VEC(5)
+ VMOVU -(VEC_SIZE * 6)(%rcx), %VEC(6)
+ VMOVU -(VEC_SIZE * 7)(%rcx), %VEC(7)
+ VMOVU %VEC(0), (%r9)
+ VMOVU %VEC(1), -VEC_SIZE(%r9)
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%r9)
+ VMOVU %VEC(3), -(VEC_SIZE * 3)(%r9)
+ VMOVU %VEC(4), -(VEC_SIZE * 4)(%r9)
+ VMOVU %VEC(5), -(VEC_SIZE * 5)(%r9)
+ VMOVU %VEC(6), -(VEC_SIZE * 6)(%r9)
+ VMOVU %VEC(7), -(VEC_SIZE * 7)(%r9)
+ subq $(VEC_SIZE * 8), %rcx
+ subq $(VEC_SIZE * 8), %r9
+ subq $(VEC_SIZE * 8), %rdx
+ cmpq $(VEC_SIZE * 8), %rdx
+ je L(between_4x_vec_and_8x_vec)
+ ja L(loop_8x_vec_backward)
+ /* Less than 8 * VEC to copy. */
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(between_0_and_4x_vec)
+ jmp L(between_4x_vec_and_8x_vec)
+END (MEMCPY_ERMS)
+
+strong_alias (MEMCPY_ERMS, MEMMOVE_ERMS)
+# ifdef SHARED
+strong_alias (MEMCPY_CHK_ERMS, MEMMOVE_CHK_ERMS)
+# endif
+# ifdef MEMCPY
+strong_alias (MEMCPY, MEMMOVE)
+# ifdef SHARED
+strong_alias (MEMCPY_CHK, MEMMOVE_CHK)
+# endif
+# endif
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 58d9223..2eefb4d 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -41,12 +41,22 @@ ENTRY(__new_memcpy)
lea __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: lea __memcpy_avx_unaligned(%rip), %RAX_LP
+1: lea __memcpy_avx_unaligned_erms(%rip), %RAX_LP
HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz L(Fast_Unaligned_Load)
+ HAS_ARCH_FEATURE (Hybrid_ERMS)
jnz 2f
- lea __memcpy_sse2_unaligned(%rip), %RAX_LP
+ lea __memcpy_avx_unaligned(%rip), %RAX_LP
+ ret
+L(Fast_Unaligned_Load):
+ lea __memcpy_sse2_unaligned_erms(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jz L(SSE2)
+ HAS_ARCH_FEATURE (Hybrid_ERMS)
jnz 2f
+ lea __memcpy_sse2_unaligned(%rip), %RAX_LP
+ ret
+L(SSE2):
lea __memcpy_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 0e21c09..9e218c0 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -35,22 +35,25 @@ ENTRY(__memcpy_chk)
jnz 2f
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
- jz 1f
+ jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
- jz 1f
- leaq __memcpy_chk_avx512_no_vzeroupper(%rip), %rax
+ jz 1f
+ lea __memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __memcpy_chk_sse2(%rip), %rax
+1: lea __memcpy_chk_avx_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ lea __memcpy_chk_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ lea __memcpy_chk_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __memcpy_chk_ssse3(%rip), %rax
+ lea __memcpy_chk_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __memcpy_chk_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __memcpy_chk_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __memcpy_chk_ssse3(%rip), %RAX_LP
2: ret
END(__memcpy_chk)
# else
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index b85cf27..ccd640c 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -36,19 +36,32 @@ ENTRY(__mempcpy)
jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
jz 1f
- leaq __mempcpy_avx512_no_vzeroupper(%rip), %rax
+ lea __mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __mempcpy_sse2(%rip), %rax
+1: lea __mempcpy_avx_unaligned_erms(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz L(Fast_Unaligned_Load)
+ HAS_ARCH_FEATURE (Hybrid_ERMS)
+ jnz 2f
+ lea __mempcpy_avx_unaligned(%rip), %RAX_LP
+ ret
+L(Fast_Unaligned_Load):
+ lea __mempcpy_sse2_unaligned_erms(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jz L(SSE2)
+ HAS_ARCH_FEATURE (Hybrid_ERMS)
+ jnz 2f
+ lea __mempcpy_sse2_unaligned(%rip), %RAX_LP
+ ret
+L(SSE2):
+ lea __mempcpy_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __mempcpy_ssse3(%rip), %rax
+ lea __mempcpy_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __mempcpy_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __mempcpy_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __mempcpy_ssse3(%rip), %RAX_LP
2: ret
END(__mempcpy)
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index de888f3..7c888d3 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -38,19 +38,22 @@ ENTRY(__mempcpy_chk)
jz 1f
HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
jz 1f
- leaq __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax
+ lea __mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
ret
#endif
-1: leaq __mempcpy_chk_sse2(%rip), %rax
+1: lea __mempcpy_chk_avx_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_chk_sse2_unaligned(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+ jnz 2f
+ lea __mempcpy_chk_sse2(%rip), %RAX_LP
HAS_CPU_FEATURE (SSSE3)
jz 2f
- leaq __mempcpy_chk_ssse3(%rip), %rax
+ lea __mempcpy_chk_ssse3_back(%rip), %RAX_LP
HAS_ARCH_FEATURE (Fast_Copy_Backward)
- jz 2f
- leaq __mempcpy_chk_ssse3_back(%rip), %rax
- HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
- jz 2f
- leaq __mempcpy_chk_avx_unaligned(%rip), %rax
+ jnz 2f
+ lea __mempcpy_chk_ssse3(%rip), %RAX_LP
2: ret
END(__mempcpy_chk)
# else
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index dee5403..111f91e 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -133,6 +133,9 @@ lose: \
/* Threshold to use Enhanced REP STOSB. */
#define REP_STOSB_THRESHOLD 1024
+/* Threshold to use Enhanced REP MOVSB. */
+#define REP_MOVSB_THRESHOLD 2048
+
#else /* __ASSEMBLER__ */
/* Long and pointer size in bytes. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=649521e48efafd5fccac8f405c2166749b6d59f4
commit 649521e48efafd5fccac8f405c2166749b6d59f4
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 12:36:03 2016 -0700
Add Hybrid_ERMS and use it in memset.S
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index c8f81ef..b8f0b82 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -192,6 +192,12 @@ init_cpu_features (struct cpu_features *cpu_features)
}
}
+ /* Enable optimization of hybrid Enhanced REP MOVSB/STOSB with
+ SSE/AVX. */
+ if (CPU_FEATURES_CPU_P (cpu_features, ERMS))
+ cpu_features->feature[index_arch_Hybrid_ERMS]
+ |= bit_arch_Hybrid_ERMS;
+
/* Unaligned load with 256-bit AVX registers are faster on
Intel processors with AVX2. */
if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 1c09712..f1df917 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -36,6 +36,7 @@
#define bit_arch_Prefer_MAP_32BIT_EXEC (1 << 16)
#define bit_arch_Prefer_No_VZEROUPPER (1 << 17)
#define bit_arch_Fast_ERMS (1 << 18)
+#define bit_arch_Hybrid_ERMS (1 << 19)
/* CPUID Feature flags. */
@@ -105,6 +106,7 @@
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Fast_ERMS FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Hybrid_ERMS FEATURE_INDEX_1*FEATURE_SIZE
# if defined (_LIBC) && !IS_IN (nonlib)
@@ -272,6 +274,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
# define index_arch_Fast_ERMS FEATURE_INDEX_1
+# define index_arch_Hybrid_ERMS FEATURE_INDEX_1
#endif /* !__ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index dda8185..1e0883c 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -29,10 +29,18 @@ ENTRY(memset)
leaq __memset_erms(%rip), %rax
HAS_ARCH_FEATURE (Fast_ERMS)
jnz 2f
+ leaq __memset_sse2_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Hybrid_ERMS)
+ jnz 1f
leaq __memset_sse2(%rip), %rax
+1:
HAS_ARCH_FEATURE (AVX2_Usable)
jz 2f
+ leaq __memset_avx2_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Hybrid_ERMS)
+ jnz L(AVX512F)
leaq __memset_avx2(%rip), %rax
+L(AVX512F):
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 2f
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=8c36a7002d4a2956e7b3701e4094382148b22eed
commit 8c36a7002d4a2956e7b3701e4094382148b22eed
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 10:34:07 2016 -0700
Add __memset_avx2_erms and __memset_chk_avx2_erms
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 57f8bb3..336ff1a 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -95,6 +95,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_chk_avx2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_chk_avx2_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX512F_Usable),
@@ -110,6 +113,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_avx2)
+ IFUNC_IMPL_ADD (array, i, memset,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memset_avx2_erms)
#ifdef HAVE_AVX512_ASM_SUPPORT
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX512F_Usable),
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
index df63472..9a565a9 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2.S
@@ -28,6 +28,24 @@
.section .text.avx2,"ax",@progbits
#if defined PIC
+ENTRY_CHK (__memset_chk_avx2_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memset_chk_avx2_erms)
+# endif
+
+ENTRY (__memset_avx2_erms)
+ cmpq $REP_STOSB_THRESHOLD, %rdx
+ jbe L(start)
+ movq %rdx, %rcx
+ movzbl %sil, %eax
+ movq %rdi, %rdx
+ rep stosb
+ movq %rdx, %rax
+ ret
+END (__memset_avx2_erms)
+
+#if defined PIC
ENTRY (MEMSET_CHK)
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
@@ -35,6 +53,7 @@ END (MEMSET_CHK)
#endif
ENTRY (MEMSET)
+L(start):
vpxor %xmm0, %xmm0, %xmm0
vmovd %esi, %xmm1
lea (%rdi, %rdx), %rsi
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cb894908a84c04d7fe3fbf164613fb5b5bac0737
commit cb894908a84c04d7fe3fbf164613fb5b5bac0737
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 10:07:48 2016 -0700
Remove mempcpy-*.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2f29a2a..fdb8448 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,10 +8,10 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
- memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
- memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
- memcpy-avx-unaligned mempcpy-avx-unaligned \
- mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
+ memcpy-avx512-no-vzeroupper memmove-ssse3 \
+ memcpy-ssse3-back memmove-avx-unaligned \
+ memcpy-avx-unaligned \
+ memmove-ssse3-back \
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index 910148e..a273f29 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -27,8 +27,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_avx_unaligned
# define MEMCPY_CHK __memcpy_chk_avx_unaligned
-# define MEMPCPY __mempcpy_avx_unaligned_1
-# define MEMPCPY_CHK __mempcpy_chk_avx_unaligned_1
+# define MEMPCPY __mempcpy_avx_unaligned
+# define MEMPCPY_CHK __mempcpy_chk_avx_unaligned
#endif
.section .text.avx,"ax",@progbits
@@ -39,12 +39,8 @@ ENTRY (MEMPCPY_CHK)
END (MEMPCPY_CHK)
ENTRY (MEMPCPY)
-#if 0
- lea (%rdi, %rdx), %rax
-#else
mov %rdi, %rax
add %rdx, %rax
-#endif
jmp L(start)
END (MEMPCPY)
#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
index 3a57b73..7babb47 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -27,8 +27,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_avx512_no_vzeroupper
# define MEMCPY_CHK __memcpy_chk_avx512_no_vzeroupper
-# define MEMPCPY __mempcpy_avx512_no_vzeroupper_1
-# define MEMPCPY_CHK __mempcpy_chk_avx512_no_vzeroupper_1
+# define MEMPCPY __mempcpy_avx512_no_vzeroupper
+# define MEMPCPY_CHK __mempcpy_chk_avx512_no_vzeroupper
#endif
.section .text.avx512,"ax",@progbits
@@ -39,12 +39,8 @@ ENTRY (MEMPCPY_CHK)
END (MEMPCPY_CHK)
ENTRY (MEMPCPY)
-#if 0
- lea (%rdi, %rdx), %rax
-#else
mov %rdi, %rax
add %rdx, %rax
-#endif
jmp L(start)
END (MEMPCPY)
#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 6184e4e..9a872d3 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -29,8 +29,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_ssse3_back
# define MEMCPY_CHK __memcpy_chk_ssse3_back
-# define MEMPCPY __mempcpy_ssse3_back_1
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_back_1
+# define MEMPCPY __mempcpy_ssse3_back
+# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
#endif
#define JMPTBL(I, B) I - B
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 127afaa..643f322 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -29,8 +29,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_ssse3
# define MEMCPY_CHK __memcpy_chk_ssse3
-# define MEMPCPY __mempcpy_ssse3_1
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_1
+# define MEMPCPY __mempcpy_ssse3
+# define MEMPCPY_CHK __mempcpy_chk_ssse3
#endif
#define JMPTBL(I, B) I - B
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
deleted file mode 100644
index 241378e..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* mempcpy with AVX
- Copyright (C) 2014-2016 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_avx_unaligned
-#define MEMCPY_CHK __mempcpy_chk_avx_unaligned
-#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
deleted file mode 100644
index fcc0945..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* mempcpy optimized with AVX512 for KNL hardware.
- Copyright (C) 2016 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_avx512_no_vzeroupper
-#define MEMCPY_CHK __mempcpy_chk_avx512_no_vzeroupper
-#include "memcpy-avx512-no-vzeroupper.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
deleted file mode 100644
index 82ffacb..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_ssse3_back
-#define MEMCPY_CHK __mempcpy_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
deleted file mode 100644
index 822d98e..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY __mempcpy_ssse3
-#define MEMCPY_CHK __mempcpy_chk_ssse3
-#include "memcpy-ssse3.S"
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=10e38ad556275d8d8a4bb2ea1423f1e160f259ee
commit 10e38ad556275d8d8a4bb2ea1423f1e160f259ee
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Mar 6 13:37:31 2016 -0800
Merge memcpy with mempcpy
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index b615d06..910148e 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -25,11 +25,30 @@
#include "asm-syntax.h"
#ifndef MEMCPY
-# define MEMCPY __memcpy_avx_unaligned
+# define MEMCPY __memcpy_avx_unaligned
# define MEMCPY_CHK __memcpy_chk_avx_unaligned
+# define MEMPCPY __mempcpy_avx_unaligned_1
+# define MEMPCPY_CHK __mempcpy_chk_avx_unaligned_1
#endif
.section .text.avx,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+ lea (%rdi, %rdx), %rax
+#else
+ mov %rdi, %rax
+ add %rdx, %rax
+#endif
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -42,6 +61,7 @@ ENTRY (MEMCPY)
#ifdef USE_AS_MEMPCPY
add %rdx, %rax
#endif
+L(start):
cmp $256, %rdx
jae L(256bytesormore)
cmp $16, %dl
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
index 3d567fc..3a57b73 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -27,9 +27,28 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_avx512_no_vzeroupper
# define MEMCPY_CHK __memcpy_chk_avx512_no_vzeroupper
+# define MEMPCPY __mempcpy_avx512_no_vzeroupper_1
+# define MEMPCPY_CHK __mempcpy_chk_avx512_no_vzeroupper_1
#endif
.section .text.avx512,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+ lea (%rdi, %rdx), %rax
+#else
+ mov %rdi, %rax
+ add %rdx, %rax
+#endif
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -42,6 +61,7 @@ ENTRY (MEMCPY)
#ifdef USE_AS_MEMPCPY
add %rdx, %rax
#endif
+L(start):
lea (%rsi, %rdx), %rcx
lea (%rdi, %rdx), %r9
cmp $512, %rdx
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 08b41e9..6184e4e 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -29,6 +29,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_ssse3_back
# define MEMCPY_CHK __memcpy_chk_ssse3_back
+# define MEMPCPY __mempcpy_ssse3_back_1
+# define MEMPCPY_CHK __mempcpy_chk_ssse3_back_1
#endif
#define JMPTBL(I, B) I - B
@@ -44,6 +46,23 @@
ud2
.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+ lea (%rdi, %rdx), %rax
+#else
+ mov %rdi, %rax
+ add %rdx, %rax
+#endif
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -66,6 +85,7 @@ ENTRY (MEMCPY)
BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
L(copy_forward):
#endif
+L(start):
cmp $144, %rdx
jae L(144bytesormore)
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 95de969..127afaa 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -29,6 +29,8 @@
#ifndef MEMCPY
# define MEMCPY __memcpy_ssse3
# define MEMCPY_CHK __memcpy_chk_ssse3
+# define MEMPCPY __mempcpy_ssse3_1
+# define MEMPCPY_CHK __mempcpy_chk_ssse3_1
#endif
#define JMPTBL(I, B) I - B
@@ -44,6 +46,23 @@
ud2
.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+#if 0
+ lea (%rdi, %rdx), %rax
+#else
+ mov %rdi, %rax
+ add %rdx, %rax
+#endif
+ jmp L(start)
+END (MEMPCPY)
+#endif
+
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
cmpq %rdx, %rcx
@@ -66,6 +85,7 @@ ENTRY (MEMCPY)
jmp L(copy_backward)
L(copy_forward):
#endif
+L(start):
cmp $79, %rdx
lea L(table_less_80bytes)(%rip), %r11
ja L(80bytesormore)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0c8870d243dd0139ef9954cad6e399327c2406e1
commit 0c8870d243dd0139ef9954cad6e399327c2406e1
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 18 09:22:56 2016 -0700
Add __memset_sse2_erms and __memset_chk_sse2_erms
* sysdeps/x86_64/memset.S (__memset_chk_sse2_erms): New
function.
(__memset_sse2_erms): Likewise.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memset_chk_sse2_erms and
__memset_sse2_erms.
* sysdeps/x86_64/sysdep.h (REP_STOSB_THRESHOLD): New.
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 4cf0da0..71796a7 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -42,6 +42,26 @@ ENTRY(__memset_tail)
END(__memset_tail)
#endif
+#ifdef USE_MULTIARCH
+# if defined PIC && IS_IN (libc)
+ENTRY_CHK (__memset_chk_sse2_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (__memset_chk_sse2_erms)
+# endif
+
+ENTRY (__memset_sse2_erms)
+ cmpq $REP_STOSB_THRESHOLD, %rdx
+ jbe L(memset_entry)
+ movq %rdx, %rcx
+ movzbl %sil, %eax
+ movq %rdi, %rdx
+ rep stosb
+ movq %rdx, %rax
+ ret
+END (__memset_sse2_erms)
+#endif
+
#if defined PIC && IS_IN (libc)
ENTRY_CHK (__memset_chk)
cmpq %rdx, %rcx
@@ -50,6 +70,7 @@ END_CHK (__memset_chk)
#endif
ENTRY (memset)
+L(memset_entry):
movd %esi, %xmm0
movq %rdi, %rax
punpcklbw %xmm0, %xmm0
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index b0d300d..57f8bb3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -89,6 +89,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_sse2)
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_sse2_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX2_Usable),
@@ -103,6 +105,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memset.S. */
IFUNC_IMPL (i, name, memset,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2_erms)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX2_Usable),
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
index fbe3560..dee5403 100644
--- a/sysdeps/x86_64/sysdep.h
+++ b/sysdeps/x86_64/sysdep.h
@@ -130,6 +130,9 @@ lose: \
#define R14_LP r14
#define R15_LP r15
+/* Threshold to use Enhanced REP STOSB. */
+#define REP_STOSB_THRESHOLD 1024
+
#else /* __ASSEMBLER__ */
/* Long and pointer size in bytes. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=84e04147d1e70e8ba00c8f46923228cebbf0a64b
commit 84e04147d1e70e8ba00c8f46923228cebbf0a64b
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Apr 11 08:51:16 2014 -0700
Test 32-bit ERMS memcpy/memset
* sysdeps/i386/i686/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __bcopy_erms, __bzero_erms,
__memmove_chk_erms, __memmove_erms, __memset_chk_erms,
__memset_erms, __memcpy_chk_erms, __memcpy_erms,
__mempcpy_chk_erms and __mempcpy_erms.
diff --git a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
index ef30a95..f3cbca0 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
+++ b/sysdeps/i386/i686/multiarch/ifunc-impl-list.c
@@ -44,6 +44,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__bcopy_ssse3)
IFUNC_IMPL_ADD (array, i, bcopy, HAS_CPU_FEATURE (SSE2),
__bcopy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_erms)
IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ia32))
/* Support sysdeps/i386/i686/multiarch/bzero.S. */
@@ -52,6 +53,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__bzero_sse2_rep)
IFUNC_IMPL_ADD (array, i, bzero, HAS_CPU_FEATURE (SSE2),
__bzero_sse2)
+ IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_erms)
IFUNC_IMPL_ADD (array, i, bzero, 1, __bzero_ia32))
/* Support sysdeps/i386/i686/multiarch/memchr.S. */
@@ -82,6 +84,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSE2),
__memmove_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_ia32))
/* Support sysdeps/i386/i686/multiarch/memmove.S. */
@@ -92,6 +96,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_ssse3)
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSE2),
__memmove_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ia32))
/* Support sysdeps/i386/i686/multiarch/memrchr.S. */
@@ -111,6 +116,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSE2),
__memset_chk_sse2)
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_ia32))
/* Support sysdeps/i386/i686/multiarch/memset.S. */
@@ -119,6 +126,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memset_sse2_rep)
IFUNC_IMPL_ADD (array, i, memset, HAS_CPU_FEATURE (SSE2),
__memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ia32))
/* Support sysdeps/i386/i686/multiarch/rawmemchr.S. */
@@ -319,6 +327,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSE2),
__memcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_ia32))
/* Support sysdeps/i386/i686/multiarch/memcpy.S. */
@@ -329,6 +339,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_ssse3)
IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSE2),
__memcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_ia32))
/* Support sysdeps/i386/i686/multiarch/mempcpy_chk.S. */
@@ -343,6 +354,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSE2),
__mempcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_ia32))
/* Support sysdeps/i386/i686/multiarch/mempcpy.S. */
@@ -353,6 +366,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSE2),
__mempcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_ia32))
/* Support sysdeps/i386/i686/multiarch/strlen.S. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f77aa0315f706aff7516af601a04568390422dab
commit f77aa0315f706aff7516af601a04568390422dab
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Apr 11 08:25:17 2014 -0700
Test 64-bit ERMS memcpy/memset
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __memmove_chk_erms,
__memmove_erms, __memset_erms, __memset_chk_erms,
__memcpy_chk_erms, __memcpy_erms, __mempcpy_chk_erms and
__mempcpy_erms.
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 188b6d3..b0d300d 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -63,6 +63,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSSE3),
__memmove_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+ __memmove_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2))
/* Support sysdeps/x86_64/multiarch/memmove.S. */
@@ -79,12 +81,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
__memmove_ssse3)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
/* Support sysdeps/x86_64/multiarch/memset_chk.S. */
IFUNC_IMPL (i, name, __memset_chk,
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
__memset_chk_sse2)
+ IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+ __memset_chk_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_chk_avx2)
@@ -98,6 +103,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memset.S. */
IFUNC_IMPL (i, name, memset,
IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
IFUNC_IMPL_ADD (array, i, memset,
HAS_ARCH_FEATURE (AVX2_Usable),
__memset_avx2)
@@ -278,6 +284,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSSE3),
__memcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+ __memcpy_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2))
/* Support sysdeps/x86_64/multiarch/memcpy.S. */
@@ -295,6 +303,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_avx512_no_vzeroupper)
#endif
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
@@ -314,6 +323,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
HAS_CPU_FEATURE (SSSE3),
__mempcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+ __mempcpy_chk_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2))
/* Support sysdeps/x86_64/multiarch/mempcpy.S. */
@@ -330,6 +341,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
__mempcpy_ssse3)
+ IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
/* Support sysdeps/x86_64/multiarch/strncmp.S. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=eade49cad4007c094dacce2967a7f13166a46dae
commit eade49cad4007c094dacce2967a7f13166a46dae
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Sep 21 15:21:28 2011 -0700
Add 32it ERMS memcpy/memset
* sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
bcopy-erms, memcpy-erms, memmove-erms, mempcpy-erms, bzero-erms
and memset-erms.
* sysdeps/i386/i686/multiarch/bcopy-erms.S: New file.
* sysdeps/i386/i686/multiarch/bzero-erms.S: Likewise.
* sysdeps/i386/i686/multiarch/memcpy-erms.S: Likewise.
* sysdeps/i386/i686/multiarch/memmove-erms.S: Likewise.
* sysdeps/i386/i686/multiarch/mempcpy-erms.S: Likewise.
* sysdeps/i386/i686/multiarch/memset-erms.S: Likewise.
* sysdeps/i386/i686/multiarch/ifunc-defines.sym: Add
COMMON_CPUID_INDEX_7.
* sysdeps/i386/i686/multiarch/bcopy.S: Enable ERMS optimization
for Fast_ERMS.
* sysdeps/i386/i686/multiarch/bzero.S: Likewise.
* sysdeps/i386/i686/multiarch/memcpy.S: Likewise.
* sysdeps/i386/i686/multiarch/memcpy_chk.S: Likewise.
* sysdeps/i386/i686/multiarch/memmove.S: Likewise.
* sysdeps/i386/i686/multiarch/memmove_chk.S: Likewise.
* sysdeps/i386/i686/multiarch/mempcpy.S: Likewise.
* sysdeps/i386/i686/multiarch/mempcpy_chk.S: Likewise.
* sysdeps/i386/i686/multiarch/memset.S: Likewise.
* sysdeps/i386/i686/multiarch/memset_chk.S: Likewise.
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 700010d..6bcef4c 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -25,7 +25,9 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
strcasecmp_l-sse4 strncase_l-sse4 \
bcopy-sse2-unaligned memcpy-sse2-unaligned \
mempcpy-sse2-unaligned memmove-sse2-unaligned \
- strcspn-c strpbrk-c strspn-c
+ strcspn-c strpbrk-c strspn-c \
+ bcopy-erms memcpy-erms memmove-erms mempcpy-erms \
+ bzero-erms memset-erms
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/bcopy-erms.S b/sysdeps/i386/i686/multiarch/bcopy-erms.S
new file mode 100644
index 0000000..da9e160
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bcopy-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define USE_AS_BCOPY
+#define MEMCPY __bcopy_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S
index ce6661b..04f5a3a 100644
--- a/sysdeps/i386/i686/multiarch/bcopy.S
+++ b/sysdeps/i386/i686/multiarch/bcopy.S
@@ -27,6 +27,9 @@
ENTRY(bcopy)
.type bcopy, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__bcopy_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__bcopy_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/bzero-erms.S b/sysdeps/i386/i686/multiarch/bzero-erms.S
new file mode 100644
index 0000000..2c3bed6
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/bzero-erms.S
@@ -0,0 +1,3 @@
+#define USE_AS_BZERO
+#define __memset_erms __bzero_erms
+#include "memset-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/bzero.S b/sysdeps/i386/i686/multiarch/bzero.S
index 738ca69..a61b5d2 100644
--- a/sysdeps/i386/i686/multiarch/bzero.S
+++ b/sysdeps/i386/i686/multiarch/bzero.S
@@ -27,6 +27,9 @@
ENTRY(__bzero)
.type __bzero, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__bzero_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__bzero_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/ifunc-defines.sym b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
index 96e9cfa..3df946f 100644
--- a/sysdeps/i386/i686/multiarch/ifunc-defines.sym
+++ b/sysdeps/i386/i686/multiarch/ifunc-defines.sym
@@ -16,4 +16,5 @@ FEATURE_OFFSET offsetof (struct cpu_features, feature)
FEATURE_SIZE sizeof (unsigned int)
COMMON_CPUID_INDEX_1
+COMMON_CPUID_INDEX_7
FEATURE_INDEX_1
diff --git a/sysdeps/i386/i686/multiarch/memcpy-erms.S b/sysdeps/i386/i686/multiarch/memcpy-erms.S
new file mode 100644
index 0000000..f134e79
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memcpy-erms.S
@@ -0,0 +1,102 @@
+/* memcpy with Enhanced REP MOVSB/STOSB
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY __memcpy_erms
+# define MEMCPY_CHK __memcpy_chk_erms
+#endif
+
+#ifdef USE_AS_BCOPY
+# define STR2 12
+# define STR1 STR2+4
+# define N STR1+4
+#else
+# define STR1 12
+# define STR2 STR1+4
+# define N STR2+4
+#endif
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+ .section .text.erms,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+ENTRY (MEMCPY)
+ PUSH (%esi)
+ PUSH (%edi)
+ movl N(%esp), %ecx
+ movl STR1(%esp), %edi
+ movl STR2(%esp), %esi
+ mov %edi, %eax
+#ifdef USE_AS_MEMPCPY
+ add %ecx, %eax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+ cmp %esi, %edi
+ jbe L(copy_forward)
+ lea (%esi,%ecx), %edx
+ cmp %edx, %edi
+ jb L(copy_backward)
+L(copy_forward):
+#endif
+
+ rep movsb
+ POP (%edi)
+ POP (%esi)
+ ret
+
+#ifdef USE_AS_MEMMOVE
+L(copy_backward):
+ lea -1(%edi,%ecx), %edi
+ lea -1(%esi,%ecx), %esi
+ std
+ rep movsb
+ cld
+ POP (%edi)
+ POP (%esi)
+ ret
+#endif
+
+END (MEMCPY)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S
index 652b5a2..79ae41f 100644
--- a/sysdeps/i386/i686/multiarch/memcpy.S
+++ b/sysdeps/i386/i686/multiarch/memcpy.S
@@ -29,6 +29,9 @@
ENTRY(memcpy)
.type memcpy, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memcpy_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memcpy_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S
index 0eee32c..dd1d38a 100644
--- a/sysdeps/i386/i686/multiarch/memcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -30,6 +30,9 @@
ENTRY(__memcpy_chk)
.type __memcpy_chk, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memcpy_chk_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/memmove-erms.S b/sysdeps/i386/i686/multiarch/memmove-erms.S
new file mode 100644
index 0000000..357289a
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memmove-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_erms
+#define MEMCPY_CHK __memmove_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S
index 725a421..13223b3 100644
--- a/sysdeps/i386/i686/multiarch/memmove.S
+++ b/sysdeps/i386/i686/multiarch/memmove.S
@@ -27,6 +27,9 @@
ENTRY(memmove)
.type memmove, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memmove_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memmove_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S
index a29bbc9..ed000ee 100644
--- a/sysdeps/i386/i686/multiarch/memmove_chk.S
+++ b/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -27,6 +27,9 @@
ENTRY(__memmove_chk)
.type __memmove_chk, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memmove_chk_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memmove_chk_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
@@ -90,5 +93,17 @@ __memmove_chk_ia32:
jmp __memmove_ia32
cfi_endproc
.size __memmove_chk_ia32, .-__memmove_chk_ia32
+
+ .type __memmove_chk_erms, @function
+ .p2align 4;
+__memmove_chk_erms:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memmove_erms
+ cfi_endproc
+ .size __memmove_chk_erms, .-__memmove_chk_erms
# endif
#endif
diff --git a/sysdeps/i386/i686/multiarch/mempcpy-erms.S b/sysdeps/i386/i686/multiarch/mempcpy-erms.S
new file mode 100644
index 0000000..01d3bf8
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/mempcpy-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_erms
+#define MEMCPY_CHK __mempcpy_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S
index b46f3fc..cceae9b 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -29,6 +29,9 @@
ENTRY(__mempcpy)
.type __mempcpy, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__mempcpy_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__mempcpy_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
index 30f3629..97d5179 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -30,6 +30,9 @@
ENTRY(__mempcpy_chk)
.type __mempcpy_chk, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__mempcpy_chk_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/memset-erms.S b/sysdeps/i386/i686/multiarch/memset-erms.S
new file mode 100644
index 0000000..807a6e4
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/memset-erms.S
@@ -0,0 +1,69 @@
+/* memset with Enhanced REP MOVSB/STOSB
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#ifndef NOT_IN_libc
+
+#include <sysdep.h>
+
+#define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+#define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+#define PUSH(REG) pushl REG; CFI_PUSH (REG)
+#define POP(REG) popl REG; CFI_POP (REG)
+
+#define STR1 8
+#ifdef USE_AS_BZERO
+#define N STR1+4
+#else
+#define STR2 STR1+4
+#define N STR2+4
+#endif
+
+ .section .text.erms,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc && !defined USE_AS_BZERO
+ENTRY (__memset_chk_erms)
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_erms)
+#endif
+ENTRY (__memset_erms)
+ PUSH (%edi)
+ movl N(%esp), %ecx
+ movl STR1(%esp), %edi
+#ifdef USE_AS_BZERO
+ xor %eax, %eax
+#else
+ movzbl STR2(%esp), %eax
+ mov %edi, %edx
+#endif
+ rep stosb
+#ifndef USE_AS_BZERO
+ mov %edx, %eax
+#endif
+ POP (%edi)
+ ret
+END (__memset_erms)
+
+#endif
diff --git a/sysdeps/i386/i686/multiarch/memset.S b/sysdeps/i386/i686/multiarch/memset.S
index 14180e4..3c11b91 100644
--- a/sysdeps/i386/i686/multiarch/memset.S
+++ b/sysdeps/i386/i686/multiarch/memset.S
@@ -27,6 +27,9 @@
ENTRY(memset)
.type memset, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memset_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memset_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
diff --git a/sysdeps/i386/i686/multiarch/memset_chk.S b/sysdeps/i386/i686/multiarch/memset_chk.S
index d73f202..fa1c5fb 100644
--- a/sysdeps/i386/i686/multiarch/memset_chk.S
+++ b/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -27,6 +27,9 @@
ENTRY(__memset_chk)
.type __memset_chk, @gnu_indirect_function
LOAD_GOT_AND_RTLD_GLOBAL_RO
+ LOAD_FUNC_GOT_EAX (__memset_chk_erms)
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
LOAD_FUNC_GOT_EAX (__memset_chk_ia32)
HAS_CPU_FEATURE (SSE2)
jz 2f
@@ -78,5 +81,17 @@ __memset_chk_ia32:
jmp __memset_ia32
cfi_endproc
.size __memset_chk_ia32, .-__memset_chk_ia32
+
+ .type __memset_chk_erms, @function
+ .p2align 4;
+__memset_chk_erms:
+ cfi_startproc
+ CALL_MCOUNT
+ movl 12(%esp), %eax
+ cmpl %eax, 16(%esp)
+ jb __chk_fail
+ jmp __memset_erms
+ cfi_endproc
+ .size __memset_chk_erms, .-__memset_chk_erms
# endif
#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=312879aedef40f9e1eb30c517d516c0a29cd030c
commit 312879aedef40f9e1eb30c517d516c0a29cd030c
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Sep 15 16:16:10 2011 -0700
Add 64-bit ERMS memcpy and memset
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memcpy-erms, mempcpy-erms, memmove-erms and memset-erms.
* sysdeps/x86_64/multiarch/memcpy-erms.S: New.
* sysdeps/x86_64/multiarch/memmove-erms.S: Likewise.
* sysdeps/x86_64/multiarch/mempcpy-erms.S: Likewise.
* sysdeps/x86_64/multiarch/memset-erms.S: Likewise.
* sysdeps/x86_64/multiarch/memcpy.S: Enable ERMS optimization
for Fast_ERMS.
* sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
* sysdeps/x86_64/multiarch/memmove.c: Likewise.
* sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
* sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
* sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.
* sysdeps/x86_64/multiarch/memset.S: Likewise.
* sysdeps/x86_64/multiarch/memset_chk.S: Likewise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d234f4a..2f29a2a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -20,7 +20,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcat-sse2-unaligned strncat-sse2-unaligned \
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
- memset-avx512-no-vzeroupper
+ memset-avx512-no-vzeroupper \
+ memcpy-erms mempcpy-erms memmove-erms \
+ memset-erms
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memcpy-erms.S b/sysdeps/x86_64/multiarch/memcpy-erms.S
new file mode 100644
index 0000000..07f4843
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-erms.S
@@ -0,0 +1,73 @@
+/* memcpy with Enhanced REP MOVSB/STOSB
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+# include "asm-syntax.h"
+
+# ifndef MEMCPY
+# define MEMCPY __memcpy_erms
+# define MEMCPY_CHK __memcpy_chk_erms
+# endif
+
+ .section .text.erms,"ax",@progbits
+# if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+# endif
+
+ENTRY (MEMCPY)
+ mov %rdi, %rax
+ mov %rdx, %rcx
+# ifdef USE_AS_MEMPCPY
+ add %rdx, %rax
+# endif
+
+# ifdef USE_AS_MEMMOVE
+ cmp %rsi, %rdi
+ jbe L(copy_forward)
+ lea (%rsi,%rcx), %rdx
+ cmp %rdx, %rdi
+ jb L(copy_backward)
+L(copy_forward):
+# endif
+
+ rep movsb
+ ret
+
+# ifdef USE_AS_MEMMOVE
+L(copy_backward):
+ lea -1(%rdi,%rcx), %rdi
+ lea -1(%rsi,%rcx), %rsi
+ std
+ rep movsb
+ cld
+ ret
+# endif
+
+END (MEMCPY)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 8882590..58d9223 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -30,6 +30,9 @@
ENTRY(__new_memcpy)
.type __new_memcpy, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ lea __memcpy_erms(%rip), %RAX_LP
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 648217e..0e21c09 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -30,6 +30,9 @@
ENTRY(__memcpy_chk)
.type __memcpy_chk, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __memcpy_chk_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
diff --git a/sysdeps/x86_64/multiarch/memmove-erms.S b/sysdeps/x86_64/multiarch/memmove-erms.S
new file mode 100644
index 0000000..357289a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY __memmove_erms
+#define MEMCPY_CHK __memmove_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index 8da5640..3777bea 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -35,6 +35,7 @@
extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
+extern __typeof (__redirect_memmove) __memmove_erms attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
# ifdef HAVE_AVX512_ASM_SUPPORT
extern __typeof (__redirect_memmove) __memmove_avx512_no_vzeroupper attribute_hidden;
@@ -52,6 +53,9 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
ifunc symbol properly. */
extern __typeof (__redirect_memmove) __libc_memmove;
libc_ifunc (__libc_memmove,
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ ? __memmove_erms
+ : (
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
&& HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
@@ -63,7 +67,7 @@ libc_ifunc (__libc_memmove,
: (HAS_CPU_FEATURE (SSSE3)
? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
? __memmove_ssse3_back : __memmove_ssse3)
- : __memmove_sse2)));
+ : __memmove_sse2))));
strong_alias (__libc_memmove, memmove)
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
index f64da63..4cd360a 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -25,6 +25,7 @@
extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
+extern __typeof (__memmove_chk) __memmove_chk_erms attribute_hidden;
extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
# ifdef HAVE_AVX512_ASM_SUPPORT
extern __typeof (__memmove_chk) __memmove_chk_avx512_no_vzeroupper attribute_hidden;
@@ -33,6 +34,9 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
#include "debug/memmove_chk.c"
libc_ifunc (__memmove_chk,
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ ? __memmove_chk_erms
+ : (
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
&& HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
@@ -43,4 +47,4 @@ libc_ifunc (__memmove_chk,
(HAS_CPU_FEATURE (SSSE3)
? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
- : __memmove_chk_sse2));
+ : __memmove_chk_sse2)));
diff --git a/sysdeps/x86_64/multiarch/mempcpy-erms.S b/sysdeps/x86_64/multiarch/mempcpy-erms.S
new file mode 100644
index 0000000..01d3bf8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-erms.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY __mempcpy_erms
+#define MEMCPY_CHK __mempcpy_chk_erms
+#include "memcpy-erms.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index ed78623..b85cf27 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -28,6 +28,9 @@
ENTRY(__mempcpy)
.type __mempcpy, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __mempcpy_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 6e8a89d..de888f3 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -30,6 +30,9 @@
ENTRY(__mempcpy_chk)
.type __mempcpy_chk, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __mempcpy_chk_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
#ifdef HAVE_AVX512_ASM_SUPPORT
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
diff --git a/sysdeps/x86_64/multiarch/memset-erms.S b/sysdeps/x86_64/multiarch/memset-erms.S
new file mode 100644
index 0000000..af9f80b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-erms.S
@@ -0,0 +1,40 @@
+/* memset with Enhanced REP MOVSB/STOSB
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+#ifndef NOT_IN_libc
+
+ .text
+# ifdef SHARED
+ENTRY (__memset_chk_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memset_chk_erms)
+# endif
+
+ENTRY (__memset_erms)
+ mov %rdx, %rcx
+ movzbl %sil, %eax
+ mov %rdi, %rdx
+ rep stosb
+ mov %rdx, %rax
+ ret
+END (__memset_erms)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 8e3b9b9..dda8185 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -26,6 +26,9 @@
ENTRY(memset)
.type memset, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __memset_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
leaq __memset_sse2(%rip), %rax
HAS_ARCH_FEATURE (AVX2_Usable)
jz 2f
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
index 9a7b270..b8c9940 100644
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -26,6 +26,9 @@
ENTRY(__memset_chk)
.type __memset_chk, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ leaq __memset_chk_erms(%rip), %rax
+ HAS_ARCH_FEATURE (Fast_ERMS)
+ jnz 2f
leaq __memset_chk_sse2(%rip), %rax
HAS_ARCH_FEATURE (AVX2_Usable)
jz 2f
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=487dc028667e43ed55f407fda3c07fc31ecd1554
commit 487dc028667e43ed55f407fda3c07fc31ecd1554
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Sep 15 15:47:01 2011 -0700
Initial ERMS support
* sysdeps/x86/cpu-features.h (bit_arch_Fast_ERMS): New.
(bit_cpu_ERMS): Likewise.
(index_cpu_ERMS): Likewise.
(index_arch_Fast_ERMS): Likewise.
(reg_ERMS): Likewise.
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index e06eb7e..1c09712 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -35,6 +35,7 @@
#define bit_arch_I686 (1 << 15)
#define bit_arch_Prefer_MAP_32BIT_EXEC (1 << 16)
#define bit_arch_Prefer_No_VZEROUPPER (1 << 17)
+#define bit_arch_Fast_ERMS (1 << 18)
/* CPUID Feature flags. */
@@ -52,6 +53,7 @@
#define bit_cpu_FMA4 (1 << 16)
/* COMMON_CPUID_INDEX_7. */
+#define bit_cpu_ERMS (1 << 9)
#define bit_cpu_RTM (1 << 11)
#define bit_cpu_AVX2 (1 << 5)
#define bit_cpu_AVX512F (1 << 16)
@@ -83,6 +85,7 @@
# define index_cpu_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_cpu_AVX COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_cpu_AVX2 COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
+# define index_cpu_ERMS COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
# define index_arch_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE
@@ -101,6 +104,7 @@
# define index_arch_I686 FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Fast_ERMS FEATURE_INDEX_1*FEATURE_SIZE
# if defined (_LIBC) && !IS_IN (nonlib)
@@ -226,6 +230,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_cpu_AVX2 COMMON_CPUID_INDEX_7
# define index_cpu_AVX512F COMMON_CPUID_INDEX_7
# define index_cpu_AVX512DQ COMMON_CPUID_INDEX_7
+# define index_cpu_ERMS COMMON_CPUID_INDEX_7
# define index_cpu_RTM COMMON_CPUID_INDEX_7
# define index_cpu_FMA COMMON_CPUID_INDEX_1
# define index_cpu_FMA4 COMMON_CPUID_INDEX_80000001
@@ -242,6 +247,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define reg_AVX2 ebx
# define reg_AVX512F ebx
# define reg_AVX512DQ ebx
+# define reg_ERMS ebx
# define reg_RTM ebx
# define reg_FMA ecx
# define reg_FMA4 ecx
@@ -265,6 +271,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_arch_I686 FEATURE_INDEX_1
# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
+# define index_arch_Fast_ERMS FEATURE_INDEX_1
#endif /* !__ASSEMBLER__ */
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources