This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch master updated. glibc-2.23-138-g88b57b8

From: hjl at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 31 Mar 2016 17:04:57 -0000
Subject: GNU C Library master sources branch master updated. glibc-2.23-138-g88b57b8
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  88b57b8ed41d5ecf2e1bdfc19556f9246a665ebb (commit)
      from  5cdd1989d1d2f135d02e66250f37ba8e767f9772 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=88b57b8ed41d5ecf2e1bdfc19556f9246a665ebb

commit 88b57b8ed41d5ecf2e1bdfc19556f9246a665ebb
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Mar 31 10:04:26 2016 -0700

    Add x86-64 memmove with unaligned load/store and rep movsb
    
    Implement x86-64 memmove with unaligned load/store and rep movsb.
    Support 16-byte, 32-byte and 64-byte vector register sizes.  When
    size <= 8 times of vector register size, there is no check for
    address overlap bewteen source and destination.  Since overhead for
    overlap check is small when size > 8 times of vector register size,
    memcpy is an alias of memmove.
    
    A single file provides 2 implementations of memmove, one with rep movsb
    and the other without rep movsb.  They share the same codes when size is
    between 2 times of vector register size and REP_MOVSB_THRESHOLD which
    is 2KB for 16-byte vector register size and scaled up by large vector
    register size.
    
    Key features:
    
    1. Use overlapping load and store to avoid branch.
    2. For size <= 8 times of vector register size, load  all sources into
    registers and store them together.
    3. If there is no address overlap bewteen source and destination, copy
    from both ends with 4 times of vector register size at a time.
    4. If address of destination > address of source, backward copy 8 times
    of vector register size at a time.
    5. Otherwise, forward copy 8 times of vector register size at a time.
    6. Use rep movsb only for forward copy.  Avoid slow backward rep movsb
    by fallbacking to backward copy 8 times of vector register size at a
    time.
    7. Skip when address of destination == address of source.
    
    	[BZ #19776]
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and
    	memmove-avx512-unaligned-erms.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Test
    	__memmove_chk_avx512_unaligned_2,
    	__memmove_chk_avx512_unaligned_erms,
    	__memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms,
    	__memmove_chk_sse2_unaligned_2,
    	__memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2,
    	__memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2,
    	__memmove_avx512_unaligned_erms, __memmove_erms,
    	__memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms,
    	__memcpy_chk_avx512_unaligned_2,
    	__memcpy_chk_avx512_unaligned_erms,
    	__memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms,
    	__memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms,
    	__memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms,
    	__memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms,
    	__memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms,
    	__memcpy_erms, __mempcpy_chk_avx512_unaligned_2,
    	__mempcpy_chk_avx512_unaligned_erms,
    	__mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms,
    	__mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms,
    	__mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms,
    	__mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms,
    	__mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and
    	__mempcpy_erms.
    	* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New
    	file.
    	* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
    	Likwise.
    	* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
    	Likwise.
    	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
    	Likwise.

diff --git a/ChangeLog b/ChangeLog
index 632da3c..100764f 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,43 @@
+2016-03-31   H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #19776]
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and
+	memmove-avx512-unaligned-erms.
+	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Test
+	__memmove_chk_avx512_unaligned_2,
+	__memmove_chk_avx512_unaligned_erms,
+	__memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms,
+	__memmove_chk_sse2_unaligned_2,
+	__memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2,
+	__memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2,
+	__memmove_avx512_unaligned_erms, __memmove_erms,
+	__memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms,
+	__memcpy_chk_avx512_unaligned_2,
+	__memcpy_chk_avx512_unaligned_erms,
+	__memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms,
+	__memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms,
+	__memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms,
+	__memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms,
+	__memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms,
+	__memcpy_erms, __mempcpy_chk_avx512_unaligned_2,
+	__mempcpy_chk_avx512_unaligned_erms,
+	__mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms,
+	__mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms,
+	__mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms,
+	__mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms,
+	__mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and
+	__mempcpy_erms.
+	* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New
+	file.
+	* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
+	Likwise.
+	* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
+	Likwise.
+	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
+	Likwise.
+
 2016-03-31  Stefan Liebler  <stli@linux.vnet.ibm.com>
 
 	* sysdeps/s390/bits/link.h: (La_s390_vr) New typedef.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 7fc89c2..ef4dbc0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -20,7 +20,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
 		   strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
-		   memset-avx512-no-vzeroupper
+		   memset-avx512-no-vzeroupper \
+		   memmove-sse2-unaligned-erms \
+		   memmove-avx-unaligned-erms \
+		   memmove-avx512-unaligned-erms
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 188b6d3..9204da4 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -52,17 +52,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memmove_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_chk_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_chk_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_chk_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memmove.S.  */
@@ -70,15 +86,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_avx_unaligned_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memmove_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1,
+			      __memmove_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1,
+			      __memmove_sse2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
@@ -267,17 +300,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memcpy_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_chk_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_chk_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_chk_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
@@ -285,6 +334,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -293,8 +348,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memcpy_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+			      __memcpy_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+			      __memcpy_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
@@ -303,17 +369,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __mempcpy_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_chk_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_chk_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_chk_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
@@ -322,14 +404,31 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __mempcpy_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+			      __mempcpy_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+			      __mempcpy_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
new file mode 100644
index 0000000..3a72c7e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -0,0 +1,9 @@
+#define VEC_SIZE	32
+#define VEC(i)		ymm##i
+#define VMOVU		vmovdqu
+#define VMOVA		vmovdqa
+
+#define SECTION(p)		p##.avx
+#define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
+
+#include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
new file mode 100644
index 0000000..38358fa
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -0,0 +1,11 @@
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE	64
+# define VEC(i)		zmm##i
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# define SECTION(p)		p##.avx512
+# define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
new file mode 100644
index 0000000..52b9ae0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -0,0 +1,9 @@
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+#define VMOVU		movdqu
+#define VMOVA		movdqa
+
+#define SECTION(p)		p
+#define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
+
+#include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
new file mode 100644
index 0000000..cf645dd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -0,0 +1,462 @@
+/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* memmove/memcpy/mempcpy is implemented as:
+   1. Use overlapping load and store to avoid branch.
+   2. Use 8-bit or 32-bit displacements for branches and nop paddings
+      to avoid long nop between instructions.
+   3. Load all sources into registers and store them together to avoid
+      possible address overflap between source and destination.
+   4. If size is 2 * VEC_SIZE or less, load all sources into registers
+      and store them together.
+   5. If there is no address overflap, copy from both ends with
+      4 * VEC_SIZE at a time.
+   6. If size is 8 * VEC_SIZE or less, load all sources into registers
+      and store them together.
+   7. If address of destination > address of source, backward copy
+      8 * VEC_SIZE at a time.
+   8. Otherwise, forward copy 8 * VEC_SIZE at a time.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef VZEROUPPER
+#  if VEC_SIZE > 16
+#   define VZEROUPPER vzeroupper
+#  else
+#   define VZEROUPPER
+#  endif
+# endif
+
+/* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
+   up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
+   memcpy micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP MOVSB becomes faster than SSE2 optimization
+   on processors with Enhanced REP MOVSB.  Since larger register size
+   can move more data with a single load and store, the threshold is
+   higher with larger register size.  */
+# ifndef REP_MOVSB_THRESHOLD
+#  define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
+# endif
+
+# ifndef SECTION
+#  error SECTION is not defined!
+# endif
+	.section SECTION(.text),"ax",@progbits
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+
+ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+	movq	%rdi, %rax
+L(start):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VZEROUPPER
+	ret
+END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start_erms)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+# endif
+
+# if VEC_SIZE == 16
+/* Only used to measure performance of REP MOVSB.  */
+#  ifdef SHARED
+ENTRY (__mempcpy_erms)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(movsb)
+END (__mempcpy_erms)
+#  endif
+
+ENTRY (__memmove_erms)
+	movq	%rdi, %rax
+	movq	%rdx, %rcx
+	cmpq	%rsi, %rdi
+	jbe	1f
+	leaq	(%rsi,%rcx), %rdx
+	cmpq	%rdx, %rdi
+	jb	L(movsb_backward)
+1:
+	rep movsb
+	ret
+L(movsb_backward):
+	leaq	-1(%rdi,%rcx), %rdi
+	leaq	-1(%rsi,%rcx), %rsi
+	std
+	rep movsb
+	cld
+	ret
+END (__memmove_erms)
+strong_alias (__memmove_erms, __memcpy_erms)
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+	movq	%rdi, %rax
+L(start_erms):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(movsb_more_2x_vec)
+L(last_2x_vec):
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+L(return):
+	VZEROUPPER
+	ret
+
+L(movsb):
+	cmpq	%rsi, %rdi
+	je	L(nop)
+	jb	1f
+	leaq	(%rsi,%rdx), %r9
+	cmpq	%r9, %rdi
+	/* Avoid slow backward REP MOVSB.  */
+# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
+#  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
+# endif
+	jb	L(more_8x_vec_backward)
+1:
+	movq	%rdx, %rcx
+	rep movsb
+L(nop):
+	ret
+
+	.p2align 4
+L(movsb_more_2x_vec):
+	cmpq	$REP_MOVSB_THRESHOLD, %rdx
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	ja.d32	L(movsb)
+	.p2align 4
+L(more_2x_vec):
+	/* More than 2 * VEC.  */
+	cmpq	%rsi, %rdi
+	je	L(nop)
+	jb	L(copy_forward)
+	leaq	(%rsi,%rdx), %rcx
+	cmpq	%rcx, %rdi
+	jb	L(more_2x_vec_overlap)
+L(copy_forward):
+	leaq	(%rdi,%rdx), %rcx
+	cmpq	%rcx, %rsi
+	jb	L(more_2x_vec_overlap)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	jbe.d32	L(return)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(0)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
+	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(1), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
+	cmpq	$(VEC_SIZE * 8), %rdx
+# if  VEC_SIZE == 16
+	jbe	L(return)
+# else
+	/* Use 8-bit displacement to avoid long nop between
+	   instructions.  */
+	jbe	L(return_disp8)
+# endif
+	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+	addq	%rdi, %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+	andq	$-(VEC_SIZE * 4), %rcx
+	movq	%rcx, %r11
+	subq	%rdi, %r11
+	addq	%r11, %rsi
+	cmpq	%rdx, %rcx
+	/* Use 8-bit displacement to avoid long nop between
+	   instructions.  */
+	je	L(return_disp8)
+	movq	%rsi, %r10
+	subq	%rcx, %r10
+	leaq	VEC_SIZE(%r10), %r9
+	leaq	(VEC_SIZE * 2)(%r10), %r8
+	leaq	(VEC_SIZE * 3)(%r10), %r11
+	.p2align 4
+L(loop):
+	VMOVU	(%rcx,%r10), %VEC(0)
+	VMOVU	(%rcx,%r9), %VEC(1)
+	VMOVU	(%rcx,%r8), %VEC(2)
+	VMOVU	(%rcx,%r11), %VEC(3)
+	VMOVA	%VEC(0), (%rcx)
+	VMOVA	%VEC(1), VEC_SIZE(%rcx)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rcx)
+	addq	$(VEC_SIZE * 4), %rcx
+	cmpq	%rcx, %rdx
+	jne	L(loop)
+L(return_disp8):
+	VZEROUPPER
+	ret
+L(less_vec):
+	/* Less than 1 VEC.  */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+#  error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+	cmpb	$32, %dl
+	jae	L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+# endif
+	cmpb	$8, %dl
+	jae	L(between_8_15)
+	cmpb	$4, %dl
+	jae	L(between_4_7)
+	cmpb	$1, %dl
+	ja	L(between_2_3)
+	jb	1f
+	movzbl	(%rsi), %ecx
+	movb	%cl, (%rdi)
+1:
+	ret
+# if VEC_SIZE > 32
+L(between_32_63):
+	/* From 32 to 63.  No branch when size == 32.  */
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu	-32(%rsi,%rdx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -32(%rdi,%rdx)
+	VZEROUPPER
+	ret
+# endif
+# if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi,%rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi,%rdx)
+	ret
+# endif
+L(between_8_15):
+	/* From 8 to 15.  No branch when size == 8.  */
+	movq	-8(%rsi,%rdx), %rcx
+	movq	(%rsi), %rsi
+	movq	%rcx, -8(%rdi,%rdx)
+	movq	%rsi, (%rdi)
+	ret
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi,%rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi,%rdx)
+	movl	%esi, (%rdi)
+	ret
+L(between_2_3):
+	/* From 2 to 3.  No branch when size == 2.  */
+	movzwl	-2(%rsi,%rdx), %ecx
+	movzwl	(%rsi), %esi
+	movw	%cx, -2(%rdi,%rdx)
+	movw	%si, (%rdi)
+	ret
+
+# if VEC_SIZE > 16
+	/* Align to 16 bytes to avoid long nop between instructions.  */
+	.p2align 4
+# endif
+L(more_2x_vec_overlap):
+	/* More than 2 * VEC and there is overlap bewteen destination
+	   and source.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+L(between_4x_vec_and_8x_vec):
+	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+L(last_4x_vec):
+	/* Copy from 2 * VEC to 4 * VEC. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+L(between_0_and_4x_vec):
+	/* Copy from 0 to 4 * VEC. */
+	cmpl	$(VEC_SIZE * 2), %edx
+	jae	L(last_4x_vec)
+	/* Copy from 0 to 2 * VEC. */
+	cmpl	$VEC_SIZE, %edx
+	jae	L(last_2x_vec)
+	/* Copy from 0 to VEC. */
+	VZEROUPPER
+	jmp	L(less_vec)
+L(more_8x_vec):
+	cmpq	%rsi, %rdi
+	ja	L(more_8x_vec_backward)
+
+	.p2align 4
+L(loop_8x_vec_forward):
+	/* Copy 8 * VEC a time forward.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 4)(%rsi), %VEC(4)
+	VMOVU	(VEC_SIZE * 5)(%rsi), %VEC(5)
+	VMOVU	(VEC_SIZE * 6)(%rsi), %VEC(6)
+	VMOVU	(VEC_SIZE * 7)(%rsi), %VEC(7)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(4), (VEC_SIZE * 4)(%rdi)
+	VMOVU	%VEC(5), (VEC_SIZE * 5)(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 6)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 7)(%rdi)
+	addq	$(VEC_SIZE * 8), %rdi
+	addq	$(VEC_SIZE * 8), %rsi
+	subq	$(VEC_SIZE * 8), %rdx
+	cmpq	$(VEC_SIZE * 8), %rdx
+	je	L(between_4x_vec_and_8x_vec)
+	ja	L(loop_8x_vec_forward)
+	/* Less than 8 * VEC to copy.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(between_0_and_4x_vec)
+	jmp	L(between_4x_vec_and_8x_vec)
+
+	.p2align 4
+L(more_8x_vec_backward):
+	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
+	leaq	-VEC_SIZE(%rdi, %rdx), %r9
+
+	.p2align 4
+L(loop_8x_vec_backward):
+	/* Copy 8 * VEC a time backward.  */
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	VMOVU	-(VEC_SIZE * 4)(%rcx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 5)(%rcx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 6)(%rcx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 7)(%rcx), %VEC(7)
+	VMOVU	%VEC(0), (%r9)
+	VMOVU	%VEC(1), -VEC_SIZE(%r9)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVU	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	VMOVU	%VEC(4), -(VEC_SIZE * 4)(%r9)
+	VMOVU	%VEC(5), -(VEC_SIZE * 5)(%r9)
+	VMOVU	%VEC(6), -(VEC_SIZE * 6)(%r9)
+	VMOVU	%VEC(7), -(VEC_SIZE * 7)(%r9)
+	subq	$(VEC_SIZE * 8), %rcx
+	subq	$(VEC_SIZE * 8), %r9
+	subq	$(VEC_SIZE * 8), %rdx
+	cmpq	$(VEC_SIZE * 8), %rdx
+	je	L(between_4x_vec_and_8x_vec)
+	ja	L(loop_8x_vec_backward)
+	/* Less than 8 * VEC to copy.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(between_0_and_4x_vec)
+	jmp	L(between_4x_vec_and_8x_vec)
+END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+
+# ifdef SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
+	      MEMMOVE_SYMBOL (__memcpy, unaligned_2))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2),
+	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2))
+# endif
+
+#endif

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                          |   40 ++
 sysdeps/x86_64/multiarch/Makefile                  |    5 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c         |   99 +++++
 .../x86_64/multiarch/memmove-avx-unaligned-erms.S  |    9 +
 .../multiarch/memmove-avx512-unaligned-erms.S      |   11 +
 .../x86_64/multiarch/memmove-sse2-unaligned-erms.S |    9 +
 .../x86_64/multiarch/memmove-vec-unaligned-erms.S  |  462 ++++++++++++++++++++
 7 files changed, 634 insertions(+), 1 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
 create mode 100644 sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
 create mode 100644 sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
 create mode 100644 sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]