This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH RFC] Improve 64bit memcpy/memove for Corei7 with unaligned avx2 instruction


From: Ma Ling <ling.ml@alibaba-inc.com>

In this version we manage to avoid branch instructions, and force destination to be aligned
with avx2 instruction. We modified gcc.403 so that we can only measure memcpy function,
gcc.403 benchmarks indicate the version improved performance from 4% to 16% on different cases .

Ondra, I will send out results from your memcpy_profile.

Best Regards
Ling
---
 sysdeps/x86_64/multiarch/Makefile                 |   5 +-
 sysdeps/x86_64/multiarch/ifunc-defines.sym        |   2 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c        |  11 +
 sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S  | 438 ++++++++++++++++++++++
 sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S |   4 +
 sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S |   4 +
 6 files changed, 462 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S
 create mode 100644 sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S
 create mode 100644 sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..02c0a2a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,8 +8,9 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
-		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
-		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
+		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back memmove-ssse3-back \
+		   memcpy-avx2-unaligned mempcpy-avx2-unaligned memmove-avx2-unaligned \
+		   strcasestr-nonascii strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
index eb1538a..448b8c4 100644
--- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
+++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
@@ -17,4 +17,6 @@ FEATURE_OFFSET		offsetof (struct cpu_features, feature)
 FEATURE_SIZE		sizeof (unsigned int)
 
 COMMON_CPUID_INDEX_1
+COMMON_CPUID_INDEX_7
 FEATURE_INDEX_1
+FEATURE_INDEX_7
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 332a60d..5fb5663 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -50,6 +50,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_SSSE3,
 			      __memmove_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_AVX2,
+			  __memmove_chk_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2))
 
@@ -59,6 +61,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_SSSE3,
 			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, HAS_AVX2,
+			  __memmove_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
@@ -235,6 +239,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_SSSE3,
 			      __memcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_AVX2,
+			      __memcpy_chk_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2))
 
@@ -243,6 +249,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3,
 			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_SSSE3, __memcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_AVX2, __memcpy_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
@@ -251,6 +258,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_SSSE3,
 			      __mempcpy_chk_ssse3)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_AVX2,
+			      __mempcpy_chk_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2))
 
@@ -260,6 +269,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_SSSE3,
 			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_AVX2,
+			      __mempcpy_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strlen.S.  */
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S
new file mode 100644
index 0000000..d32cfad
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx2-unaligned.S
@@ -0,0 +1,438 @@
+/* memcpy with AVX2
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY	__memcpy_avx2_unaligned
+# define MEMCPY_CHK	__memcpy_chk_avx2_unaligned
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)	\
+	.type name,  @function;	\
+	.globl name;	\
+	ALIGN(4);	\
+name:	\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)	\
+	cfi_endproc;	\
+	.size name, .-name
+#endif
+
+	.section .text.avx2,"ax",@progbits
+#if !defined USE_AS_BCOPY
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	vzeroupper
+	mov	%rdi, %rax
+
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+
+	lea	(%rsi, %rdx), %r8
+	lea	(%rdi, %rdx), %r9
+	cmp	$256, %rdx
+	ja	L(256bytesormore)
+	cmp	$128, %edx
+	jb	L(less_128bytes)
+	vmovups (%rsi), %xmm0
+	vmovups 0x10(%rsi), %xmm1
+	vmovups 0x20(%rsi), %xmm2
+	vmovups 0x30(%rsi), %xmm3
+	vmovups 0x40(%rsi), %xmm4
+	vmovups 0x50(%rsi), %xmm5
+	vmovups 0x60(%rsi), %xmm6
+	vmovups 0x70(%rsi), %xmm7
+	vmovups -0x80(%r8), %xmm8
+	vmovups -0x70(%r8), %xmm9
+	vmovups -0x60(%r8), %xmm10
+	vmovups -0x50(%r8), %xmm11
+	vmovups -0x40(%r8), %xmm12
+	vmovups -0x30(%r8), %xmm13
+	vmovups -0x20(%r8), %xmm14
+	vmovups -0x10(%r8), %xmm15
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm1, 0x10(%rdi)
+	vmovups %xmm2, 0x20(%rdi)
+	vmovups %xmm3, 0x30(%rdi)
+	vmovups %xmm4, 0x40(%rdi)
+	vmovups %xmm5, 0x50(%rdi)
+	vmovups %xmm6, 0x60(%rdi)
+	vmovups %xmm7, 0x70(%rdi)
+	vmovups %xmm8, -0x80(%r9)
+	vmovups %xmm9, -0x70(%r9)
+	vmovups %xmm10, -0x60(%r9)
+	vmovups %xmm11, -0x50(%r9)
+	vmovups %xmm12, -0x40(%r9)
+	vmovups %xmm13, -0x30(%r9)
+	vmovups %xmm14, -0x20(%r9)
+	vmovups %xmm15, -0x10(%r9)
+	ret
+	ALIGN(4)
+L(less_128bytes):
+	cmp	$64, %edx
+	jb	L(less_64bytes)
+	vmovups (%rsi), %xmm0
+	vmovups 0x10(%rsi), %xmm1
+	vmovups 0x20(%rsi), %xmm2
+	vmovups 0x30(%rsi), %xmm3
+	vmovups -0x40(%r8), %xmm4
+	vmovups -0x30(%r8), %xmm5
+	vmovups -0x20(%r8), %xmm6
+	vmovups -0x10(%r8), %xmm7
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm1, 0x10(%rdi)
+	vmovups %xmm2, 0x20(%rdi)
+	vmovups %xmm3, 0x30(%rdi)
+	vmovups %xmm4, -0x40(%r9)
+	vmovups %xmm5, -0x30(%r9)
+	vmovups %xmm6, -0x20(%r9)
+	vmovups %xmm7, -0x10(%r9)
+	ret
+	ALIGN(4)
+L(less_64bytes):
+	cmp	$32, %edx
+	jb	L(less_32bytes)
+	vmovups (%rsi), %xmm0
+	vmovups 0x10(%rsi), %xmm1
+	vmovups -0x20(%r8), %xmm6
+	vmovups -0x10(%r8), %xmm7
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm1, 0x10(%rdi)
+	vmovups %xmm6, -0x20(%r9)
+	vmovups %xmm7, -0x10(%r9)
+	retq
+	ALIGN(4)
+L(less_32bytes):
+	cmp	$16, %edx
+	jb	L(less_16bytes)
+	vmovups (%rsi), %xmm0
+	vmovups -0x10(%r8), %xmm7
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm7, -0x10(%r9)
+	retq
+	ALIGN(4)
+L(less_16bytes):
+	cmp	$8, %edx
+	jb	L(less_8bytes)
+	movq (%rsi),	%rcx
+	movq -0x08(%r8),	%r10
+	movq %rcx, (%rdi)
+	movq %r10, -0x08(%r9)
+	retq
+	ALIGN(4)
+L(less_8bytes):
+	cmp	$4, %edx
+	jb	L(less_4bytes)
+	mov (%rsi),	%ecx
+	mov -0x04(%r8), %edx
+	mov %ecx, (%rdi)
+	mov %edx, -0x04(%r9)
+	ret
+	ALIGN(4)
+L(less_4bytes):
+	cmp	$2, %edx
+	jb	L(less_2bytes)
+	mov (%rsi),	%cx
+	mov -0x02(%r8),	%dx
+	mov %cx, (%rdi)
+	mov %dx, -0x02(%r9)
+	ret
+	ALIGN(4)
+L(less_2bytes):
+	cmp	$1, %rdx
+	jb	L(less_0bytes)
+	mov	(%rsi), %cl
+	mov	%cl,	(%rdi)
+L(less_0bytes):
+	retq
+
+	ALIGN(4)
+L(256bytesormore):
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jae	L(copy_backward)
+#endif
+	cmp	$2048, %rdx
+	jae	L(gobble_data_movsb)
+
+	vmovups -0x80(%r8), %xmm8
+	vmovups -0x70(%r8), %xmm9
+	vmovups -0x60(%r8), %xmm10
+	vmovups -0x50(%r8), %xmm11
+	vmovups -0x40(%r8), %xmm12
+	vmovups -0x30(%r8), %xmm13
+	vmovups -0x20(%r8), %xmm14
+	vmovups -0x10(%r8), %xmm15
+	vmovups	(%rsi), %ymm4
+	mov	%rdi, %r10
+	and	$-32, %rdi
+	add	$32, %rdi
+	mov	%rdi, %r11
+	sub	%r10, %r11
+	sub	%r11, %rdx
+	add	%r11, %rsi
+	sub	$0x80, %rdx
+L(goble_128_loop):
+	vmovups (%rsi), %ymm0
+	vmovups 0x20(%rsi), %ymm1
+	vmovups 0x40(%rsi), %ymm2
+	vmovups 0x60(%rsi), %ymm3
+	lea	0x80(%rsi), %rsi
+	vmovaps %ymm0, (%rdi)
+	vmovaps %ymm1, 0x20(%rdi)
+	vmovaps %ymm2, 0x40(%rdi)
+	vmovaps %ymm3, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(goble_128_loop)
+	vmovups	%ymm4, (%r10)
+	vzeroupper
+	vmovups %xmm8, -0x80(%r9)
+	vmovups %xmm9, -0x70(%r9)
+	vmovups %xmm10, -0x60(%r9)
+	vmovups %xmm11, -0x50(%r9)
+	vmovups %xmm12, -0x40(%r9)
+	vmovups %xmm13, -0x30(%r9)
+	vmovups %xmm14, -0x20(%r9)
+	vmovups %xmm15, -0x10(%r9)
+	ret
+
+L(gobble_data_movsb):
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+	shl	$3, %rcx
+
+#ifdef USE_AS_MEMMOVE
+	mov	%rsi, %r10
+	sub	%rdi, %r10
+	cmp	%rdx, %r10
+	jae	L(memmove_use_memcpy_fwd)
+	cmp	%rcx, %r10
+	jae	L(memmove_use_memcpy_fwd)
+	jmp L(gobble_mem_fwd_llc_start)
+L(memmove_use_memcpy_fwd):
+#endif
+
+	cmp	%rcx, %rdx
+	ja	L(gobble_big_data_fwd)
+
+#ifdef USE_AS_MEMMOVE
+L(gobble_mem_fwd_llc_start):
+#endif
+	mov	%rdx, %rcx
+	rep	movsb
+	ret
+
+L(gobble_big_data_fwd):
+	vmovups	(%rsi), %ymm4
+	vmovups -0x80(%r8), %xmm5
+	vmovups -0x70(%r8), %xmm6
+	vmovups -0x60(%r8), %xmm7
+	vmovups -0x50(%r8), %xmm8
+	vmovups -0x40(%r8), %xmm9
+	vmovups -0x30(%r8), %xmm10
+	vmovups -0x20(%r8), %xmm11
+	vmovups -0x10(%r8), %xmm12
+	mov	%rdi, %r8
+	and	$-32, %rdi
+	add	$32, %rdi
+	mov	%rdi, %r10
+	sub	%r8, %r10
+	sub	%r10, %rdx
+	add	%r10, %rsi
+	sub	$0x80, %rdx
+L(gobble_mem_fwd_loop):
+	prefetcht0 0x1c0(%rsi)
+	prefetcht0 0x280(%rsi)
+	vmovups	(%rsi), %xmm0
+	vmovups	0x10(%rsi), %xmm1
+	vmovups	0x20(%rsi), %xmm2
+	vmovups	0x30(%rsi), %xmm3
+	vmovntdq	%xmm0, (%rdi)
+	vmovntdq	%xmm1, 0x10(%rdi)
+	vmovntdq	%xmm2, 0x20(%rdi)
+	vmovntdq	%xmm3, 0x30(%rdi)
+	vmovups	0x40(%rsi), %xmm0
+	vmovups	0x50(%rsi), %xmm1
+	vmovups	0x60(%rsi), %xmm2
+	vmovups	0x70(%rsi), %xmm3
+	lea	0x80(%rsi), %rsi
+	vmovntdq	%xmm0, 0x40(%rdi)
+	vmovntdq	%xmm1, 0x50(%rdi)
+	vmovntdq	%xmm2, 0x60(%rdi)
+	vmovntdq	%xmm3, 0x70(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_mem_fwd_loop)
+	sfence
+	vmovups	%ymm4, (%r8)
+	vzeroupper
+	vmovups %xmm5, -0x80(%r9)
+	vmovups %xmm6, -0x70(%r9)
+	vmovups %xmm7, -0x60(%r9)
+	vmovups %xmm8, -0x50(%r9)
+	vmovups %xmm9, -0x40(%r9)
+	vmovups %xmm10, -0x30(%r9)
+	vmovups %xmm11, -0x20(%r9)
+	vmovups %xmm12, -0x10(%r9)
+	ret
+
+	ALIGN (4)
+L(copy_backward):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %rcx
+#else
+	mov	__x86_64_shared_cache_size_half(%rip), %rcx
+#endif
+	shl	$3, %rcx	
+	vmovups (%rsi), %xmm8
+	vmovups 0x10(%rsi), %xmm9
+	vmovups 0x20(%rsi), %xmm10
+	vmovups 0x30(%rsi), %xmm11
+	vmovups 0x40(%rsi), %xmm12
+	vmovups 0x50(%rsi), %xmm13
+	vmovups 0x60(%rsi), %xmm14
+	vmovups 0x70(%rsi), %xmm15
+	mov	%rdi, %r9
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	vmovups	-0x20(%rsi), %ymm4
+	lea	-0x20(%rdi), %r10
+	mov %rdi, %r11
+	and	$0x1f, %r11
+	xor	%r11, %rdi
+	sub	%r11, %rsi
+	sub	%r11, %rdx
+#ifdef USE_AS_MEMMOVE
+	mov	%rdi, %r11
+	sub	%rsi, %r11
+	cmp	%rdx, %r11
+	jae	L(memmove_use_memcpy_bwd)
+	cmp	%rcx, %r11
+	jae	L(memmove_use_memcpy_bwd)
+	jmp L(gobble_mem_bwd_llc_start)
+#endif
+L(memmove_use_memcpy_bwd):
+	cmp	%rcx, %rdx
+	ja	L(gobble_big_data_bwd)
+L(gobble_mem_bwd_llc_start):
+	sub	$0x80, %rdx
+L(gobble_mem_bwd_llc):
+	vmovups	-0x20(%rsi), %ymm0
+	vmovups	-0x40(%rsi), %ymm1
+	vmovups	-0x60(%rsi), %ymm2
+	vmovups	-0x80(%rsi), %ymm3
+	lea	-0x80(%rsi), %rsi
+	vmovaps	%ymm0, -0x20(%rdi)
+	vmovaps	%ymm1, -0x40(%rdi)
+	vmovaps	%ymm2, -0x60(%rdi)
+	vmovaps	%ymm3, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_mem_bwd_llc)
+	vmovups	%ymm4, (%r10)
+	vzeroupper
+	vmovups %xmm8, (%r9)
+	vmovups %xmm9, 0x10(%r9)
+	vmovups %xmm10, 0x20(%r9)
+	vmovups %xmm11, 0x30(%r9)
+	vmovups %xmm12, 0x40(%r9)
+	vmovups %xmm13, 0x50(%r9)
+	vmovups %xmm14, 0x60(%r9)
+	vmovups %xmm15, 0x70(%r9)
+	ret
+
+L(gobble_big_data_bwd):
+	sub	$0x80, %rdx
+L(gobble_mem_bwd_loop):
+	prefetcht0 -0x1c0(%rsi)
+	prefetcht0 -0x280(%rsi)
+	vmovups	-0x10(%rsi), %xmm0
+	vmovups	-0x20(%rsi), %xmm1
+	vmovups	-0x30(%rsi), %xmm2
+	vmovups	-0x40(%rsi), %xmm3
+	vmovntdq	%xmm0, -0x10(%rdi)
+	vmovntdq	%xmm1, -0x20(%rdi)
+	vmovntdq	%xmm2, -0x30(%rdi)
+	vmovntdq	%xmm3, -0x40(%rdi)
+	vmovups	-0x50(%rsi), %xmm0
+	vmovups	-0x60(%rsi), %xmm1
+	vmovups	-0x70(%rsi), %xmm2
+	vmovups	-0x80(%rsi), %xmm3
+	lea	-0x80(%rsi), %rsi
+	vmovntdq	%xmm0, -0x50(%rdi)
+	vmovntdq	%xmm1, -0x60(%rdi)
+	vmovntdq	%xmm2, -0x70(%rdi)
+	vmovntdq	%xmm3, -0x80(%rdi)
+	lea	-0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_mem_bwd_loop)
+	sfence
+	vmovups	%ymm4, (%r10)
+	vzeroupper
+	vmovups %xmm8, (%r9)
+	vmovups %xmm9, 0x10(%r9)
+	vmovups %xmm10, 0x20(%r9)
+	vmovups %xmm11, 0x30(%r9)
+	vmovups %xmm12, 0x40(%r9)
+	vmovups %xmm13, 0x50(%r9)
+	vmovups %xmm14, 0x60(%r9)
+	vmovups %xmm15, 0x70(%r9)
+	ret
+END (MEMCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S
new file mode 100644
index 0000000..ddb2090
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMMOVE
+#define MEMCPY		__memmove_avx2_unaligned
+#define MEMCPY_CHK	__memmove_chk_avx2_unaligned
+#include "memcpy-avx2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S
new file mode 100644
index 0000000..a2f4af9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/mempcpy-avx2-unaligned.S
@@ -0,0 +1,4 @@
+#define USE_AS_MEMPCPY
+#define MEMCPY		__mempcpy_avx2_unaligned
+#define MEMCPY_CHK	__mempcpy_chk_avx2_unaligned
+#include "memcpy-avx2-unaligned.S"
-- 
1.8.1.4


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]