This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch hjl/erms/ifunc created. glibc-2.23-163-g25d8757

From: hjl at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 7 Apr 2016 19:08:06 -0000
Subject: GNU C Library master sources branch hjl/erms/ifunc created. glibc-2.23-163-g25d8757
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/erms/ifunc has been created
        at  25d87576122689b22db9929271bdd7cb403aec1c (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=25d87576122689b22db9929271bdd7cb403aec1c

commit 25d87576122689b22db9929271bdd7cb403aec1c
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Apr 1 14:01:24 2016 -0700

    X86-64: Add dummy memcopy.h and wordcopy.c
    
    Since x86-64 doesn't use memory copy functions, add dummy memcopy.h and
    wordcopy.c to reduce code size.  It reduces the size of libc.so by about
    1 KB.
    
    	* sysdeps/x86_64/memcopy.h: New file.
    	* sysdeps/x86_64/wordcopy.c: Likewise.

diff --git a/sysdeps/x86_64/memcopy.h b/sysdeps/x86_64/memcopy.h
new file mode 100644
index 0000000..590b6cb
--- /dev/null
+++ b/sysdeps/x86_64/memcopy.h
@@ -0,0 +1 @@
+/* X86-64 doesn't use memory copy functions.  */
diff --git a/sysdeps/x86_64/wordcopy.c b/sysdeps/x86_64/wordcopy.c
new file mode 100644
index 0000000..590b6cb
--- /dev/null
+++ b/sysdeps/x86_64/wordcopy.c
@@ -0,0 +1 @@
+/* X86-64 doesn't use memory copy functions.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a5d79693ef1c91aed0e12662ec84d5d9b597f283

commit a5d79693ef1c91aed0e12662ec84d5d9b597f283
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Mar 31 12:46:57 2016 -0700

    X86-64: Remove previous default/SSE2/AVX2 memcpy/memmove
    
    Since the new SSE2/AVX2 memcpy/memmove are faster than the previous ones,
    we can remove the previous SSE2/AVX2 memcpy/memmove and replace them with
    the new ones.
    
    No change in IFUNC selection if SSE2 and AVX2 memcpy/memmove weren't used
    before.  If SSE2 or AVX2 memcpy/memmove were used, the new SSE2 or AVX2
    memcpy/memmove optimized with Enhanced REP MOVSB will be used for
    processors with ERMS.  The new AVX512 memcpy/memmove will be used for
    processors with AVX512 which prefer vzeroupper.
    
    Since the new SSE2 memcpy/memmove are faster than the previous default
    memcpy/memmove used in libc.a and ld.so, we also remove the previous
    default memcpy/memmove and make them the default memcpy/memmove.
    
    Together, it reduces the size of libc.so by about 6 KB and the size of
    ld.so by about 2 KB.
    
    	[BZ #19776]
    	* sysdeps/x86_64/memcpy.S: Make it dummy.
    	* sysdeps/x86_64/mempcpy.S: Likewise.
    	* sysdeps/x86_64/memmove.S: New file.
    	* sysdeps/x86_64/memmove_chk.S: Likewise.
    	* sysdeps/x86_64/multiarch/memmove.S: Likewise.
    	* sysdeps/x86_64/multiarch/memmove_chk.S: Likewise.
    	* sysdeps/x86_64/memmove.c: Removed.
    	* sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: Likewise.
    	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Likewise.
    	* sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: Likewise.
    	* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
    	Likewise.
    	* sysdeps/x86_64/multiarch/memmove.c: Likewise.
    	* sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove
    	memcpy-sse2-unaligned, memmove-avx-unaligned,
    	memcpy-avx-unaligned and memmove-sse2-unaligned-erms.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Replace
    	__memmove_chk_avx512_unaligned_2 with
    	__memmove_chk_avx512_unaligned.  Remove
    	__memmove_chk_avx_unaligned_2.  Replace
    	__memmove_chk_sse2_unaligned_2 with
    	__memmove_chk_sse2_unaligned.  Remove __memmove_chk_sse2 and
    	__memmove_avx_unaligned_2.  Replace __memmove_avx512_unaligned_2
    	with __memmove_avx512_unaligned.  Replace
    	__memmove_sse2_unaligned_2 with __memmove_sse2_unaligned.
    	Remove __memmove_sse2.  Replace __memcpy_chk_avx512_unaligned_2
    	with __memcpy_chk_avx512_unaligned.  Remove
    	__memcpy_chk_avx_unaligned_2.  Replace
    	__memcpy_chk_sse2_unaligned_2 with __memcpy_chk_sse2_unaligned.
    	Remove __memcpy_chk_sse2.  Remove __memcpy_avx_unaligned_2.
    	Replace __memcpy_avx512_unaligned_2 with
    	__memcpy_avx512_unaligned.  Remove __memcpy_sse2_unaligned_2
    	and __memcpy_sse2.  Replace __mempcpy_chk_avx512_unaligned_2
    	with __mempcpy_chk_avx512_unaligned.  Remove
    	__mempcpy_chk_avx_unaligned_2.  Replace
    	__mempcpy_chk_sse2_unaligned_2 with
    	__mempcpy_chk_sse2_unaligned.  Remove __mempcpy_chk_sse2.
    	Replace __mempcpy_avx512_unaligned_2 with
    	__mempcpy_avx512_unaligned.  Remove __mempcpy_avx_unaligned_2.
    	Replace __mempcpy_sse2_unaligned_2 with
    	__mempcpy_sse2_unaligned.  Remove __mempcpy_sse2.
    	* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Support
    	__memcpy_avx512_unaligned_erms and __memcpy_avx512_unaligned.
    	Use __memcpy_avx_unaligned_erms and __memcpy_sse2_unaligned_erms
    	if processor has ERMS.  Default to __memcpy_sse2_unaligned.
    	(ENTRY): Removed.
    	(END): Likewise.
    	(ENTRY_CHK): Likewise.
    	(libc_hidden_builtin_def): Likewise.
    	Don't include ../memcpy.S.
    	* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Support
    	__memcpy_chk_avx512_unaligned_erms and
    	__memcpy_chk_avx512_unaligned.  Use
    	__memcpy_chk_avx_unaligned_erms and
    	__memcpy_chk_sse2_unaligned_erms if if processor has ERMS.
    	Default to __memcpy_chk_sse2_unaligned.
    	* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: Skip if
    	not in libc.
    	* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
    	Likewise.
    	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
    	Change function suffix from unaligned_2 to unaligned.
    	* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Support
    	__mempcpy_avx512_unaligned_erms and __mempcpy_avx512_unaligned.
    	Use __mempcpy_avx_unaligned_erms and __mempcpy_sse2_unaligned_erms
    	if processor has ERMS.  Default to __mempcpy_sse2_unaligned.
    	(ENTRY): Removed.
    	(END): Likewise.
    	(ENTRY_CHK): Likewise.
    	(libc_hidden_builtin_def): Likewise.
    	Don't include ../mempcpy.S.
    	(mempcpy): New.  Add a weak alias.
    	* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Support
    	__mempcpy_chk_avx512_unaligned_erms and
    	__mempcpy_chk_avx512_unaligned.  Use
    	__mempcpy_chk_avx_unaligned_erms and
    	__mempcpy_chk_sse2_unaligned_erms if if processor has ERMS.
    	Default to __mempcpy_chk_sse2_unaligned.

diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index f6e3d93..d98500a 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -1,584 +1 @@
-/*
-   Optimized memcpy for x86-64.
-
-   Copyright (C) 2007-2016 Free Software Foundation, Inc.
-   Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-/* Stack slots in the red-zone. */
-
-#ifdef USE_AS_MEMPCPY
-#  define RETVAL	(0)
-#else
-#  define RETVAL	(-8)
-#  if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc)
-#    define memcpy	__memcpy
-#    undef libc_hidden_builtin_def
-#    define libc_hidden_builtin_def(name) \
-	.globl __GI_memcpy; __GI_memcpy = __memcpy
-#  endif
-#endif
-#define SAVE0	(RETVAL - 8)
-#define SAVE1	(SAVE0	- 8)
-#define SAVE2	(SAVE1	- 8)
-#define SAVE3	(SAVE2	- 8)
-
-        .text
-
-#if defined PIC && IS_IN (libc)
-ENTRY_CHK (__memcpy_chk)
-
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-
-END_CHK (__memcpy_chk)
-#endif
-
-ENTRY(memcpy)				/* (void *, const void*, size_t) */
-
-/* Handle tiny blocks. */
-
-L(1try):				/* up to 32B */
-	cmpq	$32, %rdx
-#ifndef USE_AS_MEMPCPY
-	movq	%rdi, %rax		/* save return value */
-#endif
-	jae	L(1after)
-
-L(1):					/* 1-byte once */
-	testb	$1, %dl
-	jz	L(1a)
-
-	movzbl	(%rsi),	%ecx
-	movb	%cl, (%rdi)
-
-	incq	%rsi
-	incq	%rdi
-
-	.p2align 4,, 4
-
-L(1a):					/* 2-byte once */
-	testb	$2, %dl
-	jz	L(1b)
-
-	movzwl	(%rsi),	%ecx
-	movw	%cx, (%rdi)
-
-	addq	$2, %rsi
-	addq	$2, %rdi
-
-	.p2align 4,, 4
-
-L(1b):					/* 4-byte once */
-	testb	$4, %dl
-	jz	L(1c)
-
-	movl	(%rsi),	%ecx
-	movl	%ecx, (%rdi)
-
-	addq	$4, %rsi
-	addq	$4, %rdi
-
-	.p2align 4,, 4
-
-L(1c):					/* 8-byte once */
-	testb	$8, %dl
-	jz	L(1d)
-
-	movq	(%rsi), %rcx
-	movq	%rcx, (%rdi)
-
-	addq	$8, %rsi
-	addq	$8, %rdi
-
-	.p2align 4,, 4
-
-L(1d):					/* 16-byte loop */
-	andl	$0xf0, %edx
-	jz	L(exit)
-
-	.p2align 4
-
-L(1loop):
-	movq	 (%rsi), %rcx
-	movq	8(%rsi), %r8
-	movq	%rcx,  (%rdi)
-	movq	 %r8, 8(%rdi)
-
-	subl	$16, %edx
-
-	leaq	16(%rsi), %rsi
-	leaq	16(%rdi), %rdi
-
-	jnz	L(1loop)
-
-	.p2align 4,, 4
-
-L(exit):				/* exit */
-#ifdef USE_AS_MEMPCPY
-	movq	%rdi, %rax		/* return value */
-#else
-	rep
-#endif
-	retq
-
-	.p2align 4
-
-L(1after):
-#ifndef USE_AS_MEMPCPY
-	movq	%rax, RETVAL(%rsp)	/* save return value */
-#endif
-
-/* Align to the natural word size. */
-
-L(aligntry):
-	movl	%esi, %ecx      	/* align by source */
-
-	andl	$7, %ecx
-	jz	L(alignafter)  		/* already aligned */
-
-L(align):		      		/* align */
-	leaq	-8(%rcx, %rdx), %rdx	/* calculate remaining bytes */
-	subl	$8, %ecx
-
-	.p2align 4
-
-L(alignloop):				/* 1-byte alignment loop */
-	movzbl	(%rsi), %eax
-	movb	%al, (%rdi)
-
-	incl	%ecx
-
-	leaq	1(%rsi), %rsi
-	leaq	1(%rdi), %rdi
-
-	jnz	L(alignloop)
-
-	.p2align 4
-
-L(alignafter):
-
-/* Handle mid-sized blocks. */
-
-L(32try):				/* up to 1KB */
-	cmpq	$1024, %rdx
-	ja	L(32after)
-
-L(32):					/* 32-byte loop */
-	movl	%edx, %ecx
-	shrl	$5, %ecx
-	jz	L(32skip)
-
-	.p2align 4
-
-L(32loop):
-	decl	%ecx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %r8
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-
-	movq	%rax,   (%rdi)
-	movq	 %r8,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-
-	leaq	32(%rsi), %rsi
-	leaq	32(%rdi), %rdi
-
-	jz	L(32skip)		/* help out smaller blocks */
-
-	decl	%ecx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %r8
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-
-	movq	%rax,   (%rdi)
-	movq	 %r8,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-
-	leaq	32(%rsi), %rsi
-	leaq	32(%rdi), %rdi
-
-	jnz	L(32loop)
-
-	.p2align 4
-
-L(32skip):
-	andl	$31, %edx		/* check for left overs */
-#ifdef USE_AS_MEMPCPY
-	jnz	L(1)
-
-	movq	%rdi, %rax
-#else
-	movq	RETVAL(%rsp), %rax
-	jnz	L(1)
-
-	rep
-#endif
-	retq				/* exit */
-
-	.p2align 4
-
-L(32after):
-
-/*
-	In order to minimize code-size in RTLD, algorithms specific for
-	larger blocks are excluded when building for RTLD.
-*/
-
-/* Handle blocks smaller than 1/2 L1. */
-
-L(fasttry):				/* first 1/2 L1 */
-#if IS_IN (libc)			/* only up to this algorithm outside of libc.so */
-	mov	__x86_data_cache_size_half(%rip), %R11_LP
-	cmpq	%rdx, %r11		/* calculate the smaller of */
-	cmovaq	%rdx, %r11		/* remaining bytes and 1/2 L1 */
-#endif
-
-L(fast):				/* good ol' MOVS */
-#if IS_IN (libc)
-	movq	%r11, %rcx
-	andq	$-8, %r11
-#else
-	movq	%rdx, %rcx
-#endif
-	shrq	$3, %rcx
-	jz	L(fastskip)
-
-	rep
-	movsq
-
-	.p2align 4,, 4
-
-L(fastskip):
-#if IS_IN (libc)
-	subq	%r11, %rdx		/* check for more */
-	testq	$-8, %rdx
-	jnz	L(fastafter)
-#endif
-
-	andl	$7, %edx		/* check for left overs */
-#ifdef USE_AS_MEMPCPY
-	jnz	L(1)
-
-	movq	%rdi, %rax
-#else
-	movq	RETVAL(%rsp), %rax
-	jnz	L(1)
-
-	rep
-#endif
-	retq				/* exit */
-
-#if IS_IN (libc)			/* none of the algorithms below for RTLD */
-
-	.p2align 4
-
-L(fastafter):
-
-/* Handle large blocks smaller than 1/2 L2. */
-
-L(pretry):				/* first 1/2 L2 */
-	mov	__x86_shared_cache_size_half (%rip), %R8_LP
-	cmpq	%rdx, %r8		/* calculate the lesser of */
-	cmovaq	%rdx, %r8		/* remaining bytes and 1/2 L2 */
-
-L(pre):					/* 64-byte with prefetching */
-	movq	%r8, %rcx
-	andq	$-64, %r8
-	shrq	$6, %rcx
-	jz	L(preskip)
-
-	movq	%r14, SAVE0(%rsp)
-	cfi_rel_offset (%r14, SAVE0)
-	movq	%r13, SAVE1(%rsp)
-	cfi_rel_offset (%r13, SAVE1)
-	movq	%r12, SAVE2(%rsp)
-	cfi_rel_offset (%r12, SAVE2)
-	movq	%rbx, SAVE3(%rsp)
-	cfi_rel_offset (%rbx, SAVE3)
-
-	cmpl	$0, __x86_prefetchw(%rip)
-	jz	L(preloop)		/* check if PREFETCHW OK */
-
-	.p2align 4
-
-/* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */
-
-L(prewloop):				/* cache-line in state M */
-	decq	%rcx
-
-	movq	   (%rsi), %rax
-	movq	 8 (%rsi), %rbx
-	movq	16 (%rsi), %r9
-	movq	24 (%rsi), %r10
-	movq	32 (%rsi), %r11
-	movq	40 (%rsi), %r12
-	movq	48 (%rsi), %r13
-	movq	56 (%rsi), %r14
-
-	prefetcht0	 0 + 896 (%rsi)
-	prefetcht0	64 + 896 (%rsi)
-
-	movq	%rax,   (%rdi)
-	movq	%rbx,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-	movq	%r11, 32(%rdi)
-	movq	%r12, 40(%rdi)
-	movq	%r13, 48(%rdi)
-	movq	%r14, 56(%rdi)
-
-	leaq	64(%rsi), %rsi
-	leaq	64(%rdi), %rdi
-
-	jz	L(prebail)
-
-	decq	%rcx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %rbx
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-	movq	32(%rsi), %r11
-	movq	40(%rsi), %r12
-	movq	48(%rsi), %r13
-	movq	56(%rsi), %r14
-
-	movq	%rax,   (%rdi)
-	movq	%rbx,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-	movq	%r11, 32(%rdi)
-	movq	%r12, 40(%rdi)
-	movq	%r13, 48(%rdi)
-	movq	%r14, 56(%rdi)
-
-	prefetchw	896 - 64(%rdi)
-	prefetchw	896 -  0(%rdi)
-
-	leaq	64(%rsi), %rsi
-	leaq	64(%rdi), %rdi
-
-	jnz	L(prewloop)
-	jmp	L(prebail)
-
-	.p2align 4
-
-/* ... when PREFETCHW is not available. */
-
-L(preloop):				/* cache-line in state E */
-	decq	%rcx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %rbx
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-	movq	32(%rsi), %r11
-	movq	40(%rsi), %r12
-	movq	48(%rsi), %r13
-	movq	56(%rsi), %r14
-
-	prefetcht0	896 +  0(%rsi)
-	prefetcht0	896 + 64(%rsi)
-
-	movq	%rax,   (%rdi)
-	movq	%rbx,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-	movq	%r11, 32(%rdi)
-	movq	%r12, 40(%rdi)
-	movq	%r13, 48(%rdi)
-	movq	%r14, 56(%rdi)
-
-	leaq	64 (%rsi), %rsi
-	leaq	64 (%rdi), %rdi
-
-	jz	L(prebail)
-
-	decq	%rcx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %rbx
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-	movq	32(%rsi), %r11
-	movq	40(%rsi), %r12
-	movq	48(%rsi), %r13
-	movq	56(%rsi), %r14
-
-	prefetcht0	896 - 64(%rdi)
-	prefetcht0	896 -  0(%rdi)
-
-	movq	%rax,   (%rdi)
-	movq	%rbx,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-	movq	%r11, 32(%rdi)
-	movq	%r12, 40(%rdi)
-	movq	%r13, 48(%rdi)
-	movq	%r14, 56(%rdi)
-
-	leaq	64(%rsi), %rsi
-	leaq	64(%rdi), %rdi
-
-	jnz	L(preloop)
-
-L(prebail):
-	movq	SAVE3(%rsp), %rbx
-	cfi_restore (%rbx)
-	movq	SAVE2(%rsp), %r12
-	cfi_restore (%r12)
-	movq	SAVE1(%rsp), %r13
-	cfi_restore (%r13)
-	movq	SAVE0(%rsp), %r14
-	cfi_restore (%r14)
-
-/*       .p2align 4 */
-
-L(preskip):
-	subq	%r8, %rdx		/* check for more */
-	testq	$-64, %rdx
-	jnz	L(preafter)
-
-	andl	$63, %edx		/* check for left overs */
-#ifdef USE_AS_MEMPCPY
-	jnz	L(1)
-
-	movq	%rdi, %rax
-#else
-	movq	RETVAL(%rsp), %rax
-	jnz	L(1)
-
-	rep
-#endif
-	retq				/* exit */
-
-	.p2align 4
-
-L(preafter):
-
-/* Handle huge blocks. */
-
-L(NTtry):
-
-L(NT):					/* non-temporal 128-byte */
-	movq	%rdx, %rcx
-	shrq	$7, %rcx
-	jz	L(NTskip)
-
-	movq	%r14, SAVE0(%rsp)
-	cfi_rel_offset (%r14, SAVE0)
-	movq	%r13, SAVE1(%rsp)
-	cfi_rel_offset (%r13, SAVE1)
-	movq	%r12, SAVE2(%rsp)
-	cfi_rel_offset (%r12, SAVE2)
-
-       .p2align 4
-
-L(NTloop):
-	prefetchnta	768(%rsi)
-	prefetchnta	832(%rsi)
-
-	decq	%rcx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %r8
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-	movq	32(%rsi), %r11
-	movq	40(%rsi), %r12
-	movq	48(%rsi), %r13
-	movq	56(%rsi), %r14
-
-	movntiq	%rax,   (%rdi)
-	movntiq	 %r8,  8(%rdi)
-	movntiq	 %r9, 16(%rdi)
-	movntiq	%r10, 24(%rdi)
-	movntiq	%r11, 32(%rdi)
-	movntiq	%r12, 40(%rdi)
-	movntiq	%r13, 48(%rdi)
-	movntiq	%r14, 56(%rdi)
-
-	movq	 64(%rsi), %rax
-	movq	 72(%rsi), %r8
-	movq	 80(%rsi), %r9
-	movq	 88(%rsi), %r10
-	movq	 96(%rsi), %r11
-	movq	104(%rsi), %r12
-	movq	112(%rsi), %r13
-	movq	120(%rsi), %r14
-
-	movntiq	%rax,  64(%rdi)
-	movntiq	 %r8,  72(%rdi)
-	movntiq	 %r9,  80(%rdi)
-	movntiq	%r10,  88(%rdi)
-	movntiq	%r11,  96(%rdi)
-	movntiq	%r12, 104(%rdi)
-	movntiq	%r13, 112(%rdi)
-	movntiq	%r14, 120(%rdi)
-
-	leaq	128(%rsi), %rsi
-	leaq	128(%rdi), %rdi
-
-	jnz	L(NTloop)
-
-	sfence				/* serialize memory stores */
-
-	movq	SAVE2(%rsp), %r12
-	cfi_restore (%r12)
-	movq	SAVE1(%rsp), %r13
-	cfi_restore (%r13)
-	movq	SAVE0(%rsp), %r14
-	cfi_restore (%r14)
-
-L(NTskip):
-	andl	$127, %edx		/* check for left overs */
-#ifdef USE_AS_MEMPCPY
-	jnz	L(1)
-
-	movq	%rdi, %rax
-#else
-	movq	RETVAL(%rsp), %rax
-	jnz	L(1)
-
-	rep
-#endif
-	retq				/* exit */
-
-#endif /* IS_IN (libc) */
-
-END(memcpy)
-
-#ifndef USE_AS_MEMPCPY
-libc_hidden_builtin_def (memcpy)
-# if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc)
-#  undef memcpy
-#  include <shlib-compat.h>
-versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
-# endif
-#endif
+/* Implemented in memcpy.S.  */
diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
new file mode 100644
index 0000000..a7ae453
--- /dev/null
+++ b/sysdeps/x86_64/memmove.S
@@ -0,0 +1,71 @@
+/* Optimized memmove for x86-64.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+#define PREFETCHNT	prefetchnta
+#define VMOVNT		movntdq
+/* Use movups and movaps for smaller code sizes.  */
+#define VMOVU		movups
+#define VMOVA		movaps
+
+#define SECTION(p)		p
+
+#ifdef USE_MULTIARCH
+# if !defined SHARED || !IS_IN (libc)
+#  define MEMCPY_SYMBOL(p,s)		memcpy
+# endif
+#else
+# if defined SHARED && IS_IN (libc)
+#  define MEMCPY_SYMBOL(p,s)		__memcpy
+# else
+#  define MEMCPY_SYMBOL(p,s)		memcpy
+# endif
+#endif
+#if !defined SHARED || !defined USE_MULTIARCH || !IS_IN (libc)
+# define MEMPCPY_SYMBOL(p,s)		__mempcpy
+#endif
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)	p
+# define MEMMOVE_SYMBOL(p,s)		memmove
+#endif
+
+#include "multiarch/memmove-vec-unaligned-erms.S"
+
+#ifndef USE_MULTIARCH
+libc_hidden_builtin_def (memmove)
+# if defined SHARED && IS_IN (libc)
+strong_alias (memmove, __memcpy)
+libc_hidden_ver (memmove, memcpy)
+# endif
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
+
+# if defined SHARED && IS_IN (libc)
+#  undef memcpy
+#  include <shlib-compat.h>
+versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
+
+#  if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
+#  endif
+# endif
+#endif
diff --git a/sysdeps/x86_64/memmove.c b/sysdeps/x86_64/memmove_chk.S
similarity index 64%
rename from sysdeps/x86_64/memmove.c
rename to sysdeps/x86_64/memmove_chk.S
index 07f8185..ee154f1 100644
--- a/sysdeps/x86_64/memmove.c
+++ b/sysdeps/x86_64/memmove_chk.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Checking memmove for x86-64.
+   Copyright (C) 2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -15,12 +16,18 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include "string/memmove.c"
+#include <sysdep.h>
+#include "asm-syntax.h"
 
-#if !defined memmove && IS_IN (libc)
-#include <shlib-compat.h>
-
-#if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
-compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
-#endif
+#ifndef SHARED
+	/* For libc.so this is defined in memmove.S.
+	   For libc.a, this is a separate source to avoid
+	   memmove bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__memmove_chk)
+	cmpq	%rdx, %rcx
+	jb	__chk_fail
+	jmp	memmove
+END (__memmove_chk)
 #endif
diff --git a/sysdeps/x86_64/mempcpy.S b/sysdeps/x86_64/mempcpy.S
index acee5e5..d98500a 100644
--- a/sysdeps/x86_64/mempcpy.S
+++ b/sysdeps/x86_64/mempcpy.S
@@ -1,8 +1 @@
-#define USE_AS_MEMPCPY
-#define memcpy __mempcpy
-#define __memcpy_chk __mempcpy_chk
-#include <sysdeps/x86_64/memcpy.S>
-
-libc_hidden_def (__mempcpy)
-weak_alias (__mempcpy, mempcpy)
-libc_hidden_builtin_def (mempcpy)
+/* Implemented in memcpy.S.  */
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 35752a8..d6636cb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,10 +7,9 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
-		   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
+		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
-		   memcpy-ssse3-back memmove-avx-unaligned \
-		   memcpy-avx-unaligned \
+		   memcpy-ssse3-back \
 		   memmove-ssse3-back \
 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
@@ -21,7 +20,6 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
 		   strcspn-c strpbrk-c strspn-c varshift \
 		   memset-avx512-no-vzeroupper \
-		   memmove-sse2-unaligned-erms \
 		   memmove-avx-unaligned-erms \
 		   memmove-avx512-unaligned-erms \
 		   memset-avx2-unaligned-erms \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index ca05ff6..449b046 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -54,7 +54,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __memmove_chk_avx512_unaligned_2)
+			      __memmove_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memmove_chk_avx512_unaligned_erms)
@@ -64,9 +64,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __memmove_chk_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_CPU_FEATURE (SSSE3),
@@ -75,11 +72,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
-			      __memmove_chk_sse2_unaligned_2)
+			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
-			      __memmove_chk_sse2_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
-			      __memmove_chk_sse2))
+			      __memmove_chk_sse2_unaligned_erms))
 
   /* Support sysdeps/x86_64/multiarch/memmove.S.  */
   IFUNC_IMPL (i, name, memmove,
@@ -88,9 +83,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __memmove_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, memmove,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_avx_unaligned_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	      IFUNC_IMPL_ADD (array, i, memmove,
@@ -98,7 +90,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __memmove_avx512_unaligned_2)
+			      __memmove_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memmove_avx512_unaligned_erms)
@@ -109,10 +101,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
-			      __memmove_sse2_unaligned_2)
+			      __memmove_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
-			      __memmove_sse2_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
+			      __memmove_sse2_unaligned_erms))
 
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
@@ -326,7 +317,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __memcpy_chk_avx512_unaligned_2)
+			      __memcpy_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memcpy_chk_avx512_unaligned_erms)
@@ -336,9 +327,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __memcpy_chk_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
@@ -347,11 +335,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
-			      __memcpy_chk_sse2_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
-			      __memcpy_chk_sse2_unaligned_erms)
+			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
-			      __memcpy_chk_sse2))
+			      __memcpy_chk_sse2_unaligned_erms))
 
   /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
   IFUNC_IMPL (i, name, memcpy,
@@ -360,9 +346,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __memcpy_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, memcpy,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_ssse3_back)
@@ -374,18 +357,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __memcpy_avx512_unaligned_2)
+			      __memcpy_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memcpy_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
-			      __memcpy_sse2_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
 			      __memcpy_sse2_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
-	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
   IFUNC_IMPL (i, name, __mempcpy_chk,
@@ -395,7 +375,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __mempcpy_chk_avx512_unaligned_2)
+			      __mempcpy_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __mempcpy_chk_avx512_unaligned_erms)
@@ -405,9 +385,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __mempcpy_chk_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
@@ -416,11 +393,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
-			      __mempcpy_chk_sse2_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
-			      __mempcpy_chk_sse2_unaligned_erms)
+			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
-			      __mempcpy_chk_sse2))
+			      __mempcpy_chk_sse2_unaligned_erms))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
   IFUNC_IMPL (i, name, mempcpy,
@@ -430,7 +405,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __mempcpy_avx512_unaligned_2)
+			      __mempcpy_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __mempcpy_avx512_unaligned_erms)
@@ -440,20 +415,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __mempcpy_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, mempcpy,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
-			      __mempcpy_sse2_unaligned_2)
+			      __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
 			      __mempcpy_sse2_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms))
 
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
   IFUNC_IMPL (i, name, strncmp,
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
deleted file mode 100644
index dd4187f..0000000
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ /dev/null
@@ -1,391 +0,0 @@
-/* memcpy with AVX
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc) \
-    && (defined SHARED \
-        || defined USE_AS_MEMMOVE \
-	|| !defined USE_MULTIARCH)
-
-#include "asm-syntax.h"
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_avx_unaligned
-# define MEMCPY_CHK	__memcpy_chk_avx_unaligned
-# define MEMPCPY	__mempcpy_avx_unaligned
-# define MEMPCPY_CHK	__mempcpy_chk_avx_unaligned
-#endif
-
-	.section .text.avx,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%rdi, %rax
-#ifdef USE_AS_MEMPCPY
-	add	%rdx, %rax
-#endif
-L(start):
-	cmp	$256, %rdx
-	jae	L(256bytesormore)
-	cmp	$16, %dl
-	jb	L(less_16bytes)
-	cmp	$128, %dl
-	jb	L(less_128bytes)
-	vmovdqu (%rsi), %xmm0
-	lea	(%rsi, %rdx), %rcx
-	vmovdqu 0x10(%rsi), %xmm1
-	vmovdqu 0x20(%rsi), %xmm2
-	vmovdqu 0x30(%rsi), %xmm3
-	vmovdqu 0x40(%rsi), %xmm4
-	vmovdqu 0x50(%rsi), %xmm5
-	vmovdqu 0x60(%rsi), %xmm6
-	vmovdqu 0x70(%rsi), %xmm7
-	vmovdqu -0x80(%rcx), %xmm8
-	vmovdqu -0x70(%rcx), %xmm9
-	vmovdqu -0x60(%rcx), %xmm10
-	vmovdqu -0x50(%rcx), %xmm11
-	vmovdqu -0x40(%rcx), %xmm12
-	vmovdqu -0x30(%rcx), %xmm13
-	vmovdqu -0x20(%rcx), %xmm14
-	vmovdqu -0x10(%rcx), %xmm15
-	lea	(%rdi, %rdx), %rdx
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm1, 0x10(%rdi)
-	vmovdqu %xmm2, 0x20(%rdi)
-	vmovdqu %xmm3, 0x30(%rdi)
-	vmovdqu %xmm4, 0x40(%rdi)
-	vmovdqu %xmm5, 0x50(%rdi)
-	vmovdqu %xmm6, 0x60(%rdi)
-	vmovdqu %xmm7, 0x70(%rdi)
-	vmovdqu %xmm8, -0x80(%rdx)
-	vmovdqu %xmm9, -0x70(%rdx)
-	vmovdqu %xmm10, -0x60(%rdx)
-	vmovdqu %xmm11, -0x50(%rdx)
-	vmovdqu %xmm12, -0x40(%rdx)
-	vmovdqu %xmm13, -0x30(%rdx)
-	vmovdqu %xmm14, -0x20(%rdx)
-	vmovdqu %xmm15, -0x10(%rdx)
-	ret
-	.p2align 4
-L(less_128bytes):
-	cmp	$64, %dl
-	jb	L(less_64bytes)
-	vmovdqu (%rsi), %xmm0
-	lea	(%rsi, %rdx), %rcx
-	vmovdqu 0x10(%rsi), %xmm1
-	vmovdqu 0x20(%rsi), %xmm2
-	lea	(%rdi, %rdx), %rdx
-	vmovdqu 0x30(%rsi), %xmm3
-	vmovdqu -0x40(%rcx), %xmm4
-	vmovdqu -0x30(%rcx), %xmm5
-	vmovdqu -0x20(%rcx), %xmm6
-	vmovdqu -0x10(%rcx), %xmm7
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm1, 0x10(%rdi)
-	vmovdqu %xmm2, 0x20(%rdi)
-	vmovdqu %xmm3, 0x30(%rdi)
-	vmovdqu %xmm4, -0x40(%rdx)
-	vmovdqu %xmm5, -0x30(%rdx)
-	vmovdqu %xmm6, -0x20(%rdx)
-	vmovdqu %xmm7, -0x10(%rdx)
-	ret
-
-	.p2align 4
-L(less_64bytes):
-	cmp	$32, %dl
-	jb	L(less_32bytes)
-	vmovdqu (%rsi), %xmm0
-	vmovdqu 0x10(%rsi), %xmm1
-	vmovdqu -0x20(%rsi, %rdx), %xmm6
-	vmovdqu -0x10(%rsi, %rdx), %xmm7
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm1, 0x10(%rdi)
-	vmovdqu %xmm6, -0x20(%rdi, %rdx)
-	vmovdqu %xmm7, -0x10(%rdi, %rdx)
-	ret
-
-	.p2align 4
-L(less_32bytes):
-	vmovdqu (%rsi), %xmm0
-	vmovdqu -0x10(%rsi, %rdx), %xmm7
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm7, -0x10(%rdi, %rdx)
-	ret
-
-	.p2align 4
-L(less_16bytes):
-	cmp	$8, %dl
-	jb	L(less_8bytes)
-	movq -0x08(%rsi, %rdx),	%rcx
-	movq (%rsi),	%rsi
-	movq %rsi, (%rdi)
-	movq %rcx, -0x08(%rdi, %rdx)
-	ret
-
-	.p2align 4
-L(less_8bytes):
-	cmp	$4, %dl
-	jb	L(less_4bytes)
-	mov -0x04(%rsi, %rdx), %ecx
-	mov (%rsi),	%esi
-	mov %esi, (%rdi)
-	mov %ecx, -0x04(%rdi, %rdx)
-	ret
-
-L(less_4bytes):
-	cmp	$1, %dl
-	jbe	L(less_2bytes)
-	mov -0x02(%rsi, %rdx),	%cx
-	mov (%rsi),	%si
-	mov %si, (%rdi)
-	mov %cx, -0x02(%rdi, %rdx)
-	ret
-
-L(less_2bytes):
-	jb	L(less_0bytes)
-	mov	(%rsi), %cl
-	mov	%cl,	(%rdi)
-L(less_0bytes):
-	ret
-
-	.p2align 4
-L(256bytesormore):
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %rcx
-	sub	%rsi, %rcx
-	cmp	%rdx, %rcx
-	jc	L(copy_backward)
-#endif
-	cmp	$2048, %rdx
-	jae	L(gobble_data_movsb)
-	mov	%rax, %r8
-	lea	(%rsi, %rdx), %rcx
-	mov	%rdi, %r10
-	vmovdqu -0x80(%rcx), %xmm5
-	vmovdqu -0x70(%rcx), %xmm6
-	mov	$0x80, %rax
-	and	$-32, %rdi
-	add	$32, %rdi
-	vmovdqu -0x60(%rcx), %xmm7
-	vmovdqu -0x50(%rcx), %xmm8
-	mov	%rdi, %r11
-	sub	%r10, %r11
-	vmovdqu -0x40(%rcx), %xmm9
-	vmovdqu -0x30(%rcx), %xmm10
-	sub	%r11, %rdx
-	vmovdqu -0x20(%rcx), %xmm11
-	vmovdqu -0x10(%rcx), %xmm12
-	vmovdqu	(%rsi), %ymm4
-	add	%r11, %rsi
-	sub	%eax, %edx
-L(goble_128_loop):
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 0x20(%rsi), %ymm1
-	vmovdqu 0x40(%rsi), %ymm2
-	vmovdqu 0x60(%rsi), %ymm3
-	add	%rax, %rsi
-	vmovdqa %ymm0, (%rdi)
-	vmovdqa %ymm1, 0x20(%rdi)
-	vmovdqa %ymm2, 0x40(%rdi)
-	vmovdqa %ymm3, 0x60(%rdi)
-	add	%rax, %rdi
-	sub	%eax, %edx
-	jae	L(goble_128_loop)
-	add	%eax, %edx
-	add	%rdi, %rdx
-	vmovdqu	%ymm4, (%r10)
-	vzeroupper
-	vmovdqu %xmm5, -0x80(%rdx)
-	vmovdqu %xmm6, -0x70(%rdx)
-	vmovdqu %xmm7, -0x60(%rdx)
-	vmovdqu %xmm8, -0x50(%rdx)
-	vmovdqu %xmm9, -0x40(%rdx)
-	vmovdqu %xmm10, -0x30(%rdx)
-	vmovdqu %xmm11, -0x20(%rdx)
-	vmovdqu %xmm12, -0x10(%rdx)
-	mov	%r8, %rax
-	ret
-
-	.p2align 4
-L(gobble_data_movsb):
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %rcx
-#else
-	mov	__x86_shared_cache_size_half(%rip), %rcx
-#endif
-	shl	$3, %rcx
-	cmp	%rcx, %rdx
-	jae	L(gobble_big_data_fwd)
-	mov	%rdx, %rcx
-	rep	movsb
-	ret
-
-	.p2align 4
-L(gobble_big_data_fwd):
-	lea	(%rsi, %rdx), %rcx
-	vmovdqu	(%rsi), %ymm4
-	vmovdqu -0x80(%rsi,%rdx), %xmm5
-	vmovdqu -0x70(%rcx), %xmm6
-	vmovdqu -0x60(%rcx), %xmm7
-	vmovdqu -0x50(%rcx), %xmm8
-	vmovdqu -0x40(%rcx), %xmm9
-	vmovdqu -0x30(%rcx), %xmm10
-	vmovdqu -0x20(%rcx), %xmm11
-	vmovdqu -0x10(%rcx), %xmm12
-	mov	%rdi, %r8
-	and	$-32, %rdi
-	add	$32, %rdi
-	mov	%rdi, %r10
-	sub	%r8, %r10
-	sub	%r10, %rdx
-	add	%r10, %rsi
-	lea	(%rdi, %rdx), %rcx
-	add	$-0x80, %rdx
-L(gobble_mem_fwd_loop):
-	prefetchnta 0x1c0(%rsi)
-	prefetchnta 0x280(%rsi)
-	vmovdqu	(%rsi), %ymm0
-	vmovdqu	0x20(%rsi), %ymm1
-	vmovdqu	0x40(%rsi), %ymm2
-	vmovdqu	0x60(%rsi), %ymm3
-	sub	$-0x80, %rsi
-	vmovntdq	%ymm0, (%rdi)
-	vmovntdq	%ymm1, 0x20(%rdi)
-	vmovntdq	%ymm2, 0x40(%rdi)
-	vmovntdq	%ymm3, 0x60(%rdi)
-	sub	$-0x80, %rdi
-	add	$-0x80, %rdx
-	jb	L(gobble_mem_fwd_loop)
-	sfence
-	vmovdqu	%ymm4, (%r8)
-	vzeroupper
-	vmovdqu %xmm5, -0x80(%rcx)
-	vmovdqu %xmm6, -0x70(%rcx)
-	vmovdqu %xmm7, -0x60(%rcx)
-	vmovdqu %xmm8, -0x50(%rcx)
-	vmovdqu %xmm9, -0x40(%rcx)
-	vmovdqu %xmm10, -0x30(%rcx)
-	vmovdqu %xmm11, -0x20(%rcx)
-	vmovdqu %xmm12, -0x10(%rcx)
-	ret
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(copy_backward):
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %rcx
-#else
-	mov	__x86_shared_cache_size_half(%rip), %rcx
-#endif
-	shl	$3, %rcx
-	vmovdqu (%rsi), %xmm5
-	vmovdqu 0x10(%rsi), %xmm6
-	add	%rdx, %rdi
-	vmovdqu 0x20(%rsi), %xmm7
-	vmovdqu 0x30(%rsi), %xmm8
-	lea	-0x20(%rdi), %r10
-	mov %rdi, %r11
-	vmovdqu 0x40(%rsi), %xmm9
-	vmovdqu 0x50(%rsi), %xmm10
-	and	$0x1f, %r11
-	vmovdqu 0x60(%rsi), %xmm11
-	vmovdqu 0x70(%rsi), %xmm12
-	xor	%r11, %rdi
-	add	%rdx, %rsi
-	vmovdqu	-0x20(%rsi), %ymm4
-	sub	%r11, %rsi
-	sub	%r11, %rdx
-	cmp	%rcx, %rdx
-	ja	L(gobble_big_data_bwd)
-	add	$-0x80, %rdx
-L(gobble_mem_bwd_llc):
-	vmovdqu	-0x20(%rsi), %ymm0
-	vmovdqu	-0x40(%rsi), %ymm1
-	vmovdqu	-0x60(%rsi), %ymm2
-	vmovdqu	-0x80(%rsi), %ymm3
-	lea	-0x80(%rsi), %rsi
-	vmovdqa	%ymm0, -0x20(%rdi)
-	vmovdqa	%ymm1, -0x40(%rdi)
-	vmovdqa	%ymm2, -0x60(%rdi)
-	vmovdqa	%ymm3, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	add	$-0x80, %rdx
-	jb	L(gobble_mem_bwd_llc)
-	vmovdqu	%ymm4, (%r10)
-	vzeroupper
-	vmovdqu %xmm5, (%rax)
-	vmovdqu %xmm6, 0x10(%rax)
-	vmovdqu %xmm7, 0x20(%rax)
-	vmovdqu %xmm8, 0x30(%rax)
-	vmovdqu %xmm9, 0x40(%rax)
-	vmovdqu %xmm10, 0x50(%rax)
-	vmovdqu %xmm11, 0x60(%rax)
-	vmovdqu %xmm12, 0x70(%rax)
-	ret
-
-	.p2align 4
-L(gobble_big_data_bwd):
-	add	$-0x80, %rdx
-L(gobble_mem_bwd_loop):
-	prefetchnta -0x1c0(%rsi)
-	prefetchnta -0x280(%rsi)
-	vmovdqu	-0x20(%rsi), %ymm0
-	vmovdqu	-0x40(%rsi), %ymm1
-	vmovdqu	-0x60(%rsi), %ymm2
-	vmovdqu	-0x80(%rsi), %ymm3
-	lea	-0x80(%rsi), %rsi
-	vmovntdq	%ymm0, -0x20(%rdi)
-	vmovntdq	%ymm1, -0x40(%rdi)
-	vmovntdq	%ymm2, -0x60(%rdi)
-	vmovntdq	%ymm3, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	add	$-0x80, %rdx
-	jb	L(gobble_mem_bwd_loop)
-	sfence
-	vmovdqu	%ymm4, (%r10)
-	vzeroupper
-	vmovdqu %xmm5, (%rax)
-	vmovdqu %xmm6, 0x10(%rax)
-	vmovdqu %xmm7, 0x20(%rax)
-	vmovdqu %xmm8, 0x30(%rax)
-	vmovdqu %xmm9, 0x40(%rax)
-	vmovdqu %xmm10, 0x50(%rax)
-	vmovdqu %xmm11, 0x60(%rax)
-	vmovdqu %xmm12, 0x70(%rax)
-	ret
-#endif
-END (MEMCPY)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
deleted file mode 100644
index c450983..0000000
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ /dev/null
@@ -1,175 +0,0 @@
-/* memcpy with unaliged loads
-   Copyright (C) 2013-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-#include <sysdep.h>
-
-#include "asm-syntax.h"
-
-
-ENTRY(__memcpy_sse2_unaligned)
-	movq	%rsi, %rax
-	leaq	(%rdx,%rdx), %rcx
-	subq	%rdi, %rax
-	subq	%rdx, %rax
-	cmpq	%rcx, %rax
-	jb	L(overlapping)
-	cmpq	$16, %rdx
-	jbe	L(less_16)
-	movdqu	(%rsi), %xmm8
-	cmpq	$32, %rdx
-	movdqu	%xmm8, (%rdi)
-	movdqu	-16(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -16(%rdi,%rdx)
-	ja	.L31
-L(return):
-	movq	%rdi, %rax
-	ret
-	.p2align 4,,10
-	.p2align 4
-.L31:
-	movdqu	16(%rsi), %xmm8
-	cmpq	$64, %rdx
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	-32(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -32(%rdi,%rdx)
-	jbe	L(return)
-	movdqu	32(%rsi), %xmm8
-	cmpq	$128, %rdx
-	movdqu	%xmm8, 32(%rdi)
-	movdqu	-48(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -48(%rdi,%rdx)
-	movdqu	48(%rsi), %xmm8
-	movdqu	%xmm8, 48(%rdi)
-	movdqu	-64(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -64(%rdi,%rdx)
-	jbe	L(return)
-	leaq	64(%rdi), %rcx
-	addq	%rdi, %rdx
-	andq	$-64, %rdx
-	andq	$-64, %rcx
-	movq	%rcx, %rax
-	subq	%rdi, %rax
-	addq	%rax, %rsi
-	cmpq	%rdx, %rcx
-	je	L(return)
-	movq	%rsi, %r10
-	subq	%rcx, %r10
-	leaq	16(%r10), %r9
-	leaq	32(%r10), %r8
-	leaq	48(%r10), %rax
-	.p2align 4,,10
-	.p2align 4
-L(loop):
-	movdqu	(%rcx,%r10), %xmm8
-	movdqa	%xmm8, (%rcx)
-	movdqu	(%rcx,%r9), %xmm8
-	movdqa	%xmm8, 16(%rcx)
-	movdqu	(%rcx,%r8), %xmm8
-	movdqa	%xmm8, 32(%rcx)
-	movdqu	(%rcx,%rax), %xmm8
-	movdqa	%xmm8, 48(%rcx)
-	addq	$64, %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
-	jmp	L(return)
-L(overlapping):
-	cmpq	%rsi, %rdi
-	jae	.L3
-	testq	%rdx, %rdx
-	.p2align 4,,5
-	je	L(return)
-	movq	%rdx, %r9
-	leaq	16(%rsi), %rcx
-	leaq	16(%rdi), %r8
-	shrq	$4, %r9
-	movq	%r9, %rax
-	salq	$4, %rax
-	cmpq	%rcx, %rdi
-	setae	%cl
-	cmpq	%r8, %rsi
-	setae	%r8b
-	orl	%r8d, %ecx
-	cmpq	$15, %rdx
-	seta	%r8b
-	testb	%r8b, %cl
-	je	.L16
-	testq	%rax, %rax
-	je	.L16
-	xorl	%ecx, %ecx
-	xorl	%r8d, %r8d
-.L7:
-	movdqu	(%rsi,%rcx), %xmm8
-	addq	$1, %r8
-	movdqu	%xmm8, (%rdi,%rcx)
-	addq	$16, %rcx
-	cmpq	%r8, %r9
-	ja	.L7
-	cmpq	%rax, %rdx
-	je	L(return)
-.L21:
-	movzbl	(%rsi,%rax), %ecx
-	movb	%cl, (%rdi,%rax)
-	addq	$1, %rax
-	cmpq	%rax, %rdx
-	ja	.L21
-	jmp	L(return)
-L(less_16):
-	testb	$24, %dl
-	jne	L(between_9_16)
-	testb	$4, %dl
-	.p2align 4,,5
-	jne	L(between_5_8)
-	testq	%rdx, %rdx
-	.p2align 4,,2
-	je	L(return)
-	movzbl	(%rsi), %eax
-	testb	$2, %dl
-	movb	%al, (%rdi)
-	je	L(return)
-	movzwl	-2(%rsi,%rdx), %eax
-	movw	%ax, -2(%rdi,%rdx)
-	jmp	L(return)
-.L3:
-	leaq	-1(%rdx), %rax
-	.p2align 4,,10
-	.p2align 4
-.L11:
-	movzbl	(%rsi,%rax), %edx
-	movb	%dl, (%rdi,%rax)
-	subq	$1, %rax
-	jmp	.L11
-L(between_9_16):
-	movq	(%rsi), %rax
-	movq	%rax, (%rdi)
-	movq	-8(%rsi,%rdx), %rax
-	movq	%rax, -8(%rdi,%rdx)
-	jmp	L(return)
-.L16:
-	xorl	%eax, %eax
-	jmp	.L21
-L(between_5_8):
-	movl	(%rsi), %eax
-	movl	%eax, (%rdi)
-	movl	-4(%rsi,%rdx), %eax
-	movl	%eax, -4(%rdi,%rdx)
-	jmp	L(return)
-END(__memcpy_sse2_unaligned)
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 5b045d7..f6771a4 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -19,7 +19,6 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <shlib-compat.h>
 #include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
@@ -30,21 +29,34 @@
 ENTRY(__new_memcpy)
 	.type	__new_memcpy, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef HAVE_AVX512_ASM_SUPPORT
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	1f
+	lea	__memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	1f
-	lea    __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
+	jnz	2f
+	lea	__memcpy_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memcpy_avx512_unaligned(%rip), %RAX_LP
 	ret
-#endif
+# endif
 1:	lea	__memcpy_avx_unaligned(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jnz	2f
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
 	lea	__memcpy_sse2_unaligned(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
-	jnz	2f
-	lea	__memcpy_sse2(%rip), %RAX_LP
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
 	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	lea    __memcpy_ssse3_back(%rip), %RAX_LP
@@ -54,37 +66,7 @@ ENTRY(__new_memcpy)
 2:	ret
 END(__new_memcpy)
 
-# undef ENTRY
-# define ENTRY(name) \
-	.type __memcpy_sse2, @function; \
-	.globl __memcpy_sse2; \
-	.hidden __memcpy_sse2; \
-	.p2align 4; \
-	__memcpy_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __memcpy_sse2, .-__memcpy_sse2
-
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
-	.type __memcpy_chk_sse2, @function; \
-	.globl __memcpy_chk_sse2; \
-	.p2align 4; \
-	__memcpy_chk_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
-	cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2
-
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal memcpy calls through a PLT.
-   The speedup we get from using SSSE3 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_memcpy; __GI_memcpy = __memcpy_sse2
-
+# undef memcpy
+# include <shlib-compat.h>
 versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14);
 #endif
-
-#include "../memcpy.S"
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 648217e..11f1310 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -30,24 +30,40 @@
 ENTRY(__memcpy_chk)
 	.type	__memcpy_chk, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef HAVE_AVX512_ASM_SUPPORT
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
-	jz      1f
+	jz	1f
+	lea	__memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz      1f
-	leaq    __memcpy_chk_avx512_no_vzeroupper(%rip), %rax
+	jnz	2f
+	lea	__memcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memcpy_chk_avx512_unaligned(%rip), %RAX_LP
 	ret
-#endif
-1:	leaq	__memcpy_chk_sse2(%rip), %rax
+# endif
+1:	lea	__memcpy_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memcpy_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
 	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leaq	__memcpy_chk_ssse3(%rip), %rax
+	lea    __memcpy_chk_ssse3_back(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Fast_Copy_Backward)
-	jz	2f
-	leaq	__memcpy_chk_ssse3_back(%rip), %rax
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz  2f
-	leaq    __memcpy_chk_avx_unaligned(%rip), %rax
+	jnz	2f
+	lea	__memcpy_chk_ssse3(%rip), %RAX_LP
 2:	ret
 END(__memcpy_chk)
 # else
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
deleted file mode 100644
index 75e35f2..0000000
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* memmove with AVX
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_avx_unaligned
-#define MEMCPY_CHK	__memmove_chk_avx_unaligned
-#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
deleted file mode 100644
index d7edb18..0000000
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ /dev/null
@@ -1,13 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE	16
-# define VEC(i)		xmm##i
-# define VMOVNT		movntdq
-/* Use movups and movaps for smaller code sizes.  */
-# define VMOVU		movups
-# define VMOVA		movaps
-
-# define SECTION(p)		p
-# define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
-
-# include "memmove-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 4e13b8f..511b2f7 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -105,28 +105,28 @@
 
 	.section SECTION(.text),"ax",@progbits
 #if defined SHARED && IS_IN (libc)
-ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 #endif
 
 #if VEC_SIZE == 16 || defined SHARED
-ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
+ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 	movq	%rdi, %rax
 	addq	%rdx, %rax
 	jmp	L(start)
-END (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
+END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 #endif
 
 #if defined SHARED && IS_IN (libc)
-ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 #endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
 	movq	%rdi, %rax
 L(start):
 	cmpq	$VEC_SIZE, %rdx
@@ -147,7 +147,7 @@ L(nop):
 #endif
 	ret
 #if defined USE_MULTIARCH && IS_IN (libc)
-END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+END (MEMMOVE_SYMBOL (__memmove, unaligned))
 
 # if VEC_SIZE == 16 && defined SHARED
 /* Only used to measure performance of REP MOVSB.  */
@@ -520,11 +520,11 @@ strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
 	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
 #  endif
-strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2),
-	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned_2))
+strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
+	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
 # endif
 #endif
 #if VEC_SIZE == 16 || defined SHARED
-strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
-	      MEMCPY_SYMBOL (__memcpy, unaligned_2))
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
+	      MEMCPY_SYMBOL (__memcpy, unaligned))
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove.S b/sysdeps/x86_64/multiarch/memmove.S
new file mode 100644
index 0000000..25c3586
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove.S
@@ -0,0 +1,98 @@
+/* Multiple versions of memmove
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__libc_memmove)
+	.type	__libc_memmove, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+# ifdef HAVE_AVX512_ASM_SUPPORT
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__memmove_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memmove_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memmove_avx512_unaligned(%rip), %RAX_LP
+	ret
+# endif
+1:	lea	__memmove_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memmove_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __memmove_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memmove_ssse3(%rip), %RAX_LP
+2:	ret
+END(__libc_memmove)
+#endif
+
+#if IS_IN (libc)
+# define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
+
+# ifdef SHARED
+libc_hidden_ver (__memmove_sse2_unaligned, memmove)
+libc_hidden_ver (__memcpy_sse2_unaligned, memcpy)
+libc_hidden_ver (__mempcpy_sse2_unaligned, mempcpy)
+libc_hidden_ver (__mempcpy_sse2_unaligned, __mempcpy)
+
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memmove calls through a PLT.
+   The speedup we get from using SSE2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def
+# endif
+strong_alias (__libc_memmove, memmove)
+#endif
+
+#if !defined SHARED || !IS_IN (libc)
+weak_alias (__mempcpy, mempcpy)
+#endif
+
+#include "../memmove.S"
+
+#if defined SHARED && IS_IN (libc)
+# include <shlib-compat.h>
+# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+/* Use __memmove_sse2_unaligned to support overlapping addresses.  */
+compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
deleted file mode 100644
index 8da5640..0000000
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Multiple versions of memmove.
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-# define MEMMOVE __memmove_sse2
-# ifdef SHARED
-#  undef libc_hidden_builtin_def
-#  define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__memmove_sse2, __GI_memmove, __memmove_sse2);
-# endif
-
-/* Redefine memmove so that the compiler won't complain about the type
-   mismatch with the IFUNC selector in strong_alias, below.  */
-# undef memmove
-# define memmove __redirect_memmove
-# include <string.h>
-# undef memmove
-
-extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
-# ifdef HAVE_AVX512_ASM_SUPPORT
-  extern __typeof (__redirect_memmove) __memmove_avx512_no_vzeroupper attribute_hidden;
-# endif
-
-#endif
-
-#include "string/memmove.c"
-
-#if IS_IN (libc)
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-   ifunc symbol properly.  */
-extern __typeof (__redirect_memmove) __libc_memmove;
-libc_ifunc (__libc_memmove,
-#ifdef HAVE_AVX512_ASM_SUPPORT
-	    HAS_ARCH_FEATURE (AVX512F_Usable)
-	      && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	    ? __memmove_avx512_no_vzeroupper
-	    :
-#endif
-	    (HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	    ? __memmove_avx_unaligned
-	    : (HAS_CPU_FEATURE (SSSE3)
-	       ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
-	          ? __memmove_ssse3_back : __memmove_ssse3)
-	       : __memmove_sse2)));
-
-strong_alias (__libc_memmove, memmove)
-
-# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
-compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/memmove_chk.S
similarity index 55%
copy from sysdeps/x86_64/multiarch/mempcpy_chk.S
copy to sysdeps/x86_64/multiarch/memmove_chk.S
index 6e8a89d..cd639b8 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memmove_chk.S
@@ -1,7 +1,6 @@
-/* Multiple versions of __mempcpy_chk
+/* Multiple versions of __memmove_chk
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2016 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
+   Copyright (C) 2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,35 +21,51 @@
 #include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
-   DSO.  There are no multiarch mempcpy functions for static binaries.
+   DSO.  There are no multiarch memmove functions for static binaries.
  */
 #if IS_IN (libc)
 # ifdef SHARED
 	.text
-ENTRY(__mempcpy_chk)
-	.type	__mempcpy_chk, @gnu_indirect_function
+ENTRY(__memmove_chk)
+	.type	__memmove_chk, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef HAVE_AVX512_ASM_SUPPORT
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	1f
+	lea	__memmove_chk_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	1f
-	leaq    __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax
+	jnz	2f
+	lea	__memmove_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memmove_chk_avx512_unaligned(%rip), %RAX_LP
 	ret
-#endif
-1:	leaq	__mempcpy_chk_sse2(%rip), %rax
-	HAS_CPU_FEATURE (SSSE3)
+# endif
+1:	lea	__memmove_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_chk_ssse3(%rip), %rax
-	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	lea	__memmove_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memmove_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	lea	__memmove_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
+	lea    __memmove_chk_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memmove_chk_ssse3(%rip), %RAX_LP
 2:	ret
-END(__mempcpy_chk)
+END(__memmove_chk)
 # else
-#  include "../mempcpy_chk.S"
+#  include "../memmove_chk.S"
 # endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
deleted file mode 100644
index f64da63..0000000
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Multiple versions of __memmove_chk.
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-#include "init-arch.h"
-
-#define MEMMOVE_CHK __memmove_chk_sse2
-
-extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
-extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
-extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
-extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
-# ifdef HAVE_AVX512_ASM_SUPPORT
-  extern __typeof (__memmove_chk) __memmove_chk_avx512_no_vzeroupper attribute_hidden;
-# endif
-
-#include "debug/memmove_chk.c"
-
-libc_ifunc (__memmove_chk,
-#ifdef HAVE_AVX512_ASM_SUPPORT
-	    HAS_ARCH_FEATURE (AVX512F_Usable)
-	      && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	    ? __memmove_chk_avx512_no_vzeroupper
-	    :
-#endif
-	    HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) ? __memmove_chk_avx_unaligned :
-	    (HAS_CPU_FEATURE (SSSE3)
-	    ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
-	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
-	    : __memmove_chk_sse2));
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index ed78623..f9c6df3 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -25,62 +25,46 @@
    DSO.  In static binaries we need mempcpy before the initialization
    happened.  */
 #if defined SHARED && IS_IN (libc)
+	.text
 ENTRY(__mempcpy)
 	.type	__mempcpy, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef HAVE_AVX512_ASM_SUPPORT
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	1f
+	lea	__mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	1f
-	leaq    __mempcpy_avx512_no_vzeroupper(%rip), %rax
+	jnz	2f
+	lea	__mempcpy_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__mempcpy_avx512_unaligned(%rip), %RAX_LP
 	ret
-#endif
-1:	leaq	__mempcpy_sse2(%rip), %rax
-	HAS_CPU_FEATURE (SSSE3)
+# endif
+1:	lea	__mempcpy_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_ssse3(%rip), %rax
-	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	lea	__mempcpy_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__mempcpy_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_ssse3_back(%rip), %rax
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	lea	__mempcpy_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leaq	__mempcpy_avx_unaligned(%rip), %rax
+	lea    __mempcpy_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__mempcpy_ssse3(%rip), %RAX_LP
 2:	ret
 END(__mempcpy)
 
-# undef ENTRY
-# define ENTRY(name) \
-	.type __mempcpy_sse2, @function; \
-	.p2align 4; \
-	.globl __mempcpy_sse2; \
-	.hidden __mempcpy_sse2; \
-	__mempcpy_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2
-
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
-	.type __mempcpy_chk_sse2, @function; \
-	.globl __mempcpy_chk_sse2; \
-	.p2align 4; \
-	__mempcpy_chk_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
-	cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2
-
-# undef libc_hidden_def
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal mempcpy calls through a PLT.
-   The speedup we get from using SSSE3 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_def(name) \
-	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2
-# define libc_hidden_builtin_def(name) \
-	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2
+weak_alias (__mempcpy, mempcpy)
 #endif
-
-#include "../mempcpy.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 6e8a89d..80f460f 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -30,24 +30,40 @@
 ENTRY(__mempcpy_chk)
 	.type	__mempcpy_chk, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef HAVE_AVX512_ASM_SUPPORT
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	1f
+	lea	__mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	1f
-	leaq    __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax
+	jnz	2f
+	lea	__mempcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__mempcpy_chk_avx512_unaligned(%rip), %RAX_LP
 	ret
-#endif
-1:	leaq	__mempcpy_chk_sse2(%rip), %rax
-	HAS_CPU_FEATURE (SSSE3)
+# endif
+1:	lea	__mempcpy_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_chk_ssse3(%rip), %rax
-	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	lea	__mempcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__mempcpy_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	lea	__mempcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
+	lea    __mempcpy_chk_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__mempcpy_chk_ssse3(%rip), %RAX_LP
 2:	ret
 END(__mempcpy_chk)
 # else

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6e7b4bf19131e1fb304c5d5429ceb5d4ef17a6b9

commit 6e7b4bf19131e1fb304c5d5429ceb5d4ef17a6b9
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Mar 31 10:42:30 2016 -0700

    X86-64: Remove the previous SSE2/AVX2 memsets
    
    Since the new SSE2/AVX2 memsets are faster than the previous ones, we
    can remove the previous SSE2/AVX2 memsets and replace them with the
    new ones.  This reduces the size of libc.so by about 900 bytes.
    
    No change in IFUNC selection if SSE2 and AVX2 memsets weren't used
    before.  If SSE2 or AVX2 memset was used, the new SSE2 or AVX2 memset
    optimized with Enhanced REP STOSB will be used for processors with
    ERMS.  The new AVX512 memset will be used for processors with AVX512
    which prefer vzeroupper.
    
    	[BZ #19881]
    	* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S: Folded
    	into ...
    	* sysdeps/x86_64/memset.S: This.
    	(__bzero): Removed.
    	(__memset_tail): Likewise.
    	(__memset_chk): Likewise.
    	(memset): Likewise.
    	(MEMSET_CHK_SYMBOL): New. Define only if MEMSET_SYMBOL isn't
    	defined.
    	(MEMSET_SYMBOL): Define only if MEMSET_SYMBOL isn't defined.
    	* sysdeps/x86_64/multiarch/memset-avx2.S: Removed.
    	(__memset_zero_constant_len_parameter): Check SHARED instead of
    	PIC.
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove
    	memset-avx2 and memset-sse2-unaligned-erms.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Remove __memset_chk_sse2,
    	__memset_chk_avx2, __memset_sse2 and __memset_avx2_unaligned.
    	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: Skip
    	if not in libc.
    	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
    	Likewise.
    	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
    	(__bzero): Enabled.
    	* sysdeps/x86_64/multiarch/memset.S (memset): Replace
    	__memset_sse2 and __memset_avx2 with __memset_sse2_unaligned
    	and __memset_avx2_unaligned.  Use __memset_sse2_unaligned_erms
    	or __memset_avx2_unaligned_erms if processor has ERMS.  Support
    	__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
    	(memset): Removed.
    	(__memset_chk): Likewise.
    	(MEMSET_SYMBOL): New.
    	(libc_hidden_builtin_def): Replace __memset_sse2 with
    	__memset_sse2_unaligned.
    	* sysdeps/x86_64/multiarch/memset_chk.S (__memset_chk): Replace
    	__memset_chk_sse2 and __memset_chk_avx2 with
    	__memset_chk_sse2_unaligned and __memset_chk_avx2_unaligned_erms.
    	Use __memset_chk_sse2_unaligned_erms or
    	__memset_chk_avx2_unaligned_erms if processor has ERMS.  Support
    	__memset_chk_avx512_unaligned_erms and
    	__memset_chk_avx512_unaligned.

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 4cf0da0..62b85c3 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -19,113 +19,32 @@
 
 #include <sysdep.h>
 
-	.text
-#if IS_IN (libc)
-ENTRY(__bzero)
-	movq	%rdi, %rax /* Set return value.  */
-	movq	%rsi, %rdx /* Set n.  */
-	pxor	%xmm0, %xmm0
-	jmp	L(entry_from_bzero)
-END(__bzero)
-weak_alias (__bzero, bzero)
-
-/* Like memset but takes additional parameter with return value.  */
-ENTRY(__memset_tail)
-	movq	%rcx, %rax /* Set return value.  */
-
-	movd	%esi, %xmm0
-	punpcklbw	%xmm0, %xmm0
-	punpcklwd	%xmm0, %xmm0
-	pshufd	$0, %xmm0, %xmm0
-
-	jmp	L(entry_from_bzero)
-END(__memset_tail)
-#endif
-
-#if defined PIC && IS_IN (libc)
-ENTRY_CHK (__memset_chk)
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (__memset_chk)
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+/* Don't use movups and movaps since it will get larger nop paddings for
+   alignment.  */
+#define VMOVU		movdqu
+#define VMOVA		movdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movd d, %xmm0; \
+  movq r, %rax; \
+  punpcklbw %xmm0, %xmm0; \
+  punpcklwd %xmm0, %xmm0; \
+  pshufd $0, %xmm0, %xmm0
+
+#define SECTION(p)		p
+
+#ifndef MEMSET_SYMBOL
+# define MEMSET_CHK_SYMBOL(p,s)	p
+# define MEMSET_SYMBOL(p,s)	memset
 #endif
 
-ENTRY (memset)
-	movd	%esi, %xmm0
-	movq	%rdi, %rax
-	punpcklbw	%xmm0, %xmm0
-	punpcklwd	%xmm0, %xmm0
-	pshufd	$0, %xmm0, %xmm0
-L(entry_from_bzero):
-	cmpq	$64, %rdx
-	ja	L(loop_start)
-	cmpq	$16, %rdx
-	jbe	L(less_16_bytes)
-	cmpq	$32, %rdx
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm0, -16(%rdi,%rdx)
-	ja	L(between_32_64_bytes)
-L(return):
-	rep
-	ret
-	.p2align 4
-L(between_32_64_bytes):
-	movdqu	%xmm0, 16(%rdi)
-	movdqu	%xmm0, -32(%rdi,%rdx)
-	ret
-	.p2align 4
-L(loop_start):
-	leaq	64(%rdi), %rcx
-	movdqu	%xmm0, (%rdi)
-	andq	$-64, %rcx
-	movdqu	%xmm0, -16(%rdi,%rdx)
-	movdqu	%xmm0, 16(%rdi)
-	movdqu	%xmm0, -32(%rdi,%rdx)
-	movdqu	%xmm0, 32(%rdi)
-	movdqu	%xmm0, -48(%rdi,%rdx)
-	movdqu	%xmm0, 48(%rdi)
-	movdqu	%xmm0, -64(%rdi,%rdx)
-	addq	%rdi, %rdx
-	andq	$-64, %rdx
-	cmpq	%rdx, %rcx
-	je	L(return)
-	.p2align 4
-L(loop):
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	movdqa	%xmm0, 32(%rcx)
-	movdqa	%xmm0, 48(%rcx)
-	addq	$64, %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
-	rep
-	ret
-L(less_16_bytes):
-	movq %xmm0, %rcx
-	testb	$24, %dl
-	jne	L(between8_16bytes)
-	testb	$4, %dl
-	jne	L(between4_7bytes)
-	testb	$1, %dl
-	je	L(odd_byte)
-	movb	%cl, (%rdi)
-L(odd_byte):
-	testb	$2, %dl
-	je	L(return)
-	movw	%cx, -2(%rax,%rdx)
-	ret
-L(between4_7bytes):
-	movl	%ecx, (%rdi)
-	movl	%ecx, -4(%rdi,%rdx)
-	ret
-L(between8_16bytes):
-	movq	%rcx, (%rdi)
-	movq	%rcx, -8(%rdi,%rdx)
-	ret
+#include "multiarch/memset-vec-unaligned-erms.S"
 
-END (memset)
 libc_hidden_builtin_def (memset)
 
-#if defined PIC && IS_IN (libc) && !defined USE_MULTIARCH
+#if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
 strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
 	.section .gnu.warning.__memset_zero_constant_len_parameter
 	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 8878efb..35752a8 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -19,12 +19,11 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
-		   strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
+		   strcspn-c strpbrk-c strspn-c varshift \
 		   memset-avx512-no-vzeroupper \
 		   memmove-sse2-unaligned-erms \
 		   memmove-avx-unaligned-erms \
 		   memmove-avx512-unaligned-erms \
-		   memset-sse2-unaligned-erms \
 		   memset-avx2-unaligned-erms \
 		   memset-avx512-unaligned-erms
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 1e880f6..ca05ff6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -117,16 +117,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
-			      __memset_chk_sse2)
-	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_sse2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
-			      __memset_chk_avx2)
-	      IFUNC_IMPL_ADD (array, i, __memset_chk,
-			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_chk_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
@@ -146,7 +141,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
-	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
 	      IFUNC_IMPL_ADD (array, i, memset, 1,
 			      __memset_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memset, 1,
@@ -154,9 +148,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
-			      __memset_avx2)
-	      IFUNC_IMPL_ADD (array, i, memset,
-			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
deleted file mode 100644
index df63472..0000000
--- a/sysdeps/x86_64/multiarch/memset-avx2.S
+++ /dev/null
@@ -1,168 +0,0 @@
-/* memset with AVX2
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-#ifndef MEMSET
-# define MEMSET	__memset_avx2
-# define MEMSET_CHK	__memset_chk_avx2
-#endif
-
-	.section .text.avx2,"ax",@progbits
-#if defined PIC
-ENTRY (MEMSET_CHK)
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMSET_CHK)
-#endif
-
-ENTRY (MEMSET)
-	vpxor	%xmm0, %xmm0, %xmm0
-	vmovd	%esi, %xmm1
-	lea	(%rdi, %rdx), %rsi
-	mov	%rdi, %rax
-	vpshufb	%xmm0, %xmm1, %xmm0
-	cmp	$16, %rdx
-	jb	L(less_16bytes)
-	cmp	$256, %rdx
-	jae	L(256bytesormore)
-	cmp	$128, %dl
-	jb	L(less_128bytes)
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu %xmm0, 0x10(%rdi)
-	vmovdqu %xmm0, 0x20(%rdi)
-	vmovdqu %xmm0, 0x30(%rdi)
-	vmovdqu %xmm0, 0x40(%rdi)
-	vmovdqu %xmm0, 0x50(%rdi)
-	vmovdqu %xmm0, 0x60(%rdi)
-	vmovdqu %xmm0, 0x70(%rdi)
-	vmovdqu %xmm0, -0x80(%rsi)
-	vmovdqu %xmm0, -0x70(%rsi)
-	vmovdqu %xmm0, -0x60(%rsi)
-	vmovdqu %xmm0, -0x50(%rsi)
-	vmovdqu %xmm0, -0x40(%rsi)
-	vmovdqu %xmm0, -0x30(%rsi)
-	vmovdqu %xmm0, -0x20(%rsi)
-	vmovdqu %xmm0, -0x10(%rsi)
-	ret
-
-	.p2align 4
-L(less_128bytes):
-	cmp	$64, %dl
-	jb	L(less_64bytes)
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm0, 0x10(%rdi)
-	vmovdqu %xmm0, 0x20(%rdi)
-	vmovdqu %xmm0, 0x30(%rdi)
-	vmovdqu %xmm0, -0x40(%rsi)
-	vmovdqu %xmm0, -0x30(%rsi)
-	vmovdqu %xmm0, -0x20(%rsi)
-	vmovdqu %xmm0, -0x10(%rsi)
-	ret
-
-	.p2align 4
-L(less_64bytes):
-	cmp	$32, %dl
-	jb	L(less_32bytes)
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm0, 0x10(%rdi)
-	vmovdqu %xmm0, -0x20(%rsi)
-	vmovdqu %xmm0, -0x10(%rsi)
-	ret
-
-	.p2align 4
-L(less_32bytes):
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm0, -0x10(%rsi)
-	ret
-
-	.p2align 4
-L(less_16bytes):
-	cmp	$8, %dl
-	jb	L(less_8bytes)
-	vmovq %xmm0, (%rdi)
-	vmovq %xmm0, -0x08(%rsi)
-	ret
-
-	.p2align 4
-L(less_8bytes):
-	vmovd	%xmm0, %ecx
-	cmp	$4, %dl
-	jb	L(less_4bytes)
-	mov	%ecx, (%rdi)
-	mov	%ecx, -0x04(%rsi)
-	ret
-
-	.p2align 4
-L(less_4bytes):
-	cmp	$2, %dl
-	jb	L(less_2bytes)
-	mov	%cx, (%rdi)
-	mov	%cx, -0x02(%rsi)
-	ret
-
-	.p2align 4
-L(less_2bytes):
-	cmp	$1, %dl
-	jb	L(less_1bytes)
-	mov	%cl, (%rdi)
-L(less_1bytes):
-	ret
-
-	.p2align 4
-L(256bytesormore):
-	vinserti128 $1, %xmm0, %ymm0, %ymm0
-	and	$-0x20, %rdi
-	add	$0x20, %rdi
-	vmovdqu	%ymm0, (%rax)
-	sub	%rdi, %rax
-	lea	-0x80(%rax, %rdx), %rcx
-	cmp	$4096, %rcx
-	ja	L(gobble_data)
-L(gobble_128_loop):
-	vmovdqa	%ymm0, (%rdi)
-	vmovdqa	%ymm0, 0x20(%rdi)
-	vmovdqa	%ymm0, 0x40(%rdi)
-	vmovdqa	%ymm0, 0x60(%rdi)
-	sub	$-0x80, %rdi
-	add	$-0x80, %ecx
-	jb	L(gobble_128_loop)
-	mov	%rsi, %rax
-	vmovdqu	%ymm0, -0x80(%rsi)
-	vmovdqu	%ymm0, -0x60(%rsi)
-	vmovdqu	%ymm0, -0x40(%rsi)
-	vmovdqu	%ymm0, -0x20(%rsi)
-	sub	%rdx, %rax
-	vzeroupper
-	ret
-
-	.p2align 4
-L(gobble_data):
-	sub	$-0x80, %rcx
-	vmovd	%xmm0, %eax
-	rep	stosb
-	mov	%rsi, %rax
-	sub	%rdx, %rax
-	vzeroupper
-	ret
-
-END (MEMSET)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
deleted file mode 100644
index 4bf3d36..0000000
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE	16
-# define VEC(i)		xmm##i
-/* Don't use movups and movaps since it will get larger nop paddings
-   for alignment.  */
-# define VMOVU		movdqu
-# define VMOVA		movdqa
-
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movd d, %xmm0; \
-  movq r, %rax; \
-  punpcklbw %xmm0, %xmm0; \
-  punpcklwd %xmm0, %xmm0; \
-  pshufd $0, %xmm0, %xmm0
-
-# define SECTION(p)		p
-# define MEMSET_SYMBOL(p,s)	p##_sse2_##s
-
-# include "memset-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 578a5ae..89f6886 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -71,7 +71,7 @@
 #endif
 
 	.section SECTION(.text),"ax",@progbits
-#if VEC_SIZE == 16 && IS_IN (libc) && 0
+#if VEC_SIZE == 16 && IS_IN (libc)
 ENTRY (__bzero)
 	movq	%rdi, %rax /* Set return value.  */
 	movq	%rsi, %rdx /* Set n.  */
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 8e3b9b9..4e52d8f 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -26,35 +26,43 @@
 ENTRY(memset)
 	.type	memset, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	leaq	__memset_sse2(%rip), %rax
+	lea	__memset_sse2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	1f
+	lea	__memset_sse2_unaligned(%rip), %RAX_LP
+1:
 	HAS_ARCH_FEATURE (AVX2_Usable)
 	jz	2f
-	leaq	__memset_avx2(%rip), %rax
-#ifdef HAVE_AVX512_ASM_SUPPORT
+	lea	__memset_avx2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	L(AVX512F)
+	lea	__memset_avx2_unaligned(%rip), %RAX_LP
+L(AVX512F):
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	2f
+	lea	__memset_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	2f
-	leaq	__memset_avx512_no_vzeroupper(%rip), %rax
-#endif
+	jnz	2f
+	lea	__memset_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memset_avx512_unaligned(%rip), %RAX_LP
+# endif
 2:	ret
 END(memset)
 #endif
 
 #if IS_IN (libc)
-# undef memset
-# define memset __memset_sse2
-
-# undef __memset_chk
-# define __memset_chk __memset_chk_sse2
+# define MEMSET_SYMBOL(p,s)	p##_sse2_##s
 
 # ifdef SHARED
 # undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal memset calls through a PLT.
-   The speedup we get from using GPR instruction is likely eaten away
+   The speedup we get from using SSE2 instructions is likely eaten away
    by the indirect call in the PLT.  */
 # define libc_hidden_builtin_def(name) \
-	.globl __GI_memset; __GI_memset = __memset_sse2
+	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned
 # endif
 
 # undef strong_alias
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
index 9a7b270..8517cfc 100644
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -26,16 +26,28 @@
 ENTRY(__memset_chk)
 	.type	__memset_chk, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	leaq	__memset_chk_sse2(%rip), %rax
+	lea	__memset_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	1f
+	lea	__memset_chk_sse2_unaligned(%rip), %RAX_LP
+1:
 	HAS_ARCH_FEATURE (AVX2_Usable)
 	jz	2f
-	leaq	__memset_chk_avx2(%rip), %rax
+	lea	__memset_chk_avx2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	L(AVX512F)
+	lea	__memset_chk_avx2_unaligned(%rip), %RAX_LP
+L(AVX512F):
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	2f
+	lea	__memset_chk_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	2f
-	leaq	__memset_chk_avx512_no_vzeroupper(%rip), %rax
+	jnz	2f
+	lea	__memset_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memset_chk_avx512_unaligned(%rip), %RAX_LP
 #endif
 2:	ret
 END(__memset_chk)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b80cc56d7d3a54c16d33d147438b67c715906675

commit b80cc56d7d3a54c16d33d147438b67c715906675
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Apr 3 17:21:45 2016 -0700

    X86-64: Use non-temporal store in memcpy on large data
    
    The large memcpy micro benchmark in glibc shows that there is a
    regression with large data on Haswell machine.  non-temporal store in
    memcpy on large data can improve performance significantly.  This
    patch adds a threshold to use non temporal store which is 6 times of
    shared cache size.  When size is above the threshold, non temporal
    store will be used.
    
    For size below 8 vector register width, we load all data into registers
    and store them together.  Only forward and backward loops, which move 4
    vector registers at a time, are used to support overlapping addresses.
    For forward loop, we load the last 4 vector register width of data and
    the first vector register width of data into vector registers before the
    loop and store them after the loop.  For backward loop, we load the first
    4 vector register width of data and the last vector register width of
    data into vector registers before the loop and store them after the loop.
    
    	* sysdeps/x86_64/cacheinfo.c (__x86_shared_non_temporal_threshold):
    	New.
    	(init_cacheinfo): Set __x86_shared_non_temporal_threshold to
    	6 times of shared cache size.
    	* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
    	(VMOVNT): New.
    	* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
    	(VMOVNT): Likewise.
    	* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
    	(VMOVNT): Likewise.
    	(VMOVU): Changed to movups for smaller code sizes.
    	(VMOVA): Changed to movaps for smaller code sizes.
    	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: Update
    	comments.
    	(PREFETCH): New.
    	(PREFETCH_SIZE): Likewise.
    	(PREFETCHED_LOAD_SIZE): Likewise.
    	(PREFETCH_ONE_SET): Likewise.
    	Rewrite to use forward and backward loops, which move 4 vector
    	registers at a time, to support overlapping addresses and use
    	non temporal store if size is above the threshold.

diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index 96463df..143b333 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -464,6 +464,9 @@ long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
 /* Similar to __x86_shared_cache_size, but not rounded.  */
 long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
 
+/* Threshold to use non temporal store.  */
+long int __x86_shared_non_temporal_threshold attribute_hidden;
+
 #ifndef DISABLE_PREFETCHW
 /* PREFETCHW support flag for use in memory and string routines.  */
 int __x86_prefetchw attribute_hidden;
@@ -662,4 +665,9 @@ init_cacheinfo (void)
       __x86_shared_cache_size_half = shared / 2;
       __x86_shared_cache_size = shared;
     }
+
+  /* The large memcpy micro benchmark in glibc shows that 6 times of
+     shared cache size is the approximate value above which non-temporal
+     store becomes faster.  */
+  __x86_shared_non_temporal_threshold = __x86_shared_cache_size * 6;
 }
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index 44711c3..e195e93 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -1,6 +1,7 @@
 #if IS_IN (libc)
 # define VEC_SIZE	32
 # define VEC(i)		ymm##i
+# define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index c2c5293..f9af6fd 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,6 +1,7 @@
 #if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
 # define VEC_SIZE	64
 # define VEC(i)		zmm##i
+# define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 85214fe..d7edb18 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -1,8 +1,10 @@
 #if IS_IN (libc)
 # define VEC_SIZE	16
 # define VEC(i)		xmm##i
-# define VMOVU		movdqu
-# define VMOVA		movdqa
+# define VMOVNT		movntdq
+/* Use movups and movaps for smaller code sizes.  */
+# define VMOVU		movups
+# define VMOVA		movaps
 
 # define SECTION(p)		p
 # define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 8a60d0f..4e13b8f 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -18,19 +18,20 @@
 
 /* memmove/memcpy/mempcpy is implemented as:
    1. Use overlapping load and store to avoid branch.
-   2. Use 8-bit or 32-bit displacements for branches and nop paddings
-      to avoid long nop between instructions.
-   3. Load all sources into registers and store them together to avoid
+   2. Load all sources into registers and store them together to avoid
       possible address overflap between source and destination.
-   4. If size is 2 * VEC_SIZE or less, load all sources into registers
+   3. If size is 8 * VEC_SIZE or less, load all sources into registers
       and store them together.
-   5. If there is no address overflap, copy from both ends with
-      4 * VEC_SIZE at a time.
-   6. If size is 8 * VEC_SIZE or less, load all sources into registers
-      and store them together.
-   7. If address of destination > address of source, backward copy
-      8 * VEC_SIZE at a time.
-   8. Otherwise, forward copy 8 * VEC_SIZE at a time.  */
+   4. If address of destination > address of source, backward copy
+      4 * VEC_SIZE at a time with unaligned load and aligned store.
+      Load the first 4 * VEC and last VEC before the loop and store
+      them after the loop to support overlapping addresses.
+   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
+      load and aligned store.  Load the last 4 * VEC and first VEC
+      before the loop and store them after the loop to support
+      overlapping addresses.
+   6. If size >= __x86_shared_non_temporal_threshold, use non-temporal
+      store instead of aligned store.  */
 
 #include <sysdep.h>
 
@@ -65,6 +66,39 @@
 # define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
 #endif
 
+#ifndef PREFETCH
+# define PREFETCH(addr) prefetcht0 addr
+#endif
+
+/* Assume 64-byte prefetch size.  */
+#ifndef PREFETCH_SIZE
+# define PREFETCH_SIZE 64
+#endif
+
+#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
+
+#if PREFETCH_SIZE == 64
+# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base)
+# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
+# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
+# else
+#   error Unsupported PREFETCHED_LOAD_SIZE!
+# endif
+#else
+# error Unsupported PREFETCH_SIZE!
+#endif
+
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -185,6 +219,8 @@ L(return):
 	ret
 
 L(movsb):
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	jae	L(more_8x_vec)
 	cmpq	%rsi, %rdi
 	jb	1f
 	/* Source == destination is less common.  */
@@ -201,97 +237,8 @@ L(movsb):
 	rep movsb
 L(nop):
 	ret
-
-	.p2align 4
-L(movsb_more_2x_vec):
-	cmpq	$REP_MOVSB_THRESHOLD, %rdx
-	/* Force 32-bit displacement to avoid long nop between
-	   instructions.  */
-	ja.d32	L(movsb)
 #endif
-	.p2align 4
-L(more_2x_vec):
-	/* More than 2 * VEC.  */
-	cmpq	%rsi, %rdi
-	jb	L(copy_forward)
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %rcx
-	cmpq	%rcx, %rdi
-	jb	L(more_2x_vec_overlap)
-L(copy_forward):
-	leaq	(%rdi,%rdx), %rcx
-	cmpq	%rcx, %rsi
-	jb	L(more_2x_vec_overlap)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
-	cmpq	$(VEC_SIZE * 4), %rdx
-	/* Force 32-bit displacement to avoid long nop between
-	   instructions.  */
-	jbe.d32	L(return)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(0)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(1), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
-	cmpq	$(VEC_SIZE * 8), %rdx
-#if  VEC_SIZE == 16
-# if defined USE_MULTIARCH && IS_IN (libc)
-	jbe	L(return)
-# else
-	/* Use 32-bit displacement to avoid long nop between
-	   instructions.  */
-	jbe.d32	L(return)
-# endif
-#else
-	/* Use 8-bit displacement to avoid long nop between
-	   instructions.  */
-	jbe	L(return_disp8)
-#endif
-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
-	addq	%rdi, %rdx
-	andq	$-(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rcx
-	movq	%rcx, %r11
-	subq	%rdi, %r11
-	addq	%r11, %rsi
-	cmpq	%rdx, %rcx
-	/* Use 8-bit displacement to avoid long nop between
-	   instructions.  */
-	je	L(return_disp8)
-	movq	%rsi, %r10
-	subq	%rcx, %r10
-	leaq	VEC_SIZE(%r10), %r9
-	leaq	(VEC_SIZE * 2)(%r10), %r8
-	leaq	(VEC_SIZE * 3)(%r10), %r11
-	.p2align 4
-L(loop):
-	VMOVU	(%rcx,%r10), %VEC(0)
-	VMOVU	(%rcx,%r9), %VEC(1)
-	VMOVU	(%rcx,%r8), %VEC(2)
-	VMOVU	(%rcx,%r11), %VEC(3)
-	VMOVA	%VEC(0), (%rcx)
-	VMOVA	%VEC(1), VEC_SIZE(%rcx)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rcx)
-	addq	$(VEC_SIZE * 4), %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(return):
-#endif
-L(return_disp8):
-	VZEROUPPER
-	ret
+
 L(less_vec):
 	/* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
@@ -357,18 +304,18 @@ L(between_2_3):
 	movw	%si, (%rdi)
 	ret
 
-#if VEC_SIZE > 16
-	/* Align to 16 bytes to avoid long nop between instructions.  */
-	.p2align 4
+#if defined USE_MULTIARCH && IS_IN (libc)
+L(movsb_more_2x_vec):
+	cmpq	$REP_MOVSB_THRESHOLD, %rdx
+	ja	L(movsb)
 #endif
-L(more_2x_vec_overlap):
-	/* More than 2 * VEC and there is overlap bewteen destination
+L(more_2x_vec):
+	/* More than 2 * VEC and there may be overlap bewteen destination
 	   and source.  */
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jb	L(last_4x_vec)
-L(between_4x_vec_and_8x_vec):
 	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
@@ -400,84 +347,169 @@ L(last_4x_vec):
 	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
 	VZEROUPPER
 	ret
-L(between_0_and_4x_vec):
-	/* Copy from 0 to 4 * VEC. */
-	cmpl	$(VEC_SIZE * 2), %edx
-	jae	L(last_4x_vec)
-	/* Copy from 0 to 2 * VEC. */
-	cmpl	$VEC_SIZE, %edx
-	jae	L(last_2x_vec)
-	/* Copy from 0 to VEC. */
-	VZEROUPPER
-	jmp	L(less_vec)
+
 L(more_8x_vec):
 	cmpq	%rsi, %rdi
 	ja	L(more_8x_vec_backward)
-
-	.p2align 4
-L(loop_8x_vec_forward):
-	/* Copy 8 * VEC a time forward.  */
+	/* Source == destination is less common.  */
+	je	L(nop)
+	/* Load the first VEC and last 4 * VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	/* Save start and stop of the destination buffer.  */
+	movq	%rdi, %r11
+	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
+	/* Align destination for aligned stores in the loop.  Compute
+	   how much destination is misaligned.  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+	/* Check non-temporal store threshold.  */
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	ja	L(loop_large_forward)
+L(loop_4x_vec_forward):
+	/* Copy 4 * VEC a time forward.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 4)(%rsi), %VEC(4)
-	VMOVU	(VEC_SIZE * 5)(%rsi), %VEC(5)
-	VMOVU	(VEC_SIZE * 6)(%rsi), %VEC(6)
-	VMOVU	(VEC_SIZE * 7)(%rsi), %VEC(7)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), (VEC_SIZE * 4)(%rdi)
-	VMOVU	%VEC(5), (VEC_SIZE * 5)(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 6)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 7)(%rdi)
-	addq	$(VEC_SIZE * 8), %rdi
-	addq	$(VEC_SIZE * 8), %rsi
-	subq	$(VEC_SIZE * 8), %rdx
-	cmpq	$(VEC_SIZE * 8), %rdx
-	je	L(between_4x_vec_and_8x_vec)
-	ja	L(loop_8x_vec_forward)
-	/* Less than 8 * VEC to copy.  */
+	addq	$(VEC_SIZE * 4), %rsi
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
 	cmpq	$(VEC_SIZE * 4), %rdx
-	jb	L(between_0_and_4x_vec)
-	jmp	L(between_4x_vec_and_8x_vec)
+	ja	L(loop_4x_vec_forward)
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
 
-	.p2align 4
 L(more_8x_vec_backward):
+	/* Load the first 4 * VEC and last VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	VEC_SIZE(%rsi), %VEC(5)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
+	/* Save stop of the destination buffer.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %r11
+	/* Align destination end for aligned stores in the loop.  Compute
+	   how much destination end is misaligned.  */
 	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	leaq	-VEC_SIZE(%rdi, %rdx), %r9
+	movq	%r11, %r9
+	movq	%r11, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Adjust source.  */
+	subq	%r8, %rcx
+	/* Adjust the end of destination which should be aligned now.  */
+	subq	%r8, %r9
+	/* Adjust length.  */
+	subq	%r8, %rdx
+	/* Check non-temporal store threshold.  */
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	ja	L(loop_large_backward)
+L(loop_4x_vec_backward):
+	/* Copy 4 * VEC a time backward.  */
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	subq	$(VEC_SIZE * 4), %rcx
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%r9)
+	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$(VEC_SIZE * 4), %r9
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec_backward)
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(loop_large_forward):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$PREFETCHED_LOAD_SIZE, %rsi
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%rdi)
+	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$PREFETCHED_LOAD_SIZE, %rdi
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_forward)
+	sfence
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
 
 	.p2align 4
-L(loop_8x_vec_backward):
-	/* Copy 8 * VEC a time backward.  */
+L(loop_large_backward):
+	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
 	VMOVU	(%rcx), %VEC(0)
 	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
 	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
 	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	VMOVU	-(VEC_SIZE * 4)(%rcx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 5)(%rcx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 6)(%rcx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 7)(%rcx), %VEC(7)
-	VMOVU	%VEC(0), (%r9)
-	VMOVU	%VEC(1), -VEC_SIZE(%r9)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVU	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	VMOVU	%VEC(4), -(VEC_SIZE * 4)(%r9)
-	VMOVU	%VEC(5), -(VEC_SIZE * 5)(%r9)
-	VMOVU	%VEC(6), -(VEC_SIZE * 6)(%r9)
-	VMOVU	%VEC(7), -(VEC_SIZE * 7)(%r9)
-	subq	$(VEC_SIZE * 8), %rcx
-	subq	$(VEC_SIZE * 8), %r9
-	subq	$(VEC_SIZE * 8), %rdx
-	cmpq	$(VEC_SIZE * 8), %rdx
-	je	L(between_4x_vec_and_8x_vec)
-	ja	L(loop_8x_vec_backward)
-	/* Less than 8 * VEC to copy.  */
-	cmpq	$(VEC_SIZE * 4), %rdx
-	jb	L(between_0_and_4x_vec)
-	jmp	L(between_4x_vec_and_8x_vec)
+	subq	$PREFETCHED_LOAD_SIZE, %rcx
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%r9)
+	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$PREFETCHED_LOAD_SIZE, %r9
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_backward)
+	sfence
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 
 #ifdef SHARED

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]