This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/erms/2.23 created. glibc-2.23-45-g2a1cca3


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/erms/2.23 has been created
        at  2a1cca399be415d6c5a556af2018e5fb726d9a37 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2a1cca399be415d6c5a556af2018e5fb726d9a37

commit 2a1cca399be415d6c5a556af2018e5fb726d9a37
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Apr 1 14:01:24 2016 -0700

    X86-64: Add dummy memcopy.h and wordcopy.c
    
    Since x86-64 doesn't use memory copy functions, add dummy memcopy.h and
    wordcopy.c to reduce code size.  It reduces the size of libc.so by about
    1 KB.
    
    	* sysdeps/x86_64/memcopy.h: New file.
    	* sysdeps/x86_64/wordcopy.c: Likewise.

diff --git a/sysdeps/x86_64/memcopy.h b/sysdeps/x86_64/memcopy.h
new file mode 100644
index 0000000..590b6cb
--- /dev/null
+++ b/sysdeps/x86_64/memcopy.h
@@ -0,0 +1 @@
+/* X86-64 doesn't use memory copy functions.  */
diff --git a/sysdeps/x86_64/wordcopy.c b/sysdeps/x86_64/wordcopy.c
new file mode 100644
index 0000000..590b6cb
--- /dev/null
+++ b/sysdeps/x86_64/wordcopy.c
@@ -0,0 +1 @@
+/* X86-64 doesn't use memory copy functions.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b361e72f264a06e856d97cbbf1cedbf2f7dd73bf

commit b361e72f264a06e856d97cbbf1cedbf2f7dd73bf
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Mar 31 12:46:57 2016 -0700

    X86-64: Remove previous default/SSE2/AVX2 memcpy/memmove
    
    Since the new SSE2/AVX2 memcpy/memmove are faster than the previous ones,
    we can remove the previous SSE2/AVX2 memcpy/memmove and replace them with
    the new ones.
    
    No change in IFUNC selection if SSE2 and AVX2 memcpy/memmove weren't used
    before.  If SSE2 or AVX2 memcpy/memmove were used, the new SSE2 or AVX2
    memcpy/memmove optimized with Enhanced REP MOVSB will be used for
    processors with ERMS.  The new AVX512 memcpy/memmove will be used for
    processors with AVX512 which prefer vzeroupper.
    
    Since the new SSE2 memcpy/memmove are faster than the previous default
    memcpy/memmove used in libc.a and ld.so, we also remove the previous
    default memcpy/memmove and make them the default memcpy/memmove.
    
    Together, it reduces the size of libc.so by about 6 KB and the size of
    ld.so by about 2 KB.
    
    	[BZ #19776]
    	* sysdeps/x86_64/memcpy.S: Make it dummy.
    	* sysdeps/x86_64/mempcpy.S: Likewise.
    	* sysdeps/x86_64/memmove.S: New file.
    	* sysdeps/x86_64/memmove_chk.S: Likewise.
    	* sysdeps/x86_64/multiarch/memmove.S: Likewise.
    	* sysdeps/x86_64/multiarch/memmove_chk.S: Likewise.
    	* sysdeps/x86_64/memmove.c: Removed.
    	* sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: Likewise.
    	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Likewise.
    	* sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: Likewise.
    	* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
    	Likewise.
    	* sysdeps/x86_64/multiarch/memmove.c: Likewise.
    	* sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove
    	memcpy-sse2-unaligned, memmove-avx-unaligned,
    	memcpy-avx-unaligned and memmove-sse2-unaligned-erms.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Replace
    	__memmove_chk_avx512_unaligned_2 with
    	__memmove_chk_avx512_unaligned.  Remove
    	__memmove_chk_avx_unaligned_2.  Replace
    	__memmove_chk_sse2_unaligned_2 with
    	__memmove_chk_sse2_unaligned.  Remove __memmove_chk_sse2 and
    	__memmove_avx_unaligned_2.  Replace __memmove_avx512_unaligned_2
    	with __memmove_avx512_unaligned.  Replace
    	__memmove_sse2_unaligned_2 with __memmove_sse2_unaligned.
    	Remove __memmove_sse2.  Replace __memcpy_chk_avx512_unaligned_2
    	with __memcpy_chk_avx512_unaligned.  Remove
    	__memcpy_chk_avx_unaligned_2.  Replace
    	__memcpy_chk_sse2_unaligned_2 with __memcpy_chk_sse2_unaligned.
    	Remove __memcpy_chk_sse2.  Remove __memcpy_avx_unaligned_2.
    	Replace __memcpy_avx512_unaligned_2 with
    	__memcpy_avx512_unaligned.  Remove __memcpy_sse2_unaligned_2
    	and __memcpy_sse2.  Replace __mempcpy_chk_avx512_unaligned_2
    	with __mempcpy_chk_avx512_unaligned.  Remove
    	__mempcpy_chk_avx_unaligned_2.  Replace
    	__mempcpy_chk_sse2_unaligned_2 with
    	__mempcpy_chk_sse2_unaligned.  Remove __mempcpy_chk_sse2.
    	Replace __mempcpy_avx512_unaligned_2 with
    	__mempcpy_avx512_unaligned.  Remove __mempcpy_avx_unaligned_2.
    	Replace __mempcpy_sse2_unaligned_2 with
    	__mempcpy_sse2_unaligned.  Remove __mempcpy_sse2.
    	* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Support
    	__memcpy_avx512_unaligned_erms and __memcpy_avx512_unaligned.
    	Use __memcpy_avx_unaligned_erms and __memcpy_sse2_unaligned_erms
    	if processor has ERMS.  Default to __memcpy_sse2_unaligned.
    	(ENTRY): Removed.
    	(END): Likewise.
    	(ENTRY_CHK): Likewise.
    	(libc_hidden_builtin_def): Likewise.
    	Don't include ../memcpy.S.
    	* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Support
    	__memcpy_chk_avx512_unaligned_erms and
    	__memcpy_chk_avx512_unaligned.  Use
    	__memcpy_chk_avx_unaligned_erms and
    	__memcpy_chk_sse2_unaligned_erms if if processor has ERMS.
    	Default to __memcpy_chk_sse2_unaligned.
    	* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: Skip if
    	not in libc.
    	* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
    	Likewise.
    	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
    	Change function suffix from unaligned_2 to unaligned.
    	* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Support
    	__mempcpy_avx512_unaligned_erms and __mempcpy_avx512_unaligned.
    	Use __mempcpy_avx_unaligned_erms and __mempcpy_sse2_unaligned_erms
    	if processor has ERMS.  Default to __mempcpy_sse2_unaligned.
    	(ENTRY): Removed.
    	(END): Likewise.
    	(ENTRY_CHK): Likewise.
    	(libc_hidden_builtin_def): Likewise.
    	Don't include ../mempcpy.S.
    	(mempcpy): New.  Add a weak alias.
    	* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Support
    	__mempcpy_chk_avx512_unaligned_erms and
    	__mempcpy_chk_avx512_unaligned.  Use
    	__mempcpy_chk_avx_unaligned_erms and
    	__mempcpy_chk_sse2_unaligned_erms if if processor has ERMS.
    	Default to __mempcpy_chk_sse2_unaligned.

diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index f6e3d93..d98500a 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -1,584 +1 @@
-/*
-   Optimized memcpy for x86-64.
-
-   Copyright (C) 2007-2016 Free Software Foundation, Inc.
-   Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
-
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.
-*/
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-/* Stack slots in the red-zone. */
-
-#ifdef USE_AS_MEMPCPY
-#  define RETVAL	(0)
-#else
-#  define RETVAL	(-8)
-#  if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc)
-#    define memcpy	__memcpy
-#    undef libc_hidden_builtin_def
-#    define libc_hidden_builtin_def(name) \
-	.globl __GI_memcpy; __GI_memcpy = __memcpy
-#  endif
-#endif
-#define SAVE0	(RETVAL - 8)
-#define SAVE1	(SAVE0	- 8)
-#define SAVE2	(SAVE1	- 8)
-#define SAVE3	(SAVE2	- 8)
-
-        .text
-
-#if defined PIC && IS_IN (libc)
-ENTRY_CHK (__memcpy_chk)
-
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-
-END_CHK (__memcpy_chk)
-#endif
-
-ENTRY(memcpy)				/* (void *, const void*, size_t) */
-
-/* Handle tiny blocks. */
-
-L(1try):				/* up to 32B */
-	cmpq	$32, %rdx
-#ifndef USE_AS_MEMPCPY
-	movq	%rdi, %rax		/* save return value */
-#endif
-	jae	L(1after)
-
-L(1):					/* 1-byte once */
-	testb	$1, %dl
-	jz	L(1a)
-
-	movzbl	(%rsi),	%ecx
-	movb	%cl, (%rdi)
-
-	incq	%rsi
-	incq	%rdi
-
-	.p2align 4,, 4
-
-L(1a):					/* 2-byte once */
-	testb	$2, %dl
-	jz	L(1b)
-
-	movzwl	(%rsi),	%ecx
-	movw	%cx, (%rdi)
-
-	addq	$2, %rsi
-	addq	$2, %rdi
-
-	.p2align 4,, 4
-
-L(1b):					/* 4-byte once */
-	testb	$4, %dl
-	jz	L(1c)
-
-	movl	(%rsi),	%ecx
-	movl	%ecx, (%rdi)
-
-	addq	$4, %rsi
-	addq	$4, %rdi
-
-	.p2align 4,, 4
-
-L(1c):					/* 8-byte once */
-	testb	$8, %dl
-	jz	L(1d)
-
-	movq	(%rsi), %rcx
-	movq	%rcx, (%rdi)
-
-	addq	$8, %rsi
-	addq	$8, %rdi
-
-	.p2align 4,, 4
-
-L(1d):					/* 16-byte loop */
-	andl	$0xf0, %edx
-	jz	L(exit)
-
-	.p2align 4
-
-L(1loop):
-	movq	 (%rsi), %rcx
-	movq	8(%rsi), %r8
-	movq	%rcx,  (%rdi)
-	movq	 %r8, 8(%rdi)
-
-	subl	$16, %edx
-
-	leaq	16(%rsi), %rsi
-	leaq	16(%rdi), %rdi
-
-	jnz	L(1loop)
-
-	.p2align 4,, 4
-
-L(exit):				/* exit */
-#ifdef USE_AS_MEMPCPY
-	movq	%rdi, %rax		/* return value */
-#else
-	rep
-#endif
-	retq
-
-	.p2align 4
-
-L(1after):
-#ifndef USE_AS_MEMPCPY
-	movq	%rax, RETVAL(%rsp)	/* save return value */
-#endif
-
-/* Align to the natural word size. */
-
-L(aligntry):
-	movl	%esi, %ecx      	/* align by source */
-
-	andl	$7, %ecx
-	jz	L(alignafter)  		/* already aligned */
-
-L(align):		      		/* align */
-	leaq	-8(%rcx, %rdx), %rdx	/* calculate remaining bytes */
-	subl	$8, %ecx
-
-	.p2align 4
-
-L(alignloop):				/* 1-byte alignment loop */
-	movzbl	(%rsi), %eax
-	movb	%al, (%rdi)
-
-	incl	%ecx
-
-	leaq	1(%rsi), %rsi
-	leaq	1(%rdi), %rdi
-
-	jnz	L(alignloop)
-
-	.p2align 4
-
-L(alignafter):
-
-/* Handle mid-sized blocks. */
-
-L(32try):				/* up to 1KB */
-	cmpq	$1024, %rdx
-	ja	L(32after)
-
-L(32):					/* 32-byte loop */
-	movl	%edx, %ecx
-	shrl	$5, %ecx
-	jz	L(32skip)
-
-	.p2align 4
-
-L(32loop):
-	decl	%ecx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %r8
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-
-	movq	%rax,   (%rdi)
-	movq	 %r8,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-
-	leaq	32(%rsi), %rsi
-	leaq	32(%rdi), %rdi
-
-	jz	L(32skip)		/* help out smaller blocks */
-
-	decl	%ecx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %r8
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-
-	movq	%rax,   (%rdi)
-	movq	 %r8,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-
-	leaq	32(%rsi), %rsi
-	leaq	32(%rdi), %rdi
-
-	jnz	L(32loop)
-
-	.p2align 4
-
-L(32skip):
-	andl	$31, %edx		/* check for left overs */
-#ifdef USE_AS_MEMPCPY
-	jnz	L(1)
-
-	movq	%rdi, %rax
-#else
-	movq	RETVAL(%rsp), %rax
-	jnz	L(1)
-
-	rep
-#endif
-	retq				/* exit */
-
-	.p2align 4
-
-L(32after):
-
-/*
-	In order to minimize code-size in RTLD, algorithms specific for
-	larger blocks are excluded when building for RTLD.
-*/
-
-/* Handle blocks smaller than 1/2 L1. */
-
-L(fasttry):				/* first 1/2 L1 */
-#if IS_IN (libc)			/* only up to this algorithm outside of libc.so */
-	mov	__x86_data_cache_size_half(%rip), %R11_LP
-	cmpq	%rdx, %r11		/* calculate the smaller of */
-	cmovaq	%rdx, %r11		/* remaining bytes and 1/2 L1 */
-#endif
-
-L(fast):				/* good ol' MOVS */
-#if IS_IN (libc)
-	movq	%r11, %rcx
-	andq	$-8, %r11
-#else
-	movq	%rdx, %rcx
-#endif
-	shrq	$3, %rcx
-	jz	L(fastskip)
-
-	rep
-	movsq
-
-	.p2align 4,, 4
-
-L(fastskip):
-#if IS_IN (libc)
-	subq	%r11, %rdx		/* check for more */
-	testq	$-8, %rdx
-	jnz	L(fastafter)
-#endif
-
-	andl	$7, %edx		/* check for left overs */
-#ifdef USE_AS_MEMPCPY
-	jnz	L(1)
-
-	movq	%rdi, %rax
-#else
-	movq	RETVAL(%rsp), %rax
-	jnz	L(1)
-
-	rep
-#endif
-	retq				/* exit */
-
-#if IS_IN (libc)			/* none of the algorithms below for RTLD */
-
-	.p2align 4
-
-L(fastafter):
-
-/* Handle large blocks smaller than 1/2 L2. */
-
-L(pretry):				/* first 1/2 L2 */
-	mov	__x86_shared_cache_size_half (%rip), %R8_LP
-	cmpq	%rdx, %r8		/* calculate the lesser of */
-	cmovaq	%rdx, %r8		/* remaining bytes and 1/2 L2 */
-
-L(pre):					/* 64-byte with prefetching */
-	movq	%r8, %rcx
-	andq	$-64, %r8
-	shrq	$6, %rcx
-	jz	L(preskip)
-
-	movq	%r14, SAVE0(%rsp)
-	cfi_rel_offset (%r14, SAVE0)
-	movq	%r13, SAVE1(%rsp)
-	cfi_rel_offset (%r13, SAVE1)
-	movq	%r12, SAVE2(%rsp)
-	cfi_rel_offset (%r12, SAVE2)
-	movq	%rbx, SAVE3(%rsp)
-	cfi_rel_offset (%rbx, SAVE3)
-
-	cmpl	$0, __x86_prefetchw(%rip)
-	jz	L(preloop)		/* check if PREFETCHW OK */
-
-	.p2align 4
-
-/* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */
-
-L(prewloop):				/* cache-line in state M */
-	decq	%rcx
-
-	movq	   (%rsi), %rax
-	movq	 8 (%rsi), %rbx
-	movq	16 (%rsi), %r9
-	movq	24 (%rsi), %r10
-	movq	32 (%rsi), %r11
-	movq	40 (%rsi), %r12
-	movq	48 (%rsi), %r13
-	movq	56 (%rsi), %r14
-
-	prefetcht0	 0 + 896 (%rsi)
-	prefetcht0	64 + 896 (%rsi)
-
-	movq	%rax,   (%rdi)
-	movq	%rbx,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-	movq	%r11, 32(%rdi)
-	movq	%r12, 40(%rdi)
-	movq	%r13, 48(%rdi)
-	movq	%r14, 56(%rdi)
-
-	leaq	64(%rsi), %rsi
-	leaq	64(%rdi), %rdi
-
-	jz	L(prebail)
-
-	decq	%rcx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %rbx
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-	movq	32(%rsi), %r11
-	movq	40(%rsi), %r12
-	movq	48(%rsi), %r13
-	movq	56(%rsi), %r14
-
-	movq	%rax,   (%rdi)
-	movq	%rbx,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-	movq	%r11, 32(%rdi)
-	movq	%r12, 40(%rdi)
-	movq	%r13, 48(%rdi)
-	movq	%r14, 56(%rdi)
-
-	prefetchw	896 - 64(%rdi)
-	prefetchw	896 -  0(%rdi)
-
-	leaq	64(%rsi), %rsi
-	leaq	64(%rdi), %rdi
-
-	jnz	L(prewloop)
-	jmp	L(prebail)
-
-	.p2align 4
-
-/* ... when PREFETCHW is not available. */
-
-L(preloop):				/* cache-line in state E */
-	decq	%rcx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %rbx
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-	movq	32(%rsi), %r11
-	movq	40(%rsi), %r12
-	movq	48(%rsi), %r13
-	movq	56(%rsi), %r14
-
-	prefetcht0	896 +  0(%rsi)
-	prefetcht0	896 + 64(%rsi)
-
-	movq	%rax,   (%rdi)
-	movq	%rbx,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-	movq	%r11, 32(%rdi)
-	movq	%r12, 40(%rdi)
-	movq	%r13, 48(%rdi)
-	movq	%r14, 56(%rdi)
-
-	leaq	64 (%rsi), %rsi
-	leaq	64 (%rdi), %rdi
-
-	jz	L(prebail)
-
-	decq	%rcx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %rbx
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-	movq	32(%rsi), %r11
-	movq	40(%rsi), %r12
-	movq	48(%rsi), %r13
-	movq	56(%rsi), %r14
-
-	prefetcht0	896 - 64(%rdi)
-	prefetcht0	896 -  0(%rdi)
-
-	movq	%rax,   (%rdi)
-	movq	%rbx,  8(%rdi)
-	movq	 %r9, 16(%rdi)
-	movq	%r10, 24(%rdi)
-	movq	%r11, 32(%rdi)
-	movq	%r12, 40(%rdi)
-	movq	%r13, 48(%rdi)
-	movq	%r14, 56(%rdi)
-
-	leaq	64(%rsi), %rsi
-	leaq	64(%rdi), %rdi
-
-	jnz	L(preloop)
-
-L(prebail):
-	movq	SAVE3(%rsp), %rbx
-	cfi_restore (%rbx)
-	movq	SAVE2(%rsp), %r12
-	cfi_restore (%r12)
-	movq	SAVE1(%rsp), %r13
-	cfi_restore (%r13)
-	movq	SAVE0(%rsp), %r14
-	cfi_restore (%r14)
-
-/*       .p2align 4 */
-
-L(preskip):
-	subq	%r8, %rdx		/* check for more */
-	testq	$-64, %rdx
-	jnz	L(preafter)
-
-	andl	$63, %edx		/* check for left overs */
-#ifdef USE_AS_MEMPCPY
-	jnz	L(1)
-
-	movq	%rdi, %rax
-#else
-	movq	RETVAL(%rsp), %rax
-	jnz	L(1)
-
-	rep
-#endif
-	retq				/* exit */
-
-	.p2align 4
-
-L(preafter):
-
-/* Handle huge blocks. */
-
-L(NTtry):
-
-L(NT):					/* non-temporal 128-byte */
-	movq	%rdx, %rcx
-	shrq	$7, %rcx
-	jz	L(NTskip)
-
-	movq	%r14, SAVE0(%rsp)
-	cfi_rel_offset (%r14, SAVE0)
-	movq	%r13, SAVE1(%rsp)
-	cfi_rel_offset (%r13, SAVE1)
-	movq	%r12, SAVE2(%rsp)
-	cfi_rel_offset (%r12, SAVE2)
-
-       .p2align 4
-
-L(NTloop):
-	prefetchnta	768(%rsi)
-	prefetchnta	832(%rsi)
-
-	decq	%rcx
-
-	movq	  (%rsi), %rax
-	movq	 8(%rsi), %r8
-	movq	16(%rsi), %r9
-	movq	24(%rsi), %r10
-	movq	32(%rsi), %r11
-	movq	40(%rsi), %r12
-	movq	48(%rsi), %r13
-	movq	56(%rsi), %r14
-
-	movntiq	%rax,   (%rdi)
-	movntiq	 %r8,  8(%rdi)
-	movntiq	 %r9, 16(%rdi)
-	movntiq	%r10, 24(%rdi)
-	movntiq	%r11, 32(%rdi)
-	movntiq	%r12, 40(%rdi)
-	movntiq	%r13, 48(%rdi)
-	movntiq	%r14, 56(%rdi)
-
-	movq	 64(%rsi), %rax
-	movq	 72(%rsi), %r8
-	movq	 80(%rsi), %r9
-	movq	 88(%rsi), %r10
-	movq	 96(%rsi), %r11
-	movq	104(%rsi), %r12
-	movq	112(%rsi), %r13
-	movq	120(%rsi), %r14
-
-	movntiq	%rax,  64(%rdi)
-	movntiq	 %r8,  72(%rdi)
-	movntiq	 %r9,  80(%rdi)
-	movntiq	%r10,  88(%rdi)
-	movntiq	%r11,  96(%rdi)
-	movntiq	%r12, 104(%rdi)
-	movntiq	%r13, 112(%rdi)
-	movntiq	%r14, 120(%rdi)
-
-	leaq	128(%rsi), %rsi
-	leaq	128(%rdi), %rdi
-
-	jnz	L(NTloop)
-
-	sfence				/* serialize memory stores */
-
-	movq	SAVE2(%rsp), %r12
-	cfi_restore (%r12)
-	movq	SAVE1(%rsp), %r13
-	cfi_restore (%r13)
-	movq	SAVE0(%rsp), %r14
-	cfi_restore (%r14)
-
-L(NTskip):
-	andl	$127, %edx		/* check for left overs */
-#ifdef USE_AS_MEMPCPY
-	jnz	L(1)
-
-	movq	%rdi, %rax
-#else
-	movq	RETVAL(%rsp), %rax
-	jnz	L(1)
-
-	rep
-#endif
-	retq				/* exit */
-
-#endif /* IS_IN (libc) */
-
-END(memcpy)
-
-#ifndef USE_AS_MEMPCPY
-libc_hidden_builtin_def (memcpy)
-# if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc)
-#  undef memcpy
-#  include <shlib-compat.h>
-versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
-# endif
-#endif
+/* Implemented in memcpy.S.  */
diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
new file mode 100644
index 0000000..a7ae453
--- /dev/null
+++ b/sysdeps/x86_64/memmove.S
@@ -0,0 +1,71 @@
+/* Optimized memmove for x86-64.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+#define PREFETCHNT	prefetchnta
+#define VMOVNT		movntdq
+/* Use movups and movaps for smaller code sizes.  */
+#define VMOVU		movups
+#define VMOVA		movaps
+
+#define SECTION(p)		p
+
+#ifdef USE_MULTIARCH
+# if !defined SHARED || !IS_IN (libc)
+#  define MEMCPY_SYMBOL(p,s)		memcpy
+# endif
+#else
+# if defined SHARED && IS_IN (libc)
+#  define MEMCPY_SYMBOL(p,s)		__memcpy
+# else
+#  define MEMCPY_SYMBOL(p,s)		memcpy
+# endif
+#endif
+#if !defined SHARED || !defined USE_MULTIARCH || !IS_IN (libc)
+# define MEMPCPY_SYMBOL(p,s)		__mempcpy
+#endif
+#ifndef MEMMOVE_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)	p
+# define MEMMOVE_SYMBOL(p,s)		memmove
+#endif
+
+#include "multiarch/memmove-vec-unaligned-erms.S"
+
+#ifndef USE_MULTIARCH
+libc_hidden_builtin_def (memmove)
+# if defined SHARED && IS_IN (libc)
+strong_alias (memmove, __memcpy)
+libc_hidden_ver (memmove, memcpy)
+# endif
+libc_hidden_def (__mempcpy)
+weak_alias (__mempcpy, mempcpy)
+libc_hidden_builtin_def (mempcpy)
+
+# if defined SHARED && IS_IN (libc)
+#  undef memcpy
+#  include <shlib-compat.h>
+versioned_symbol (libc, __memcpy, memcpy, GLIBC_2_14);
+
+#  if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
+#  endif
+# endif
+#endif
diff --git a/sysdeps/x86_64/memmove.c b/sysdeps/x86_64/memmove_chk.S
similarity index 64%
rename from sysdeps/x86_64/memmove.c
rename to sysdeps/x86_64/memmove_chk.S
index 07f8185..ee154f1 100644
--- a/sysdeps/x86_64/memmove.c
+++ b/sysdeps/x86_64/memmove_chk.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2011-2016 Free Software Foundation, Inc.
+/* Checking memmove for x86-64.
+   Copyright (C) 2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -15,12 +16,18 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include "string/memmove.c"
+#include <sysdep.h>
+#include "asm-syntax.h"
 
-#if !defined memmove && IS_IN (libc)
-#include <shlib-compat.h>
-
-#if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
-compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
-#endif
+#ifndef SHARED
+	/* For libc.so this is defined in memmove.S.
+	   For libc.a, this is a separate source to avoid
+	   memmove bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__memmove_chk)
+	cmpq	%rdx, %rcx
+	jb	__chk_fail
+	jmp	memmove
+END (__memmove_chk)
 #endif
diff --git a/sysdeps/x86_64/mempcpy.S b/sysdeps/x86_64/mempcpy.S
index acee5e5..d98500a 100644
--- a/sysdeps/x86_64/mempcpy.S
+++ b/sysdeps/x86_64/mempcpy.S
@@ -1,8 +1 @@
-#define USE_AS_MEMPCPY
-#define memcpy __mempcpy
-#define __memcpy_chk __mempcpy_chk
-#include <sysdeps/x86_64/memcpy.S>
-
-libc_hidden_def (__mempcpy)
-weak_alias (__mempcpy, mempcpy)
-libc_hidden_builtin_def (mempcpy)
+/* Implemented in memcpy.S.  */
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 35752a8..d6636cb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,10 +7,9 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
-		   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
+		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
-		   memcpy-ssse3-back memmove-avx-unaligned \
-		   memcpy-avx-unaligned \
+		   memcpy-ssse3-back \
 		   memmove-ssse3-back \
 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
@@ -21,7 +20,6 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
 		   strcspn-c strpbrk-c strspn-c varshift \
 		   memset-avx512-no-vzeroupper \
-		   memmove-sse2-unaligned-erms \
 		   memmove-avx-unaligned-erms \
 		   memmove-avx512-unaligned-erms \
 		   memset-avx2-unaligned-erms \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index ca05ff6..449b046 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -54,7 +54,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __memmove_chk_avx512_unaligned_2)
+			      __memmove_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memmove_chk_avx512_unaligned_erms)
@@ -64,9 +64,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __memmove_chk_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_CPU_FEATURE (SSSE3),
@@ -75,11 +72,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
-			      __memmove_chk_sse2_unaligned_2)
+			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
-			      __memmove_chk_sse2_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
-			      __memmove_chk_sse2))
+			      __memmove_chk_sse2_unaligned_erms))
 
   /* Support sysdeps/x86_64/multiarch/memmove.S.  */
   IFUNC_IMPL (i, name, memmove,
@@ -88,9 +83,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __memmove_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, memmove,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_avx_unaligned_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	      IFUNC_IMPL_ADD (array, i, memmove,
@@ -98,7 +90,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __memmove_avx512_unaligned_2)
+			      __memmove_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memmove_avx512_unaligned_erms)
@@ -109,10 +101,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
-			      __memmove_sse2_unaligned_2)
+			      __memmove_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
-			      __memmove_sse2_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
+			      __memmove_sse2_unaligned_erms))
 
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
@@ -326,7 +317,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __memcpy_chk_avx512_unaligned_2)
+			      __memcpy_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memcpy_chk_avx512_unaligned_erms)
@@ -336,9 +327,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __memcpy_chk_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
@@ -347,11 +335,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
-			      __memcpy_chk_sse2_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
-			      __memcpy_chk_sse2_unaligned_erms)
+			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
-			      __memcpy_chk_sse2))
+			      __memcpy_chk_sse2_unaligned_erms))
 
   /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
   IFUNC_IMPL (i, name, memcpy,
@@ -360,9 +346,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __memcpy_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, memcpy,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_ssse3_back)
@@ -374,18 +357,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __memcpy_avx512_unaligned_2)
+			      __memcpy_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memcpy_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
-			      __memcpy_sse2_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
 			      __memcpy_sse2_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
-	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
   IFUNC_IMPL (i, name, __mempcpy_chk,
@@ -395,7 +375,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __mempcpy_chk_avx512_unaligned_2)
+			      __mempcpy_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __mempcpy_chk_avx512_unaligned_erms)
@@ -405,9 +385,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __mempcpy_chk_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_chk_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
@@ -416,11 +393,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
-			      __mempcpy_chk_sse2_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
-			      __mempcpy_chk_sse2_unaligned_erms)
+			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
-			      __mempcpy_chk_sse2))
+			      __mempcpy_chk_sse2_unaligned_erms))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
   IFUNC_IMPL (i, name, mempcpy,
@@ -430,7 +405,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
-			      __mempcpy_avx512_unaligned_2)
+			      __mempcpy_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __mempcpy_avx512_unaligned_erms)
@@ -440,20 +415,16 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
-			      __mempcpy_avx_unaligned_2)
-	      IFUNC_IMPL_ADD (array, i, mempcpy,
-			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
-			      __mempcpy_sse2_unaligned_2)
+			      __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
 			      __mempcpy_sse2_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms))
 
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
   IFUNC_IMPL (i, name, strncmp,
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
deleted file mode 100644
index dd4187f..0000000
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ /dev/null
@@ -1,391 +0,0 @@
-/* memcpy with AVX
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc) \
-    && (defined SHARED \
-        || defined USE_AS_MEMMOVE \
-	|| !defined USE_MULTIARCH)
-
-#include "asm-syntax.h"
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_avx_unaligned
-# define MEMCPY_CHK	__memcpy_chk_avx_unaligned
-# define MEMPCPY	__mempcpy_avx_unaligned
-# define MEMPCPY_CHK	__mempcpy_chk_avx_unaligned
-#endif
-
-	.section .text.avx,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%rdi, %rax
-#ifdef USE_AS_MEMPCPY
-	add	%rdx, %rax
-#endif
-L(start):
-	cmp	$256, %rdx
-	jae	L(256bytesormore)
-	cmp	$16, %dl
-	jb	L(less_16bytes)
-	cmp	$128, %dl
-	jb	L(less_128bytes)
-	vmovdqu (%rsi), %xmm0
-	lea	(%rsi, %rdx), %rcx
-	vmovdqu 0x10(%rsi), %xmm1
-	vmovdqu 0x20(%rsi), %xmm2
-	vmovdqu 0x30(%rsi), %xmm3
-	vmovdqu 0x40(%rsi), %xmm4
-	vmovdqu 0x50(%rsi), %xmm5
-	vmovdqu 0x60(%rsi), %xmm6
-	vmovdqu 0x70(%rsi), %xmm7
-	vmovdqu -0x80(%rcx), %xmm8
-	vmovdqu -0x70(%rcx), %xmm9
-	vmovdqu -0x60(%rcx), %xmm10
-	vmovdqu -0x50(%rcx), %xmm11
-	vmovdqu -0x40(%rcx), %xmm12
-	vmovdqu -0x30(%rcx), %xmm13
-	vmovdqu -0x20(%rcx), %xmm14
-	vmovdqu -0x10(%rcx), %xmm15
-	lea	(%rdi, %rdx), %rdx
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm1, 0x10(%rdi)
-	vmovdqu %xmm2, 0x20(%rdi)
-	vmovdqu %xmm3, 0x30(%rdi)
-	vmovdqu %xmm4, 0x40(%rdi)
-	vmovdqu %xmm5, 0x50(%rdi)
-	vmovdqu %xmm6, 0x60(%rdi)
-	vmovdqu %xmm7, 0x70(%rdi)
-	vmovdqu %xmm8, -0x80(%rdx)
-	vmovdqu %xmm9, -0x70(%rdx)
-	vmovdqu %xmm10, -0x60(%rdx)
-	vmovdqu %xmm11, -0x50(%rdx)
-	vmovdqu %xmm12, -0x40(%rdx)
-	vmovdqu %xmm13, -0x30(%rdx)
-	vmovdqu %xmm14, -0x20(%rdx)
-	vmovdqu %xmm15, -0x10(%rdx)
-	ret
-	.p2align 4
-L(less_128bytes):
-	cmp	$64, %dl
-	jb	L(less_64bytes)
-	vmovdqu (%rsi), %xmm0
-	lea	(%rsi, %rdx), %rcx
-	vmovdqu 0x10(%rsi), %xmm1
-	vmovdqu 0x20(%rsi), %xmm2
-	lea	(%rdi, %rdx), %rdx
-	vmovdqu 0x30(%rsi), %xmm3
-	vmovdqu -0x40(%rcx), %xmm4
-	vmovdqu -0x30(%rcx), %xmm5
-	vmovdqu -0x20(%rcx), %xmm6
-	vmovdqu -0x10(%rcx), %xmm7
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm1, 0x10(%rdi)
-	vmovdqu %xmm2, 0x20(%rdi)
-	vmovdqu %xmm3, 0x30(%rdi)
-	vmovdqu %xmm4, -0x40(%rdx)
-	vmovdqu %xmm5, -0x30(%rdx)
-	vmovdqu %xmm6, -0x20(%rdx)
-	vmovdqu %xmm7, -0x10(%rdx)
-	ret
-
-	.p2align 4
-L(less_64bytes):
-	cmp	$32, %dl
-	jb	L(less_32bytes)
-	vmovdqu (%rsi), %xmm0
-	vmovdqu 0x10(%rsi), %xmm1
-	vmovdqu -0x20(%rsi, %rdx), %xmm6
-	vmovdqu -0x10(%rsi, %rdx), %xmm7
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm1, 0x10(%rdi)
-	vmovdqu %xmm6, -0x20(%rdi, %rdx)
-	vmovdqu %xmm7, -0x10(%rdi, %rdx)
-	ret
-
-	.p2align 4
-L(less_32bytes):
-	vmovdqu (%rsi), %xmm0
-	vmovdqu -0x10(%rsi, %rdx), %xmm7
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm7, -0x10(%rdi, %rdx)
-	ret
-
-	.p2align 4
-L(less_16bytes):
-	cmp	$8, %dl
-	jb	L(less_8bytes)
-	movq -0x08(%rsi, %rdx),	%rcx
-	movq (%rsi),	%rsi
-	movq %rsi, (%rdi)
-	movq %rcx, -0x08(%rdi, %rdx)
-	ret
-
-	.p2align 4
-L(less_8bytes):
-	cmp	$4, %dl
-	jb	L(less_4bytes)
-	mov -0x04(%rsi, %rdx), %ecx
-	mov (%rsi),	%esi
-	mov %esi, (%rdi)
-	mov %ecx, -0x04(%rdi, %rdx)
-	ret
-
-L(less_4bytes):
-	cmp	$1, %dl
-	jbe	L(less_2bytes)
-	mov -0x02(%rsi, %rdx),	%cx
-	mov (%rsi),	%si
-	mov %si, (%rdi)
-	mov %cx, -0x02(%rdi, %rdx)
-	ret
-
-L(less_2bytes):
-	jb	L(less_0bytes)
-	mov	(%rsi), %cl
-	mov	%cl,	(%rdi)
-L(less_0bytes):
-	ret
-
-	.p2align 4
-L(256bytesormore):
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %rcx
-	sub	%rsi, %rcx
-	cmp	%rdx, %rcx
-	jc	L(copy_backward)
-#endif
-	cmp	$2048, %rdx
-	jae	L(gobble_data_movsb)
-	mov	%rax, %r8
-	lea	(%rsi, %rdx), %rcx
-	mov	%rdi, %r10
-	vmovdqu -0x80(%rcx), %xmm5
-	vmovdqu -0x70(%rcx), %xmm6
-	mov	$0x80, %rax
-	and	$-32, %rdi
-	add	$32, %rdi
-	vmovdqu -0x60(%rcx), %xmm7
-	vmovdqu -0x50(%rcx), %xmm8
-	mov	%rdi, %r11
-	sub	%r10, %r11
-	vmovdqu -0x40(%rcx), %xmm9
-	vmovdqu -0x30(%rcx), %xmm10
-	sub	%r11, %rdx
-	vmovdqu -0x20(%rcx), %xmm11
-	vmovdqu -0x10(%rcx), %xmm12
-	vmovdqu	(%rsi), %ymm4
-	add	%r11, %rsi
-	sub	%eax, %edx
-L(goble_128_loop):
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 0x20(%rsi), %ymm1
-	vmovdqu 0x40(%rsi), %ymm2
-	vmovdqu 0x60(%rsi), %ymm3
-	add	%rax, %rsi
-	vmovdqa %ymm0, (%rdi)
-	vmovdqa %ymm1, 0x20(%rdi)
-	vmovdqa %ymm2, 0x40(%rdi)
-	vmovdqa %ymm3, 0x60(%rdi)
-	add	%rax, %rdi
-	sub	%eax, %edx
-	jae	L(goble_128_loop)
-	add	%eax, %edx
-	add	%rdi, %rdx
-	vmovdqu	%ymm4, (%r10)
-	vzeroupper
-	vmovdqu %xmm5, -0x80(%rdx)
-	vmovdqu %xmm6, -0x70(%rdx)
-	vmovdqu %xmm7, -0x60(%rdx)
-	vmovdqu %xmm8, -0x50(%rdx)
-	vmovdqu %xmm9, -0x40(%rdx)
-	vmovdqu %xmm10, -0x30(%rdx)
-	vmovdqu %xmm11, -0x20(%rdx)
-	vmovdqu %xmm12, -0x10(%rdx)
-	mov	%r8, %rax
-	ret
-
-	.p2align 4
-L(gobble_data_movsb):
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %rcx
-#else
-	mov	__x86_shared_cache_size_half(%rip), %rcx
-#endif
-	shl	$3, %rcx
-	cmp	%rcx, %rdx
-	jae	L(gobble_big_data_fwd)
-	mov	%rdx, %rcx
-	rep	movsb
-	ret
-
-	.p2align 4
-L(gobble_big_data_fwd):
-	lea	(%rsi, %rdx), %rcx
-	vmovdqu	(%rsi), %ymm4
-	vmovdqu -0x80(%rsi,%rdx), %xmm5
-	vmovdqu -0x70(%rcx), %xmm6
-	vmovdqu -0x60(%rcx), %xmm7
-	vmovdqu -0x50(%rcx), %xmm8
-	vmovdqu -0x40(%rcx), %xmm9
-	vmovdqu -0x30(%rcx), %xmm10
-	vmovdqu -0x20(%rcx), %xmm11
-	vmovdqu -0x10(%rcx), %xmm12
-	mov	%rdi, %r8
-	and	$-32, %rdi
-	add	$32, %rdi
-	mov	%rdi, %r10
-	sub	%r8, %r10
-	sub	%r10, %rdx
-	add	%r10, %rsi
-	lea	(%rdi, %rdx), %rcx
-	add	$-0x80, %rdx
-L(gobble_mem_fwd_loop):
-	prefetchnta 0x1c0(%rsi)
-	prefetchnta 0x280(%rsi)
-	vmovdqu	(%rsi), %ymm0
-	vmovdqu	0x20(%rsi), %ymm1
-	vmovdqu	0x40(%rsi), %ymm2
-	vmovdqu	0x60(%rsi), %ymm3
-	sub	$-0x80, %rsi
-	vmovntdq	%ymm0, (%rdi)
-	vmovntdq	%ymm1, 0x20(%rdi)
-	vmovntdq	%ymm2, 0x40(%rdi)
-	vmovntdq	%ymm3, 0x60(%rdi)
-	sub	$-0x80, %rdi
-	add	$-0x80, %rdx
-	jb	L(gobble_mem_fwd_loop)
-	sfence
-	vmovdqu	%ymm4, (%r8)
-	vzeroupper
-	vmovdqu %xmm5, -0x80(%rcx)
-	vmovdqu %xmm6, -0x70(%rcx)
-	vmovdqu %xmm7, -0x60(%rcx)
-	vmovdqu %xmm8, -0x50(%rcx)
-	vmovdqu %xmm9, -0x40(%rcx)
-	vmovdqu %xmm10, -0x30(%rcx)
-	vmovdqu %xmm11, -0x20(%rcx)
-	vmovdqu %xmm12, -0x10(%rcx)
-	ret
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(copy_backward):
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %rcx
-#else
-	mov	__x86_shared_cache_size_half(%rip), %rcx
-#endif
-	shl	$3, %rcx
-	vmovdqu (%rsi), %xmm5
-	vmovdqu 0x10(%rsi), %xmm6
-	add	%rdx, %rdi
-	vmovdqu 0x20(%rsi), %xmm7
-	vmovdqu 0x30(%rsi), %xmm8
-	lea	-0x20(%rdi), %r10
-	mov %rdi, %r11
-	vmovdqu 0x40(%rsi), %xmm9
-	vmovdqu 0x50(%rsi), %xmm10
-	and	$0x1f, %r11
-	vmovdqu 0x60(%rsi), %xmm11
-	vmovdqu 0x70(%rsi), %xmm12
-	xor	%r11, %rdi
-	add	%rdx, %rsi
-	vmovdqu	-0x20(%rsi), %ymm4
-	sub	%r11, %rsi
-	sub	%r11, %rdx
-	cmp	%rcx, %rdx
-	ja	L(gobble_big_data_bwd)
-	add	$-0x80, %rdx
-L(gobble_mem_bwd_llc):
-	vmovdqu	-0x20(%rsi), %ymm0
-	vmovdqu	-0x40(%rsi), %ymm1
-	vmovdqu	-0x60(%rsi), %ymm2
-	vmovdqu	-0x80(%rsi), %ymm3
-	lea	-0x80(%rsi), %rsi
-	vmovdqa	%ymm0, -0x20(%rdi)
-	vmovdqa	%ymm1, -0x40(%rdi)
-	vmovdqa	%ymm2, -0x60(%rdi)
-	vmovdqa	%ymm3, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	add	$-0x80, %rdx
-	jb	L(gobble_mem_bwd_llc)
-	vmovdqu	%ymm4, (%r10)
-	vzeroupper
-	vmovdqu %xmm5, (%rax)
-	vmovdqu %xmm6, 0x10(%rax)
-	vmovdqu %xmm7, 0x20(%rax)
-	vmovdqu %xmm8, 0x30(%rax)
-	vmovdqu %xmm9, 0x40(%rax)
-	vmovdqu %xmm10, 0x50(%rax)
-	vmovdqu %xmm11, 0x60(%rax)
-	vmovdqu %xmm12, 0x70(%rax)
-	ret
-
-	.p2align 4
-L(gobble_big_data_bwd):
-	add	$-0x80, %rdx
-L(gobble_mem_bwd_loop):
-	prefetchnta -0x1c0(%rsi)
-	prefetchnta -0x280(%rsi)
-	vmovdqu	-0x20(%rsi), %ymm0
-	vmovdqu	-0x40(%rsi), %ymm1
-	vmovdqu	-0x60(%rsi), %ymm2
-	vmovdqu	-0x80(%rsi), %ymm3
-	lea	-0x80(%rsi), %rsi
-	vmovntdq	%ymm0, -0x20(%rdi)
-	vmovntdq	%ymm1, -0x40(%rdi)
-	vmovntdq	%ymm2, -0x60(%rdi)
-	vmovntdq	%ymm3, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	add	$-0x80, %rdx
-	jb	L(gobble_mem_bwd_loop)
-	sfence
-	vmovdqu	%ymm4, (%r10)
-	vzeroupper
-	vmovdqu %xmm5, (%rax)
-	vmovdqu %xmm6, 0x10(%rax)
-	vmovdqu %xmm7, 0x20(%rax)
-	vmovdqu %xmm8, 0x30(%rax)
-	vmovdqu %xmm9, 0x40(%rax)
-	vmovdqu %xmm10, 0x50(%rax)
-	vmovdqu %xmm11, 0x60(%rax)
-	vmovdqu %xmm12, 0x70(%rax)
-	ret
-#endif
-END (MEMCPY)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
deleted file mode 100644
index c450983..0000000
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ /dev/null
@@ -1,175 +0,0 @@
-/* memcpy with unaliged loads
-   Copyright (C) 2013-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-#include <sysdep.h>
-
-#include "asm-syntax.h"
-
-
-ENTRY(__memcpy_sse2_unaligned)
-	movq	%rsi, %rax
-	leaq	(%rdx,%rdx), %rcx
-	subq	%rdi, %rax
-	subq	%rdx, %rax
-	cmpq	%rcx, %rax
-	jb	L(overlapping)
-	cmpq	$16, %rdx
-	jbe	L(less_16)
-	movdqu	(%rsi), %xmm8
-	cmpq	$32, %rdx
-	movdqu	%xmm8, (%rdi)
-	movdqu	-16(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -16(%rdi,%rdx)
-	ja	.L31
-L(return):
-	movq	%rdi, %rax
-	ret
-	.p2align 4,,10
-	.p2align 4
-.L31:
-	movdqu	16(%rsi), %xmm8
-	cmpq	$64, %rdx
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	-32(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -32(%rdi,%rdx)
-	jbe	L(return)
-	movdqu	32(%rsi), %xmm8
-	cmpq	$128, %rdx
-	movdqu	%xmm8, 32(%rdi)
-	movdqu	-48(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -48(%rdi,%rdx)
-	movdqu	48(%rsi), %xmm8
-	movdqu	%xmm8, 48(%rdi)
-	movdqu	-64(%rsi,%rdx), %xmm8
-	movdqu	%xmm8, -64(%rdi,%rdx)
-	jbe	L(return)
-	leaq	64(%rdi), %rcx
-	addq	%rdi, %rdx
-	andq	$-64, %rdx
-	andq	$-64, %rcx
-	movq	%rcx, %rax
-	subq	%rdi, %rax
-	addq	%rax, %rsi
-	cmpq	%rdx, %rcx
-	je	L(return)
-	movq	%rsi, %r10
-	subq	%rcx, %r10
-	leaq	16(%r10), %r9
-	leaq	32(%r10), %r8
-	leaq	48(%r10), %rax
-	.p2align 4,,10
-	.p2align 4
-L(loop):
-	movdqu	(%rcx,%r10), %xmm8
-	movdqa	%xmm8, (%rcx)
-	movdqu	(%rcx,%r9), %xmm8
-	movdqa	%xmm8, 16(%rcx)
-	movdqu	(%rcx,%r8), %xmm8
-	movdqa	%xmm8, 32(%rcx)
-	movdqu	(%rcx,%rax), %xmm8
-	movdqa	%xmm8, 48(%rcx)
-	addq	$64, %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
-	jmp	L(return)
-L(overlapping):
-	cmpq	%rsi, %rdi
-	jae	.L3
-	testq	%rdx, %rdx
-	.p2align 4,,5
-	je	L(return)
-	movq	%rdx, %r9
-	leaq	16(%rsi), %rcx
-	leaq	16(%rdi), %r8
-	shrq	$4, %r9
-	movq	%r9, %rax
-	salq	$4, %rax
-	cmpq	%rcx, %rdi
-	setae	%cl
-	cmpq	%r8, %rsi
-	setae	%r8b
-	orl	%r8d, %ecx
-	cmpq	$15, %rdx
-	seta	%r8b
-	testb	%r8b, %cl
-	je	.L16
-	testq	%rax, %rax
-	je	.L16
-	xorl	%ecx, %ecx
-	xorl	%r8d, %r8d
-.L7:
-	movdqu	(%rsi,%rcx), %xmm8
-	addq	$1, %r8
-	movdqu	%xmm8, (%rdi,%rcx)
-	addq	$16, %rcx
-	cmpq	%r8, %r9
-	ja	.L7
-	cmpq	%rax, %rdx
-	je	L(return)
-.L21:
-	movzbl	(%rsi,%rax), %ecx
-	movb	%cl, (%rdi,%rax)
-	addq	$1, %rax
-	cmpq	%rax, %rdx
-	ja	.L21
-	jmp	L(return)
-L(less_16):
-	testb	$24, %dl
-	jne	L(between_9_16)
-	testb	$4, %dl
-	.p2align 4,,5
-	jne	L(between_5_8)
-	testq	%rdx, %rdx
-	.p2align 4,,2
-	je	L(return)
-	movzbl	(%rsi), %eax
-	testb	$2, %dl
-	movb	%al, (%rdi)
-	je	L(return)
-	movzwl	-2(%rsi,%rdx), %eax
-	movw	%ax, -2(%rdi,%rdx)
-	jmp	L(return)
-.L3:
-	leaq	-1(%rdx), %rax
-	.p2align 4,,10
-	.p2align 4
-.L11:
-	movzbl	(%rsi,%rax), %edx
-	movb	%dl, (%rdi,%rax)
-	subq	$1, %rax
-	jmp	.L11
-L(between_9_16):
-	movq	(%rsi), %rax
-	movq	%rax, (%rdi)
-	movq	-8(%rsi,%rdx), %rax
-	movq	%rax, -8(%rdi,%rdx)
-	jmp	L(return)
-.L16:
-	xorl	%eax, %eax
-	jmp	.L21
-L(between_5_8):
-	movl	(%rsi), %eax
-	movl	%eax, (%rdi)
-	movl	-4(%rsi,%rdx), %eax
-	movl	%eax, -4(%rdi,%rdx)
-	jmp	L(return)
-END(__memcpy_sse2_unaligned)
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 5b045d7..f6771a4 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -19,7 +19,6 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <shlib-compat.h>
 #include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
@@ -30,21 +29,34 @@
 ENTRY(__new_memcpy)
 	.type	__new_memcpy, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef HAVE_AVX512_ASM_SUPPORT
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	1f
+	lea	__memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	1f
-	lea    __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
+	jnz	2f
+	lea	__memcpy_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memcpy_avx512_unaligned(%rip), %RAX_LP
 	ret
-#endif
+# endif
 1:	lea	__memcpy_avx_unaligned(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jnz	2f
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
 	lea	__memcpy_sse2_unaligned(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
-	jnz	2f
-	lea	__memcpy_sse2(%rip), %RAX_LP
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
 	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
 	lea    __memcpy_ssse3_back(%rip), %RAX_LP
@@ -54,37 +66,7 @@ ENTRY(__new_memcpy)
 2:	ret
 END(__new_memcpy)
 
-# undef ENTRY
-# define ENTRY(name) \
-	.type __memcpy_sse2, @function; \
-	.globl __memcpy_sse2; \
-	.hidden __memcpy_sse2; \
-	.p2align 4; \
-	__memcpy_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __memcpy_sse2, .-__memcpy_sse2
-
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
-	.type __memcpy_chk_sse2, @function; \
-	.globl __memcpy_chk_sse2; \
-	.p2align 4; \
-	__memcpy_chk_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
-	cfi_endproc; .size __memcpy_chk_sse2, .-__memcpy_chk_sse2
-
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal memcpy calls through a PLT.
-   The speedup we get from using SSSE3 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_memcpy; __GI_memcpy = __memcpy_sse2
-
+# undef memcpy
+# include <shlib-compat.h>
 versioned_symbol (libc, __new_memcpy, memcpy, GLIBC_2_14);
 #endif
-
-#include "../memcpy.S"
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 648217e..11f1310 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -30,24 +30,40 @@
 ENTRY(__memcpy_chk)
 	.type	__memcpy_chk, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef HAVE_AVX512_ASM_SUPPORT
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
-	jz      1f
+	jz	1f
+	lea	__memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz      1f
-	leaq    __memcpy_chk_avx512_no_vzeroupper(%rip), %rax
+	jnz	2f
+	lea	__memcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memcpy_chk_avx512_unaligned(%rip), %RAX_LP
 	ret
-#endif
-1:	leaq	__memcpy_chk_sse2(%rip), %rax
+# endif
+1:	lea	__memcpy_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memcpy_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
 	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leaq	__memcpy_chk_ssse3(%rip), %rax
+	lea    __memcpy_chk_ssse3_back(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Fast_Copy_Backward)
-	jz	2f
-	leaq	__memcpy_chk_ssse3_back(%rip), %rax
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz  2f
-	leaq    __memcpy_chk_avx_unaligned(%rip), %rax
+	jnz	2f
+	lea	__memcpy_chk_ssse3(%rip), %RAX_LP
 2:	ret
 END(__memcpy_chk)
 # else
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
deleted file mode 100644
index 75e35f2..0000000
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* memmove with AVX
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_avx_unaligned
-#define MEMCPY_CHK	__memmove_chk_avx_unaligned
-#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
deleted file mode 100644
index d7edb18..0000000
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ /dev/null
@@ -1,13 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE	16
-# define VEC(i)		xmm##i
-# define VMOVNT		movntdq
-/* Use movups and movaps for smaller code sizes.  */
-# define VMOVU		movups
-# define VMOVA		movaps
-
-# define SECTION(p)		p
-# define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
-
-# include "memmove-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 51e562b..689f5fd 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -105,28 +105,28 @@
 
 	.section SECTION(.text),"ax",@progbits
 #if defined SHARED && IS_IN (libc)
-ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 #endif
 
 #if VEC_SIZE == 16 || defined SHARED
-ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
+ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 	movq	%rdi, %rax
 	addq	%rdx, %rax
 	jmp	L(start)
-END (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
+END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 #endif
 
 #if defined SHARED && IS_IN (libc)
-ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 #endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
 	movq	%rdi, %rax
 L(start):
 	cmpq	$VEC_SIZE, %rdx
@@ -147,7 +147,7 @@ L(nop):
 #endif
 	ret
 #if defined USE_MULTIARCH && IS_IN (libc)
-END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+END (MEMMOVE_SYMBOL (__memmove, unaligned))
 
 # if VEC_SIZE == 16 && defined SHARED
 /* Only used to measure performance of REP MOVSB.  */
@@ -526,11 +526,11 @@ strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
 	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
 #  endif
-strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2),
-	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned_2))
+strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
+	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
 # endif
 #endif
 #if VEC_SIZE == 16 || defined SHARED
-strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
-	      MEMCPY_SYMBOL (__memcpy, unaligned_2))
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
+	      MEMCPY_SYMBOL (__memcpy, unaligned))
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove.S b/sysdeps/x86_64/multiarch/memmove.S
new file mode 100644
index 0000000..25c3586
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove.S
@@ -0,0 +1,98 @@
+/* Multiple versions of memmove
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in lib and for
+   DSO.  */
+#if IS_IN (libc)
+	.text
+ENTRY(__libc_memmove)
+	.type	__libc_memmove, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+# ifdef HAVE_AVX512_ASM_SUPPORT
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__memmove_avx512_no_vzeroupper(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	2f
+	lea	__memmove_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memmove_avx512_unaligned(%rip), %RAX_LP
+	ret
+# endif
+1:	lea	__memmove_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memmove_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
+	jz	2f
+	lea	__memmove_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __memmove_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memmove_ssse3(%rip), %RAX_LP
+2:	ret
+END(__libc_memmove)
+#endif
+
+#if IS_IN (libc)
+# define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
+
+# ifdef SHARED
+libc_hidden_ver (__memmove_sse2_unaligned, memmove)
+libc_hidden_ver (__memcpy_sse2_unaligned, memcpy)
+libc_hidden_ver (__mempcpy_sse2_unaligned, mempcpy)
+libc_hidden_ver (__mempcpy_sse2_unaligned, __mempcpy)
+
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memmove calls through a PLT.
+   The speedup we get from using SSE2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def
+# endif
+strong_alias (__libc_memmove, memmove)
+#endif
+
+#if !defined SHARED || !IS_IN (libc)
+weak_alias (__mempcpy, mempcpy)
+#endif
+
+#include "../memmove.S"
+
+#if defined SHARED && IS_IN (libc)
+# include <shlib-compat.h>
+# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
+/* Use __memmove_sse2_unaligned to support overlapping addresses.  */
+compat_symbol (libc, __memmove_sse2_unaligned, memcpy, GLIBC_2_2_5);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
deleted file mode 100644
index 8da5640..0000000
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Multiple versions of memmove.
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-# define MEMMOVE __memmove_sse2
-# ifdef SHARED
-#  undef libc_hidden_builtin_def
-#  define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__memmove_sse2, __GI_memmove, __memmove_sse2);
-# endif
-
-/* Redefine memmove so that the compiler won't complain about the type
-   mismatch with the IFUNC selector in strong_alias, below.  */
-# undef memmove
-# define memmove __redirect_memmove
-# include <string.h>
-# undef memmove
-
-extern __typeof (__redirect_memmove) __memmove_sse2 attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_ssse3 attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_ssse3_back attribute_hidden;
-extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
-# ifdef HAVE_AVX512_ASM_SUPPORT
-  extern __typeof (__redirect_memmove) __memmove_avx512_no_vzeroupper attribute_hidden;
-# endif
-
-#endif
-
-#include "string/memmove.c"
-
-#if IS_IN (libc)
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-   ifunc symbol properly.  */
-extern __typeof (__redirect_memmove) __libc_memmove;
-libc_ifunc (__libc_memmove,
-#ifdef HAVE_AVX512_ASM_SUPPORT
-	    HAS_ARCH_FEATURE (AVX512F_Usable)
-	      && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	    ? __memmove_avx512_no_vzeroupper
-	    :
-#endif
-	    (HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	    ? __memmove_avx_unaligned
-	    : (HAS_CPU_FEATURE (SSSE3)
-	       ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
-	          ? __memmove_ssse3_back : __memmove_ssse3)
-	       : __memmove_sse2)));
-
-strong_alias (__libc_memmove, memmove)
-
-# if SHLIB_COMPAT (libc, GLIBC_2_2_5, GLIBC_2_14)
-compat_symbol (libc, memmove, memcpy, GLIBC_2_2_5);
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/memmove_chk.S
similarity index 55%
copy from sysdeps/x86_64/multiarch/mempcpy_chk.S
copy to sysdeps/x86_64/multiarch/memmove_chk.S
index 6e8a89d..cd639b8 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memmove_chk.S
@@ -1,7 +1,6 @@
-/* Multiple versions of __mempcpy_chk
+/* Multiple versions of __memmove_chk
    All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2016 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
+   Copyright (C) 2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -22,35 +21,51 @@
 #include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib and for
-   DSO.  There are no multiarch mempcpy functions for static binaries.
+   DSO.  There are no multiarch memmove functions for static binaries.
  */
 #if IS_IN (libc)
 # ifdef SHARED
 	.text
-ENTRY(__mempcpy_chk)
-	.type	__mempcpy_chk, @gnu_indirect_function
+ENTRY(__memmove_chk)
+	.type	__memmove_chk, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef HAVE_AVX512_ASM_SUPPORT
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	1f
+	lea	__memmove_chk_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	1f
-	leaq    __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax
+	jnz	2f
+	lea	__memmove_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memmove_chk_avx512_unaligned(%rip), %RAX_LP
 	ret
-#endif
-1:	leaq	__mempcpy_chk_sse2(%rip), %rax
-	HAS_CPU_FEATURE (SSSE3)
+# endif
+1:	lea	__memmove_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_chk_ssse3(%rip), %rax
-	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	lea	__memmove_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__memmove_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	lea	__memmove_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
+	lea    __memmove_chk_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memmove_chk_ssse3(%rip), %RAX_LP
 2:	ret
-END(__mempcpy_chk)
+END(__memmove_chk)
 # else
-#  include "../mempcpy_chk.S"
+#  include "../memmove_chk.S"
 # endif
 #endif
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
deleted file mode 100644
index f64da63..0000000
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Multiple versions of __memmove_chk.
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2010-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-#include "init-arch.h"
-
-#define MEMMOVE_CHK __memmove_chk_sse2
-
-extern __typeof (__memmove_chk) __memmove_chk_sse2 attribute_hidden;
-extern __typeof (__memmove_chk) __memmove_chk_ssse3 attribute_hidden;
-extern __typeof (__memmove_chk) __memmove_chk_ssse3_back attribute_hidden;
-extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
-# ifdef HAVE_AVX512_ASM_SUPPORT
-  extern __typeof (__memmove_chk) __memmove_chk_avx512_no_vzeroupper attribute_hidden;
-# endif
-
-#include "debug/memmove_chk.c"
-
-libc_ifunc (__memmove_chk,
-#ifdef HAVE_AVX512_ASM_SUPPORT
-	    HAS_ARCH_FEATURE (AVX512F_Usable)
-	      && HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	    ? __memmove_chk_avx512_no_vzeroupper
-	    :
-#endif
-	    HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load) ? __memmove_chk_avx_unaligned :
-	    (HAS_CPU_FEATURE (SSSE3)
-	    ? (HAS_ARCH_FEATURE (Fast_Copy_Backward)
-	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
-	    : __memmove_chk_sse2));
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index ed78623..f9c6df3 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -25,62 +25,46 @@
    DSO.  In static binaries we need mempcpy before the initialization
    happened.  */
 #if defined SHARED && IS_IN (libc)
+	.text
 ENTRY(__mempcpy)
 	.type	__mempcpy, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef HAVE_AVX512_ASM_SUPPORT
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	1f
+	lea	__mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	1f
-	leaq    __mempcpy_avx512_no_vzeroupper(%rip), %rax
+	jnz	2f
+	lea	__mempcpy_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__mempcpy_avx512_unaligned(%rip), %RAX_LP
 	ret
-#endif
-1:	leaq	__mempcpy_sse2(%rip), %rax
-	HAS_CPU_FEATURE (SSSE3)
+# endif
+1:	lea	__mempcpy_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_ssse3(%rip), %rax
-	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	lea	__mempcpy_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__mempcpy_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_ssse3_back(%rip), %rax
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	lea	__mempcpy_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leaq	__mempcpy_avx_unaligned(%rip), %rax
+	lea    __mempcpy_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__mempcpy_ssse3(%rip), %RAX_LP
 2:	ret
 END(__mempcpy)
 
-# undef ENTRY
-# define ENTRY(name) \
-	.type __mempcpy_sse2, @function; \
-	.p2align 4; \
-	.globl __mempcpy_sse2; \
-	.hidden __mempcpy_sse2; \
-	__mempcpy_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __mempcpy_sse2, .-__mempcpy_sse2
-
-# undef ENTRY_CHK
-# define ENTRY_CHK(name) \
-	.type __mempcpy_chk_sse2, @function; \
-	.globl __mempcpy_chk_sse2; \
-	.p2align 4; \
-	__mempcpy_chk_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END_CHK
-# define END_CHK(name) \
-	cfi_endproc; .size __mempcpy_chk_sse2, .-__mempcpy_chk_sse2
-
-# undef libc_hidden_def
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal mempcpy calls through a PLT.
-   The speedup we get from using SSSE3 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_def(name) \
-	.globl __GI_mempcpy; __GI_mempcpy = __mempcpy_sse2
-# define libc_hidden_builtin_def(name) \
-	.globl __GI___mempcpy; __GI___mempcpy = __mempcpy_sse2
+weak_alias (__mempcpy, mempcpy)
 #endif
-
-#include "../mempcpy.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 6e8a89d..80f460f 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -30,24 +30,40 @@
 ENTRY(__mempcpy_chk)
 	.type	__mempcpy_chk, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-#ifdef HAVE_AVX512_ASM_SUPPORT
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	1f
+	lea	__mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	1f
-	leaq    __mempcpy_chk_avx512_no_vzeroupper(%rip), %rax
+	jnz	2f
+	lea	__mempcpy_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__mempcpy_chk_avx512_unaligned(%rip), %RAX_LP
 	ret
-#endif
-1:	leaq	__mempcpy_chk_sse2(%rip), %rax
-	HAS_CPU_FEATURE (SSSE3)
+# endif
+1:	lea	__mempcpy_chk_avx_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	L(Fast_Unaligned_Load)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_chk_ssse3(%rip), %rax
-	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	lea	__mempcpy_chk_avx_unaligned_erms(%rip), %RAX_LP
+	ret
+L(Fast_Unaligned_Load):
+	lea	__mempcpy_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
+	jz	L(SSSE3)
+	HAS_CPU_FEATURE (ERMS)
 	jz	2f
-	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	lea	__mempcpy_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	ret
+L(SSSE3):
+	HAS_CPU_FEATURE (SSSE3)
 	jz	2f
-	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
+	lea    __mempcpy_chk_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__mempcpy_chk_ssse3(%rip), %RAX_LP
 2:	ret
 END(__mempcpy_chk)
 # else

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c97c370612496379176be8e33c19dc4f80b7f01c

commit c97c370612496379176be8e33c19dc4f80b7f01c
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Mar 31 10:42:30 2016 -0700

    X86-64: Remove the previous SSE2/AVX2 memsets
    
    Since the new SSE2/AVX2 memsets are faster than the previous ones, we
    can remove the previous SSE2/AVX2 memsets and replace them with the
    new ones.  This reduces the size of libc.so by about 900 bytes.
    
    No change in IFUNC selection if SSE2 and AVX2 memsets weren't used
    before.  If SSE2 or AVX2 memset was used, the new SSE2 or AVX2 memset
    optimized with Enhanced REP STOSB will be used for processors with
    ERMS.  The new AVX512 memset will be used for processors with AVX512
    which prefer vzeroupper.
    
    	[BZ #19881]
    	* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S: Folded
    	into ...
    	* sysdeps/x86_64/memset.S: This.
    	(__bzero): Removed.
    	(__memset_tail): Likewise.
    	(__memset_chk): Likewise.
    	(memset): Likewise.
    	(MEMSET_CHK_SYMBOL): New. Define only if MEMSET_SYMBOL isn't
    	defined.
    	(MEMSET_SYMBOL): Define only if MEMSET_SYMBOL isn't defined.
    	* sysdeps/x86_64/multiarch/memset-avx2.S: Removed.
    	(__memset_zero_constant_len_parameter): Check SHARED instead of
    	PIC.
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove
    	memset-avx2 and memset-sse2-unaligned-erms.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Remove __memset_chk_sse2,
    	__memset_chk_avx2, __memset_sse2 and __memset_avx2_unaligned.
    	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: Skip
    	if not in libc.
    	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
    	Likewise.
    	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
    	(__bzero): Enabled.
    	* sysdeps/x86_64/multiarch/memset.S (memset): Replace
    	__memset_sse2 and __memset_avx2 with __memset_sse2_unaligned
    	and __memset_avx2_unaligned.  Use __memset_sse2_unaligned_erms
    	or __memset_avx2_unaligned_erms if processor has ERMS.  Support
    	__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
    	(memset): Removed.
    	(__memset_chk): Likewise.
    	(MEMSET_SYMBOL): New.
    	(libc_hidden_builtin_def): Replace __memset_sse2 with
    	__memset_sse2_unaligned.
    	* sysdeps/x86_64/multiarch/memset_chk.S (__memset_chk): Replace
    	__memset_chk_sse2 and __memset_chk_avx2 with
    	__memset_chk_sse2_unaligned and __memset_chk_avx2_unaligned_erms.
    	Use __memset_chk_sse2_unaligned_erms or
    	__memset_chk_avx2_unaligned_erms if processor has ERMS.  Support
    	__memset_chk_avx512_unaligned_erms and
    	__memset_chk_avx512_unaligned.

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 4cf0da0..62b85c3 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -19,113 +19,32 @@
 
 #include <sysdep.h>
 
-	.text
-#if IS_IN (libc)
-ENTRY(__bzero)
-	movq	%rdi, %rax /* Set return value.  */
-	movq	%rsi, %rdx /* Set n.  */
-	pxor	%xmm0, %xmm0
-	jmp	L(entry_from_bzero)
-END(__bzero)
-weak_alias (__bzero, bzero)
-
-/* Like memset but takes additional parameter with return value.  */
-ENTRY(__memset_tail)
-	movq	%rcx, %rax /* Set return value.  */
-
-	movd	%esi, %xmm0
-	punpcklbw	%xmm0, %xmm0
-	punpcklwd	%xmm0, %xmm0
-	pshufd	$0, %xmm0, %xmm0
-
-	jmp	L(entry_from_bzero)
-END(__memset_tail)
-#endif
-
-#if defined PIC && IS_IN (libc)
-ENTRY_CHK (__memset_chk)
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (__memset_chk)
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+/* Don't use movups and movaps since it will get larger nop paddings for
+   alignment.  */
+#define VMOVU		movdqu
+#define VMOVA		movdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movd d, %xmm0; \
+  movq r, %rax; \
+  punpcklbw %xmm0, %xmm0; \
+  punpcklwd %xmm0, %xmm0; \
+  pshufd $0, %xmm0, %xmm0
+
+#define SECTION(p)		p
+
+#ifndef MEMSET_SYMBOL
+# define MEMSET_CHK_SYMBOL(p,s)	p
+# define MEMSET_SYMBOL(p,s)	memset
 #endif
 
-ENTRY (memset)
-	movd	%esi, %xmm0
-	movq	%rdi, %rax
-	punpcklbw	%xmm0, %xmm0
-	punpcklwd	%xmm0, %xmm0
-	pshufd	$0, %xmm0, %xmm0
-L(entry_from_bzero):
-	cmpq	$64, %rdx
-	ja	L(loop_start)
-	cmpq	$16, %rdx
-	jbe	L(less_16_bytes)
-	cmpq	$32, %rdx
-	movdqu	%xmm0, (%rdi)
-	movdqu	%xmm0, -16(%rdi,%rdx)
-	ja	L(between_32_64_bytes)
-L(return):
-	rep
-	ret
-	.p2align 4
-L(between_32_64_bytes):
-	movdqu	%xmm0, 16(%rdi)
-	movdqu	%xmm0, -32(%rdi,%rdx)
-	ret
-	.p2align 4
-L(loop_start):
-	leaq	64(%rdi), %rcx
-	movdqu	%xmm0, (%rdi)
-	andq	$-64, %rcx
-	movdqu	%xmm0, -16(%rdi,%rdx)
-	movdqu	%xmm0, 16(%rdi)
-	movdqu	%xmm0, -32(%rdi,%rdx)
-	movdqu	%xmm0, 32(%rdi)
-	movdqu	%xmm0, -48(%rdi,%rdx)
-	movdqu	%xmm0, 48(%rdi)
-	movdqu	%xmm0, -64(%rdi,%rdx)
-	addq	%rdi, %rdx
-	andq	$-64, %rdx
-	cmpq	%rdx, %rcx
-	je	L(return)
-	.p2align 4
-L(loop):
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	movdqa	%xmm0, 32(%rcx)
-	movdqa	%xmm0, 48(%rcx)
-	addq	$64, %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
-	rep
-	ret
-L(less_16_bytes):
-	movq %xmm0, %rcx
-	testb	$24, %dl
-	jne	L(between8_16bytes)
-	testb	$4, %dl
-	jne	L(between4_7bytes)
-	testb	$1, %dl
-	je	L(odd_byte)
-	movb	%cl, (%rdi)
-L(odd_byte):
-	testb	$2, %dl
-	je	L(return)
-	movw	%cx, -2(%rax,%rdx)
-	ret
-L(between4_7bytes):
-	movl	%ecx, (%rdi)
-	movl	%ecx, -4(%rdi,%rdx)
-	ret
-L(between8_16bytes):
-	movq	%rcx, (%rdi)
-	movq	%rcx, -8(%rdi,%rdx)
-	ret
+#include "multiarch/memset-vec-unaligned-erms.S"
 
-END (memset)
 libc_hidden_builtin_def (memset)
 
-#if defined PIC && IS_IN (libc) && !defined USE_MULTIARCH
+#if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
 strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
 	.section .gnu.warning.__memset_zero_constant_len_parameter
 	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 8878efb..35752a8 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -19,12 +19,11 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
-		   strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
+		   strcspn-c strpbrk-c strspn-c varshift \
 		   memset-avx512-no-vzeroupper \
 		   memmove-sse2-unaligned-erms \
 		   memmove-avx-unaligned-erms \
 		   memmove-avx512-unaligned-erms \
-		   memset-sse2-unaligned-erms \
 		   memset-avx2-unaligned-erms \
 		   memset-avx512-unaligned-erms
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 1e880f6..ca05ff6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -117,16 +117,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
-			      __memset_chk_sse2)
-	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_sse2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
-			      __memset_chk_avx2)
-	      IFUNC_IMPL_ADD (array, i, __memset_chk,
-			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_chk_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
@@ -146,7 +141,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
-	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
 	      IFUNC_IMPL_ADD (array, i, memset, 1,
 			      __memset_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memset, 1,
@@ -154,9 +148,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
-			      __memset_avx2)
-	      IFUNC_IMPL_ADD (array, i, memset,
-			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
deleted file mode 100644
index df63472..0000000
--- a/sysdeps/x86_64/multiarch/memset-avx2.S
+++ /dev/null
@@ -1,168 +0,0 @@
-/* memset with AVX2
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-#ifndef MEMSET
-# define MEMSET	__memset_avx2
-# define MEMSET_CHK	__memset_chk_avx2
-#endif
-
-	.section .text.avx2,"ax",@progbits
-#if defined PIC
-ENTRY (MEMSET_CHK)
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMSET_CHK)
-#endif
-
-ENTRY (MEMSET)
-	vpxor	%xmm0, %xmm0, %xmm0
-	vmovd	%esi, %xmm1
-	lea	(%rdi, %rdx), %rsi
-	mov	%rdi, %rax
-	vpshufb	%xmm0, %xmm1, %xmm0
-	cmp	$16, %rdx
-	jb	L(less_16bytes)
-	cmp	$256, %rdx
-	jae	L(256bytesormore)
-	cmp	$128, %dl
-	jb	L(less_128bytes)
-	vmovdqu	%xmm0, (%rdi)
-	vmovdqu %xmm0, 0x10(%rdi)
-	vmovdqu %xmm0, 0x20(%rdi)
-	vmovdqu %xmm0, 0x30(%rdi)
-	vmovdqu %xmm0, 0x40(%rdi)
-	vmovdqu %xmm0, 0x50(%rdi)
-	vmovdqu %xmm0, 0x60(%rdi)
-	vmovdqu %xmm0, 0x70(%rdi)
-	vmovdqu %xmm0, -0x80(%rsi)
-	vmovdqu %xmm0, -0x70(%rsi)
-	vmovdqu %xmm0, -0x60(%rsi)
-	vmovdqu %xmm0, -0x50(%rsi)
-	vmovdqu %xmm0, -0x40(%rsi)
-	vmovdqu %xmm0, -0x30(%rsi)
-	vmovdqu %xmm0, -0x20(%rsi)
-	vmovdqu %xmm0, -0x10(%rsi)
-	ret
-
-	.p2align 4
-L(less_128bytes):
-	cmp	$64, %dl
-	jb	L(less_64bytes)
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm0, 0x10(%rdi)
-	vmovdqu %xmm0, 0x20(%rdi)
-	vmovdqu %xmm0, 0x30(%rdi)
-	vmovdqu %xmm0, -0x40(%rsi)
-	vmovdqu %xmm0, -0x30(%rsi)
-	vmovdqu %xmm0, -0x20(%rsi)
-	vmovdqu %xmm0, -0x10(%rsi)
-	ret
-
-	.p2align 4
-L(less_64bytes):
-	cmp	$32, %dl
-	jb	L(less_32bytes)
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm0, 0x10(%rdi)
-	vmovdqu %xmm0, -0x20(%rsi)
-	vmovdqu %xmm0, -0x10(%rsi)
-	ret
-
-	.p2align 4
-L(less_32bytes):
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm0, -0x10(%rsi)
-	ret
-
-	.p2align 4
-L(less_16bytes):
-	cmp	$8, %dl
-	jb	L(less_8bytes)
-	vmovq %xmm0, (%rdi)
-	vmovq %xmm0, -0x08(%rsi)
-	ret
-
-	.p2align 4
-L(less_8bytes):
-	vmovd	%xmm0, %ecx
-	cmp	$4, %dl
-	jb	L(less_4bytes)
-	mov	%ecx, (%rdi)
-	mov	%ecx, -0x04(%rsi)
-	ret
-
-	.p2align 4
-L(less_4bytes):
-	cmp	$2, %dl
-	jb	L(less_2bytes)
-	mov	%cx, (%rdi)
-	mov	%cx, -0x02(%rsi)
-	ret
-
-	.p2align 4
-L(less_2bytes):
-	cmp	$1, %dl
-	jb	L(less_1bytes)
-	mov	%cl, (%rdi)
-L(less_1bytes):
-	ret
-
-	.p2align 4
-L(256bytesormore):
-	vinserti128 $1, %xmm0, %ymm0, %ymm0
-	and	$-0x20, %rdi
-	add	$0x20, %rdi
-	vmovdqu	%ymm0, (%rax)
-	sub	%rdi, %rax
-	lea	-0x80(%rax, %rdx), %rcx
-	cmp	$4096, %rcx
-	ja	L(gobble_data)
-L(gobble_128_loop):
-	vmovdqa	%ymm0, (%rdi)
-	vmovdqa	%ymm0, 0x20(%rdi)
-	vmovdqa	%ymm0, 0x40(%rdi)
-	vmovdqa	%ymm0, 0x60(%rdi)
-	sub	$-0x80, %rdi
-	add	$-0x80, %ecx
-	jb	L(gobble_128_loop)
-	mov	%rsi, %rax
-	vmovdqu	%ymm0, -0x80(%rsi)
-	vmovdqu	%ymm0, -0x60(%rsi)
-	vmovdqu	%ymm0, -0x40(%rsi)
-	vmovdqu	%ymm0, -0x20(%rsi)
-	sub	%rdx, %rax
-	vzeroupper
-	ret
-
-	.p2align 4
-L(gobble_data):
-	sub	$-0x80, %rcx
-	vmovd	%xmm0, %eax
-	rep	stosb
-	mov	%rsi, %rax
-	sub	%rdx, %rax
-	vzeroupper
-	ret
-
-END (MEMSET)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
deleted file mode 100644
index 4bf3d36..0000000
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#if IS_IN (libc)
-# define VEC_SIZE	16
-# define VEC(i)		xmm##i
-/* Don't use movups and movaps since it will get larger nop paddings
-   for alignment.  */
-# define VMOVU		movdqu
-# define VMOVA		movdqa
-
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movd d, %xmm0; \
-  movq r, %rax; \
-  punpcklbw %xmm0, %xmm0; \
-  punpcklwd %xmm0, %xmm0; \
-  pshufd $0, %xmm0, %xmm0
-
-# define SECTION(p)		p
-# define MEMSET_SYMBOL(p,s)	p##_sse2_##s
-
-# include "memset-vec-unaligned-erms.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 578a5ae..89f6886 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -71,7 +71,7 @@
 #endif
 
 	.section SECTION(.text),"ax",@progbits
-#if VEC_SIZE == 16 && IS_IN (libc) && 0
+#if VEC_SIZE == 16 && IS_IN (libc)
 ENTRY (__bzero)
 	movq	%rdi, %rax /* Set return value.  */
 	movq	%rsi, %rdx /* Set n.  */
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 8e3b9b9..4e52d8f 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -26,35 +26,43 @@
 ENTRY(memset)
 	.type	memset, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	leaq	__memset_sse2(%rip), %rax
+	lea	__memset_sse2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	1f
+	lea	__memset_sse2_unaligned(%rip), %RAX_LP
+1:
 	HAS_ARCH_FEATURE (AVX2_Usable)
 	jz	2f
-	leaq	__memset_avx2(%rip), %rax
-#ifdef HAVE_AVX512_ASM_SUPPORT
+	lea	__memset_avx2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	L(AVX512F)
+	lea	__memset_avx2_unaligned(%rip), %RAX_LP
+L(AVX512F):
+# ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	2f
+	lea	__memset_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	2f
-	leaq	__memset_avx512_no_vzeroupper(%rip), %rax
-#endif
+	jnz	2f
+	lea	__memset_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memset_avx512_unaligned(%rip), %RAX_LP
+# endif
 2:	ret
 END(memset)
 #endif
 
 #if IS_IN (libc)
-# undef memset
-# define memset __memset_sse2
-
-# undef __memset_chk
-# define __memset_chk __memset_chk_sse2
+# define MEMSET_SYMBOL(p,s)	p##_sse2_##s
 
 # ifdef SHARED
 # undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal memset calls through a PLT.
-   The speedup we get from using GPR instruction is likely eaten away
+   The speedup we get from using SSE2 instructions is likely eaten away
    by the indirect call in the PLT.  */
 # define libc_hidden_builtin_def(name) \
-	.globl __GI_memset; __GI_memset = __memset_sse2
+	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned
 # endif
 
 # undef strong_alias
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
index 9a7b270..8517cfc 100644
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -26,16 +26,28 @@
 ENTRY(__memset_chk)
 	.type	__memset_chk, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	leaq	__memset_chk_sse2(%rip), %rax
+	lea	__memset_chk_sse2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	1f
+	lea	__memset_chk_sse2_unaligned(%rip), %RAX_LP
+1:
 	HAS_ARCH_FEATURE (AVX2_Usable)
 	jz	2f
-	leaq	__memset_chk_avx2(%rip), %rax
+	lea	__memset_chk_avx2_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	L(AVX512F)
+	lea	__memset_chk_avx2_unaligned(%rip), %RAX_LP
+L(AVX512F):
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	HAS_ARCH_FEATURE (AVX512F_Usable)
 	jz	2f
+	lea	__memset_chk_avx512_no_vzeroupper(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jz	2f
-	leaq	__memset_chk_avx512_no_vzeroupper(%rip), %rax
+	jnz	2f
+	lea	__memset_chk_avx512_unaligned_erms(%rip), %RAX_LP
+	HAS_CPU_FEATURE (ERMS)
+	jnz	2f
+	lea	__memset_chk_avx512_unaligned(%rip), %RAX_LP
 #endif
 2:	ret
 END(__memset_chk)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=121270b79236d7c5802e8d9af2d27952cb9efae9

commit 121270b79236d7c5802e8d9af2d27952cb9efae9
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Apr 3 17:21:45 2016 -0700

    X86-64: Use non-temporal store in memcpy on large data
    
    The large memcpy micro benchmark in glibc shows that there is a
    regression with large data on Haswell machine.  non-temporal store in
    memcpy on large data can improve performance significantly.  This
    patch adds a threshold to use non temporal store which is 6 times of
    shared cache size.  When size is above the threshold, non temporal
    store will be used.
    
    For size below 8 vector register width, we load all data into registers
    and store them together.  Only forward and backward loops, which move 4
    vector registers at a time, are used to support overlapping addresses.
    For forward loop, we load the last 4 vector register width of data and
    the first vector register width of data into vector registers before the
    loop and store them after the loop.  For backward loop, we load the first
    4 vector register width of data and the last vector register width of
    data into vector registers before the loop and store them after the loop.
    
    	* sysdeps/x86_64/cacheinfo.c (__x86_shared_non_temporal_threshold):
    	New.
    	(init_cacheinfo): Set __x86_shared_non_temporal_threshold to
    	6 times of shared cache size.
    	* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
    	(VMOVNT): New.
    	* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
    	(VMOVNT): Likewise.
    	* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
    	(VMOVNT): Likewise.
    	(VMOVU): Changed to movups for smaller code sizes.
    	(VMOVA): Changed to movaps for smaller code sizes.
    	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: Update
    	comments.
    	(PREFETCH): New.
    	(PREFETCH_SIZE): Likewise.
    	(PREFETCHED_LOAD_SIZE): Likewise.
    	(PREFETCH_ONE_SET): Likewise.
    	Rewrite to use forward and backward loops, which move 4 vector
    	registers at a time, to support overlapping addresses and use
    	non temporal store if size is above the threshold.

diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index 96463df..143b333 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -464,6 +464,9 @@ long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
 /* Similar to __x86_shared_cache_size, but not rounded.  */
 long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
 
+/* Threshold to use non temporal store.  */
+long int __x86_shared_non_temporal_threshold attribute_hidden;
+
 #ifndef DISABLE_PREFETCHW
 /* PREFETCHW support flag for use in memory and string routines.  */
 int __x86_prefetchw attribute_hidden;
@@ -662,4 +665,9 @@ init_cacheinfo (void)
       __x86_shared_cache_size_half = shared / 2;
       __x86_shared_cache_size = shared;
     }
+
+  /* The large memcpy micro benchmark in glibc shows that 6 times of
+     shared cache size is the approximate value above which non-temporal
+     store becomes faster.  */
+  __x86_shared_non_temporal_threshold = __x86_shared_cache_size * 6;
 }
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index 44711c3..e195e93 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -1,6 +1,7 @@
 #if IS_IN (libc)
 # define VEC_SIZE	32
 # define VEC(i)		ymm##i
+# define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index c2c5293..f9af6fd 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,6 +1,7 @@
 #if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
 # define VEC_SIZE	64
 # define VEC(i)		zmm##i
+# define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 85214fe..d7edb18 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -1,8 +1,10 @@
 #if IS_IN (libc)
 # define VEC_SIZE	16
 # define VEC(i)		xmm##i
-# define VMOVU		movdqu
-# define VMOVA		movdqa
+# define VMOVNT		movntdq
+/* Use movups and movaps for smaller code sizes.  */
+# define VMOVU		movups
+# define VMOVA		movaps
 
 # define SECTION(p)		p
 # define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 8a60d0f..51e562b 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -18,19 +18,20 @@
 
 /* memmove/memcpy/mempcpy is implemented as:
    1. Use overlapping load and store to avoid branch.
-   2. Use 8-bit or 32-bit displacements for branches and nop paddings
-      to avoid long nop between instructions.
-   3. Load all sources into registers and store them together to avoid
+   2. Load all sources into registers and store them together to avoid
       possible address overflap between source and destination.
-   4. If size is 2 * VEC_SIZE or less, load all sources into registers
+   3. If size is 8 * VEC_SIZE or less, load all sources into registers
       and store them together.
-   5. If there is no address overflap, copy from both ends with
-      4 * VEC_SIZE at a time.
-   6. If size is 8 * VEC_SIZE or less, load all sources into registers
-      and store them together.
-   7. If address of destination > address of source, backward copy
-      8 * VEC_SIZE at a time.
-   8. Otherwise, forward copy 8 * VEC_SIZE at a time.  */
+   4. If address of destination > address of source, backward copy
+      4 * VEC_SIZE at a time with unaligned load and aligned store.
+      Load the first 4 * VEC and last VEC before the loop and store
+      them after the loop to support overlapping addresses.
+   5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
+      load and aligned store.  Load the last 4 * VEC and first VEC
+      before the loop and store them after the loop to support
+      overlapping addresses.
+   6. If size >= __x86_shared_non_temporal_threshold, use non-temporal
+      store instead of aligned store.  */
 
 #include <sysdep.h>
 
@@ -65,6 +66,39 @@
 # define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
 #endif
 
+#ifndef PREFETCH
+# define PREFETCH(addr) prefetcht0 addr
+#endif
+
+/* Assume 64-byte prefetch size.  */
+#ifndef PREFETCH_SIZE
+# define PREFETCH_SIZE 64
+#endif
+
+#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
+
+#if PREFETCH_SIZE == 64
+# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base)
+# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
+# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
+#  define PREFETCH_ONE_SET(dir, base, offset) \
+	PREFETCH ((offset)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
+	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
+# else
+#   error Unsupported PREFETCHED_LOAD_SIZE!
+# endif
+#else
+# error Unsupported PREFETCH_SIZE!
+#endif
+
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -185,6 +219,8 @@ L(return):
 	ret
 
 L(movsb):
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	jae	L(more_8x_vec)
 	cmpq	%rsi, %rdi
 	jb	1f
 	/* Source == destination is less common.  */
@@ -201,97 +237,8 @@ L(movsb):
 	rep movsb
 L(nop):
 	ret
-
-	.p2align 4
-L(movsb_more_2x_vec):
-	cmpq	$REP_MOVSB_THRESHOLD, %rdx
-	/* Force 32-bit displacement to avoid long nop between
-	   instructions.  */
-	ja.d32	L(movsb)
-#endif
-	.p2align 4
-L(more_2x_vec):
-	/* More than 2 * VEC.  */
-	cmpq	%rsi, %rdi
-	jb	L(copy_forward)
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %rcx
-	cmpq	%rcx, %rdi
-	jb	L(more_2x_vec_overlap)
-L(copy_forward):
-	leaq	(%rdi,%rdx), %rcx
-	cmpq	%rcx, %rsi
-	jb	L(more_2x_vec_overlap)
-	VMOVU	(%rsi), %VEC(0)
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
-	cmpq	$(VEC_SIZE * 4), %rdx
-	/* Force 32-bit displacement to avoid long nop between
-	   instructions.  */
-	jbe.d32	L(return)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(0)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(1), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
-	cmpq	$(VEC_SIZE * 8), %rdx
-#if  VEC_SIZE == 16
-# if defined USE_MULTIARCH && IS_IN (libc)
-	jbe	L(return)
-# else
-	/* Use 32-bit displacement to avoid long nop between
-	   instructions.  */
-	jbe.d32	L(return)
-# endif
-#else
-	/* Use 8-bit displacement to avoid long nop between
-	   instructions.  */
-	jbe	L(return_disp8)
-#endif
-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
-	addq	%rdi, %rdx
-	andq	$-(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rcx
-	movq	%rcx, %r11
-	subq	%rdi, %r11
-	addq	%r11, %rsi
-	cmpq	%rdx, %rcx
-	/* Use 8-bit displacement to avoid long nop between
-	   instructions.  */
-	je	L(return_disp8)
-	movq	%rsi, %r10
-	subq	%rcx, %r10
-	leaq	VEC_SIZE(%r10), %r9
-	leaq	(VEC_SIZE * 2)(%r10), %r8
-	leaq	(VEC_SIZE * 3)(%r10), %r11
-	.p2align 4
-L(loop):
-	VMOVU	(%rcx,%r10), %VEC(0)
-	VMOVU	(%rcx,%r9), %VEC(1)
-	VMOVU	(%rcx,%r8), %VEC(2)
-	VMOVU	(%rcx,%r11), %VEC(3)
-	VMOVA	%VEC(0), (%rcx)
-	VMOVA	%VEC(1), VEC_SIZE(%rcx)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rcx)
-	addq	$(VEC_SIZE * 4), %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
-#if !defined USE_MULTIARCH || !IS_IN (libc)
-L(return):
 #endif
-L(return_disp8):
-	VZEROUPPER
-	ret
+
 L(less_vec):
 	/* Less than 1 VEC.  */
 #if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
@@ -357,18 +304,18 @@ L(between_2_3):
 	movw	%si, (%rdi)
 	ret
 
-#if VEC_SIZE > 16
-	/* Align to 16 bytes to avoid long nop between instructions.  */
-	.p2align 4
+#if defined USE_MULTIARCH && IS_IN (libc)
+L(movsb_more_2x_vec):
+	cmpq	$REP_MOVSB_THRESHOLD, %rdx
+	ja	L(movsb)
 #endif
-L(more_2x_vec_overlap):
-	/* More than 2 * VEC and there is overlap bewteen destination
+L(more_2x_vec):
+	/* More than 2 * VEC and there may be overlap bewteen destination
 	   and source.  */
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
 	cmpq	$(VEC_SIZE * 4), %rdx
 	jb	L(last_4x_vec)
-L(between_4x_vec_and_8x_vec):
 	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
@@ -400,84 +347,175 @@ L(last_4x_vec):
 	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
 	VZEROUPPER
 	ret
-L(between_0_and_4x_vec):
-	/* Copy from 0 to 4 * VEC. */
-	cmpl	$(VEC_SIZE * 2), %edx
-	jae	L(last_4x_vec)
-	/* Copy from 0 to 2 * VEC. */
-	cmpl	$VEC_SIZE, %edx
-	jae	L(last_2x_vec)
-	/* Copy from 0 to VEC. */
-	VZEROUPPER
-	jmp	L(less_vec)
+
 L(more_8x_vec):
 	cmpq	%rsi, %rdi
 	ja	L(more_8x_vec_backward)
-
-	.p2align 4
-L(loop_8x_vec_forward):
-	/* Copy 8 * VEC a time forward.  */
+	/* Source == destination is less common.  */
+	je	L(nop)
+	/* Load the first VEC and last 4 * VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	/* Save start and stop of the destination buffer.  */
+	movq	%rdi, %r11
+	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
+	/* Align destination for aligned stores in the loop.  Compute
+	   how much destination is misaligned.  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	ja	L(loop_large_forward)
+#endif
+L(loop_4x_vec_forward):
+	/* Copy 4 * VEC a time forward.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 4)(%rsi), %VEC(4)
-	VMOVU	(VEC_SIZE * 5)(%rsi), %VEC(5)
-	VMOVU	(VEC_SIZE * 6)(%rsi), %VEC(6)
-	VMOVU	(VEC_SIZE * 7)(%rsi), %VEC(7)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), (VEC_SIZE * 4)(%rdi)
-	VMOVU	%VEC(5), (VEC_SIZE * 5)(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 6)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 7)(%rdi)
-	addq	$(VEC_SIZE * 8), %rdi
-	addq	$(VEC_SIZE * 8), %rsi
-	subq	$(VEC_SIZE * 8), %rdx
-	cmpq	$(VEC_SIZE * 8), %rdx
-	je	L(between_4x_vec_and_8x_vec)
-	ja	L(loop_8x_vec_forward)
-	/* Less than 8 * VEC to copy.  */
+	addq	$(VEC_SIZE * 4), %rsi
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
 	cmpq	$(VEC_SIZE * 4), %rdx
-	jb	L(between_0_and_4x_vec)
-	jmp	L(between_4x_vec_and_8x_vec)
+	ja	L(loop_4x_vec_forward)
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
 
-	.p2align 4
 L(more_8x_vec_backward):
+	/* Load the first 4 * VEC and last VEC to support overlapping
+	   addresses.  */
+	VMOVU	(%rsi), %VEC(4)
+	VMOVU	VEC_SIZE(%rsi), %VEC(5)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
+	/* Save stop of the destination buffer.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %r11
+	/* Align destination end for aligned stores in the loop.  Compute
+	   how much destination end is misaligned.  */
 	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	leaq	-VEC_SIZE(%rdi, %rdx), %r9
+	movq	%r11, %r9
+	movq	%r11, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Adjust source.  */
+	subq	%r8, %rcx
+	/* Adjust the end of destination which should be aligned now.  */
+	subq	%r8, %r9
+	/* Adjust length.  */
+	subq	%r8, %rdx
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	ja	L(loop_large_backward)
+#endif
+L(loop_4x_vec_backward):
+	/* Copy 4 * VEC a time backward.  */
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	subq	$(VEC_SIZE * 4), %rcx
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%r9)
+	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$(VEC_SIZE * 4), %r9
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec_backward)
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
 
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	.p2align 4
-L(loop_8x_vec_backward):
-	/* Copy 8 * VEC a time backward.  */
+L(loop_large_forward):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$PREFETCHED_LOAD_SIZE, %rsi
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%rdi)
+	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$PREFETCHED_LOAD_SIZE, %rdi
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_forward)
+	sfence
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r11)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(loop_large_backward):
+	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
 	VMOVU	(%rcx), %VEC(0)
 	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
 	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
 	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	VMOVU	-(VEC_SIZE * 4)(%rcx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 5)(%rcx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 6)(%rcx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 7)(%rcx), %VEC(7)
-	VMOVU	%VEC(0), (%r9)
-	VMOVU	%VEC(1), -VEC_SIZE(%r9)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVU	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	VMOVU	%VEC(4), -(VEC_SIZE * 4)(%r9)
-	VMOVU	%VEC(5), -(VEC_SIZE * 5)(%r9)
-	VMOVU	%VEC(6), -(VEC_SIZE * 6)(%r9)
-	VMOVU	%VEC(7), -(VEC_SIZE * 7)(%r9)
-	subq	$(VEC_SIZE * 8), %rcx
-	subq	$(VEC_SIZE * 8), %r9
-	subq	$(VEC_SIZE * 8), %rdx
-	cmpq	$(VEC_SIZE * 8), %rdx
-	je	L(between_4x_vec_and_8x_vec)
-	ja	L(loop_8x_vec_backward)
-	/* Less than 8 * VEC to copy.  */
-	cmpq	$(VEC_SIZE * 4), %rdx
-	jb	L(between_0_and_4x_vec)
-	jmp	L(between_4x_vec_and_8x_vec)
+	subq	$PREFETCHED_LOAD_SIZE, %rcx
+	subq	$PREFETCHED_LOAD_SIZE, %rdx
+	VMOVNT	%VEC(0), (%r9)
+	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$PREFETCHED_LOAD_SIZE, %r9
+	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+	ja	L(loop_large_backward)
+	sfence
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
+#endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 
 #ifdef SHARED

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0932dd8b56db46dd421a4855fb5dee9de092538d

commit 0932dd8b56db46dd421a4855fb5dee9de092538d
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Apr 6 10:19:16 2016 -0700

    X86-64: Prepare memmove-vec-unaligned-erms.S
    
    Prepare memmove-vec-unaligned-erms.S to make the SSE2 version as the
    default memcpy, mempcpy and memmove.
    
    	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
    	(MEMCPY_SYMBOL): New.
    	(MEMPCPY_SYMBOL): Likewise.
    	(MEMMOVE_CHK_SYMBOL): Likewise.
    	Replace MEMMOVE_SYMBOL with MEMMOVE_CHK_SYMBOL on __mempcpy_chk
    	symbols.  Replace MEMMOVE_SYMBOL with MEMPCPY_SYMBOL on
    	__mempcpy symbols.  Provide alias for __memcpy_chk in libc.a.
    	Provide alias for memcpy in libc.a and ld.so.
    
    (cherry picked from commit a7d1c51482d15ab6c07e2ee0ae5e007067b18bfb)

diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 66779a3..8a60d0f 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -32,18 +32,27 @@
       8 * VEC_SIZE at a time.
    8. Otherwise, forward copy 8 * VEC_SIZE at a time.  */
 
-#if IS_IN (libc)
+#include <sysdep.h>
 
-# include <sysdep.h>
-# include "asm-syntax.h"
+#ifndef MEMCPY_SYMBOL
+# define MEMCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
 
-# ifndef VZEROUPPER
-#  if VEC_SIZE > 16
-#   define VZEROUPPER vzeroupper
-#  else
-#   define VZEROUPPER
-#  endif
+#ifndef MEMPCPY_SYMBOL
+# define MEMPCPY_SYMBOL(p,s)		MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef MEMMOVE_CHK_SYMBOL
+# define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+#endif
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER vzeroupper
+# else
+#  define VZEROUPPER
 # endif
+#endif
 
 /* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
    up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
@@ -52,32 +61,36 @@
    on processors with Enhanced REP MOVSB.  Since larger register size
    can move more data with a single load and store, the threshold is
    higher with larger register size.  */
-# ifndef REP_MOVSB_THRESHOLD
-#  define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
-# endif
+#ifndef REP_MOVSB_THRESHOLD
+# define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
+#endif
 
-# ifndef SECTION
-#  error SECTION is not defined!
-# endif
-	.section SECTION(.text),"ax",@progbits
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
 
-# ifdef SHARED
-ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+	.section SECTION(.text),"ax",@progbits
+#if defined SHARED && IS_IN (libc)
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_2))
+#endif
 
-ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+#if VEC_SIZE == 16 || defined SHARED
+ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
 	movq	%rdi, %rax
 	addq	%rdx, %rax
 	jmp	L(start)
-END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+END (MEMPCPY_SYMBOL (__mempcpy, unaligned_2))
+#endif
 
-ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+#if defined SHARED && IS_IN (libc)
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
-# endif
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2))
+#endif
 
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
 	movq	%rdi, %rax
@@ -86,24 +99,29 @@ L(start):
 	jb	L(less_vec)
 	cmpq	$(VEC_SIZE * 2), %rdx
 	ja	L(more_2x_vec)
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(last_2x_vec):
+#endif
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
 	VZEROUPPER
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(nop):
+#endif
 	ret
+#if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
 
-# if VEC_SIZE == 16
+# if VEC_SIZE == 16 && defined SHARED
 /* Only used to measure performance of REP MOVSB.  */
-#  ifdef SHARED
 ENTRY (__mempcpy_erms)
 	movq	%rdi, %rax
 	addq	%rdx, %rax
 	jmp	L(start_movsb)
 END (__mempcpy_erms)
-#  endif
 
 ENTRY (__memmove_erms)
 	movq	%rdi, %rax
@@ -132,11 +150,10 @@ strong_alias (__memmove_erms, __memcpy_erms)
 # endif
 
 # ifdef SHARED
-ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
-# endif
+END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 
 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 	movq	%rdi, %rax
@@ -144,11 +161,10 @@ ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 	jmp	L(start_erms)
 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 
-# ifdef SHARED
-ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
 
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
@@ -192,6 +208,7 @@ L(movsb_more_2x_vec):
 	/* Force 32-bit displacement to avoid long nop between
 	   instructions.  */
 	ja.d32	L(movsb)
+#endif
 	.p2align 4
 L(more_2x_vec):
 	/* More than 2 * VEC.  */
@@ -227,13 +244,19 @@ L(copy_forward):
 	VMOVU	%VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
 	VMOVU	%VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
 	cmpq	$(VEC_SIZE * 8), %rdx
-# if  VEC_SIZE == 16
+#if  VEC_SIZE == 16
+# if defined USE_MULTIARCH && IS_IN (libc)
 	jbe	L(return)
 # else
+	/* Use 32-bit displacement to avoid long nop between
+	   instructions.  */
+	jbe.d32	L(return)
+# endif
+#else
 	/* Use 8-bit displacement to avoid long nop between
 	   instructions.  */
 	jbe	L(return_disp8)
-# endif
+#endif
 	leaq	(VEC_SIZE * 4)(%rdi), %rcx
 	addq	%rdi, %rdx
 	andq	$-(VEC_SIZE * 4), %rdx
@@ -263,22 +286,25 @@ L(loop):
 	addq	$(VEC_SIZE * 4), %rcx
 	cmpq	%rcx, %rdx
 	jne	L(loop)
+#if !defined USE_MULTIARCH || !IS_IN (libc)
+L(return):
+#endif
 L(return_disp8):
 	VZEROUPPER
 	ret
 L(less_vec):
 	/* Less than 1 VEC.  */
-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
-#  error Unsupported VEC_SIZE!
-# endif
-# if VEC_SIZE > 32
+#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+# error Unsupported VEC_SIZE!
+#endif
+#if VEC_SIZE > 32
 	cmpb	$32, %dl
 	jae	L(between_32_63)
-# endif
-# if VEC_SIZE > 16
+#endif
+#if VEC_SIZE > 16
 	cmpb	$16, %dl
 	jae	L(between_16_31)
-# endif
+#endif
 	cmpb	$8, %dl
 	jae	L(between_8_15)
 	cmpb	$4, %dl
@@ -290,7 +316,7 @@ L(less_vec):
 	movb	%cl, (%rdi)
 1:
 	ret
-# if VEC_SIZE > 32
+#if VEC_SIZE > 32
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
 	vmovdqu	(%rsi), %ymm0
@@ -299,8 +325,8 @@ L(between_32_63):
 	vmovdqu	%ymm1, -32(%rdi,%rdx)
 	VZEROUPPER
 	ret
-# endif
-# if VEC_SIZE > 16
+#endif
+#if VEC_SIZE > 16
 	/* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
 	vmovdqu	(%rsi), %xmm0
@@ -308,7 +334,7 @@ L(between_16_31):
 	vmovdqu	%xmm0, (%rdi)
 	vmovdqu	%xmm1, -16(%rdi,%rdx)
 	ret
-# endif
+#endif
 L(between_8_15):
 	/* From 8 to 15.  No branch when size == 8.  */
 	movq	-8(%rsi,%rdx), %rcx
@@ -331,10 +357,10 @@ L(between_2_3):
 	movw	%si, (%rdi)
 	ret
 
-# if VEC_SIZE > 16
+#if VEC_SIZE > 16
 	/* Align to 16 bytes to avoid long nop between instructions.  */
 	.p2align 4
-# endif
+#endif
 L(more_2x_vec_overlap):
 	/* More than 2 * VEC and there is overlap bewteen destination
 	   and source.  */
@@ -454,15 +480,19 @@ L(loop_8x_vec_backward):
 	jmp	L(between_4x_vec_and_8x_vec)
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 
-# ifdef SHARED
+#ifdef SHARED
+# if IS_IN (libc)
+#  ifdef USE_MULTIARCH
 strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
 	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
 strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
 	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
-strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
-	      MEMMOVE_SYMBOL (__memcpy, unaligned_2))
-strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2),
-	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2))
+#  endif
+strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_2),
+	      MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned_2))
 # endif
-
+#endif
+#if VEC_SIZE == 16 || defined SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
+	      MEMCPY_SYMBOL (__memcpy, unaligned_2))
 #endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=da2da79262814ba4ead3ee487549949096d8ad2d

commit da2da79262814ba4ead3ee487549949096d8ad2d
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Apr 6 09:10:18 2016 -0700

    X86-64: Prepare memset-vec-unaligned-erms.S
    
    Prepare memset-vec-unaligned-erms.S to make the SSE2 version as the
    default memset.
    
    	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
    	(MEMSET_CHK_SYMBOL): New.  Define if not defined.
    	(__bzero): Check VEC_SIZE == 16 instead of USE_MULTIARCH.
    	Disabled fro now.
    	Replace MEMSET_SYMBOL with MEMSET_CHK_SYMBOL on __memset_chk
    	symbols.  Properly check USE_MULTIARCH on __memset symbols.
    
    (cherry picked from commit 4af1bb06c59d24f35bf8dc55897838d926c05892)

diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index fe0f745..578a5ae 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -28,6 +28,10 @@
 
 #include <sysdep.h>
 
+#ifndef MEMSET_CHK_SYMBOL
+# define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
+#endif
+
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
@@ -66,8 +70,8 @@
 # error SECTION is not defined!
 #endif
 
-#if !defined USE_MULTIARCH && IS_IN (libc)
 	.section SECTION(.text),"ax",@progbits
+#if VEC_SIZE == 16 && IS_IN (libc) && 0
 ENTRY (__bzero)
 	movq	%rdi, %rax /* Set return value.  */
 	movq	%rsi, %rdx /* Set n.  */
@@ -78,10 +82,10 @@ weak_alias (__bzero, bzero)
 #endif
 
 #if defined SHARED && IS_IN (libc)
-ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
@@ -97,15 +101,16 @@ L(entry_from_bzero):
 	VMOVU	%VEC(0), (%rdi)
 	VZEROUPPER
 	ret
+#if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
 
-#if VEC_SIZE == 16
+# if VEC_SIZE == 16
 /* Only used to measure performance of REP STOSB.  */
 ENTRY (__memset_erms)
-#else
+# else
 /* Provide a symbol to debugger.  */
 ENTRY (MEMSET_SYMBOL (__memset, erms))
-#endif
+# endif
 L(stosb):
 	movq	%rdx, %rcx
 	movzbl	%sil, %eax
@@ -113,18 +118,18 @@ L(stosb):
 	rep stosb
 	movq	%rdx, %rax
 	ret
-#if VEC_SIZE == 16
+# if VEC_SIZE == 16
 END (__memset_erms)
-#else
+# else
 END (MEMSET_SYMBOL (__memset, erms))
-#endif
+# endif
 
-#if defined SHARED && IS_IN (libc)
-ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+# if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
-#endif
+END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+# endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
@@ -144,6 +149,7 @@ L(stosb_more_2x_vec):
 	/* Force 32-bit displacement to avoid long nop between
 	   instructions.  */
 	ja.d32	L(stosb)
+#endif
 	.p2align 4
 L(more_2x_vec):
 	cmpq  $(VEC_SIZE * 4), %rdx

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9a93bdbaff81edf67c5486c84f8098055e355abb

commit 9a93bdbaff81edf67c5486c84f8098055e355abb
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Apr 5 05:21:07 2016 -0700

    Force 32-bit displacement in memset-vec-unaligned-erms.S
    
    	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Force
    	32-bit displacement to avoid long nop between instructions.
    
    (cherry picked from commit ec0cac9a1f4094bd0db6f77c1b329e7a40eecc10)

diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9383517..fe0f745 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -159,10 +159,23 @@ L(return):
 	.p2align 4
 L(loop_start):
 	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	VMOVU.d32 %VEC(0), (%rdi)
+# else
 	VMOVU	%VEC(0), (%rdi)
+# endif
 	andq	$-(VEC_SIZE * 4), %rcx
+# if VEC_SIZE == 32
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	VMOVU.d32 %VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU.d32 %VEC(0), VEC_SIZE(%rdi)
+# else
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+# endif
 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
 	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5118e532600549ad0f56cb9b1a179b8eab70c483

commit 5118e532600549ad0f56cb9b1a179b8eab70c483
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Apr 5 05:19:05 2016 -0700

    Add a comment in memset-sse2-unaligned-erms.S
    
    	* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S: Add
    	a comment on VMOVU and VMOVA.
    
    (cherry picked from commit 696ac774847b80cf994438739478b0c3003b5958)

diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 2deba42..4bf3d36 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -1,6 +1,8 @@
 #if IS_IN (libc)
 # define VEC_SIZE	16
 # define VEC(i)		xmm##i
+/* Don't use movups and movaps since it will get larger nop paddings
+   for alignment.  */
 # define VMOVU		movdqu
 # define VMOVA		movdqa
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=06c6d4ae6ee7e5b83fd5868bef494def01f59292

commit 06c6d4ae6ee7e5b83fd5868bef494def01f59292
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Apr 3 14:32:20 2016 -0700

    Don't put SSE2/AVX/AVX512 memmove/memset in ld.so
    
    Since memmove and memset in ld.so don't use IFUNC, don't put SSE2, AVX
    and AVX512 memmove and memset in ld.so.
    
    	* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: Skip
    	if not in libc.
    	* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
    	Likewise.
    	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S:
    	Likewise.
    	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
    	Likewise.
    
    (cherry picked from commit 5cd7af016d8587ff53b20ba259746f97edbddbf7)

diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index 3a72c7e..44711c3 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -1,9 +1,11 @@
-#define VEC_SIZE	32
-#define VEC(i)		ymm##i
-#define VMOVU		vmovdqu
-#define VMOVA		vmovdqa
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define VEC(i)		ymm##i
+# define VMOVU		vmovdqu
+# define VMOVA		vmovdqa
 
-#define SECTION(p)		p##.avx
-#define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
+# define SECTION(p)		p##.avx
+# define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
 
-#include "memmove-vec-unaligned-erms.S"
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index 38358fa..c2c5293 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,4 +1,4 @@
-#ifdef HAVE_AVX512_ASM_SUPPORT
+#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
 # define VEC_SIZE	64
 # define VEC(i)		zmm##i
 # define VMOVU		vmovdqu64
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 52b9ae0..85214fe 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -1,9 +1,11 @@
-#define VEC_SIZE	16
-#define VEC(i)		xmm##i
-#define VMOVU		movdqu
-#define VMOVA		movdqa
+#if IS_IN (libc)
+# define VEC_SIZE	16
+# define VEC(i)		xmm##i
+# define VMOVU		movdqu
+# define VMOVA		movdqa
 
-#define SECTION(p)		p
-#define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
+# define SECTION(p)		p
+# define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
 
-#include "memmove-vec-unaligned-erms.S"
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index e0dc565..79975e0 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -1,14 +1,16 @@
-#define VEC_SIZE	32
-#define VEC(i)		ymm##i
-#define VMOVU		vmovdqu
-#define VMOVA		vmovdqa
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define VEC(i)		ymm##i
+# define VMOVU		vmovdqu
+# define VMOVA		vmovdqa
 
-#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
   movq r, %rax; \
   vpbroadcastb %xmm0, %ymm0
 
-#define SECTION(p)		p##.avx
-#define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+# define SECTION(p)		p##.avx
+# define MEMSET_SYMBOL(p,s)	p##_avx2_##s
 
-#include "memset-vec-unaligned-erms.S"
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 72f4095..f1b3cb2 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,4 +1,4 @@
-#ifdef HAVE_AVX512_ASM_SUPPORT
+#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
 # define VEC_SIZE	64
 # define VEC(i)		zmm##i
 # define VMOVU		vmovdqu64
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 437a858..2deba42 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -1,16 +1,18 @@
-#define VEC_SIZE	16
-#define VEC(i)		xmm##i
-#define VMOVU		movdqu
-#define VMOVA		movdqa
+#if IS_IN (libc)
+# define VEC_SIZE	16
+# define VEC(i)		xmm##i
+# define VMOVU		movdqu
+# define VMOVA		movdqa
 
-#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
   movq r, %rax; \
   punpcklbw %xmm0, %xmm0; \
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
-#define SECTION(p)		p
-#define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+# define SECTION(p)		p
+# define MEMSET_SYMBOL(p,s)	p##_sse2_##s
 
-#include "memset-vec-unaligned-erms.S"
+# include "memset-vec-unaligned-erms.S"
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a96379797a7eecc1b709cad7b68981eb698783dc

commit a96379797a7eecc1b709cad7b68981eb698783dc
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Apr 3 12:38:25 2016 -0700

    Fix memmove-vec-unaligned-erms.S
    
    __mempcpy_erms and __memmove_erms can't be placed between __memmove_chk
    and __memmove it breaks __memmove_chk.
    
    Don't check source == destination first since it is less common.
    
    	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
    	(__mempcpy_erms, __memmove_erms): Moved before __mempcpy_chk
    	with unaligned_erms.
    	(__memmove_erms): Skip if source == destination.
    	(__memmove_unaligned_erms): Don't check source == destination
    	first.
    
    (cherry picked from commit ea2785e96fa503f3a2b5dd9f3a6ca65622b3c5f2)

diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index cf645dd..66779a3 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -95,46 +95,30 @@ L(start):
 	ret
 END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
 
-# ifdef SHARED
-ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
-# endif
-
-ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
-	movq	%rdi, %rax
-	addq	%rdx, %rax
-	jmp	L(start_erms)
-END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
-
-# ifdef SHARED
-ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
-# endif
-
 # if VEC_SIZE == 16
 /* Only used to measure performance of REP MOVSB.  */
 #  ifdef SHARED
 ENTRY (__mempcpy_erms)
 	movq	%rdi, %rax
 	addq	%rdx, %rax
-	jmp	L(movsb)
+	jmp	L(start_movsb)
 END (__mempcpy_erms)
 #  endif
 
 ENTRY (__memmove_erms)
 	movq	%rdi, %rax
+L(start_movsb):
 	movq	%rdx, %rcx
 	cmpq	%rsi, %rdi
-	jbe	1f
+	jb	1f
+	/* Source == destination is less common.  */
+	je	2f
 	leaq	(%rsi,%rcx), %rdx
 	cmpq	%rdx, %rdi
 	jb	L(movsb_backward)
 1:
 	rep movsb
+2:
 	ret
 L(movsb_backward):
 	leaq	-1(%rdi,%rcx), %rdi
@@ -147,6 +131,26 @@ END (__memmove_erms)
 strong_alias (__memmove_erms, __memcpy_erms)
 # endif
 
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start_erms)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+# endif
+
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 	movq	%rdi, %rax
 L(start_erms):
@@ -166,8 +170,9 @@ L(return):
 
 L(movsb):
 	cmpq	%rsi, %rdi
-	je	L(nop)
 	jb	1f
+	/* Source == destination is less common.  */
+	je	L(nop)
 	leaq	(%rsi,%rdx), %r9
 	cmpq	%r9, %rdi
 	/* Avoid slow backward REP MOVSB.  */
@@ -191,8 +196,9 @@ L(movsb_more_2x_vec):
 L(more_2x_vec):
 	/* More than 2 * VEC.  */
 	cmpq	%rsi, %rdi
-	je	L(nop)
 	jb	L(copy_forward)
+	/* Source == destination is less common.  */
+	je	L(nop)
 	leaq	(%rsi,%rdx), %rcx
 	cmpq	%rcx, %rdi
 	jb	L(more_2x_vec_overlap)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cfb059c79729b26284863334c9aa04f0a3b967b9

commit cfb059c79729b26284863334c9aa04f0a3b967b9
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Apr 1 15:08:48 2016 -0700

    Remove Fast_Copy_Backward from Intel Core processors
    
    Intel Core i3, i5 and i7 processors have fast unaligned copy and
    copy backward is ignored.  Remove Fast_Copy_Backward from Intel Core
    processors to avoid confusion.
    
    	* sysdeps/x86/cpu-features.c (init_cpu_features): Don't set
    	bit_arch_Fast_Copy_Backward for Intel Core proessors.
    
    (cherry picked from commit 27d3ce1467990f89126e228559dec8f84b96c60e)

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index de75c79..963b845 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -176,11 +176,8 @@ init_cpu_features (struct cpu_features *cpu_features)
 	    case 0x2c:
 	    case 0x2e:
 	    case 0x2f:
-	      /* Rep string instructions, copy backward, unaligned loads
+	      /* Rep string instructions, unaligned load, unaligned copy,
 		 and pminub are fast on Intel Core i3, i5 and i7.  */
-#if index_arch_Fast_Rep_String != index_arch_Fast_Copy_Backward
-# error index_arch_Fast_Rep_String != index_arch_Fast_Copy_Backward
-#endif
 #if index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Load
 # error index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Load
 #endif
@@ -192,7 +189,6 @@ init_cpu_features (struct cpu_features *cpu_features)
 #endif
 	      cpu_features->feature[index_arch_Fast_Rep_String]
 		|= (bit_arch_Fast_Rep_String
-		    | bit_arch_Fast_Copy_Backward
 		    | bit_arch_Fast_Unaligned_Load
 		    | bit_arch_Fast_Unaligned_Copy
 		    | bit_arch_Prefer_PMINUB_for_stringop);

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=30c389be1af67c4d0716d207b6780c6169d1355f

commit 30c389be1af67c4d0716d207b6780c6169d1355f
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Mar 31 10:05:51 2016 -0700

    Add x86-64 memset with unaligned store and rep stosb
    
    Implement x86-64 memset with unaligned store and rep movsb.  Support
    16-byte, 32-byte and 64-byte vector register sizes.  A single file
    provides 2 implementations of memset, one with rep stosb and the other
    without rep stosb.  They share the same codes when size is between 2
    times of vector register size and REP_STOSB_THRESHOLD which defaults
    to 2KB.
    
    Key features:
    
    1. Use overlapping store to avoid branch.
    2. For size <= 4 times of vector register size, fully unroll the loop.
    3. For size > 4 times of vector register size, store 4 times of vector
    register size at a time.
    
    	[BZ #19881]
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
    	memset-avx512-unaligned-erms.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
    	__memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
    	__memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
    	__memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
    	__memset_sse2_unaligned_erms, __memset_erms,
    	__memset_avx2_unaligned, __memset_avx2_unaligned_erms,
    	__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
    	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
    	file.
    	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
    	Likewise.
    	* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
    	Likewise.
    	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
    	Likewise.
    
    (cherry picked from commit 830566307f038387ca0af3fd327706a8d1a2f595)

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ef4dbc0..8878efb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memset-avx512-no-vzeroupper \
 		   memmove-sse2-unaligned-erms \
 		   memmove-avx-unaligned-erms \
-		   memmove-avx512-unaligned-erms
+		   memmove-avx512-unaligned-erms \
+		   memset-sse2-unaligned-erms \
+		   memset-avx2-unaligned-erms \
+		   memset-avx512-unaligned-erms
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 9204da4..1e880f6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -118,12 +118,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, __memset_chk,
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_sse2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
+			      __memset_chk_sse2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_chk_avx2)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_chk_avx2_unaligned_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_chk_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_chk_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memset_chk_avx512_no_vzeroupper)
 #endif
 	      )
@@ -131,12 +147,29 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
+	      IFUNC_IMPL_ADD (array, i, memset, 1,
+			      __memset_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset, 1,
+			      __memset_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX2_Usable),
 			      __memset_avx2)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memset_avx2_unaligned_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memset_avx512_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memset_avx512_no_vzeroupper)
 #endif
 	     )
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
new file mode 100644
index 0000000..e0dc565
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -0,0 +1,14 @@
+#define VEC_SIZE	32
+#define VEC(i)		ymm##i
+#define VMOVU		vmovdqu
+#define VMOVA		vmovdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastb %xmm0, %ymm0
+
+#define SECTION(p)		p##.avx
+#define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
new file mode 100644
index 0000000..72f4095
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -0,0 +1,17 @@
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE	64
+# define VEC(i)		zmm##i
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastb %xmm0, %xmm0; \
+  vpbroadcastq %xmm0, %zmm0
+
+# define SECTION(p)		p##.avx512
+# define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
new file mode 100644
index 0000000..437a858
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -0,0 +1,16 @@
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+#define VMOVU		movdqu
+#define VMOVA		movdqa
+
+#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movd d, %xmm0; \
+  movq r, %rax; \
+  punpcklbw %xmm0, %xmm0; \
+  punpcklwd %xmm0, %xmm0; \
+  pshufd $0, %xmm0, %xmm0
+
+#define SECTION(p)		p
+#define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+
+#include "memset-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
new file mode 100644
index 0000000..9383517
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -0,0 +1,251 @@
+/* memset/bzero with unaligned store and rep stosb
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* memset is implemented as:
+   1. Use overlapping store to avoid branch.
+   2. Force 32-bit displacement for branches to avoid long nop between
+      instructions.
+   3. If size is less than VEC, use integer register stores.
+   4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
+   5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
+   6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
+      4 VEC stores and store 4 * VEC at a time until done.  */
+
+#include <sysdep.h>
+
+#ifndef VZEROUPPER
+# if VEC_SIZE > 16
+#  define VZEROUPPER			vzeroupper
+# else
+#  define VZEROUPPER
+# endif
+#endif
+
+#ifndef VZEROUPPER_SHORT_RETURN
+# if VEC_SIZE > 16
+#  define VZEROUPPER_SHORT_RETURN	vzeroupper
+# else
+#  define VZEROUPPER_SHORT_RETURN	rep
+# endif
+#endif
+
+#ifndef MOVQ
+# if VEC_SIZE > 16
+#  define MOVQ				vmovq
+# else
+#  define MOVQ				movq
+# endif
+#endif
+
+/* Threshold to use Enhanced REP STOSB.  Since there is overhead to set
+   up REP STOSB operation, REP STOSB isn't faster on short data.  The
+   memset micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP STOSB becomes faster on processors with
+   Enhanced REP STOSB.  Since the stored value is fixed, larger register
+   size has minimal impact on threshold.  */
+#ifndef REP_STOSB_THRESHOLD
+# define REP_STOSB_THRESHOLD		2048
+#endif
+
+#ifndef SECTION
+# error SECTION is not defined!
+#endif
+
+#if !defined USE_MULTIARCH && IS_IN (libc)
+	.section SECTION(.text),"ax",@progbits
+ENTRY (__bzero)
+	movq	%rdi, %rax /* Set return value.  */
+	movq	%rsi, %rdx /* Set n.  */
+	pxor	%xmm0, %xmm0
+	jmp	L(entry_from_bzero)
+END (__bzero)
+weak_alias (__bzero, bzero)
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+L(memset_entry):
+	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+L(entry_from_bzero):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), (%rdi)
+	VZEROUPPER
+	ret
+END (MEMSET_SYMBOL (__memset, unaligned))
+
+#if VEC_SIZE == 16
+/* Only used to measure performance of REP STOSB.  */
+ENTRY (__memset_erms)
+#else
+/* Provide a symbol to debugger.  */
+ENTRY (MEMSET_SYMBOL (__memset, erms))
+#endif
+L(stosb):
+	movq	%rdx, %rcx
+	movzbl	%sil, %eax
+	movq	%rdi, %rdx
+	rep stosb
+	movq	%rdx, %rax
+	ret
+#if VEC_SIZE == 16
+END (__memset_erms)
+#else
+END (MEMSET_SYMBOL (__memset, erms))
+#endif
+
+#if defined SHARED && IS_IN (libc)
+ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
+#endif
+
+ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(stosb_more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), (%rdi)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(stosb_more_2x_vec):
+	cmpq	$REP_STOSB_THRESHOLD, %rdx
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	ja.d32	L(stosb)
+	.p2align 4
+L(more_2x_vec):
+	cmpq  $(VEC_SIZE * 4), %rdx
+	ja	L(loop_start)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+L(return):
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(loop_start):
+	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+	VMOVU	%VEC(0), (%rdi)
+	andq	$-(VEC_SIZE * 4), %rcx
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+	addq	%rdi, %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+	cmpq	%rdx, %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	je.d32	L(return)
+# else
+	je	L(return)
+# endif
+	.p2align 4
+L(loop):
+	VMOVA	%VEC(0), (%rcx)
+	VMOVA	%VEC(0), VEC_SIZE(%rcx)
+	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
+	addq	$(VEC_SIZE * 4), %rcx
+	cmpq	%rcx, %rdx
+	jne	L(loop)
+	VZEROUPPER_SHORT_RETURN
+	ret
+L(less_vec):
+	/* Less than 1 VEC.  */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+#  error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+	cmpb	$32, %dl
+	jae	L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+# endif
+	MOVQ	%xmm0, %rcx
+	cmpb	$8, %dl
+	jae	L(between_8_15)
+	cmpb	$4, %dl
+	jae	L(between_4_7)
+	cmpb	$1, %dl
+	ja	L(between_2_3)
+	jb	1f
+	movb	%cl, (%rdi)
+1:
+	VZEROUPPER
+	ret
+# if VEC_SIZE > 32
+	/* From 32 to 63.  No branch when size == 32.  */
+L(between_32_63):
+	vmovdqu	%ymm0, -32(%rdi,%rdx)
+	vmovdqu	%ymm0, (%rdi)
+	VZEROUPPER
+	ret
+# endif
+# if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+	vmovdqu	%xmm0, -16(%rdi,%rdx)
+	vmovdqu	%xmm0, (%rdi)
+	VZEROUPPER
+	ret
+# endif
+	/* From 8 to 15.  No branch when size == 8.  */
+L(between_8_15):
+	movq	%rcx, -8(%rdi,%rdx)
+	movq	%rcx, (%rdi)
+	VZEROUPPER
+	ret
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	%ecx, -4(%rdi,%rdx)
+	movl	%ecx, (%rdi)
+	VZEROUPPER
+	ret
+L(between_2_3):
+	/* From 2 to 3.  No branch when size == 2.  */
+	movw	%cx, -2(%rdi,%rdx)
+	movw	%cx, (%rdi)
+	VZEROUPPER
+	ret
+END (MEMSET_SYMBOL (__memset, unaligned_erms))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=980d639b4ae58209843f09a29d86b0a8303b6650

commit 980d639b4ae58209843f09a29d86b0a8303b6650
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Mar 31 10:04:26 2016 -0700

    Add x86-64 memmove with unaligned load/store and rep movsb
    
    Implement x86-64 memmove with unaligned load/store and rep movsb.
    Support 16-byte, 32-byte and 64-byte vector register sizes.  When
    size <= 8 times of vector register size, there is no check for
    address overlap bewteen source and destination.  Since overhead for
    overlap check is small when size > 8 times of vector register size,
    memcpy is an alias of memmove.
    
    A single file provides 2 implementations of memmove, one with rep movsb
    and the other without rep movsb.  They share the same codes when size is
    between 2 times of vector register size and REP_MOVSB_THRESHOLD which
    is 2KB for 16-byte vector register size and scaled up by large vector
    register size.
    
    Key features:
    
    1. Use overlapping load and store to avoid branch.
    2. For size <= 8 times of vector register size, load  all sources into
    registers and store them together.
    3. If there is no address overlap bewteen source and destination, copy
    from both ends with 4 times of vector register size at a time.
    4. If address of destination > address of source, backward copy 8 times
    of vector register size at a time.
    5. Otherwise, forward copy 8 times of vector register size at a time.
    6. Use rep movsb only for forward copy.  Avoid slow backward rep movsb
    by fallbacking to backward copy 8 times of vector register size at a
    time.
    7. Skip when address of destination == address of source.
    
    	[BZ #19776]
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and
    	memmove-avx512-unaligned-erms.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Test
    	__memmove_chk_avx512_unaligned_2,
    	__memmove_chk_avx512_unaligned_erms,
    	__memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms,
    	__memmove_chk_sse2_unaligned_2,
    	__memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2,
    	__memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2,
    	__memmove_avx512_unaligned_erms, __memmove_erms,
    	__memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms,
    	__memcpy_chk_avx512_unaligned_2,
    	__memcpy_chk_avx512_unaligned_erms,
    	__memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms,
    	__memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms,
    	__memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms,
    	__memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms,
    	__memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms,
    	__memcpy_erms, __mempcpy_chk_avx512_unaligned_2,
    	__mempcpy_chk_avx512_unaligned_erms,
    	__mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms,
    	__mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms,
    	__mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms,
    	__mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms,
    	__mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and
    	__mempcpy_erms.
    	* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New
    	file.
    	* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
    	Likwise.
    	* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
    	Likwise.
    	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
    	Likwise.
    
    (cherry picked from commit 88b57b8ed41d5ecf2e1bdfc19556f9246a665ebb)

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 7fc89c2..ef4dbc0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -20,7 +20,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
 		   strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
-		   memset-avx512-no-vzeroupper
+		   memset-avx512-no-vzeroupper \
+		   memmove-sse2-unaligned-erms \
+		   memmove-avx-unaligned-erms \
+		   memmove-avx512-unaligned-erms
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 188b6d3..9204da4 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -52,17 +52,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memmove_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_chk_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_chk_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_chk_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memmove_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
+			      __memmove_chk_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memmove.S.  */
@@ -70,15 +86,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memmove_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memmove_avx_unaligned_erms)
 #ifdef HAVE_AVX512_ASM_SUPPORT
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memmove_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memmove_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
 			      __memmove_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1,
+			      __memmove_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memmove, 1,
+			      __memmove_sse2_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
@@ -267,17 +300,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memcpy_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_chk_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_chk_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_chk_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
+			      __memcpy_chk_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memcpy.S.  */
@@ -285,6 +334,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __memcpy_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __memcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
 			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
@@ -293,8 +348,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __memcpy_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __memcpy_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+			      __memcpy_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+			      __memcpy_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S.  */
@@ -303,17 +369,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __mempcpy_chk_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_chk_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_chk_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_chk_avx_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_chk_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
+			      __mempcpy_chk_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2))
 
   /* Support sysdeps/x86_64/multiarch/mempcpy.S.  */
@@ -322,14 +404,31 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX512F_Usable),
 			      __mempcpy_avx512_no_vzeroupper)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_avx512_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __mempcpy_avx512_unaligned_erms)
 #endif
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      HAS_ARCH_FEATURE (AVX_Usable),
 			      __mempcpy_avx_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_avx_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      HAS_ARCH_FEATURE (AVX_Usable),
+			      __mempcpy_avx_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
 			      __mempcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+			      __mempcpy_sse2_unaligned_2)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
+			      __mempcpy_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
new file mode 100644
index 0000000..3a72c7e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -0,0 +1,9 @@
+#define VEC_SIZE	32
+#define VEC(i)		ymm##i
+#define VMOVU		vmovdqu
+#define VMOVA		vmovdqa
+
+#define SECTION(p)		p##.avx
+#define MEMMOVE_SYMBOL(p,s)	p##_avx_##s
+
+#include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
new file mode 100644
index 0000000..38358fa
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -0,0 +1,11 @@
+#ifdef HAVE_AVX512_ASM_SUPPORT
+# define VEC_SIZE	64
+# define VEC(i)		zmm##i
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+# define SECTION(p)		p##.avx512
+# define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
new file mode 100644
index 0000000..52b9ae0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -0,0 +1,9 @@
+#define VEC_SIZE	16
+#define VEC(i)		xmm##i
+#define VMOVU		movdqu
+#define VMOVA		movdqa
+
+#define SECTION(p)		p
+#define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
+
+#include "memmove-vec-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
new file mode 100644
index 0000000..cf645dd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -0,0 +1,462 @@
+/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* memmove/memcpy/mempcpy is implemented as:
+   1. Use overlapping load and store to avoid branch.
+   2. Use 8-bit or 32-bit displacements for branches and nop paddings
+      to avoid long nop between instructions.
+   3. Load all sources into registers and store them together to avoid
+      possible address overflap between source and destination.
+   4. If size is 2 * VEC_SIZE or less, load all sources into registers
+      and store them together.
+   5. If there is no address overflap, copy from both ends with
+      4 * VEC_SIZE at a time.
+   6. If size is 8 * VEC_SIZE or less, load all sources into registers
+      and store them together.
+   7. If address of destination > address of source, backward copy
+      8 * VEC_SIZE at a time.
+   8. Otherwise, forward copy 8 * VEC_SIZE at a time.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+# include "asm-syntax.h"
+
+# ifndef VZEROUPPER
+#  if VEC_SIZE > 16
+#   define VZEROUPPER vzeroupper
+#  else
+#   define VZEROUPPER
+#  endif
+# endif
+
+/* Threshold to use Enhanced REP MOVSB.  Since there is overhead to set
+   up REP MOVSB operation, REP MOVSB isn't faster on short data.  The
+   memcpy micro benchmark in glibc shows that 2KB is the approximate
+   value above which REP MOVSB becomes faster than SSE2 optimization
+   on processors with Enhanced REP MOVSB.  Since larger register size
+   can move more data with a single load and store, the threshold is
+   higher with larger register size.  */
+# ifndef REP_MOVSB_THRESHOLD
+#  define REP_MOVSB_THRESHOLD	(2048 * (VEC_SIZE / 16))
+# endif
+
+# ifndef SECTION
+#  error SECTION is not defined!
+# endif
+	.section SECTION(.text),"ax",@progbits
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2))
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2))
+
+ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+	movq	%rdi, %rax
+L(start):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+	VZEROUPPER
+	ret
+END (MEMMOVE_SYMBOL (__memmove, unaligned_2))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms))
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start_erms)
+END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+
+# ifdef SHARED
+ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms))
+# endif
+
+# if VEC_SIZE == 16
+/* Only used to measure performance of REP MOVSB.  */
+#  ifdef SHARED
+ENTRY (__mempcpy_erms)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(movsb)
+END (__mempcpy_erms)
+#  endif
+
+ENTRY (__memmove_erms)
+	movq	%rdi, %rax
+	movq	%rdx, %rcx
+	cmpq	%rsi, %rdi
+	jbe	1f
+	leaq	(%rsi,%rcx), %rdx
+	cmpq	%rdx, %rdi
+	jb	L(movsb_backward)
+1:
+	rep movsb
+	ret
+L(movsb_backward):
+	leaq	-1(%rdi,%rcx), %rdi
+	leaq	-1(%rsi,%rcx), %rsi
+	std
+	rep movsb
+	cld
+	ret
+END (__memmove_erms)
+strong_alias (__memmove_erms, __memcpy_erms)
+# endif
+
+ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+	movq	%rdi, %rax
+L(start_erms):
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(movsb_more_2x_vec)
+L(last_2x_vec):
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(1)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), -VEC_SIZE(%rdi,%rdx)
+L(return):
+	VZEROUPPER
+	ret
+
+L(movsb):
+	cmpq	%rsi, %rdi
+	je	L(nop)
+	jb	1f
+	leaq	(%rsi,%rdx), %r9
+	cmpq	%r9, %rdi
+	/* Avoid slow backward REP MOVSB.  */
+# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
+#  error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
+# endif
+	jb	L(more_8x_vec_backward)
+1:
+	movq	%rdx, %rcx
+	rep movsb
+L(nop):
+	ret
+
+	.p2align 4
+L(movsb_more_2x_vec):
+	cmpq	$REP_MOVSB_THRESHOLD, %rdx
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	ja.d32	L(movsb)
+	.p2align 4
+L(more_2x_vec):
+	/* More than 2 * VEC.  */
+	cmpq	%rsi, %rdi
+	je	L(nop)
+	jb	L(copy_forward)
+	leaq	(%rsi,%rdx), %rcx
+	cmpq	%rcx, %rdi
+	jb	L(more_2x_vec_overlap)
+L(copy_forward):
+	leaq	(%rdi,%rdx), %rcx
+	cmpq	%rcx, %rsi
+	jb	L(more_2x_vec_overlap)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	jbe.d32	L(return)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(0)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
+	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
+	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(1), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
+	cmpq	$(VEC_SIZE * 8), %rdx
+# if  VEC_SIZE == 16
+	jbe	L(return)
+# else
+	/* Use 8-bit displacement to avoid long nop between
+	   instructions.  */
+	jbe	L(return_disp8)
+# endif
+	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+	addq	%rdi, %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+	andq	$-(VEC_SIZE * 4), %rcx
+	movq	%rcx, %r11
+	subq	%rdi, %r11
+	addq	%r11, %rsi
+	cmpq	%rdx, %rcx
+	/* Use 8-bit displacement to avoid long nop between
+	   instructions.  */
+	je	L(return_disp8)
+	movq	%rsi, %r10
+	subq	%rcx, %r10
+	leaq	VEC_SIZE(%r10), %r9
+	leaq	(VEC_SIZE * 2)(%r10), %r8
+	leaq	(VEC_SIZE * 3)(%r10), %r11
+	.p2align 4
+L(loop):
+	VMOVU	(%rcx,%r10), %VEC(0)
+	VMOVU	(%rcx,%r9), %VEC(1)
+	VMOVU	(%rcx,%r8), %VEC(2)
+	VMOVU	(%rcx,%r11), %VEC(3)
+	VMOVA	%VEC(0), (%rcx)
+	VMOVA	%VEC(1), VEC_SIZE(%rcx)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rcx)
+	addq	$(VEC_SIZE * 4), %rcx
+	cmpq	%rcx, %rdx
+	jne	L(loop)
+L(return_disp8):
+	VZEROUPPER
+	ret
+L(less_vec):
+	/* Less than 1 VEC.  */
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+#  error Unsupported VEC_SIZE!
+# endif
+# if VEC_SIZE > 32
+	cmpb	$32, %dl
+	jae	L(between_32_63)
+# endif
+# if VEC_SIZE > 16
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+# endif
+	cmpb	$8, %dl
+	jae	L(between_8_15)
+	cmpb	$4, %dl
+	jae	L(between_4_7)
+	cmpb	$1, %dl
+	ja	L(between_2_3)
+	jb	1f
+	movzbl	(%rsi), %ecx
+	movb	%cl, (%rdi)
+1:
+	ret
+# if VEC_SIZE > 32
+L(between_32_63):
+	/* From 32 to 63.  No branch when size == 32.  */
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu	-32(%rsi,%rdx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -32(%rdi,%rdx)
+	VZEROUPPER
+	ret
+# endif
+# if VEC_SIZE > 16
+	/* From 16 to 31.  No branch when size == 16.  */
+L(between_16_31):
+	vmovdqu	(%rsi), %xmm0
+	vmovdqu	-16(%rsi,%rdx), %xmm1
+	vmovdqu	%xmm0, (%rdi)
+	vmovdqu	%xmm1, -16(%rdi,%rdx)
+	ret
+# endif
+L(between_8_15):
+	/* From 8 to 15.  No branch when size == 8.  */
+	movq	-8(%rsi,%rdx), %rcx
+	movq	(%rsi), %rsi
+	movq	%rcx, -8(%rdi,%rdx)
+	movq	%rsi, (%rdi)
+	ret
+L(between_4_7):
+	/* From 4 to 7.  No branch when size == 4.  */
+	movl	-4(%rsi,%rdx), %ecx
+	movl	(%rsi), %esi
+	movl	%ecx, -4(%rdi,%rdx)
+	movl	%esi, (%rdi)
+	ret
+L(between_2_3):
+	/* From 2 to 3.  No branch when size == 2.  */
+	movzwl	-2(%rsi,%rdx), %ecx
+	movzwl	(%rsi), %esi
+	movw	%cx, -2(%rdi,%rdx)
+	movw	%si, (%rdi)
+	ret
+
+# if VEC_SIZE > 16
+	/* Align to 16 bytes to avoid long nop between instructions.  */
+	.p2align 4
+# endif
+L(more_2x_vec_overlap):
+	/* More than 2 * VEC and there is overlap bewteen destination
+	   and source.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+L(between_4x_vec_and_8x_vec):
+	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(4), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+L(last_4x_vec):
+	/* Copy from 2 * VEC to 4 * VEC. */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
+	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VZEROUPPER
+	ret
+L(between_0_and_4x_vec):
+	/* Copy from 0 to 4 * VEC. */
+	cmpl	$(VEC_SIZE * 2), %edx
+	jae	L(last_4x_vec)
+	/* Copy from 0 to 2 * VEC. */
+	cmpl	$VEC_SIZE, %edx
+	jae	L(last_2x_vec)
+	/* Copy from 0 to VEC. */
+	VZEROUPPER
+	jmp	L(less_vec)
+L(more_8x_vec):
+	cmpq	%rsi, %rdi
+	ja	L(more_8x_vec_backward)
+
+	.p2align 4
+L(loop_8x_vec_forward):
+	/* Copy 8 * VEC a time forward.  */
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	VMOVU	(VEC_SIZE * 4)(%rsi), %VEC(4)
+	VMOVU	(VEC_SIZE * 5)(%rsi), %VEC(5)
+	VMOVU	(VEC_SIZE * 6)(%rsi), %VEC(6)
+	VMOVU	(VEC_SIZE * 7)(%rsi), %VEC(7)
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(1), VEC_SIZE(%rdi)
+	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	VMOVU	%VEC(4), (VEC_SIZE * 4)(%rdi)
+	VMOVU	%VEC(5), (VEC_SIZE * 5)(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 6)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 7)(%rdi)
+	addq	$(VEC_SIZE * 8), %rdi
+	addq	$(VEC_SIZE * 8), %rsi
+	subq	$(VEC_SIZE * 8), %rdx
+	cmpq	$(VEC_SIZE * 8), %rdx
+	je	L(between_4x_vec_and_8x_vec)
+	ja	L(loop_8x_vec_forward)
+	/* Less than 8 * VEC to copy.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(between_0_and_4x_vec)
+	jmp	L(between_4x_vec_and_8x_vec)
+
+	.p2align 4
+L(more_8x_vec_backward):
+	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
+	leaq	-VEC_SIZE(%rdi, %rdx), %r9
+
+	.p2align 4
+L(loop_8x_vec_backward):
+	/* Copy 8 * VEC a time backward.  */
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	VMOVU	-(VEC_SIZE * 4)(%rcx), %VEC(4)
+	VMOVU	-(VEC_SIZE * 5)(%rcx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 6)(%rcx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 7)(%rcx), %VEC(7)
+	VMOVU	%VEC(0), (%r9)
+	VMOVU	%VEC(1), -VEC_SIZE(%r9)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVU	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	VMOVU	%VEC(4), -(VEC_SIZE * 4)(%r9)
+	VMOVU	%VEC(5), -(VEC_SIZE * 5)(%r9)
+	VMOVU	%VEC(6), -(VEC_SIZE * 6)(%r9)
+	VMOVU	%VEC(7), -(VEC_SIZE * 7)(%r9)
+	subq	$(VEC_SIZE * 8), %rcx
+	subq	$(VEC_SIZE * 8), %r9
+	subq	$(VEC_SIZE * 8), %rdx
+	cmpq	$(VEC_SIZE * 8), %rdx
+	je	L(between_4x_vec_and_8x_vec)
+	ja	L(loop_8x_vec_backward)
+	/* Less than 8 * VEC to copy.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(between_0_and_4x_vec)
+	jmp	L(between_4x_vec_and_8x_vec)
+END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+
+# ifdef SHARED
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
+	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
+strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_2),
+	      MEMMOVE_SYMBOL (__memcpy, unaligned_2))
+strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2),
+	      MEMMOVE_SYMBOL (__memcpy_chk, unaligned_2))
+# endif
+
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=bf2bc5e5c9d7aa8af28b299ec26b8a37352730cc

commit bf2bc5e5c9d7aa8af28b299ec26b8a37352730cc
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Mar 28 19:22:59 2016 -0700

    Initial Enhanced REP MOVSB/STOSB (ERMS) support
    
    The newer Intel processors support Enhanced REP MOVSB/STOSB (ERMS) which
    has a feature bit in CPUID.  This patch adds the Enhanced REP MOVSB/STOSB
    (ERMS) bit to x86 cpu-features.
    
    	* sysdeps/x86/cpu-features.h (bit_cpu_ERMS): New.
    	(index_cpu_ERMS): Likewise.
    	(reg_ERMS): Likewise.
    
    (cherry picked from commit 0791f91dff9a77263fa8173b143d854cad902c6d)

diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index bfe1f4c..8f946c4 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -53,6 +53,7 @@
 #define bit_cpu_FMA4		(1 << 16)
 
 /* COMMON_CPUID_INDEX_7.  */
+#define bit_cpu_ERMS		(1 << 9)
 #define bit_cpu_RTM		(1 << 11)
 #define bit_cpu_AVX2		(1 << 5)
 #define bit_cpu_AVX512F		(1 << 16)
@@ -84,6 +85,7 @@
 # define index_cpu_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 # define index_cpu_AVX	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 # define index_cpu_AVX2	COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
+# define index_cpu_ERMS	COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
 
 # define index_arch_Fast_Rep_String	FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Fast_Copy_Backward	FEATURE_INDEX_1*FEATURE_SIZE
@@ -228,6 +230,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_cpu_AVX2		COMMON_CPUID_INDEX_7
 # define index_cpu_AVX512F	COMMON_CPUID_INDEX_7
 # define index_cpu_AVX512DQ	COMMON_CPUID_INDEX_7
+# define index_cpu_ERMS		COMMON_CPUID_INDEX_7
 # define index_cpu_RTM		COMMON_CPUID_INDEX_7
 # define index_cpu_FMA		COMMON_CPUID_INDEX_1
 # define index_cpu_FMA4		COMMON_CPUID_INDEX_80000001
@@ -244,6 +247,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define reg_AVX2		ebx
 # define reg_AVX512F		ebx
 # define reg_AVX512DQ		ebx
+# define reg_ERMS		ebx
 # define reg_RTM		ebx
 # define reg_FMA		ecx
 # define reg_FMA4		ecx

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=7c244283ff12329b3bca9878b8edac3b3fe5c7bc

commit 7c244283ff12329b3bca9878b8edac3b3fe5c7bc
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Mar 28 13:15:59 2016 -0700

    Make __memcpy_avx512_no_vzeroupper an alias
    
    Since x86-64 memcpy-avx512-no-vzeroupper.S implements memmove, make
    __memcpy_avx512_no_vzeroupper an alias of __memmove_avx512_no_vzeroupper
    to reduce code size of libc.so.
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove
    	memcpy-avx512-no-vzeroupper.
    	* sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S: Renamed
    	to ...
    	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: This.
    	(MEMCPY): Don't define.
    	(MEMCPY_CHK): Likewise.
    	(MEMPCPY): Likewise.
    	(MEMPCPY_CHK): Likewise.
    	(MEMPCPY_CHK): Renamed to ...
    	(__mempcpy_chk_avx512_no_vzeroupper): This.
    	(MEMPCPY_CHK): Renamed to ...
    	(__mempcpy_chk_avx512_no_vzeroupper): This.
    	(MEMCPY_CHK): Renamed to ...
    	(__memmove_chk_avx512_no_vzeroupper): This.
    	(MEMCPY): Renamed to ...
    	(__memmove_avx512_no_vzeroupper): This.
    	(__memcpy_avx512_no_vzeroupper): New alias.
    	(__memcpy_chk_avx512_no_vzeroupper): Likewise.
    
    (cherry picked from commit 064f01b10b57ff09cda7025f484b848c38ddd57a)

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 39c0905..7fc89c2 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,7 +8,7 @@ ifeq ($(subdir),string)
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
 		   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
-		   memcpy-avx512-no-vzeroupper memmove-ssse3 \
+		   memmove-ssse3 \
 		   memcpy-ssse3-back memmove-avx-unaligned \
 		   memcpy-avx-unaligned \
 		   memmove-ssse3-back \
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
deleted file mode 100644
index 285bb83..0000000
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ /dev/null
@@ -1,424 +0,0 @@
-/* memcpy optimized with AVX512 for KNL hardware.
-   Copyright (C) 2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc) \
-    && (defined SHARED \
-	|| defined USE_AS_MEMMOVE \
-	|| !defined USE_MULTIARCH)
-
-#include "asm-syntax.h"
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_avx512_no_vzeroupper
-# define MEMCPY_CHK	__memcpy_chk_avx512_no_vzeroupper
-# define MEMPCPY	__mempcpy_avx512_no_vzeroupper
-# define MEMPCPY_CHK	__mempcpy_chk_avx512_no_vzeroupper
-#endif
-
-	.section .text.avx512,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	movq	%rdi, %rax
-	addq	%rdx, %rax
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmpq	%rdx, %rcx
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%rdi, %rax
-#ifdef USE_AS_MEMPCPY
-	add	%rdx, %rax
-#endif
-L(start):
-	lea	(%rsi, %rdx), %rcx
-	lea	(%rdi, %rdx), %r9
-	cmp	$512, %rdx
-	ja	L(512bytesormore)
-
-L(check):
-	cmp	$16, %rdx
-	jbe	L(less_16bytes)
-	cmp	$256, %rdx
-	jb	L(less_256bytes)
-	vmovups	(%rsi), %zmm0
-	vmovups 0x40(%rsi), %zmm1
-	vmovups 0x80(%rsi), %zmm2
-	vmovups 0xC0(%rsi), %zmm3
-	vmovups	-0x100(%rcx), %zmm4
-	vmovups -0xC0(%rcx), %zmm5
-	vmovups -0x80(%rcx), %zmm6
-	vmovups -0x40(%rcx), %zmm7
-	vmovups %zmm0, (%rdi)
-	vmovups %zmm1, 0x40(%rdi)
-	vmovups %zmm2, 0x80(%rdi)
-	vmovups %zmm3, 0xC0(%rdi)
-	vmovups	%zmm4, -0x100(%r9)
-	vmovups %zmm5, -0xC0(%r9)
-	vmovups %zmm6, -0x80(%r9)
-	vmovups %zmm7, -0x40(%r9)
-	ret
-
-L(less_256bytes):
-	cmp	$128, %dl
-	jb	L(less_128bytes)
-	vmovups	(%rsi), %zmm0
-	vmovups 0x40(%rsi), %zmm1
-	vmovups -0x80(%rcx), %zmm2
-	vmovups -0x40(%rcx), %zmm3
-	vmovups	%zmm0, (%rdi)
-	vmovups %zmm1, 0x40(%rdi)
-	vmovups %zmm2, -0x80(%r9)
-	vmovups %zmm3, -0x40(%r9)
-	ret
-
-L(less_128bytes):
-	cmp	$64, %dl
-	jb	L(less_64bytes)
-	vmovdqu (%rsi), %ymm0
-	vmovdqu 0x20(%rsi), %ymm1
-	vmovdqu -0x40(%rcx), %ymm2
-	vmovdqu -0x20(%rcx), %ymm3
-	vmovdqu %ymm0, (%rdi)
-	vmovdqu %ymm1, 0x20(%rdi)
-	vmovdqu %ymm2, -0x40(%r9)
-	vmovdqu %ymm3, -0x20(%r9)
-	ret
-
-L(less_64bytes):
-	cmp	$32, %dl
-	jb	L(less_32bytes)
-	vmovdqu	(%rsi), %ymm0
-	vmovdqu -0x20(%rcx), %ymm1
-	vmovdqu	%ymm0, (%rdi)
-	vmovdqu	%ymm1, -0x20(%r9)
-	ret
-
-L(less_32bytes):
-	vmovdqu (%rsi), %xmm0
-	vmovdqu -0x10(%rcx), %xmm1
-	vmovdqu %xmm0, (%rdi)
-	vmovdqu %xmm1, -0x10(%r9)
-	ret
-
-L(less_16bytes):
-	cmp	$8, %dl
-	jb	L(less_8bytes)
-	movq	(%rsi), %rsi
-	movq	-0x8(%rcx), %rcx
-	movq	%rsi, (%rdi)
-	movq	%rcx, -0x8(%r9)
-	ret
-
-L(less_8bytes):
-	cmp	$4, %dl
-	jb	L(less_4bytes)
-	mov	(%rsi), %esi
-	mov	-0x4(%rcx), %ecx
-	mov	%esi, (%rdi)
-	mov	%ecx, -0x4(%r9)
-	ret
-
-L(less_4bytes):
-	cmp	$2, %dl
-	jb	L(less_2bytes)
-	mov	(%rsi), %si
-	mov	-0x2(%rcx), %cx
-	mov	%si, (%rdi)
-	mov	%cx, -0x2(%r9)
-	ret
-
-L(less_2bytes):
-	cmp	$1, %dl
-	jb	L(less_1bytes)
-	mov	(%rsi), %cl
-	mov	%cl, (%rdi)
-L(less_1bytes):
-	ret
-
-L(512bytesormore):
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %r8
-#else
-	mov	__x86_shared_cache_size_half(%rip), %r8
-#endif
-	cmp	%r8, %rdx
-	jae	L(preloop_large)
-	cmp	$1024, %rdx
-	ja	L(1024bytesormore)
-	prefetcht1 (%rsi)
-	prefetcht1 0x40(%rsi)
-	prefetcht1 0x80(%rsi)
-	prefetcht1 0xC0(%rsi)
-	prefetcht1 0x100(%rsi)
-	prefetcht1 0x140(%rsi)
-	prefetcht1 0x180(%rsi)
-	prefetcht1 0x1C0(%rsi)
-	prefetcht1 -0x200(%rcx)
-	prefetcht1 -0x1C0(%rcx)
-	prefetcht1 -0x180(%rcx)
-	prefetcht1 -0x140(%rcx)
-	prefetcht1 -0x100(%rcx)
-	prefetcht1 -0xC0(%rcx)
-	prefetcht1 -0x80(%rcx)
-	prefetcht1 -0x40(%rcx)
-	vmovups	(%rsi), %zmm0
-	vmovups 0x40(%rsi), %zmm1
-	vmovups 0x80(%rsi), %zmm2
-	vmovups 0xC0(%rsi), %zmm3
-	vmovups	0x100(%rsi), %zmm4
-	vmovups 0x140(%rsi), %zmm5
-	vmovups 0x180(%rsi), %zmm6
-	vmovups 0x1C0(%rsi), %zmm7
-	vmovups	-0x200(%rcx), %zmm8
-	vmovups -0x1C0(%rcx), %zmm9
-	vmovups -0x180(%rcx), %zmm10
-	vmovups -0x140(%rcx), %zmm11
-	vmovups	-0x100(%rcx), %zmm12
-	vmovups -0xC0(%rcx), %zmm13
-	vmovups -0x80(%rcx), %zmm14
-	vmovups -0x40(%rcx), %zmm15
-	vmovups %zmm0, (%rdi)
-	vmovups %zmm1, 0x40(%rdi)
-	vmovups %zmm2, 0x80(%rdi)
-	vmovups %zmm3, 0xC0(%rdi)
-	vmovups %zmm4, 0x100(%rdi)
-	vmovups %zmm5, 0x140(%rdi)
-	vmovups %zmm6, 0x180(%rdi)
-	vmovups %zmm7, 0x1C0(%rdi)
-	vmovups	%zmm8, -0x200(%r9)
-	vmovups %zmm9, -0x1C0(%r9)
-	vmovups %zmm10, -0x180(%r9)
-	vmovups %zmm11, -0x140(%r9)
-	vmovups	%zmm12, -0x100(%r9)
-	vmovups %zmm13, -0xC0(%r9)
-	vmovups %zmm14, -0x80(%r9)
-	vmovups %zmm15, -0x40(%r9)
-	ret
-
-L(1024bytesormore):
-	cmp	%rsi, %rdi
-	ja	L(1024bytesormore_bkw)
-	sub	$512, %r9
-	vmovups -0x200(%rcx), %zmm8
-	vmovups -0x1C0(%rcx), %zmm9
-	vmovups -0x180(%rcx), %zmm10
-	vmovups -0x140(%rcx), %zmm11
-	vmovups	-0x100(%rcx), %zmm12
-	vmovups -0xC0(%rcx), %zmm13
-	vmovups -0x80(%rcx), %zmm14
-	vmovups -0x40(%rcx), %zmm15
-	prefetcht1 (%rsi)
-	prefetcht1 0x40(%rsi)
-	prefetcht1 0x80(%rsi)
-	prefetcht1 0xC0(%rsi)
-	prefetcht1 0x100(%rsi)
-	prefetcht1 0x140(%rsi)
-	prefetcht1 0x180(%rsi)
-	prefetcht1 0x1C0(%rsi)
-
-/* Loop with unaligned memory access.  */
-L(gobble_512bytes_loop):
-	vmovups	(%rsi), %zmm0
-	vmovups 0x40(%rsi), %zmm1
-	vmovups 0x80(%rsi), %zmm2
-	vmovups 0xC0(%rsi), %zmm3
-	vmovups	0x100(%rsi), %zmm4
-	vmovups 0x140(%rsi), %zmm5
-	vmovups 0x180(%rsi), %zmm6
-	vmovups 0x1C0(%rsi), %zmm7
-	add	$512, %rsi
-	prefetcht1 (%rsi)
-	prefetcht1 0x40(%rsi)
-	prefetcht1 0x80(%rsi)
-	prefetcht1 0xC0(%rsi)
-	prefetcht1 0x100(%rsi)
-	prefetcht1 0x140(%rsi)
-	prefetcht1 0x180(%rsi)
-	prefetcht1 0x1C0(%rsi)
-	vmovups	%zmm0, (%rdi)
-	vmovups %zmm1, 0x40(%rdi)
-	vmovups %zmm2, 0x80(%rdi)
-	vmovups %zmm3, 0xC0(%rdi)
-	vmovups	%zmm4, 0x100(%rdi)
-	vmovups %zmm5, 0x140(%rdi)
-	vmovups %zmm6, 0x180(%rdi)
-	vmovups %zmm7, 0x1C0(%rdi)
-	add	$512, %rdi
-	cmp	%r9, %rdi
-	jb	L(gobble_512bytes_loop)
-	vmovups %zmm8, (%r9)
-	vmovups %zmm9, 0x40(%r9)
-	vmovups %zmm10, 0x80(%r9)
-	vmovups %zmm11, 0xC0(%r9)
-	vmovups %zmm12, 0x100(%r9)
-	vmovups %zmm13, 0x140(%r9)
-	vmovups %zmm14, 0x180(%r9)
-	vmovups %zmm15, 0x1C0(%r9)
-	ret
-
-L(1024bytesormore_bkw):
-	add	$512, %rdi
-	vmovups	0x1C0(%rsi), %zmm8
-	vmovups 0x180(%rsi), %zmm9
-	vmovups 0x140(%rsi), %zmm10
-	vmovups 0x100(%rsi), %zmm11
-	vmovups	0xC0(%rsi), %zmm12
-	vmovups 0x80(%rsi), %zmm13
-	vmovups 0x40(%rsi), %zmm14
-	vmovups (%rsi), %zmm15
-	prefetcht1 -0x40(%rcx)
-	prefetcht1 -0x80(%rcx)
-	prefetcht1 -0xC0(%rcx)
-	prefetcht1 -0x100(%rcx)
-	prefetcht1 -0x140(%rcx)
-	prefetcht1 -0x180(%rcx)
-	prefetcht1 -0x1C0(%rcx)
-	prefetcht1 -0x200(%rcx)
-
-/* Backward loop with unaligned memory access.  */
-L(gobble_512bytes_loop_bkw):
-	vmovups -0x40(%rcx), %zmm0
-	vmovups -0x80(%rcx), %zmm1
-	vmovups -0xC0(%rcx), %zmm2
-	vmovups	-0x100(%rcx), %zmm3
-	vmovups -0x140(%rcx), %zmm4
-	vmovups -0x180(%rcx), %zmm5
-	vmovups -0x1C0(%rcx), %zmm6
-	vmovups	-0x200(%rcx), %zmm7
-	sub	$512, %rcx
-	prefetcht1 -0x40(%rcx)
-	prefetcht1 -0x80(%rcx)
-	prefetcht1 -0xC0(%rcx)
-	prefetcht1 -0x100(%rcx)
-	prefetcht1 -0x140(%rcx)
-	prefetcht1 -0x180(%rcx)
-	prefetcht1 -0x1C0(%rcx)
-	prefetcht1 -0x200(%rcx)
-	vmovups %zmm0, -0x40(%r9)
-	vmovups %zmm1, -0x80(%r9)
-	vmovups %zmm2, -0xC0(%r9)
-	vmovups	%zmm3, -0x100(%r9)
-	vmovups %zmm4, -0x140(%r9)
-	vmovups %zmm5, -0x180(%r9)
-	vmovups %zmm6, -0x1C0(%r9)
-	vmovups	%zmm7, -0x200(%r9)
-	sub	$512, %r9
-	cmp	%rdi, %r9
-	ja	L(gobble_512bytes_loop_bkw)
-	vmovups %zmm8, -0x40(%rdi)
-	vmovups %zmm9, -0x80(%rdi)
-	vmovups %zmm10, -0xC0(%rdi)
-	vmovups %zmm11, -0x100(%rdi)
-	vmovups %zmm12, -0x140(%rdi)
-	vmovups %zmm13, -0x180(%rdi)
-	vmovups %zmm14, -0x1C0(%rdi)
-	vmovups %zmm15, -0x200(%rdi)
-	ret
-
-L(preloop_large):
-	cmp	%rsi, %rdi
-	ja	L(preloop_large_bkw)
-	vmovups	(%rsi), %zmm4
-	vmovups	0x40(%rsi), %zmm5
-
-/* Align destination for access with non-temporal stores in the loop.  */
-	mov	%rdi, %r8
-	and	$-0x80, %rdi
-	add	$0x80, %rdi
-	sub	%rdi, %r8
-	sub	%r8, %rsi
-	add	%r8, %rdx
-L(gobble_256bytes_nt_loop):
-	prefetcht1 0x200(%rsi)
-	prefetcht1 0x240(%rsi)
-	prefetcht1 0x280(%rsi)
-	prefetcht1 0x2C0(%rsi)
-	prefetcht1 0x300(%rsi)
-	prefetcht1 0x340(%rsi)
-	prefetcht1 0x380(%rsi)
-	prefetcht1 0x3C0(%rsi)
-	vmovdqu64 (%rsi), %zmm0
-	vmovdqu64 0x40(%rsi), %zmm1
-	vmovdqu64 0x80(%rsi), %zmm2
-	vmovdqu64 0xC0(%rsi), %zmm3
-	vmovntdq %zmm0, (%rdi)
-	vmovntdq %zmm1, 0x40(%rdi)
-	vmovntdq %zmm2, 0x80(%rdi)
-	vmovntdq %zmm3, 0xC0(%rdi)
-	sub	$256, %rdx
-	add	$256, %rsi
-	add	$256, %rdi
-	cmp	$256, %rdx
-	ja	L(gobble_256bytes_nt_loop)
-	sfence
-	vmovups	%zmm4, (%rax)
-	vmovups	%zmm5, 0x40(%rax)
-	jmp	L(check)
-
-L(preloop_large_bkw):
-	vmovups -0x80(%rcx), %zmm4
-	vmovups -0x40(%rcx), %zmm5
-
-/* Align end of destination for access with non-temporal stores.  */
-	mov	%r9, %r8
-	and	$-0x80, %r9
-	sub	%r9, %r8
-	sub	%r8, %rcx
-	sub	%r8, %rdx
-	add	%r9, %r8
-L(gobble_256bytes_nt_loop_bkw):
-	prefetcht1 -0x400(%rcx)
-	prefetcht1 -0x3C0(%rcx)
-	prefetcht1 -0x380(%rcx)
-	prefetcht1 -0x340(%rcx)
-	prefetcht1 -0x300(%rcx)
-	prefetcht1 -0x2C0(%rcx)
-	prefetcht1 -0x280(%rcx)
-	prefetcht1 -0x240(%rcx)
-	vmovdqu64 -0x100(%rcx), %zmm0
-	vmovdqu64 -0xC0(%rcx), %zmm1
-	vmovdqu64 -0x80(%rcx), %zmm2
-	vmovdqu64 -0x40(%rcx), %zmm3
-	vmovntdq %zmm0,	-0x100(%r9)
-	vmovntdq %zmm1,	-0xC0(%r9)
-	vmovntdq %zmm2,	-0x80(%r9)
-	vmovntdq %zmm3,	-0x40(%r9)
-	sub	$256, %rdx
-	sub	$256, %rcx
-	sub	$256, %r9
-	cmp	$256, %rdx
-	ja	L(gobble_256bytes_nt_loop_bkw)
-	sfence
-	vmovups	%zmm4, -0x80(%r8)
-	vmovups	%zmm5, -0x40(%r8)
-	jmp	L(check)
-END (MEMCPY)
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
index 518d1fe..5b8ff57 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -1,4 +1,4 @@
-/* memmove optimized with AVX512 for KNL hardware.
+/* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.
    Copyright (C) 2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -16,7 +16,405 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_avx512_no_vzeroupper
-#define MEMCPY_CHK	__memmove_chk_avx512_no_vzeroupper
-#include "memcpy-avx512-no-vzeroupper.S"
+#include <sysdep.h>
+
+#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
+
+# include "asm-syntax.h"
+
+	.section .text.avx512,"ax",@progbits
+# if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_avx512_no_vzeroupper)
+
+ENTRY (__mempcpy_avx512_no_vzeroupper)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (__mempcpy_avx512_no_vzeroupper)
+# endif
+
+# ifdef SHARED
+ENTRY (__memmove_chk_avx512_no_vzeroupper)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (__memmove_chk_avx512_no_vzeroupper)
+# endif
+
+ENTRY (__memmove_avx512_no_vzeroupper)
+	mov	%rdi, %rax
+# ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+# endif
+L(start):
+	lea	(%rsi, %rdx), %rcx
+	lea	(%rdi, %rdx), %r9
+	cmp	$512, %rdx
+	ja	L(512bytesormore)
+
+L(check):
+	cmp	$16, %rdx
+	jbe	L(less_16bytes)
+	cmp	$256, %rdx
+	jb	L(less_256bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	-0x100(%rcx), %zmm4
+	vmovups -0xC0(%rcx), %zmm5
+	vmovups -0x80(%rcx), %zmm6
+	vmovups -0x40(%rcx), %zmm7
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, -0x100(%r9)
+	vmovups %zmm5, -0xC0(%r9)
+	vmovups %zmm6, -0x80(%r9)
+	vmovups %zmm7, -0x40(%r9)
+	ret
+
+L(less_256bytes):
+	cmp	$128, %dl
+	jb	L(less_128bytes)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups -0x80(%rcx), %zmm2
+	vmovups -0x40(%rcx), %zmm3
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, -0x80(%r9)
+	vmovups %zmm3, -0x40(%r9)
+	ret
+
+L(less_128bytes):
+	cmp	$64, %dl
+	jb	L(less_64bytes)
+	vmovdqu (%rsi), %ymm0
+	vmovdqu 0x20(%rsi), %ymm1
+	vmovdqu -0x40(%rcx), %ymm2
+	vmovdqu -0x20(%rcx), %ymm3
+	vmovdqu %ymm0, (%rdi)
+	vmovdqu %ymm1, 0x20(%rdi)
+	vmovdqu %ymm2, -0x40(%r9)
+	vmovdqu %ymm3, -0x20(%r9)
+	ret
+
+L(less_64bytes):
+	cmp	$32, %dl
+	jb	L(less_32bytes)
+	vmovdqu	(%rsi), %ymm0
+	vmovdqu -0x20(%rcx), %ymm1
+	vmovdqu	%ymm0, (%rdi)
+	vmovdqu	%ymm1, -0x20(%r9)
+	ret
+
+L(less_32bytes):
+	vmovdqu (%rsi), %xmm0
+	vmovdqu -0x10(%rcx), %xmm1
+	vmovdqu %xmm0, (%rdi)
+	vmovdqu %xmm1, -0x10(%r9)
+	ret
+
+L(less_16bytes):
+	cmp	$8, %dl
+	jb	L(less_8bytes)
+	movq	(%rsi), %rsi
+	movq	-0x8(%rcx), %rcx
+	movq	%rsi, (%rdi)
+	movq	%rcx, -0x8(%r9)
+	ret
+
+L(less_8bytes):
+	cmp	$4, %dl
+	jb	L(less_4bytes)
+	mov	(%rsi), %esi
+	mov	-0x4(%rcx), %ecx
+	mov	%esi, (%rdi)
+	mov	%ecx, -0x4(%r9)
+	ret
+
+L(less_4bytes):
+	cmp	$2, %dl
+	jb	L(less_2bytes)
+	mov	(%rsi), %si
+	mov	-0x2(%rcx), %cx
+	mov	%si, (%rdi)
+	mov	%cx, -0x2(%r9)
+	ret
+
+L(less_2bytes):
+	cmp	$1, %dl
+	jb	L(less_1bytes)
+	mov	(%rsi), %cl
+	mov	%cl, (%rdi)
+L(less_1bytes):
+	ret
+
+L(512bytesormore):
+# ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %r8
+# else
+	mov	__x86_shared_cache_size_half(%rip), %r8
+# endif
+	cmp	%r8, %rdx
+	jae	L(preloop_large)
+	cmp	$1024, %rdx
+	ja	L(1024bytesormore)
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	prefetcht1 -0x200(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0x40(%rcx)
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	vmovups	-0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	vmovups %zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups %zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	vmovups	%zmm8, -0x200(%r9)
+	vmovups %zmm9, -0x1C0(%r9)
+	vmovups %zmm10, -0x180(%r9)
+	vmovups %zmm11, -0x140(%r9)
+	vmovups	%zmm12, -0x100(%r9)
+	vmovups %zmm13, -0xC0(%r9)
+	vmovups %zmm14, -0x80(%r9)
+	vmovups %zmm15, -0x40(%r9)
+	ret
+
+L(1024bytesormore):
+	cmp	%rsi, %rdi
+	ja	L(1024bytesormore_bkw)
+	sub	$512, %r9
+	vmovups -0x200(%rcx), %zmm8
+	vmovups -0x1C0(%rcx), %zmm9
+	vmovups -0x180(%rcx), %zmm10
+	vmovups -0x140(%rcx), %zmm11
+	vmovups	-0x100(%rcx), %zmm12
+	vmovups -0xC0(%rcx), %zmm13
+	vmovups -0x80(%rcx), %zmm14
+	vmovups -0x40(%rcx), %zmm15
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+
+/* Loop with unaligned memory access.  */
+L(gobble_512bytes_loop):
+	vmovups	(%rsi), %zmm0
+	vmovups 0x40(%rsi), %zmm1
+	vmovups 0x80(%rsi), %zmm2
+	vmovups 0xC0(%rsi), %zmm3
+	vmovups	0x100(%rsi), %zmm4
+	vmovups 0x140(%rsi), %zmm5
+	vmovups 0x180(%rsi), %zmm6
+	vmovups 0x1C0(%rsi), %zmm7
+	add	$512, %rsi
+	prefetcht1 (%rsi)
+	prefetcht1 0x40(%rsi)
+	prefetcht1 0x80(%rsi)
+	prefetcht1 0xC0(%rsi)
+	prefetcht1 0x100(%rsi)
+	prefetcht1 0x140(%rsi)
+	prefetcht1 0x180(%rsi)
+	prefetcht1 0x1C0(%rsi)
+	vmovups	%zmm0, (%rdi)
+	vmovups %zmm1, 0x40(%rdi)
+	vmovups %zmm2, 0x80(%rdi)
+	vmovups %zmm3, 0xC0(%rdi)
+	vmovups	%zmm4, 0x100(%rdi)
+	vmovups %zmm5, 0x140(%rdi)
+	vmovups %zmm6, 0x180(%rdi)
+	vmovups %zmm7, 0x1C0(%rdi)
+	add	$512, %rdi
+	cmp	%r9, %rdi
+	jb	L(gobble_512bytes_loop)
+	vmovups %zmm8, (%r9)
+	vmovups %zmm9, 0x40(%r9)
+	vmovups %zmm10, 0x80(%r9)
+	vmovups %zmm11, 0xC0(%r9)
+	vmovups %zmm12, 0x100(%r9)
+	vmovups %zmm13, 0x140(%r9)
+	vmovups %zmm14, 0x180(%r9)
+	vmovups %zmm15, 0x1C0(%r9)
+	ret
+
+L(1024bytesormore_bkw):
+	add	$512, %rdi
+	vmovups	0x1C0(%rsi), %zmm8
+	vmovups 0x180(%rsi), %zmm9
+	vmovups 0x140(%rsi), %zmm10
+	vmovups 0x100(%rsi), %zmm11
+	vmovups	0xC0(%rsi), %zmm12
+	vmovups 0x80(%rsi), %zmm13
+	vmovups 0x40(%rsi), %zmm14
+	vmovups (%rsi), %zmm15
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+
+/* Backward loop with unaligned memory access.  */
+L(gobble_512bytes_loop_bkw):
+	vmovups -0x40(%rcx), %zmm0
+	vmovups -0x80(%rcx), %zmm1
+	vmovups -0xC0(%rcx), %zmm2
+	vmovups	-0x100(%rcx), %zmm3
+	vmovups -0x140(%rcx), %zmm4
+	vmovups -0x180(%rcx), %zmm5
+	vmovups -0x1C0(%rcx), %zmm6
+	vmovups	-0x200(%rcx), %zmm7
+	sub	$512, %rcx
+	prefetcht1 -0x40(%rcx)
+	prefetcht1 -0x80(%rcx)
+	prefetcht1 -0xC0(%rcx)
+	prefetcht1 -0x100(%rcx)
+	prefetcht1 -0x140(%rcx)
+	prefetcht1 -0x180(%rcx)
+	prefetcht1 -0x1C0(%rcx)
+	prefetcht1 -0x200(%rcx)
+	vmovups %zmm0, -0x40(%r9)
+	vmovups %zmm1, -0x80(%r9)
+	vmovups %zmm2, -0xC0(%r9)
+	vmovups	%zmm3, -0x100(%r9)
+	vmovups %zmm4, -0x140(%r9)
+	vmovups %zmm5, -0x180(%r9)
+	vmovups %zmm6, -0x1C0(%r9)
+	vmovups	%zmm7, -0x200(%r9)
+	sub	$512, %r9
+	cmp	%rdi, %r9
+	ja	L(gobble_512bytes_loop_bkw)
+	vmovups %zmm8, -0x40(%rdi)
+	vmovups %zmm9, -0x80(%rdi)
+	vmovups %zmm10, -0xC0(%rdi)
+	vmovups %zmm11, -0x100(%rdi)
+	vmovups %zmm12, -0x140(%rdi)
+	vmovups %zmm13, -0x180(%rdi)
+	vmovups %zmm14, -0x1C0(%rdi)
+	vmovups %zmm15, -0x200(%rdi)
+	ret
+
+L(preloop_large):
+	cmp	%rsi, %rdi
+	ja	L(preloop_large_bkw)
+	vmovups	(%rsi), %zmm4
+	vmovups	0x40(%rsi), %zmm5
+
+/* Align destination for access with non-temporal stores in the loop.  */
+	mov	%rdi, %r8
+	and	$-0x80, %rdi
+	add	$0x80, %rdi
+	sub	%rdi, %r8
+	sub	%r8, %rsi
+	add	%r8, %rdx
+L(gobble_256bytes_nt_loop):
+	prefetcht1 0x200(%rsi)
+	prefetcht1 0x240(%rsi)
+	prefetcht1 0x280(%rsi)
+	prefetcht1 0x2C0(%rsi)
+	prefetcht1 0x300(%rsi)
+	prefetcht1 0x340(%rsi)
+	prefetcht1 0x380(%rsi)
+	prefetcht1 0x3C0(%rsi)
+	vmovdqu64 (%rsi), %zmm0
+	vmovdqu64 0x40(%rsi), %zmm1
+	vmovdqu64 0x80(%rsi), %zmm2
+	vmovdqu64 0xC0(%rsi), %zmm3
+	vmovntdq %zmm0, (%rdi)
+	vmovntdq %zmm1, 0x40(%rdi)
+	vmovntdq %zmm2, 0x80(%rdi)
+	vmovntdq %zmm3, 0xC0(%rdi)
+	sub	$256, %rdx
+	add	$256, %rsi
+	add	$256, %rdi
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop)
+	sfence
+	vmovups	%zmm4, (%rax)
+	vmovups	%zmm5, 0x40(%rax)
+	jmp	L(check)
+
+L(preloop_large_bkw):
+	vmovups -0x80(%rcx), %zmm4
+	vmovups -0x40(%rcx), %zmm5
+
+/* Align end of destination for access with non-temporal stores.  */
+	mov	%r9, %r8
+	and	$-0x80, %r9
+	sub	%r9, %r8
+	sub	%r8, %rcx
+	sub	%r8, %rdx
+	add	%r9, %r8
+L(gobble_256bytes_nt_loop_bkw):
+	prefetcht1 -0x400(%rcx)
+	prefetcht1 -0x3C0(%rcx)
+	prefetcht1 -0x380(%rcx)
+	prefetcht1 -0x340(%rcx)
+	prefetcht1 -0x300(%rcx)
+	prefetcht1 -0x2C0(%rcx)
+	prefetcht1 -0x280(%rcx)
+	prefetcht1 -0x240(%rcx)
+	vmovdqu64 -0x100(%rcx), %zmm0
+	vmovdqu64 -0xC0(%rcx), %zmm1
+	vmovdqu64 -0x80(%rcx), %zmm2
+	vmovdqu64 -0x40(%rcx), %zmm3
+	vmovntdq %zmm0,	-0x100(%r9)
+	vmovntdq %zmm1,	-0xC0(%r9)
+	vmovntdq %zmm2,	-0x80(%r9)
+	vmovntdq %zmm3,	-0x40(%r9)
+	sub	$256, %rdx
+	sub	$256, %rcx
+	sub	$256, %r9
+	cmp	$256, %rdx
+	ja	L(gobble_256bytes_nt_loop_bkw)
+	sfence
+	vmovups	%zmm4, -0x80(%r8)
+	vmovups	%zmm5, -0x40(%r8)
+	jmp	L(check)
+END (__memmove_avx512_no_vzeroupper)
+
+# ifdef SHARED
+strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
+strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
+# endif
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a9a14991fb2d3e69f80d25e9bbf2f6b0bcf11c3d

commit a9a14991fb2d3e69f80d25e9bbf2f6b0bcf11c3d
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Mar 28 13:13:36 2016 -0700

    Implement x86-64 multiarch mempcpy in memcpy
    
    Implement x86-64 multiarch mempcpy in memcpy to share most of code.  It
    reduces code size of libc.so.
    
    	[BZ #18858]
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove
    	mempcpy-ssse3, mempcpy-ssse3-back, mempcpy-avx-unaligned
    	and mempcpy-avx512-no-vzeroupper.
    	* sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S (MEMPCPY_CHK):
    	New.
    	(MEMPCPY): Likewise.
    	* sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
    	(MEMPCPY_CHK): New.
    	(MEMPCPY): Likewise.
    	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S (MEMPCPY_CHK): New.
    	(MEMPCPY): Likewise.
    	* sysdeps/x86_64/multiarch/memcpy-ssse3.S (MEMPCPY_CHK): New.
    	(MEMPCPY): Likewise.
    	* sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S: Removed.
    	* sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S:
    	Likewise.
    	* sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S: Likewise.
    	* sysdeps/x86_64/multiarch/mempcpy-ssse3.S: Likewise.
    
    (cherry picked from commit c365e615f7429aee302f8af7bf07ae262278febb)

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d234f4a..39c0905 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,10 +8,10 @@ ifeq ($(subdir),string)
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
 		   memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
-		   memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
-		   memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
-		   memcpy-avx-unaligned mempcpy-avx-unaligned \
-		   mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
+		   memcpy-avx512-no-vzeroupper memmove-ssse3 \
+		   memcpy-ssse3-back memmove-avx-unaligned \
+		   memcpy-avx-unaligned \
+		   memmove-ssse3-back \
 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index b615d06..dd4187f 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -25,11 +25,26 @@
 
 #include "asm-syntax.h"
 #ifndef MEMCPY
-# define MEMCPY	__memcpy_avx_unaligned
+# define MEMCPY		__memcpy_avx_unaligned
 # define MEMCPY_CHK	__memcpy_chk_avx_unaligned
+# define MEMPCPY	__mempcpy_avx_unaligned
+# define MEMPCPY_CHK	__mempcpy_chk_avx_unaligned
 #endif
 
 	.section .text.avx,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
@@ -42,6 +57,7 @@ ENTRY (MEMCPY)
 #ifdef USE_AS_MEMPCPY
 	add	%rdx, %rax
 #endif
+L(start):
 	cmp	$256, %rdx
 	jae	L(256bytesormore)
 	cmp	$16, %dl
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
index 3d567fc..285bb83 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -27,9 +27,24 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_avx512_no_vzeroupper
 # define MEMCPY_CHK	__memcpy_chk_avx512_no_vzeroupper
+# define MEMPCPY	__mempcpy_avx512_no_vzeroupper
+# define MEMPCPY_CHK	__mempcpy_chk_avx512_no_vzeroupper
 #endif
 
 	.section .text.avx512,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
@@ -42,6 +57,7 @@ ENTRY (MEMCPY)
 #ifdef USE_AS_MEMPCPY
 	add	%rdx, %rax
 #endif
+L(start):
 	lea	(%rsi, %rdx), %rcx
 	lea	(%rdi, %rdx), %r9
 	cmp	$512, %rdx
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 08b41e9..b4890f4 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -29,6 +29,8 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_ssse3_back
 # define MEMCPY_CHK	__memcpy_chk_ssse3_back
+# define MEMPCPY	__mempcpy_ssse3_back
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
 #endif
 
 #define JMPTBL(I, B)	I - B
@@ -44,6 +46,19 @@
   ud2
 
 	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
@@ -66,6 +81,7 @@ ENTRY (MEMCPY)
 	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
 L(copy_forward):
 #endif
+L(start):
 	cmp	$144, %rdx
 	jae	L(144bytesormore)
 
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 95de969..1ca88c0 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -29,6 +29,8 @@
 #ifndef MEMCPY
 # define MEMCPY		__memcpy_ssse3
 # define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMPCPY	__mempcpy_ssse3
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3
 #endif
 
 #define JMPTBL(I, B)	I - B
@@ -44,6 +46,19 @@
   ud2
 
 	.section .text.ssse3,"ax",@progbits
+#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ENTRY (MEMPCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMPCPY_CHK)
+
+ENTRY (MEMPCPY)
+	movq	%rdi, %rax
+	addq	%rdx, %rax
+	jmp	L(start)
+END (MEMPCPY)
+#endif
+
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
@@ -66,6 +81,7 @@ ENTRY (MEMCPY)
 	jmp	L(copy_backward)
 L(copy_forward):
 #endif
+L(start):
 	cmp	$79, %rdx
 	lea     L(table_less_80bytes)(%rip), %r11
 	ja	L(80bytesormore)
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
deleted file mode 100644
index 241378e..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-avx-unaligned.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* mempcpy with AVX
-   Copyright (C) 2014-2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_avx_unaligned
-#define MEMCPY_CHK	__mempcpy_chk_avx_unaligned
-#include "memcpy-avx-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
deleted file mode 100644
index fcc0945..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S
+++ /dev/null
@@ -1,22 +0,0 @@
-/* mempcpy optimized with AVX512 for KNL hardware.
-   Copyright (C) 2016 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_avx512_no_vzeroupper
-#define MEMCPY_CHK	__mempcpy_chk_avx512_no_vzeroupper
-#include "memcpy-avx512-no-vzeroupper.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
deleted file mode 100644
index 82ffacb..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_ssse3_back
-#define MEMCPY_CHK	__mempcpy_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
diff --git a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S b/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
deleted file mode 100644
index 822d98e..0000000
--- a/sysdeps/x86_64/multiarch/mempcpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMPCPY
-#define MEMCPY		__mempcpy_ssse3
-#define MEMCPY_CHK	__mempcpy_chk_ssse3
-#include "memcpy-ssse3.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4fc09dabecee1b7cafdbca26ee7c63f68e53c229

commit 4fc09dabecee1b7cafdbca26ee7c63f68e53c229
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Mar 28 04:39:48 2016 -0700

    [x86] Add a feature bit: Fast_Unaligned_Copy
    
    On AMD processors, memcpy optimized with unaligned SSE load is
    slower than emcpy optimized with aligned SSSE3 while other string
    functions are faster with unaligned SSE load.  A feature bit,
    Fast_Unaligned_Copy, is added to select memcpy optimized with
    unaligned SSE load.
    
    	[BZ #19583]
    	* sysdeps/x86/cpu-features.c (init_cpu_features): Set
    	Fast_Unaligned_Copy with Fast_Unaligned_Load for Intel
    	processors.  Set Fast_Copy_Backward for AMD Excavator
    	processors.
    	* sysdeps/x86/cpu-features.h (bit_arch_Fast_Unaligned_Copy):
    	New.
    	(index_arch_Fast_Unaligned_Copy): Likewise.
    	* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check
    	Fast_Unaligned_Copy instead of Fast_Unaligned_Load.
    
    (cherry picked from commit e41b395523040fcb58c7d378475720c2836d280c)

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index c8f81ef..de75c79 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -153,8 +153,12 @@ init_cpu_features (struct cpu_features *cpu_features)
 #if index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
 # error index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
 #endif
+#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy
+# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy
+#endif
 	      cpu_features->feature[index_arch_Fast_Unaligned_Load]
 		|= (bit_arch_Fast_Unaligned_Load
+		    | bit_arch_Fast_Unaligned_Copy
 		    | bit_arch_Prefer_PMINUB_for_stringop
 		    | bit_arch_Slow_SSE4_2);
 	      break;
@@ -183,10 +187,14 @@ init_cpu_features (struct cpu_features *cpu_features)
 #if index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
 # error index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
 #endif
+#if index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy
+# error index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy
+#endif
 	      cpu_features->feature[index_arch_Fast_Rep_String]
 		|= (bit_arch_Fast_Rep_String
 		    | bit_arch_Fast_Copy_Backward
 		    | bit_arch_Fast_Unaligned_Load
+		    | bit_arch_Fast_Unaligned_Copy
 		    | bit_arch_Prefer_PMINUB_for_stringop);
 	      break;
 	    }
@@ -220,10 +228,14 @@ init_cpu_features (struct cpu_features *cpu_features)
 
       if (family == 0x15)
 	{
+#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
+# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
+#endif
 	  /* "Excavator"   */
 	  if (model >= 0x60 && model <= 0x7f)
 	    cpu_features->feature[index_arch_Fast_Unaligned_Load]
-	      |= bit_arch_Fast_Unaligned_Load;
+	      |= (bit_arch_Fast_Unaligned_Load
+		  | bit_arch_Fast_Copy_Backward);
 	}
     }
   else
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index e06eb7e..bfe1f4c 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -35,6 +35,7 @@
 #define bit_arch_I686				(1 << 15)
 #define bit_arch_Prefer_MAP_32BIT_EXEC		(1 << 16)
 #define bit_arch_Prefer_No_VZEROUPPER		(1 << 17)
+#define bit_arch_Fast_Unaligned_Copy		(1 << 18)
 
 /* CPUID Feature flags.  */
 
@@ -101,6 +102,7 @@
 # define index_arch_I686		FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Fast_Unaligned_Copy	FEATURE_INDEX_1*FEATURE_SIZE
 
 
 # if defined (_LIBC) && !IS_IN (nonlib)
@@ -265,6 +267,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_arch_I686		FEATURE_INDEX_1
 # define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
 # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
+# define index_arch_Fast_Unaligned_Copy	FEATURE_INDEX_1
 
 #endif	/* !__ASSEMBLER__ */
 
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 8882590..5b045d7 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -42,7 +42,7 @@ ENTRY(__new_memcpy)
 	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 	jnz	2f
 	lea	__memcpy_sse2_unaligned(%rip), %RAX_LP
-	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	HAS_ARCH_FEATURE (Fast_Unaligned_Copy)
 	jnz	2f
 	lea	__memcpy_sse2(%rip), %RAX_LP
 	HAS_CPU_FEATURE (SSSE3)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=75f2d47e459a6bf5656a938e5c63f8b581eb3ee6

commit 75f2d47e459a6bf5656a938e5c63f8b581eb3ee6
Author: Florian Weimer <fweimer@redhat.com>
Date:   Fri Mar 25 11:11:42 2016 +0100

    tst-audit10: Fix compilation on compilers without bit_AVX512F [BZ #19860]
    
    	[BZ# 19860]
    	* sysdeps/x86_64/tst-audit10.c (avx512_enabled): Always return
    	zero if the compiler does not provide the AVX512F bit.
    
    (cherry picked from commit f327f5b47be57bc05a4077344b381016c1bb2c11)

diff --git a/sysdeps/x86_64/tst-audit10.c b/sysdeps/x86_64/tst-audit10.c
index a487b40..0df2275 100644
--- a/sysdeps/x86_64/tst-audit10.c
+++ b/sysdeps/x86_64/tst-audit10.c
@@ -17,13 +17,13 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <cpuid.h>
-#include <cpu-features.h>
 
 int tst_audit10_aux (void);
 
 static int
 avx512_enabled (void)
 {
+#ifdef bit_AVX512F
   unsigned int eax, ebx, ecx, edx;
 
   if (__get_cpuid (1, &eax, &ebx, &ecx, &edx) == 0
@@ -38,6 +38,9 @@ avx512_enabled (void)
 
   /* Verify that ZMM, YMM and XMM states are enabled.  */
   return (eax & 0xe6) == 0xe6;
+#else
+  return 0;
+#endif
 }
 
 static int

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=96c7375cb8b6f1875d9865f2ae92ecacf5f5e6fa

commit 96c7375cb8b6f1875d9865f2ae92ecacf5f5e6fa
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Mar 22 08:36:16 2016 -0700

    Don't set %rcx twice before "rep movsb"
    
    	* sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S (MEMCPY):
    	Don't set %rcx twice before "rep movsb".
    
    (cherry picked from commit 3c9a4cd16cbc7b79094fec68add2df66061ab5d7)

diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index 74fed18..b615d06 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -234,7 +234,6 @@ L(gobble_data_movsb):
 	cmp	%rcx, %rdx
 	jae	L(gobble_big_data_fwd)
 	mov	%rdx, %rcx
-	mov	%rdx, %rcx
 	rep	movsb
 	ret
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c273f613b0cc779ee33cc33d20941d271316e483

commit c273f613b0cc779ee33cc33d20941d271316e483
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Mar 22 07:46:56 2016 -0700

    Set index_arch_AVX_Fast_Unaligned_Load only for Intel processors
    
    Since only Intel processors with AVX2 have fast unaligned load, we
    should set index_arch_AVX_Fast_Unaligned_Load only for Intel processors.
    
    Move AVX, AVX2, AVX512, FMA and FMA4 detection into get_common_indeces
    and call get_common_indeces for other processors.
    
    Add CPU_FEATURES_CPU_P and CPU_FEATURES_ARCH_P to aoid loading
    GLRO(dl_x86_cpu_features) in cpu-features.c.
    
    	[BZ #19583]
    	* sysdeps/x86/cpu-features.c (get_common_indeces): Remove
    	inline.  Check family before setting family, model and
    	extended_model.  Set AVX, AVX2, AVX512, FMA and FMA4 usable
    	bits here.
    	(init_cpu_features): Replace HAS_CPU_FEATURE and
    	HAS_ARCH_FEATURE with CPU_FEATURES_CPU_P and
    	CPU_FEATURES_ARCH_P.  Set index_arch_AVX_Fast_Unaligned_Load
    	for Intel processors with usable AVX2.  Call get_common_indeces
    	for other processors with family == NULL.
    	* sysdeps/x86/cpu-features.h (CPU_FEATURES_CPU_P): New macro.
    	(CPU_FEATURES_ARCH_P): Likewise.
    	(HAS_CPU_FEATURE): Use CPU_FEATURES_CPU_P.
    	(HAS_ARCH_FEATURE): Use CPU_FEATURES_ARCH_P.
    
    (cherry picked from commit f781a9e96138d8839663af5e88649ab1fbed74f8)

diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 1787716..c8f81ef 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -19,23 +19,79 @@
 #include <cpuid.h>
 #include <cpu-features.h>
 
-static inline void
+static void
 get_common_indeces (struct cpu_features *cpu_features,
 		    unsigned int *family, unsigned int *model,
 		    unsigned int *extended_model)
 {
-  unsigned int eax;
-  __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx,
-	   cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx,
-	   cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx);
-  GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].eax = eax;
-  *family = (eax >> 8) & 0x0f;
-  *model = (eax >> 4) & 0x0f;
-  *extended_model = (eax >> 12) & 0xf0;
-  if (*family == 0x0f)
+  if (family)
     {
-      *family += (eax >> 20) & 0xff;
-      *model += *extended_model;
+      unsigned int eax;
+      __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx,
+	       cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx,
+	       cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx);
+      cpu_features->cpuid[COMMON_CPUID_INDEX_1].eax = eax;
+      *family = (eax >> 8) & 0x0f;
+      *model = (eax >> 4) & 0x0f;
+      *extended_model = (eax >> 12) & 0xf0;
+      if (*family == 0x0f)
+	{
+	  *family += (eax >> 20) & 0xff;
+	  *model += *extended_model;
+	}
+    }
+
+  if (cpu_features->max_cpuid >= 7)
+    __cpuid_count (7, 0,
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax,
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx,
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx,
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx);
+
+  /* Can we call xgetbv?  */
+  if (CPU_FEATURES_CPU_P (cpu_features, OSXSAVE))
+    {
+      unsigned int xcrlow;
+      unsigned int xcrhigh;
+      asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0));
+      /* Is YMM and XMM state usable?  */
+      if ((xcrlow & (bit_YMM_state | bit_XMM_state)) ==
+	  (bit_YMM_state | bit_XMM_state))
+	{
+	  /* Determine if AVX is usable.  */
+	  if (CPU_FEATURES_CPU_P (cpu_features, AVX))
+	    cpu_features->feature[index_arch_AVX_Usable]
+	      |= bit_arch_AVX_Usable;
+	  /* Determine if AVX2 is usable.  */
+	  if (CPU_FEATURES_CPU_P (cpu_features, AVX2))
+	    cpu_features->feature[index_arch_AVX2_Usable]
+	      |= bit_arch_AVX2_Usable;
+	  /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and
+	     ZMM16-ZMM31 state are enabled.  */
+	  if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state
+			 | bit_ZMM16_31_state)) ==
+	      (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state))
+	    {
+	      /* Determine if AVX512F is usable.  */
+	      if (CPU_FEATURES_CPU_P (cpu_features, AVX512F))
+		{
+		  cpu_features->feature[index_arch_AVX512F_Usable]
+		    |= bit_arch_AVX512F_Usable;
+		  /* Determine if AVX512DQ is usable.  */
+		  if (CPU_FEATURES_CPU_P (cpu_features, AVX512DQ))
+		    cpu_features->feature[index_arch_AVX512DQ_Usable]
+		      |= bit_arch_AVX512DQ_Usable;
+		}
+	    }
+	  /* Determine if FMA is usable.  */
+	  if (CPU_FEATURES_CPU_P (cpu_features, FMA))
+	    cpu_features->feature[index_arch_FMA_Usable]
+	      |= bit_arch_FMA_Usable;
+	  /* Determine if FMA4 is usable.  */
+	  if (CPU_FEATURES_CPU_P (cpu_features, FMA4))
+	    cpu_features->feature[index_arch_FMA4_Usable]
+	      |= bit_arch_FMA4_Usable;
+	}
     }
 }
 
@@ -135,6 +191,12 @@ init_cpu_features (struct cpu_features *cpu_features)
 	      break;
 	    }
 	}
+
+      /* Unaligned load with 256-bit AVX registers are faster on
+	 Intel processors with AVX2.  */
+      if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
+	cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
+	  |= bit_arch_AVX_Fast_Unaligned_Load;
     }
   /* This spells out "AuthenticAMD".  */
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
@@ -165,73 +227,19 @@ init_cpu_features (struct cpu_features *cpu_features)
 	}
     }
   else
-    kind = arch_kind_other;
+    {
+      kind = arch_kind_other;
+      get_common_indeces (cpu_features, NULL, NULL, NULL);
+    }
 
   /* Support i586 if CX8 is available.  */
-  if (HAS_CPU_FEATURE (CX8))
+  if (CPU_FEATURES_CPU_P (cpu_features, CX8))
     cpu_features->feature[index_arch_I586] |= bit_arch_I586;
 
   /* Support i686 if CMOV is available.  */
-  if (HAS_CPU_FEATURE (CMOV))
+  if (CPU_FEATURES_CPU_P (cpu_features, CMOV))
     cpu_features->feature[index_arch_I686] |= bit_arch_I686;
 
-  if (cpu_features->max_cpuid >= 7)
-    __cpuid_count (7, 0,
-		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax,
-		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx,
-		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx,
-		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx);
-
-  /* Can we call xgetbv?  */
-  if (HAS_CPU_FEATURE (OSXSAVE))
-    {
-      unsigned int xcrlow;
-      unsigned int xcrhigh;
-      asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0));
-      /* Is YMM and XMM state usable?  */
-      if ((xcrlow & (bit_YMM_state | bit_XMM_state)) ==
-	  (bit_YMM_state | bit_XMM_state))
-	{
-	  /* Determine if AVX is usable.  */
-	  if (HAS_CPU_FEATURE (AVX))
-	    cpu_features->feature[index_arch_AVX_Usable]
-	      |= bit_arch_AVX_Usable;
-#if index_arch_AVX2_Usable != index_arch_AVX_Fast_Unaligned_Load
-# error index_arch_AVX2_Usable != index_arch_AVX_Fast_Unaligned_Load
-#endif
-	  /* Determine if AVX2 is usable.  Unaligned load with 256-bit
-	     AVX registers are faster on processors with AVX2.  */
-	  if (HAS_CPU_FEATURE (AVX2))
-	    cpu_features->feature[index_arch_AVX2_Usable]
-	      |= bit_arch_AVX2_Usable | bit_arch_AVX_Fast_Unaligned_Load;
-	  /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and
-	     ZMM16-ZMM31 state are enabled.  */
-	  if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state
-			 | bit_ZMM16_31_state)) ==
-	      (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state))
-	    {
-	      /* Determine if AVX512F is usable.  */
-	      if (HAS_CPU_FEATURE (AVX512F))
-		{
-		  cpu_features->feature[index_arch_AVX512F_Usable]
-		    |= bit_arch_AVX512F_Usable;
-		  /* Determine if AVX512DQ is usable.  */
-		  if (HAS_CPU_FEATURE (AVX512DQ))
-		    cpu_features->feature[index_arch_AVX512DQ_Usable]
-		      |= bit_arch_AVX512DQ_Usable;
-		}
-	    }
-	  /* Determine if FMA is usable.  */
-	  if (HAS_CPU_FEATURE (FMA))
-	    cpu_features->feature[index_arch_FMA_Usable]
-	      |= bit_arch_FMA_Usable;
-	  /* Determine if FMA4 is usable.  */
-	  if (HAS_CPU_FEATURE (FMA4))
-	    cpu_features->feature[index_arch_FMA4_Usable]
-	      |= bit_arch_FMA4_Usable;
-	}
-    }
-
 #if !HAS_CPUID
 no_cpuid:
 #endif
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 0624a92..e06eb7e 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -204,11 +204,17 @@ extern const struct cpu_features *__get_cpu_features (void)
 # endif
 
 
+/* Only used directly in cpu-features.c.  */
+# define CPU_FEATURES_CPU_P(ptr, name) \
+  ((ptr->cpuid[index_cpu_##name].reg_##name & (bit_cpu_##name)) != 0)
+# define CPU_FEATURES_ARCH_P(ptr, name) \
+  ((ptr->feature[index_arch_##name] & (bit_arch_##name)) != 0)
+
 /* HAS_* evaluates to true if we may use the feature at runtime.  */
 # define HAS_CPU_FEATURE(name) \
-  ((__get_cpu_features ()->cpuid[index_cpu_##name].reg_##name & (bit_cpu_##name)) != 0)
+   CPU_FEATURES_CPU_P (__get_cpu_features (), name)
 # define HAS_ARCH_FEATURE(name) \
-  ((__get_cpu_features ()->feature[index_arch_##name] & (bit_arch_##name)) != 0)
+   CPU_FEATURES_ARCH_P (__get_cpu_features (), name)
 
 # define index_cpu_CX8		COMMON_CPUID_INDEX_1
 # define index_cpu_CMOV		COMMON_CPUID_INDEX_1

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c858d10a4e7fd682f2e7083836e4feacc2d580f4

commit c858d10a4e7fd682f2e7083836e4feacc2d580f4
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu Mar 10 05:26:46 2016 -0800

    Add _arch_/_cpu_ to index_*/bit_* in x86 cpu-features.h
    
    index_* and bit_* macros are used to access cpuid and feature arrays o
    struct cpu_features.  It is very easy to use bits and indices of cpuid
    array on feature array, especially in assembly codes.  For example,
    sysdeps/i386/i686/multiarch/bcopy.S has
    
    	HAS_CPU_FEATURE (Fast_Rep_String)
    
    which should be
    
    	HAS_ARCH_FEATURE (Fast_Rep_String)
    
    We change index_* and bit_* to index_cpu_*/index_arch_* and
    bit_cpu_*/bit_arch_* so that we can catch such error at build time.
    
    	[BZ #19762]
    	* sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h
    	(EXTRA_LD_ENVVARS): Add _arch_ to index_*/bit_*.
    	* sysdeps/x86/cpu-features.c (init_cpu_features): Likewise.
    	* sysdeps/x86/cpu-features.h (bit_*): Renamed to ...
    	(bit_arch_*): This for feature array.
    	(bit_*): Renamed to ...
    	(bit_cpu_*): This for cpu array.
    	(index_*): Renamed to ...
    	(index_arch_*): This for feature array.
    	(index_*): Renamed to ...
    	(index_cpu_*): This for cpu array.
    	[__ASSEMBLER__] (HAS_FEATURE): Add and use field.
    	[__ASSEMBLER__] (HAS_CPU_FEATURE)): Pass cpu to HAS_FEATURE.
    	[__ASSEMBLER__] (HAS_ARCH_FEATURE)): Pass arch to HAS_FEATURE.
    	[!__ASSEMBLER__] (HAS_CPU_FEATURE): Replace index_##name and
    	bit_##name with index_cpu_##name and bit_cpu_##name.
    	[!__ASSEMBLER__] (HAS_ARCH_FEATURE): Replace index_##name and
    	bit_##name with index_arch_##name and bit_arch_##name.
    
    (cherry picked from commit 6aa3e97e2530f9917f504eb4146af119a3f27229)

diff --git a/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h b/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h
index a759934..1c67050 100644
--- a/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h
+++ b/sysdeps/unix/sysv/linux/x86_64/64/dl-librecon.h
@@ -30,10 +30,10 @@
    is always disabled for SUID programs and can be enabled by setting
    environment variable, LD_PREFER_MAP_32BIT_EXEC.  */
 #define EXTRA_LD_ENVVARS \
-  case 21:							      \
-    if (memcmp (envline, "PREFER_MAP_32BIT_EXEC", 21) == 0)	      \
-      GLRO(dl_x86_cpu_features).feature[index_Prefer_MAP_32BIT_EXEC]  \
-	|= bit_Prefer_MAP_32BIT_EXEC;				      \
+  case 21:								  \
+    if (memcmp (envline, "PREFER_MAP_32BIT_EXEC", 21) == 0)		  \
+      GLRO(dl_x86_cpu_features).feature[index_arch_Prefer_MAP_32BIT_EXEC] \
+	|= bit_arch_Prefer_MAP_32BIT_EXEC;				  \
     break;
 
 /* Extra unsecure variables.  The names are all stuffed in a single
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 218ff2b..1787716 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -75,13 +75,14 @@ init_cpu_features (struct cpu_features *cpu_features)
 	    case 0x1c:
 	    case 0x26:
 	      /* BSF is slow on Atom.  */
-	      cpu_features->feature[index_Slow_BSF] |= bit_Slow_BSF;
+	      cpu_features->feature[index_arch_Slow_BSF]
+		|= bit_arch_Slow_BSF;
 	      break;
 
 	    case 0x57:
 	      /* Knights Landing.  Enable Silvermont optimizations.  */
-	      cpu_features->feature[index_Prefer_No_VZEROUPPER]
-		|= bit_Prefer_No_VZEROUPPER;
+	      cpu_features->feature[index_arch_Prefer_No_VZEROUPPER]
+		|= bit_arch_Prefer_No_VZEROUPPER;
 
 	    case 0x37:
 	    case 0x4a:
@@ -90,22 +91,22 @@ init_cpu_features (struct cpu_features *cpu_features)
 	    case 0x5d:
 	      /* Unaligned load versions are faster than SSSE3
 		 on Silvermont.  */
-#if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
-# error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
+#if index_arch_Fast_Unaligned_Load != index_arch_Prefer_PMINUB_for_stringop
+# error index_arch_Fast_Unaligned_Load != index_arch_Prefer_PMINUB_for_stringop
 #endif
-#if index_Fast_Unaligned_Load != index_Slow_SSE4_2
-# error index_Fast_Unaligned_Load != index_Slow_SSE4_2
+#if index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
+# error index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
 #endif
-	      cpu_features->feature[index_Fast_Unaligned_Load]
-		|= (bit_Fast_Unaligned_Load
-		    | bit_Prefer_PMINUB_for_stringop
-		    | bit_Slow_SSE4_2);
+	      cpu_features->feature[index_arch_Fast_Unaligned_Load]
+		|= (bit_arch_Fast_Unaligned_Load
+		    | bit_arch_Prefer_PMINUB_for_stringop
+		    | bit_arch_Slow_SSE4_2);
 	      break;
 
 	    default:
 	      /* Unknown family 0x06 processors.  Assuming this is one
 		 of Core i3/i5/i7 processors if AVX is available.  */
-	      if ((ecx & bit_AVX) == 0)
+	      if ((ecx & bit_cpu_AVX) == 0)
 		break;
 
 	    case 0x1a:
@@ -117,20 +118,20 @@ init_cpu_features (struct cpu_features *cpu_features)
 	    case 0x2f:
 	      /* Rep string instructions, copy backward, unaligned loads
 		 and pminub are fast on Intel Core i3, i5 and i7.  */
-#if index_Fast_Rep_String != index_Fast_Copy_Backward
-# error index_Fast_Rep_String != index_Fast_Copy_Backward
+#if index_arch_Fast_Rep_String != index_arch_Fast_Copy_Backward
+# error index_arch_Fast_Rep_String != index_arch_Fast_Copy_Backward
 #endif
-#if index_Fast_Rep_String != index_Fast_Unaligned_Load
-# error index_Fast_Rep_String != index_Fast_Unaligned_Load
+#if index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Load
+# error index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Load
 #endif
-#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
-# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
+#if index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
+# error index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
 #endif
-	      cpu_features->feature[index_Fast_Rep_String]
-		|= (bit_Fast_Rep_String
-		    | bit_Fast_Copy_Backward
-		    | bit_Fast_Unaligned_Load
-		    | bit_Prefer_PMINUB_for_stringop);
+	      cpu_features->feature[index_arch_Fast_Rep_String]
+		|= (bit_arch_Fast_Rep_String
+		    | bit_arch_Fast_Copy_Backward
+		    | bit_arch_Fast_Unaligned_Load
+		    | bit_arch_Prefer_PMINUB_for_stringop);
 	      break;
 	    }
 	}
@@ -159,8 +160,8 @@ init_cpu_features (struct cpu_features *cpu_features)
 	{
 	  /* "Excavator"   */
 	  if (model >= 0x60 && model <= 0x7f)
-	    cpu_features->feature[index_Fast_Unaligned_Load]
-	      |= bit_Fast_Unaligned_Load;
+	    cpu_features->feature[index_arch_Fast_Unaligned_Load]
+	      |= bit_arch_Fast_Unaligned_Load;
 	}
     }
   else
@@ -168,11 +169,11 @@ init_cpu_features (struct cpu_features *cpu_features)
 
   /* Support i586 if CX8 is available.  */
   if (HAS_CPU_FEATURE (CX8))
-    cpu_features->feature[index_I586] |= bit_I586;
+    cpu_features->feature[index_arch_I586] |= bit_arch_I586;
 
   /* Support i686 if CMOV is available.  */
   if (HAS_CPU_FEATURE (CMOV))
-    cpu_features->feature[index_I686] |= bit_I686;
+    cpu_features->feature[index_arch_I686] |= bit_arch_I686;
 
   if (cpu_features->max_cpuid >= 7)
     __cpuid_count (7, 0,
@@ -193,15 +194,16 @@ init_cpu_features (struct cpu_features *cpu_features)
 	{
 	  /* Determine if AVX is usable.  */
 	  if (HAS_CPU_FEATURE (AVX))
-	    cpu_features->feature[index_AVX_Usable] |= bit_AVX_Usable;
-#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
-# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
+	    cpu_features->feature[index_arch_AVX_Usable]
+	      |= bit_arch_AVX_Usable;
+#if index_arch_AVX2_Usable != index_arch_AVX_Fast_Unaligned_Load
+# error index_arch_AVX2_Usable != index_arch_AVX_Fast_Unaligned_Load
 #endif
 	  /* Determine if AVX2 is usable.  Unaligned load with 256-bit
 	     AVX registers are faster on processors with AVX2.  */
 	  if (HAS_CPU_FEATURE (AVX2))
-	    cpu_features->feature[index_AVX2_Usable]
-	      |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
+	    cpu_features->feature[index_arch_AVX2_Usable]
+	      |= bit_arch_AVX2_Usable | bit_arch_AVX_Fast_Unaligned_Load;
 	  /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and
 	     ZMM16-ZMM31 state are enabled.  */
 	  if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state
@@ -211,20 +213,22 @@ init_cpu_features (struct cpu_features *cpu_features)
 	      /* Determine if AVX512F is usable.  */
 	      if (HAS_CPU_FEATURE (AVX512F))
 		{
-		  cpu_features->feature[index_AVX512F_Usable]
-		    |= bit_AVX512F_Usable;
+		  cpu_features->feature[index_arch_AVX512F_Usable]
+		    |= bit_arch_AVX512F_Usable;
 		  /* Determine if AVX512DQ is usable.  */
 		  if (HAS_CPU_FEATURE (AVX512DQ))
-		    cpu_features->feature[index_AVX512DQ_Usable]
-		      |= bit_AVX512DQ_Usable;
+		    cpu_features->feature[index_arch_AVX512DQ_Usable]
+		      |= bit_arch_AVX512DQ_Usable;
 		}
 	    }
 	  /* Determine if FMA is usable.  */
 	  if (HAS_CPU_FEATURE (FMA))
-	    cpu_features->feature[index_FMA_Usable] |= bit_FMA_Usable;
+	    cpu_features->feature[index_arch_FMA_Usable]
+	      |= bit_arch_FMA_Usable;
 	  /* Determine if FMA4 is usable.  */
 	  if (HAS_CPU_FEATURE (FMA4))
-	    cpu_features->feature[index_FMA4_Usable] |= bit_FMA4_Usable;
+	    cpu_features->feature[index_arch_FMA4_Usable]
+	      |= bit_arch_FMA4_Usable;
 	}
     }
 
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index e354920..0624a92 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -18,48 +18,48 @@
 #ifndef cpu_features_h
 #define cpu_features_h
 
-#define bit_Fast_Rep_String		(1 << 0)
-#define bit_Fast_Copy_Backward		(1 << 1)
-#define bit_Slow_BSF			(1 << 2)
-#define bit_Fast_Unaligned_Load		(1 << 4)
-#define bit_Prefer_PMINUB_for_stringop	(1 << 5)
-#define bit_AVX_Usable			(1 << 6)
-#define bit_FMA_Usable			(1 << 7)
-#define bit_FMA4_Usable			(1 << 8)
-#define bit_Slow_SSE4_2			(1 << 9)
-#define bit_AVX2_Usable			(1 << 10)
-#define bit_AVX_Fast_Unaligned_Load	(1 << 11)
-#define bit_AVX512F_Usable		(1 << 12)
-#define bit_AVX512DQ_Usable		(1 << 13)
-#define bit_I586			(1 << 14)
-#define bit_I686			(1 << 15)
-#define bit_Prefer_MAP_32BIT_EXEC	(1 << 16)
-#define bit_Prefer_No_VZEROUPPER	(1 << 17)
+#define bit_arch_Fast_Rep_String		(1 << 0)
+#define bit_arch_Fast_Copy_Backward		(1 << 1)
+#define bit_arch_Slow_BSF			(1 << 2)
+#define bit_arch_Fast_Unaligned_Load		(1 << 4)
+#define bit_arch_Prefer_PMINUB_for_stringop	(1 << 5)
+#define bit_arch_AVX_Usable			(1 << 6)
+#define bit_arch_FMA_Usable			(1 << 7)
+#define bit_arch_FMA4_Usable			(1 << 8)
+#define bit_arch_Slow_SSE4_2			(1 << 9)
+#define bit_arch_AVX2_Usable			(1 << 10)
+#define bit_arch_AVX_Fast_Unaligned_Load	(1 << 11)
+#define bit_arch_AVX512F_Usable			(1 << 12)
+#define bit_arch_AVX512DQ_Usable		(1 << 13)
+#define bit_arch_I586				(1 << 14)
+#define bit_arch_I686				(1 << 15)
+#define bit_arch_Prefer_MAP_32BIT_EXEC		(1 << 16)
+#define bit_arch_Prefer_No_VZEROUPPER		(1 << 17)
 
 /* CPUID Feature flags.  */
 
 /* COMMON_CPUID_INDEX_1.  */
-#define bit_CX8		(1 << 8)
-#define bit_CMOV	(1 << 15)
-#define bit_SSE2	(1 << 26)
-#define bit_SSSE3	(1 << 9)
-#define bit_SSE4_1	(1 << 19)
-#define bit_SSE4_2	(1 << 20)
-#define bit_OSXSAVE	(1 << 27)
-#define bit_AVX		(1 << 28)
-#define bit_POPCOUNT	(1 << 23)
-#define bit_FMA		(1 << 12)
-#define bit_FMA4	(1 << 16)
+#define bit_cpu_CX8		(1 << 8)
+#define bit_cpu_CMOV		(1 << 15)
+#define bit_cpu_SSE2		(1 << 26)
+#define bit_cpu_SSSE3		(1 << 9)
+#define bit_cpu_SSE4_1		(1 << 19)
+#define bit_cpu_SSE4_2		(1 << 20)
+#define bit_cpu_OSXSAVE		(1 << 27)
+#define bit_cpu_AVX		(1 << 28)
+#define bit_cpu_POPCOUNT	(1 << 23)
+#define bit_cpu_FMA		(1 << 12)
+#define bit_cpu_FMA4		(1 << 16)
 
 /* COMMON_CPUID_INDEX_7.  */
-#define bit_RTM		(1 << 11)
-#define bit_AVX2	(1 << 5)
-#define bit_AVX512F	(1 << 16)
-#define bit_AVX512DQ	(1 << 17)
+#define bit_cpu_RTM		(1 << 11)
+#define bit_cpu_AVX2		(1 << 5)
+#define bit_cpu_AVX512F		(1 << 16)
+#define bit_cpu_AVX512DQ	(1 << 17)
 
 /* XCR0 Feature flags.  */
-#define bit_XMM_state  (1 << 1)
-#define bit_YMM_state  (2 << 1)
+#define bit_XMM_state		(1 << 1)
+#define bit_YMM_state		(2 << 1)
 #define bit_Opmask_state	(1 << 5)
 #define bit_ZMM0_15_state	(1 << 6)
 #define bit_ZMM16_31_state	(1 << 7)
@@ -75,32 +75,32 @@
 # include <ifunc-defines.h>
 # include <rtld-global-offsets.h>
 
-# define index_CX8	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
-# define index_CMOV	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
-# define index_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
-# define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_SSE4_1	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_SSE4_2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_AVX	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_AVX2	COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
+# define index_cpu_CX8	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
+# define index_cpu_CMOV	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
+# define index_cpu_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
+# define index_cpu_SSSE3 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+# define index_cpu_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+# define index_cpu_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+# define index_cpu_AVX	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+# define index_cpu_AVX2	COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
 
-# define index_Fast_Rep_String		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Fast_Copy_Backward	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Slow_BSF			FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_FMA_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_FMA4_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Slow_SSE4_2		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX2_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX512F_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX512DQ_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_I586			FEATURE_INDEX_1*FEATURE_SIZE
-# define index_I686			FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Prefer_MAP_32BIT_EXEC	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Prefer_No_VZEROUPPER	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Fast_Rep_String	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Fast_Copy_Backward	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Slow_BSF		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_AVX_Usable		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_FMA_Usable		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_FMA4_Usable		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Slow_SSE4_2		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_AVX2_Usable		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_AVX_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_AVX512F_Usable	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_AVX512DQ_Usable	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_I586		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_I686		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1*FEATURE_SIZE
 
 
 # if defined (_LIBC) && !IS_IN (nonlib)
@@ -108,19 +108,21 @@
 #   ifdef SHARED
 #    if IS_IN (rtld)
 #     define LOAD_RTLD_GLOBAL_RO_RDX
-#     define HAS_FEATURE(offset, name) \
-  testl $(bit_##name), _rtld_local_ro+offset+(index_##name)(%rip)
+#     define HAS_FEATURE(offset, field, name) \
+  testl $(bit_##field##_##name), \
+	_rtld_local_ro+offset+(index_##field##_##name)(%rip)
 #    else
 #      define LOAD_RTLD_GLOBAL_RO_RDX \
   mov _rtld_global_ro@GOTPCREL(%rip), %RDX_LP
-#     define HAS_FEATURE(offset, name) \
-  testl $(bit_##name), \
-	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##name)(%rdx)
+#     define HAS_FEATURE(offset, field, name) \
+  testl $(bit_##field##_##name), \
+	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##field##_##name)(%rdx)
 #    endif
 #   else /* SHARED */
 #    define LOAD_RTLD_GLOBAL_RO_RDX
-#    define HAS_FEATURE(offset, name) \
-  testl $(bit_##name), _dl_x86_cpu_features+offset+(index_##name)(%rip)
+#    define HAS_FEATURE(offset, field, name) \
+  testl $(bit_##field##_##name), \
+	_dl_x86_cpu_features+offset+(index_##field##_##name)(%rip)
 #   endif /* !SHARED */
 #  else  /* __x86_64__ */
 #   ifdef SHARED
@@ -129,22 +131,24 @@
 #    if IS_IN (rtld)
 #    define LOAD_GOT_AND_RTLD_GLOBAL_RO \
   LOAD_PIC_REG(dx)
-#     define HAS_FEATURE(offset, name) \
-  testl $(bit_##name), offset+(index_##name)+_rtld_local_ro@GOTOFF(%edx)
+#     define HAS_FEATURE(offset, field, name) \
+  testl $(bit_##field##_##name), \
+	offset+(index_##field##_##name)+_rtld_local_ro@GOTOFF(%edx)
 #    else
 #     define LOAD_GOT_AND_RTLD_GLOBAL_RO \
   LOAD_PIC_REG(dx); \
   mov _rtld_global_ro@GOT(%edx), %ecx
-#     define HAS_FEATURE(offset, name) \
-  testl $(bit_##name), \
-	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##name)(%ecx)
+#     define HAS_FEATURE(offset, field, name) \
+  testl $(bit_##field##_##name), \
+	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##field##_##name)(%ecx)
 #    endif
 #   else  /* SHARED */
 #    define LOAD_FUNC_GOT_EAX(func) \
   leal func, %eax
 #    define LOAD_GOT_AND_RTLD_GLOBAL_RO
-#    define HAS_FEATURE(offset, name) \
-  testl $(bit_##name), _dl_x86_cpu_features+offset+(index_##name)
+#    define HAS_FEATURE(offset, field, name) \
+  testl $(bit_##field##_##name), \
+	_dl_x86_cpu_features+offset+(index_##field##_##name)
 #   endif /* !SHARED */
 #  endif /* !__x86_64__ */
 # else /* _LIBC && !nonlib */
@@ -152,8 +156,8 @@
 # endif /* !_LIBC || nonlib */
 
 /* HAS_* evaluates to true if we may use the feature at runtime.  */
-# define HAS_CPU_FEATURE(name)	HAS_FEATURE (CPUID_OFFSET, name)
-# define HAS_ARCH_FEATURE(name) HAS_FEATURE (FEATURE_OFFSET, name)
+# define HAS_CPU_FEATURE(name)	HAS_FEATURE (CPUID_OFFSET, cpu, name)
+# define HAS_ARCH_FEATURE(name) HAS_FEATURE (FEATURE_OFFSET, arch, name)
 
 #else	/* __ASSEMBLER__ */
 
@@ -202,25 +206,25 @@ extern const struct cpu_features *__get_cpu_features (void)
 
 /* HAS_* evaluates to true if we may use the feature at runtime.  */
 # define HAS_CPU_FEATURE(name) \
-  ((__get_cpu_features ()->cpuid[index_##name].reg_##name & (bit_##name)) != 0)
+  ((__get_cpu_features ()->cpuid[index_cpu_##name].reg_##name & (bit_cpu_##name)) != 0)
 # define HAS_ARCH_FEATURE(name) \
-  ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
+  ((__get_cpu_features ()->feature[index_arch_##name] & (bit_arch_##name)) != 0)
 
-# define index_CX8		COMMON_CPUID_INDEX_1
-# define index_CMOV		COMMON_CPUID_INDEX_1
-# define index_SSE2		COMMON_CPUID_INDEX_1
-# define index_SSSE3		COMMON_CPUID_INDEX_1
-# define index_SSE4_1		COMMON_CPUID_INDEX_1
-# define index_SSE4_2		COMMON_CPUID_INDEX_1
-# define index_AVX		COMMON_CPUID_INDEX_1
-# define index_AVX2		COMMON_CPUID_INDEX_7
-# define index_AVX512F		COMMON_CPUID_INDEX_7
-# define index_AVX512DQ		COMMON_CPUID_INDEX_7
-# define index_RTM		COMMON_CPUID_INDEX_7
-# define index_FMA		COMMON_CPUID_INDEX_1
-# define index_FMA4		COMMON_CPUID_INDEX_80000001
-# define index_POPCOUNT		COMMON_CPUID_INDEX_1
-# define index_OSXSAVE		COMMON_CPUID_INDEX_1
+# define index_cpu_CX8		COMMON_CPUID_INDEX_1
+# define index_cpu_CMOV		COMMON_CPUID_INDEX_1
+# define index_cpu_SSE2		COMMON_CPUID_INDEX_1
+# define index_cpu_SSSE3	COMMON_CPUID_INDEX_1
+# define index_cpu_SSE4_1	COMMON_CPUID_INDEX_1
+# define index_cpu_SSE4_2	COMMON_CPUID_INDEX_1
+# define index_cpu_AVX		COMMON_CPUID_INDEX_1
+# define index_cpu_AVX2		COMMON_CPUID_INDEX_7
+# define index_cpu_AVX512F	COMMON_CPUID_INDEX_7
+# define index_cpu_AVX512DQ	COMMON_CPUID_INDEX_7
+# define index_cpu_RTM		COMMON_CPUID_INDEX_7
+# define index_cpu_FMA		COMMON_CPUID_INDEX_1
+# define index_cpu_FMA4		COMMON_CPUID_INDEX_80000001
+# define index_cpu_POPCOUNT	COMMON_CPUID_INDEX_1
+# define index_cpu_OSXSAVE	COMMON_CPUID_INDEX_1
 
 # define reg_CX8		edx
 # define reg_CMOV		edx
@@ -238,23 +242,23 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define reg_POPCOUNT		ecx
 # define reg_OSXSAVE		ecx
 
-# define index_Fast_Rep_String		FEATURE_INDEX_1
-# define index_Fast_Copy_Backward	FEATURE_INDEX_1
-# define index_Slow_BSF			FEATURE_INDEX_1
-# define index_Fast_Unaligned_Load	FEATURE_INDEX_1
-# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1
-# define index_AVX_Usable		FEATURE_INDEX_1
-# define index_FMA_Usable		FEATURE_INDEX_1
-# define index_FMA4_Usable		FEATURE_INDEX_1
-# define index_Slow_SSE4_2		FEATURE_INDEX_1
-# define index_AVX2_Usable		FEATURE_INDEX_1
-# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1
-# define index_AVX512F_Usable		FEATURE_INDEX_1
-# define index_AVX512DQ_Usable		FEATURE_INDEX_1
-# define index_I586			FEATURE_INDEX_1
-# define index_I686			FEATURE_INDEX_1
-# define index_Prefer_MAP_32BIT_EXEC	FEATURE_INDEX_1
-# define index_Prefer_No_VZEROUPPER     FEATURE_INDEX_1
+# define index_arch_Fast_Rep_String	FEATURE_INDEX_1
+# define index_arch_Fast_Copy_Backward	FEATURE_INDEX_1
+# define index_arch_Slow_BSF		FEATURE_INDEX_1
+# define index_arch_Fast_Unaligned_Load	FEATURE_INDEX_1
+# define index_arch_Prefer_PMINUB_for_stringop FEATURE_INDEX_1
+# define index_arch_AVX_Usable		FEATURE_INDEX_1
+# define index_arch_FMA_Usable		FEATURE_INDEX_1
+# define index_arch_FMA4_Usable		FEATURE_INDEX_1
+# define index_arch_Slow_SSE4_2		FEATURE_INDEX_1
+# define index_arch_AVX2_Usable		FEATURE_INDEX_1
+# define index_arch_AVX_Fast_Unaligned_Load FEATURE_INDEX_1
+# define index_arch_AVX512F_Usable	FEATURE_INDEX_1
+# define index_arch_AVX512DQ_Usable	FEATURE_INDEX_1
+# define index_arch_I586		FEATURE_INDEX_1
+# define index_arch_I686		FEATURE_INDEX_1
+# define index_arch_Prefer_MAP_32BIT_EXEC FEATURE_INDEX_1
+# define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
 
 #endif	/* !__ASSEMBLER__ */
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=7a90b56b0c3f8e55df44957cf6de7d3c9c04cbb9

commit 7a90b56b0c3f8e55df44957cf6de7d3c9c04cbb9
Author: Roland McGrath <roland@hack.frob.com>
Date:   Tue Mar 8 12:31:13 2016 -0800

    Fix tst-audit10 build when -mavx512f is not supported.
    
    (cherry picked from commit 3bd80c0de2f8e7ca8020d37739339636d169957e)

diff --git a/sysdeps/x86_64/tst-audit10-aux.c b/sysdeps/x86_64/tst-audit10-aux.c
index 4398b8f..992a16c 100644
--- a/sysdeps/x86_64/tst-audit10-aux.c
+++ b/sysdeps/x86_64/tst-audit10-aux.c
@@ -20,13 +20,13 @@
 #include <stdlib.h>
 #include <string.h>
 
-extern __m512i audit_test (__m512i, __m512i, __m512i, __m512i,
-			   __m512i, __m512i, __m512i, __m512i);
-
 int
 tst_audit10_aux (void)
 {
 #ifdef __AVX512F__
+  extern __m512i audit_test (__m512i, __m512i, __m512i, __m512i,
+                             __m512i, __m512i, __m512i, __m512i);
+
   __m512i zmm = _mm512_setzero_si512 ();
   __m512i ret = audit_test (zmm, zmm, zmm, zmm, zmm, zmm, zmm, zmm);
 
diff --git a/sysdeps/x86_64/tst-audit10.c b/sysdeps/x86_64/tst-audit10.c
index 92e0cb4..a487b40 100644
--- a/sysdeps/x86_64/tst-audit10.c
+++ b/sysdeps/x86_64/tst-audit10.c
@@ -17,6 +17,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <cpuid.h>
+#include <cpu-features.h>
 
 int tst_audit10_aux (void);
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ba80f6ceea3a6b6f711038646f419125fe3ad39c

commit ba80f6ceea3a6b6f711038646f419125fe3ad39c
Author: Florian Weimer <fweimer@redhat.com>
Date:   Mon Mar 7 16:00:25 2016 +0100

    tst-audit4, tst-audit10: Compile AVX/AVX-512 code separately [BZ #19269]
    
    This ensures that GCC will not use unsupported instructions before
    the run-time check to ensure support.
    
    (cherry picked from commit 3c0f7407eedb524c9114bb675cd55b903c71daaa)

diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 67ed5ba..cc47b88 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -60,7 +60,7 @@ $(objpfx)tst-audit3: $(objpfx)tst-auditmod3a.so
 $(objpfx)tst-audit3.out: $(objpfx)tst-auditmod3b.so
 tst-audit3-ENV = LD_AUDIT=$(objpfx)tst-auditmod3b.so
 
-$(objpfx)tst-audit4: $(objpfx)tst-auditmod4a.so
+$(objpfx)tst-audit4: $(objpfx)tst-audit4-aux.o $(objpfx)tst-auditmod4a.so
 $(objpfx)tst-audit4.out: $(objpfx)tst-auditmod4b.so
 tst-audit4-ENV = LD_AUDIT=$(objpfx)tst-auditmod4b.so
 
@@ -77,12 +77,12 @@ $(objpfx)tst-audit7: $(objpfx)tst-auditmod7a.so
 $(objpfx)tst-audit7.out: $(objpfx)tst-auditmod7b.so
 tst-audit7-ENV = LD_AUDIT=$(objpfx)tst-auditmod7b.so
 
-$(objpfx)tst-audit10: $(objpfx)tst-auditmod10a.so
+$(objpfx)tst-audit10: $(objpfx)tst-audit10-aux.o $(objpfx)tst-auditmod10a.so
 $(objpfx)tst-audit10.out: $(objpfx)tst-auditmod10b.so
 tst-audit10-ENV = LD_AUDIT=$(objpfx)tst-auditmod10b.so
 
 AVX-CFLAGS=-mavx -mno-vzeroupper
-CFLAGS-tst-audit4.c += $(AVX-CFLAGS)
+CFLAGS-tst-audit4-aux.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod4a.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod4b.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod6b.c += $(AVX-CFLAGS)
@@ -90,7 +90,7 @@ CFLAGS-tst-auditmod6c.c += $(AVX-CFLAGS)
 CFLAGS-tst-auditmod7b.c += $(AVX-CFLAGS)
 ifeq (yes,$(config-cflags-avx512))
 AVX512-CFLAGS = -mavx512f
-CFLAGS-tst-audit10.c += $(AVX512-CFLAGS)
+CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS)
 CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS)
 CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS)
 endif
diff --git a/sysdeps/x86_64/tst-audit10-aux.c b/sysdeps/x86_64/tst-audit10-aux.c
new file mode 100644
index 0000000..4398b8f
--- /dev/null
+++ b/sysdeps/x86_64/tst-audit10-aux.c
@@ -0,0 +1,41 @@
+/* Test case for preserved AVX512 registers in dynamic linker, -mavx512f part.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+extern __m512i audit_test (__m512i, __m512i, __m512i, __m512i,
+			   __m512i, __m512i, __m512i, __m512i);
+
+int
+tst_audit10_aux (void)
+{
+#ifdef __AVX512F__
+  __m512i zmm = _mm512_setzero_si512 ();
+  __m512i ret = audit_test (zmm, zmm, zmm, zmm, zmm, zmm, zmm, zmm);
+
+  zmm = _mm512_set1_epi64 (0x12349876);
+
+  if (memcmp (&zmm, &ret, sizeof (ret)))
+    abort ();
+  return 0;
+#else /* __AVX512F__ */
+  return 77;
+#endif /* __AVX512F__ */
+}
diff --git a/sysdeps/x86_64/tst-audit10.c b/sysdeps/x86_64/tst-audit10.c
index d104341..92e0cb4 100644
--- a/sysdeps/x86_64/tst-audit10.c
+++ b/sysdeps/x86_64/tst-audit10.c
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2016 Free Software Foundation, Inc.
+/* Test case for preserved AVX512 registers in dynamic linker.
+   Copyright (C) 2012-2016 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -15,13 +16,9 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* Test case for x86-64 preserved registers in dynamic linker.  */
-
-#ifdef __AVX512F__
-#include <stdlib.h>
-#include <string.h>
 #include <cpuid.h>
-#include <immintrin.h>
+
+int tst_audit10_aux (void);
 
 static int
 avx512_enabled (void)
@@ -42,32 +39,15 @@ avx512_enabled (void)
   return (eax & 0xe6) == 0xe6;
 }
 
-
-extern __m512i audit_test (__m512i, __m512i, __m512i, __m512i,
-			   __m512i, __m512i, __m512i, __m512i);
 static int
 do_test (void)
 {
   /* Run AVX512 test only if AVX512 is supported.  */
   if (avx512_enabled ())
-    {
-      __m512i zmm = _mm512_setzero_si512 ();
-      __m512i ret = audit_test (zmm, zmm, zmm, zmm, zmm, zmm, zmm, zmm);
-
-      zmm = _mm512_set1_epi64 (0x12349876);
-
-      if (memcmp (&zmm, &ret, sizeof (ret)))
-	abort ();
-    }
-  return 0;
-}
-#else
-static int
-do_test (void)
-{
-  return 0;
+    return tst_audit10_aux ();
+  else
+    return 77;
 }
-#endif
 
 #define TEST_FUNCTION do_test ()
 #include "../../test-skeleton.c"
diff --git a/sysdeps/x86_64/tst-audit4-aux.c b/sysdeps/x86_64/tst-audit4-aux.c
new file mode 100644
index 0000000..a1aeb65
--- /dev/null
+++ b/sysdeps/x86_64/tst-audit4-aux.c
@@ -0,0 +1,39 @@
+/* Test case for preserved AVX registers in dynamic linker, -mavx part.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <immintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+extern __m256i audit_test (__m256i, __m256i, __m256i, __m256i,
+			   __m256i, __m256i, __m256i, __m256i);
+
+int
+tst_audit4_aux (void)
+{
+#ifdef __AVX__
+  __m256i ymm = _mm256_setzero_si256 ();
+  __m256i ret = audit_test (ymm, ymm, ymm, ymm, ymm, ymm, ymm, ymm);
+  ymm =	 _mm256_set1_epi32 (0x12349876);
+  if (memcmp (&ymm, &ret, sizeof (ret)))
+    abort ();
+  return 0;
+#else  /* __AVX__ */
+  return 77;
+#endif  /* __AVX__ */
+}
diff --git a/sysdeps/x86_64/tst-audit4.c b/sysdeps/x86_64/tst-audit4.c
index 44d5123..d8e2ab1 100644
--- a/sysdeps/x86_64/tst-audit4.c
+++ b/sysdeps/x86_64/tst-audit4.c
@@ -1,11 +1,24 @@
-/* Test case for x86-64 preserved registers in dynamic linker.  */
+/* Test case for preserved AVX registers in dynamic linker.
+   Copyright (C) 2009-2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
 
-#ifdef __AVX__
-#include <stdlib.h>
-#include <string.h>
 #include <cpuid.h>
-#include <immintrin.h>
 
+int tst_audit4_aux (void);
 
 static int
 avx_enabled (void)
@@ -22,31 +35,15 @@ avx_enabled (void)
   return (eax & 6) == 6;
 }
 
-
-extern __m256i audit_test (__m256i, __m256i, __m256i, __m256i,
-			   __m256i, __m256i, __m256i, __m256i);
 static int
 do_test (void)
 {
   /* Run AVX test only if AVX is supported.  */
   if (avx_enabled ())
-    {
-      __m256i ymm = _mm256_setzero_si256 ();
-      __m256i ret = audit_test (ymm, ymm, ymm, ymm, ymm, ymm, ymm, ymm);
-
-      ymm =  _mm256_set1_epi32 (0x12349876);
-      if (memcmp (&ymm, &ret, sizeof (ret)))
-	abort ();
-    }
-  return 0;
-}
-#else
-static int
-do_test (void)
-{
-  return 0;
+    return tst_audit4_aux ();
+  else
+    return 77;
 }
-#endif
 
 #define TEST_FUNCTION do_test ()
 #include "../../test-skeleton.c"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b8fe596e7f750d4ee2fca14d6a3999364c02662e

commit b8fe596e7f750d4ee2fca14d6a3999364c02662e
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Mar 6 16:48:11 2016 -0800

    Group AVX512 functions in .text.avx512 section
    
    	* sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S:
    	Replace .text with .text.avx512.
    	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S:
    	Likewise.
    
    (cherry picked from commit fee9eb6200f0e44a4b684903bc47fde36d46f1a5)

diff --git a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
index 1bb12e8..3d567fc 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S
@@ -29,7 +29,7 @@
 # define MEMCPY_CHK	__memcpy_chk_avx512_no_vzeroupper
 #endif
 
-	.section .text,"ax",@progbits
+	.section .text.avx512,"ax",@progbits
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	cmpq	%rdx, %rcx
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
index 1e638d7..eab8c5a 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
@@ -26,7 +26,7 @@
 # define MEMSET_CHK __memset_chk_avx512_no_vzeroupper
 #endif
 
-	.section .text,"ax",@progbits
+	.section .text.avx512,"ax",@progbits
 #if defined PIC
 ENTRY (MEMSET_CHK)
 	cmpq	%rdx, %rcx

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e455d17680cfaebb12692547422f95ba1ed30e29

commit e455d17680cfaebb12692547422f95ba1ed30e29
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Mar 4 08:37:40 2016 -0800

    x86-64: Fix memcpy IFUNC selection
    
    Chek Fast_Unaligned_Load, instead of Slow_BSF, and also check for
    Fast_Copy_Backward to enable __memcpy_ssse3_back.  Existing selection
    order is updated with following selection order:
    
    1. __memcpy_avx_unaligned if AVX_Fast_Unaligned_Load bit is set.
    2. __memcpy_sse2_unaligned if Fast_Unaligned_Load bit is set.
    3. __memcpy_sse2 if SSSE3 isn't available.
    4. __memcpy_ssse3_back if Fast_Copy_Backward bit it set.
    5. __memcpy_ssse3
    
    	[BZ #18880]
    	* sysdeps/x86_64/multiarch/memcpy.S: Check Fast_Unaligned_Load,
    	instead of Slow_BSF, and also check for Fast_Copy_Backward to
    	enable __memcpy_ssse3_back.
    
    (cherry picked from commit 14a1d7cc4c4fd5ee8e4e66b777221dd32a84efe8)

diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 64a1bcd..8882590 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -35,22 +35,23 @@ ENTRY(__new_memcpy)
 	jz	1f
 	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
 	jz	1f
-	leaq    __memcpy_avx512_no_vzeroupper(%rip), %rax
+	lea    __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
 	ret
 #endif
-1:	leaq	__memcpy_avx_unaligned(%rip), %rax
+1:	lea	__memcpy_avx_unaligned(%rip), %RAX_LP
 	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz 2f
-	ret
-2:	leaq	__memcpy_sse2(%rip), %rax
-	HAS_ARCH_FEATURE (Slow_BSF)
-	jnz	3f
-	leaq	__memcpy_sse2_unaligned(%rip), %rax
-	ret
-3:	HAS_CPU_FEATURE (SSSE3)
-	jz 4f
-	leaq    __memcpy_ssse3(%rip), %rax
-4:	ret
+	jnz	2f
+	lea	__memcpy_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+	jnz	2f
+	lea	__memcpy_sse2(%rip), %RAX_LP
+	HAS_CPU_FEATURE (SSSE3)
+	jz	2f
+	lea    __memcpy_ssse3_back(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Fast_Copy_Backward)
+	jnz	2f
+	lea	__memcpy_ssse3(%rip), %RAX_LP
+2:	ret
 END(__new_memcpy)
 
 # undef ENTRY

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]