This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/avx2/master created. glibc-2.25-401-g6c8356d


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/avx2/master has been created
        at  6c8356dd950dc30c4d79ffbf51beb30c4a1bc62f (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6c8356dd950dc30c4d79ffbf51beb30c4a1bc62f

commit 6c8356dd950dc30c4d79ffbf51beb30c4a1bc62f
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jun 2 05:52:09 2017 -0700

    memrchr.S: memrchr.c

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 68fd4f4..f206469 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,6 +7,7 @@ ifeq ($(subdir),string)
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
 		   memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
+		   memrchr-sse2 memrchr-avx2 \
 		   memcmp-avx2-movbe \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S
new file mode 100644
index 0000000..f518819
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S
@@ -0,0 +1,26 @@
+/* memrchr optimized with SSE2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define __memrchr __memrchr_sse2
+
+# undef weak_alias
+# define weak_alias(__memrchr, memrchr)
+#endif
+
+#include "../memrchr.S"
diff --git a/sysdeps/x86_64/multiarch/memrchr.S b/sysdeps/x86_64/multiarch/memrchr.c
similarity index 59%
rename from sysdeps/x86_64/multiarch/memrchr.S
rename to sysdeps/x86_64/multiarch/memrchr.c
index 20a9e76..26e777c 100644
--- a/sysdeps/x86_64/multiarch/memrchr.S
+++ b/sysdeps/x86_64/multiarch/memrchr.c
@@ -17,39 +17,15 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
-
 /* Define multiple versions only for the definition in libc. */
 #if IS_IN (libc)
-	.text
-ENTRY(__memrchr)
-	.type	__memrchr, @gnu_indirect_function
-	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jnz	1f
-	HAS_ARCH_FEATURE (AVX2_Usable)
-	jz	1f
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz	1f
-	leaq	__memrchr_avx2(%rip), %rax
-	ret
+# define memrchr __redirect_memrchr
+# include <string.h>
+# undef memrchr
 
-1:	leaq	__memrchr_sse2(%rip), %rax
-	ret
-END(__memrchr)
+# define SYMBOL_NAME memrchr
+# include "string-sse2-avx2.h"
 
-# undef ENTRY
-# define ENTRY(name) \
-	.type __memrchr_sse2, @function; \
-	.p2align 4; \
-	.globl __memrchr_sse2; \
-	.hidden __memrchr_sse2; \
-	__memrchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __memrchr_sse2, .-__memrchr_sse2
+libc_ifunc_redirected (__redirect_memrchr, __memrchr, IFUNC_SELECTOR ());
+weak_alias (__memrchr, memrchr)
 #endif
-
-#include "../memrchr.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c8405630dd8a3b83821d4fc77699c61a1df6a073

commit c8405630dd8a3b83821d4fc77699c61a1df6a073
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jun 2 05:51:12 2017 -0700

    memchr/rawmemchr/wmemchr: string-sse2-avx2.h

diff --git a/sysdeps/x86/init-arch.h b/sysdeps/x86/init-arch.h
index 6cbc16b..44664a5 100644
--- a/sysdeps/x86/init-arch.h
+++ b/sysdeps/x86/init-arch.h
@@ -21,6 +21,18 @@
 # include <ldsodefs.h>
 #endif
 
+#define PASTER1(x,y)	x##_##y
+#define EVALUATOR1(x,y)	PASTER1 (x,y)
+#define PASTER2(x,y)	__##x##_##y
+#define EVALUATOR2(x,y)	PASTER2 (x,y)
+
+/* Basically set '__redirect_<symbol>' to use as type definition,
+   '__<symbol>_<variant>' as the optimized implementation and
+   '<symbol>_ifunc_selector' as the IFUNC selector.  */
+#define REDIRECT_NAME	EVALUATOR1 (__redirect, SYMBOL_NAME)
+#define OPTIMIZE(name)	EVALUATOR2 (SYMBOL_NAME, name)
+#define IFUNC_SELECTOR	EVALUATOR1 (SYMBOL_NAME, ifunc_selector)
+
 #ifndef __x86_64__
 /* Due to the reordering and the other nifty extensions in i686, it is
    not really good to use heavily i586 optimized code on an i686.  It's
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 3daae7c..68fd4f4 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,8 +6,7 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
-		   memchr-avx2 rawmemchr-avx2 \
-		   memrchr-avx2 \
+		   memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
 		   memcmp-avx2-movbe \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
@@ -37,7 +36,7 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
-		   wmemchr-avx2 \
+		   wmemchr-sse2 wmemchr-avx2 \
 		   wmemcmp-avx2-movbe \
 		   wcscpy-ssse3 wcscpy-c \
 		   wcschr-avx2 \
diff --git a/sysdeps/x86/init-arch.h b/sysdeps/x86_64/multiarch/memchr-sse2.S
similarity index 54%
copy from sysdeps/x86/init-arch.h
copy to sysdeps/x86_64/multiarch/memchr-sse2.S
index 6cbc16b..12c9c70 100644
--- a/sysdeps/x86/init-arch.h
+++ b/sysdeps/x86_64/multiarch/memchr-sse2.S
@@ -1,5 +1,6 @@
-/* This file is part of the GNU C Library.
-   Copyright (C) 2008-2017 Free Software Foundation, Inc.
+/* memchr optimized with SSE2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -15,21 +16,20 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifdef  __ASSEMBLER__
-# include <cpu-features.h>
-#else
-# include <ldsodefs.h>
-#endif
+#if IS_IN (libc)
+# define memchr __memchr_sse2
 
-#ifndef __x86_64__
-/* Due to the reordering and the other nifty extensions in i686, it is
-   not really good to use heavily i586 optimized code on an i686.  It's
-   better to use i486 code if it isn't an i586.  */
-# if MINIMUM_ISA == 686
-#  define USE_I586 0
-#  define USE_I686 1
-# else
-#  define USE_I586 (HAS_ARCH_FEATURE (I586) && !HAS_ARCH_FEATURE (I686))
-#  define USE_I686 HAS_ARCH_FEATURE (I686)
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memchr calls through a PLT.
+   The speedup we get from using AVX2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memchr; __GI_memchr = __memchr_sse2
 # endif
+
+# undef strong_alias
+# define strong_alias(memchr, __memchr)
 #endif
+
+#include "../memchr.S"
diff --git a/sysdeps/x86_64/multiarch/memchr.S b/sysdeps/x86_64/multiarch/memchr.S
deleted file mode 100644
index dee3fd1..0000000
--- a/sysdeps/x86_64/multiarch/memchr.S
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Multiple versions of memchr
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2017 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
-	.text
-ENTRY(memchr)
-	.type	memchr, @gnu_indirect_function
-	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jnz	1f
-	HAS_ARCH_FEATURE (AVX2_Usable)
-	jz	1f
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz	1f
-	leaq	__memchr_avx2(%rip), %rax
-	ret
-
-1:	leaq	__memchr_sse2(%rip), %rax
-	ret
-END(memchr)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __memchr_sse2, @function; \
-	.p2align 4; \
-	.globl __memchr_sse2; \
-	.hidden __memchr_sse2; \
-	__memchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __memchr_sse2, .-__memchr_sse2
-
-# ifdef SHARED
-#  undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal memchr calls through a PLT.
-   The speedup we get from using AVX2 instructions is likely eaten away
-   by the indirect call in the PLT.  */
-#  define libc_hidden_builtin_def(name) \
-	.globl __GI_memchr; __GI_memchr = __memchr_sse2
-# endif
-#endif
-
-#include "../memchr.S"
diff --git a/sysdeps/x86/init-arch.h b/sysdeps/x86_64/multiarch/memchr.c
similarity index 53%
copy from sysdeps/x86/init-arch.h
copy to sysdeps/x86_64/multiarch/memchr.c
index 6cbc16b..5e4ee63 100644
--- a/sysdeps/x86/init-arch.h
+++ b/sysdeps/x86_64/multiarch/memchr.c
@@ -1,5 +1,7 @@
-/* This file is part of the GNU C Library.
-   Copyright (C) 2008-2017 Free Software Foundation, Inc.
+/* Multiple versions of memchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -15,21 +17,15 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifdef  __ASSEMBLER__
-# include <cpu-features.h>
-#else
-# include <ldsodefs.h>
-#endif
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define memchr __redirect_memchr
+# include <string.h>
+# undef memchr
+
+# define SYMBOL_NAME memchr
+# include "string-sse2-avx2.h"
 
-#ifndef __x86_64__
-/* Due to the reordering and the other nifty extensions in i686, it is
-   not really good to use heavily i586 optimized code on an i686.  It's
-   better to use i486 code if it isn't an i586.  */
-# if MINIMUM_ISA == 686
-#  define USE_I586 0
-#  define USE_I686 1
-# else
-#  define USE_I586 (HAS_ARCH_FEATURE (I586) && !HAS_ARCH_FEATURE (I686))
-#  define USE_I686 HAS_ARCH_FEATURE (I686)
-# endif
+libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
+strong_alias (memchr, __memchr)
 #endif
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S
similarity index 62%
rename from sysdeps/x86_64/multiarch/rawmemchr.S
rename to sysdeps/x86_64/multiarch/rawmemchr-sse2.S
index b938f13..a44a7ff 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S
@@ -1,5 +1,4 @@
-/* Multiple versions of rawmemchr
-   All versions must be listed in ifunc-impl-list.c.
+/* rawmemchr optimized with SSE2.
    Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,39 +16,9 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
-
 /* Define multiple versions only for the definition in libc. */
 #if IS_IN (libc)
-	.text
-ENTRY(__rawmemchr)
-	.type	__rawmemchr, @gnu_indirect_function
-	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jnz	1f
-	HAS_ARCH_FEATURE (AVX2_Usable)
-	jz	1f
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz	1f
-	leaq	__rawmemchr_avx2(%rip), %rax
-	ret
-
-1:	leaq	__rawmemchr_sse2(%rip), %rax
-	ret
-END(__rawmemchr)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __rawmemchr_sse2, @function; \
-	.p2align 4; \
-	.globl __rawmemchr_sse2; \
-	.hidden __rawmemchr_sse2; \
-	__rawmemchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2
+# define __rawmemchr __rawmemchr_sse2
 
 # ifdef SHARED
 /* It doesn't make sense to send libc-internal rawmemchr calls through a
@@ -59,6 +28,9 @@ END(__rawmemchr)
 #  define libc_hidden_def(name) \
 	.globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2
 # endif
+
+# undefine weak_alias
+# define weak_alias(__rawmemchr, rawmemchr)
 #endif
 
 #include "../rawmemchr.S"
diff --git a/sysdeps/x86/init-arch.h b/sysdeps/x86_64/multiarch/rawmemchr.c
similarity index 53%
copy from sysdeps/x86/init-arch.h
copy to sysdeps/x86_64/multiarch/rawmemchr.c
index 6cbc16b..d81e9a8 100644
--- a/sysdeps/x86/init-arch.h
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
@@ -1,5 +1,7 @@
-/* This file is part of the GNU C Library.
-   Copyright (C) 2008-2017 Free Software Foundation, Inc.
+/* Multiple versions of rawmemchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -15,21 +17,18 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifdef  __ASSEMBLER__
-# include <cpu-features.h>
-#else
-# include <ldsodefs.h>
-#endif
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define rawmemchr __redirect_rawmemchr
+# define __rawmemchr __redirect___rawmemchr
+# include <string.h>
+# undef rawmemchr
+# undef __rawmemchr
+
+# define SYMBOL_NAME rawmemchr
+# include "string-sse2-avx2.h"
 
-#ifndef __x86_64__
-/* Due to the reordering and the other nifty extensions in i686, it is
-   not really good to use heavily i586 optimized code on an i686.  It's
-   better to use i486 code if it isn't an i586.  */
-# if MINIMUM_ISA == 686
-#  define USE_I586 0
-#  define USE_I686 1
-# else
-#  define USE_I586 (HAS_ARCH_FEATURE (I586) && !HAS_ARCH_FEATURE (I686))
-#  define USE_I686 HAS_ARCH_FEATURE (I686)
-# endif
+libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
+		       IFUNC_SELECTOR ());
+weak_alias (__rawmemchr, rawmemchr)
 #endif
diff --git a/sysdeps/x86_64/multiarch/string-sse2-avx2.h b/sysdeps/x86_64/multiarch/string-sse2-avx2.h
new file mode 100644
index 0000000..9b26086
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/string-sse2-avx2.h
@@ -0,0 +1,37 @@
+/* Common definition for ifunc selections of string functions optimized
+   with SSE2 and AVX2.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    return OPTIMIZE (avx2);
+
+  return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/wmemchr.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S
similarity index 64%
rename from sysdeps/x86_64/multiarch/wmemchr.S
rename to sysdeps/x86_64/multiarch/wmemchr-sse2.S
index b79df23..49cd86a 100644
--- a/sysdeps/x86_64/multiarch/wmemchr.S
+++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S
@@ -1,5 +1,4 @@
-/* Multiple versions of wmemchr
-   All versions must be listed in ifunc-impl-list.c.
+/* wmemchr optimized with SSE2.
    Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,39 +16,9 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
-
 /* Define multiple versions only for the definition in libc. */
 #if IS_IN (libc)
-	.text
-ENTRY(__wmemchr)
-	.type	__wmemchr, @gnu_indirect_function
-	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jnz	1f
-	HAS_ARCH_FEATURE (AVX2_Usable)
-	jz	1f
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz	1f
-	leaq	__wmemchr_avx2(%rip), %rax
-	ret
-
-1:	leaq	__wmemchr_sse2(%rip), %rax
-	ret
-END(__wmemchr)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __wmemchr_sse2, @function; \
-	.p2align 4; \
-	.globl __wmemchr_sse2; \
-	.hidden __wmemchr_sse2; \
-	__wmemchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __wmemchr_sse2, .-__wmemchr_sse2
+# define wmemchr __wmemchr_sse2
 
 # ifdef SHARED
 /* It doesn't make sense to send libc-internal wmemchr calls through a PLT.
@@ -62,6 +31,9 @@ END(__wmemchr)
 #  define libc_hidden_weak(name) \
 	.weak __GI_wmemchr; __GI_wmemchr = __wmemchr_sse2
 # endif
+
+# undef weak_alias
+# define weak_alias(__wmemchr, wmemchr)
 #endif
 
 #include "../wmemchr.S"
diff --git a/sysdeps/x86/init-arch.h b/sysdeps/x86_64/multiarch/wmemchr.c
similarity index 53%
copy from sysdeps/x86/init-arch.h
copy to sysdeps/x86_64/multiarch/wmemchr.c
index 6cbc16b..9ece9df 100644
--- a/sysdeps/x86/init-arch.h
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
@@ -1,5 +1,7 @@
-/* This file is part of the GNU C Library.
-   Copyright (C) 2008-2017 Free Software Foundation, Inc.
+/* Multiple versions of wmemchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -15,21 +17,17 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifdef  __ASSEMBLER__
-# include <cpu-features.h>
-#else
-# include <ldsodefs.h>
-#endif
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define wmemchr __redirect_wmemchr
+# define __wmemchr __redirect___wmemchr
+# include <wchar.h>
+# undef wmemchr
+# undef __wmemchr
+
+# define SYMBOL_NAME wmemchr
+# include "string-sse2-avx2.h"
 
-#ifndef __x86_64__
-/* Due to the reordering and the other nifty extensions in i686, it is
-   not really good to use heavily i586 optimized code on an i686.  It's
-   better to use i486 code if it isn't an i586.  */
-# if MINIMUM_ISA == 686
-#  define USE_I586 0
-#  define USE_I686 1
-# else
-#  define USE_I586 (HAS_ARCH_FEATURE (I586) && !HAS_ARCH_FEATURE (I686))
-#  define USE_I686 HAS_ARCH_FEATURE (I686)
-# endif
+libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
+weak_alias (__wmemchr, wmemchr)
 #endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5c0699577cfb3ced6beda3eebf88bf425a972cfc

commit 5c0699577cfb3ced6beda3eebf88bf425a972cfc
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jun 2 05:18:22 2017 -0700

    strrchr/wcsrchr: string-sse2-avx2.h

diff --git a/sysdeps/x86_64/multiarch/strrchr.c b/sysdeps/x86_64/multiarch/strrchr.c
index c434090..488c4d8 100644
--- a/sysdeps/x86_64/multiarch/strrchr.c
+++ b/sysdeps/x86_64/multiarch/strrchr.c
@@ -23,7 +23,7 @@
 # undef strrchr
 
 # define SYMBOL_NAME strrchr
-# include "strrchr.h"
+# include "string-sse2-avx2.h"
 
 extern __typeof (__redirect_strrchr) __libc_strrchr;
 
diff --git a/sysdeps/x86_64/multiarch/strrchr.h b/sysdeps/x86_64/multiarch/strrchr.h
deleted file mode 100644
index 9aa815b..0000000
--- a/sysdeps/x86_64/multiarch/strrchr.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Common definition for strrchr and wcsrchr ifunc selections.
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2017 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <init-arch.h>
-
-/* strrchr and wcsrchr share the same logic for ifunc selection.  */
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
-  const struct cpu_features* cpu_features = __get_cpu_features ();
-
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
-      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
-
-  return OPTIMIZE (sse2);
-}
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.c b/sysdeps/x86_64/multiarch/wcsrchr.c
index ea29083..2d55f56 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr.c
+++ b/sysdeps/x86_64/multiarch/wcsrchr.c
@@ -23,7 +23,7 @@
 # undef wcsrchr
 
 # define SYMBOL_NAME wcsrchr
-# include "strrchr.h"
+# include "string-sse2-avx2.h"
 
 extern __typeof (__redirect_wcsrchr) __libc_wcsrchr;
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6a421960a31a098a5b41625e5d94458a9702aecf

commit 6a421960a31a098a5b41625e5d94458a9702aecf
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jun 2 04:55:46 2017 -0700

    strrchr/wcsrchr AVX2: c

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 06568cb..3daae7c 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,7 +15,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memmove-ssse3-back \
 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
 		   strchr-avx2 strchrnul-avx2 \
-		   strrchr-avx2 \
+		   strrchr-sse2 strrchr-avx2 \
 		   strlen-avx2 strnlen-avx2 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -41,6 +41,6 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wmemcmp-avx2-movbe \
 		   wcscpy-ssse3 wcscpy-c \
 		   wcschr-avx2 \
-		   wcsrchr-avx2 \
+		   wcsrchr-sse2 wcsrchr-avx2 \
 		   wcslen-avx2 wcsnlen-avx2
 endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
similarity index 58%
rename from sysdeps/x86_64/multiarch/strrchr.S
rename to sysdeps/x86_64/multiarch/strrchr-sse2.S
index e5b49a9..bc60e8f 100644
--- a/sysdeps/x86_64/multiarch/strrchr.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strrchr
-   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+/* strrchr optimized with SSE2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,48 +16,18 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc.  */
 #if IS_IN (libc)
-	.text
-ENTRY(strrchr)
-	.type	strrchr, @gnu_indirect_function
-	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jnz	1f
-	HAS_ARCH_FEATURE (AVX2_Usable)
-	jz	1f
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz	1f
-	leaq	__strrchr_avx2(%rip), %rax
-	ret
-1:
-	leaq	__strrchr_sse2(%rip), %rax
-	ret
-END(strrchr)
-
+# define strrchr __strrchr_sse2
 
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __strrchr_sse2, @function; \
-	.align 16; \
-	.globl __strrchr_sse2; \
-	.hidden __strrchr_sse2; \
-	__strrchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
 # undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal strrchr calls through a PLT.
    The speedup we get from using SSE4.2 instruction is likely eaten away
    by the indirect call in the PLT.  */
 # define libc_hidden_builtin_def(name) \
 	.globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
+
+# undef weak_alias
+# define weak_alias(strrchr, rindex)
 #endif
 
 #include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.S b/sysdeps/x86_64/multiarch/strrchr.c
similarity index 50%
copy from sysdeps/x86_64/multiarch/wcsrchr.S
copy to sysdeps/x86_64/multiarch/strrchr.c
index 59276cb..c434090 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr.S
+++ b/sysdeps/x86_64/multiarch/strrchr.c
@@ -1,5 +1,4 @@
-/* Multiple versions of wcsrchr
-   All versions must be listed in ifunc-impl-list.c.
+/* Multiple versions of strrchr.
    Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,39 +16,17 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
+/* Define multiple versions only for the definition in libc.  */
 #if IS_IN (libc)
-	.text
-ENTRY(wcsrchr)
-	.type	wcsrchr, @gnu_indirect_function
-	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jnz	1f
-	HAS_ARCH_FEATURE (AVX2_Usable)
-	jz	1f
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz	1f
-	leaq	__wcsrchr_avx2(%rip), %rax
-	ret
+# define strrchr __redirect_strrchr
+# include <string.h>
+# undef strrchr
 
-1:	leaq	__wcsrchr_sse2(%rip), %rax
-	ret
-END(wcsrchr)
+# define SYMBOL_NAME strrchr
+# include "strrchr.h"
 
-# undef ENTRY
-# define ENTRY(name) \
-	.type __wcsrchr_sse2, @function; \
-	.p2align 4; \
-	.globl __wcsrchr_sse2; \
-	.hidden __wcsrchr_sse2; \
-	__wcsrchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __wcsrchr_sse2, .-__wcsrchr_sse2
-#endif
+extern __typeof (__redirect_strrchr) __libc_strrchr;
 
-#include "../wcsrchr.S"
+libc_ifunc_redirected (__redirect_strrchr, strrchr, IFUNC_SELECTOR ());
+weak_alias (strrchr, rindex);
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.S b/sysdeps/x86_64/multiarch/strrchr.h
similarity index 53%
copy from sysdeps/x86_64/multiarch/wcsrchr.S
copy to sysdeps/x86_64/multiarch/strrchr.h
index 59276cb..9aa815b 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr.S
+++ b/sysdeps/x86_64/multiarch/strrchr.h
@@ -1,4 +1,4 @@
-/* Multiple versions of wcsrchr
+/* Common definition for strrchr and wcsrchr ifunc selections.
    All versions must be listed in ifunc-impl-list.c.
    Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
@@ -17,39 +17,22 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 #include <init-arch.h>
 
-/* Define multiple versions only for the definition in libc. */
-#if IS_IN (libc)
-	.text
-ENTRY(wcsrchr)
-	.type	wcsrchr, @gnu_indirect_function
-	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jnz	1f
-	HAS_ARCH_FEATURE (AVX2_Usable)
-	jz	1f
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz	1f
-	leaq	__wcsrchr_avx2(%rip), %rax
-	ret
-
-1:	leaq	__wcsrchr_sse2(%rip), %rax
-	ret
-END(wcsrchr)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __wcsrchr_sse2, @function; \
-	.p2align 4; \
-	.globl __wcsrchr_sse2; \
-	.hidden __wcsrchr_sse2; \
-	__wcsrchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __wcsrchr_sse2, .-__wcsrchr_sse2
-#endif
-
-#include "../wcsrchr.S"
+/* strrchr and wcsrchr share the same logic for ifunc selection.  */
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    return OPTIMIZE (avx2);
+
+  return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
similarity index 51%
copy from sysdeps/x86_64/multiarch/wcsrchr.S
copy to sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index 59276cb..967a7b1 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -1,5 +1,4 @@
-/* Multiple versions of wcsrchr
-   All versions must be listed in ifunc-impl-list.c.
+/* wcsrchr optimized with SSE2.
    Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,39 +16,15 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
 #if IS_IN (libc)
-	.text
-ENTRY(wcsrchr)
-	.type	wcsrchr, @gnu_indirect_function
-	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jnz	1f
-	HAS_ARCH_FEATURE (AVX2_Usable)
-	jz	1f
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz	1f
-	leaq	__wcsrchr_avx2(%rip), %rax
-	ret
-
-1:	leaq	__wcsrchr_sse2(%rip), %rax
-	ret
-END(wcsrchr)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __wcsrchr_sse2, @function; \
-	.p2align 4; \
-	.globl __wcsrchr_sse2; \
-	.hidden __wcsrchr_sse2; \
-	__wcsrchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __wcsrchr_sse2, .-__wcsrchr_sse2
+# define wcsrchr __wcsrchr_sse2
+
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal wcsrchr calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_wcsrchr; __GI_wcsrchr = __wcsrchr_sse2
 #endif
 
 #include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.S b/sysdeps/x86_64/multiarch/wcsrchr.c
similarity index 50%
rename from sysdeps/x86_64/multiarch/wcsrchr.S
rename to sysdeps/x86_64/multiarch/wcsrchr.c
index 59276cb..ea29083 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr.c
@@ -1,5 +1,4 @@
-/* Multiple versions of wcsrchr
-   All versions must be listed in ifunc-impl-list.c.
+/* Multiple versions of wcsrchr.
    Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,39 +16,16 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include <init-arch.h>
-
-/* Define multiple versions only for the definition in libc. */
+/* Define multiple versions only for the definition in libc.  */
 #if IS_IN (libc)
-	.text
-ENTRY(wcsrchr)
-	.type	wcsrchr, @gnu_indirect_function
-	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
-	jnz	1f
-	HAS_ARCH_FEATURE (AVX2_Usable)
-	jz	1f
-	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-	jz	1f
-	leaq	__wcsrchr_avx2(%rip), %rax
-	ret
+# define wcsrchr __redirect_wcsrchr
+# include <wchar.h>
+# undef wcsrchr
 
-1:	leaq	__wcsrchr_sse2(%rip), %rax
-	ret
-END(wcsrchr)
+# define SYMBOL_NAME wcsrchr
+# include "strrchr.h"
 
-# undef ENTRY
-# define ENTRY(name) \
-	.type __wcsrchr_sse2, @function; \
-	.p2align 4; \
-	.globl __wcsrchr_sse2; \
-	.hidden __wcsrchr_sse2; \
-	__wcsrchr_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __wcsrchr_sse2, .-__wcsrchr_sse2
-#endif
+extern __typeof (__redirect_wcsrchr) __libc_wcsrchr;
 
-#include "../wcsrchr.S"
+libc_ifunc_redirected (__redirect_wcsrchr, wcsrchr, IFUNC_SELECTOR ());
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4a3951d0b577efd45d064ec466fa4cf76c11d9c3

commit 4a3951d0b577efd45d064ec466fa4cf76c11d9c3
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri May 26 12:21:55 2017 -0700

    x86-64: Optimize strrchr/wcsrchr with AVX2
    
    Optimize strrchr/wcsrchr with AVX2 to check 32 bytes with vector
    instructions.  It is as fast as SSE2 version for small data sizes
    and up to 1X faster for large data sizes on Haswell.  Select AVX2
    version on AVX2 machines where vzeroupper is preferred and AVX
    unaligned load is fast.
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	strrchr-avx2 and wcsrchr-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add tests for __strrchr_avx2,
    	__strrchr_sse2, __wcsrchr_avx2 and __wcsrchr_sse2.
    	* sysdeps/x86_64/multiarch/strrchr-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/strrchr.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcsrchr-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcsrchr.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 55d4147..06568cb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,6 +15,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memmove-ssse3-back \
 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
 		   strchr-avx2 strchrnul-avx2 \
+		   strrchr-avx2 \
 		   strlen-avx2 strnlen-avx2 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -40,5 +41,6 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wmemcmp-avx2-movbe \
 		   wcscpy-ssse3 wcscpy-c \
 		   wcschr-avx2 \
+		   wcsrchr-avx2 \
 		   wcslen-avx2 wcsnlen-avx2
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 86bd4ab..c2578cb 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -251,6 +251,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strchrnul_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/strrchr.S.  */
+  IFUNC_IMPL (i, name, strrchr,
+	      IFUNC_IMPL_ADD (array, i, strrchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/strcmp.S.  */
   IFUNC_IMPL (i, name, strcmp,
 	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
@@ -342,6 +349,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wcschr_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wcsrchr.S.  */
+  IFUNC_IMPL (i, name, wcsrchr,
+	      IFUNC_IMPL_ADD (array, i, wcsrchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wcsrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/wcscpy.S.  */
   IFUNC_IMPL (i, name, wcscpy,
 	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
new file mode 100644
index 0000000..36ef660
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -0,0 +1,235 @@
+/* strrchr/wcsrchr optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRRCHR
+#  define STRRCHR	__strrchr_avx2
+# endif
+
+# ifdef USE_AS_WCSRCHR
+#  define VPBROADCAST	vpbroadcastd
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPBROADCAST	vpbroadcastb
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE	32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (STRRCHR)
+	movd	%esi, %xmm4
+	movl	%edi, %ecx
+	/* Broadcast CHAR to YMM4.  */
+	VPBROADCAST %xmm4, %ymm4
+	vpxor	%ymm0, %ymm0, %ymm0
+
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	vmovdqu	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %ecx
+	vpmovmskb %ymm3, %eax
+	addq	$VEC_SIZE, %rdi
+
+	testl	%eax, %eax
+	jnz	L(first_vec)
+
+	testl	%ecx, %ecx
+	jnz	L(return_null)
+
+	andq	$-VEC_SIZE, %rdi
+	xorl	%edx, %edx
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(first_vec):
+	/* Check if there is a nul CHAR.  */
+	testl	%ecx, %ecx
+	jnz	L(char_and_nul_in_first_vec)
+
+	/* Remember the match and keep searching.  */
+	movl	%eax, %edx
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rdi
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+	vmovdqa	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %edx
+	vpmovmskb %ymm3, %eax
+	shrl	%cl, %edx
+	shrl	%cl, %eax
+	addq	$VEC_SIZE, %rdi
+
+	/* Check if there is a CHAR.  */
+	testl	%eax, %eax
+	jnz	L(found_char)
+
+	testl	%edx, %edx
+	jnz	L(return_null)
+
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(found_char):
+	testl	%edx, %edx
+	jnz	L(char_and_nul)
+
+	/* Remember the match and keep searching.  */
+	movl	%eax, %edx
+	leaq	(%rdi, %rcx), %rsi
+
+	.p2align 4
+L(aligned_loop):
+	vmovdqa	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	addq	$VEC_SIZE, %rdi
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %ecx
+	vpmovmskb %ymm3, %eax
+	orl	%eax, %ecx
+	jnz	L(char_nor_null)
+
+	vmovdqa	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	add	$VEC_SIZE, %rdi
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %ecx
+	vpmovmskb %ymm3, %eax
+	orl	%eax, %ecx
+	jnz	L(char_nor_null)
+
+	vmovdqa	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	addq	$VEC_SIZE, %rdi
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %ecx
+	vpmovmskb %ymm3, %eax
+	orl	%eax, %ecx
+	jnz	L(char_nor_null)
+
+	vmovdqa	(%rdi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm2
+	addq	$VEC_SIZE, %rdi
+	VPCMPEQ	%ymm1, %ymm4, %ymm3
+	vpmovmskb %ymm2, %ecx
+	vpmovmskb %ymm3, %eax
+	orl	%eax, %ecx
+	jz	L(aligned_loop)
+
+	.p2align 4
+L(char_nor_null):
+	/* Find a CHAR or a nul CHAR in a loop.  */
+	testl	%eax, %eax
+	jnz	L(match)
+L(return_value):
+	testl	%edx, %edx
+	jz	L(return_null)
+	movl	%edx, %eax
+	movq	%rsi, %rdi
+
+# ifdef USE_AS_WCSRCHR
+	/* Keep the first bit for each matching CHAR for bsr.  */
+	andl	$0x11111111, %eax
+# endif
+	bsrl	%eax, %eax
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(match):
+	/* Find a CHAR.  Check if there is a nul CHAR.  */
+	vpmovmskb %ymm2, %ecx
+	testl	%ecx, %ecx
+	jnz	L(find_nul)
+
+	/* Remember the match and keep searching.  */
+	movl	%eax, %edx
+	movq	%rdi, %rsi
+	jmp	L(aligned_loop)
+
+	.p2align 4
+L(find_nul):
+# ifdef USE_AS_WCSRCHR
+	/* Keep the first bit for each matching CHAR for bsr.  */
+	andl	$0x11111111, %ecx
+	andl	$0x11111111, %eax
+# endif
+	/* Mask out any matching bits after the nul CHAR.  */
+	movl	%ecx, %r8d
+	subl	$1, %r8d
+	xorl	%ecx, %r8d
+	andl	%r8d, %eax
+	testl	%eax, %eax
+	/* If there is no CHAR here, return the remembered one.  */
+	jz	L(return_value)
+	bsrl	%eax, %eax
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(char_and_nul):
+	/* Find both a CHAR and a nul CHAR.  */
+	addq	%rcx, %rdi
+	movl	%edx, %ecx
+L(char_and_nul_in_first_vec):
+# ifdef USE_AS_WCSRCHR
+	/* Keep the first bit for each matching CHAR for bsr.  */
+	andl	$0x11111111, %ecx
+	andl	$0x11111111, %eax
+# endif
+	/* Mask out any matching bits after the nul CHAR.  */
+	movl	%ecx, %r8d
+	subl	$1, %r8d
+	xorl	%ecx, %r8d
+	andl	%r8d, %eax
+	testl	%eax, %eax
+	/* Return null pointer if the nul CHAR comes first.  */
+	jz	L(return_null)
+	bsrl	%eax, %eax
+	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(return_null):
+	xorl	%eax, %eax
+	VZEROUPPER
+	ret
+
+END (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
new file mode 100644
index 0000000..e5b49a9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr.S
@@ -0,0 +1,63 @@
+/* Multiple versions of strrchr
+   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+	.text
+ENTRY(strrchr)
+	.type	strrchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__strrchr_avx2(%rip), %rax
+	ret
+1:
+	leaq	__strrchr_sse2(%rip), %rax
+	ret
+END(strrchr)
+
+
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strrchr_sse2, @function; \
+	.align 16; \
+	.globl __strrchr_sse2; \
+	.hidden __strrchr_sse2; \
+	__strrchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
+#endif
+
+#include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2.S
new file mode 100644
index 0000000..cf8a239
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_avx2
+#define USE_AS_WCSRCHR 1
+#include "strrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.S b/sysdeps/x86_64/multiarch/wcsrchr.S
new file mode 100644
index 0000000..59276cb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr.S
@@ -0,0 +1,55 @@
+/* Multiple versions of wcsrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(wcsrchr)
+	.type	wcsrchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wcsrchr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__wcsrchr_sse2(%rip), %rax
+	ret
+END(wcsrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __wcsrchr_sse2, @function; \
+	.p2align 4; \
+	.globl __wcsrchr_sse2; \
+	.hidden __wcsrchr_sse2; \
+	__wcsrchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __wcsrchr_sse2, .-__wcsrchr_sse2
+#endif
+
+#include "../wcsrchr.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a13bc0517019cb698162accc0fa07d79cd9ad56e

commit a13bc0517019cb698162accc0fa07d79cd9ad56e
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue May 23 11:25:19 2017 -0700

    x86-64: Optimize memrchr with AVX2
    
    Optimize memrchr with AVX2 to search 32 bytes with a single vector
    compare instruction.  It is as fast as SSE2 memrchr for small data
    sizes and up to 1X faster for large data sizes on Haswell.  Select
    AVX2 memrchr on AVX2 machines where vzeroupper is preferred and AVX
    unaligned load is fast.
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	memrchr-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add tests for __memrchr_avx2 and
    	__memrchr_sse2.
    	* sysdeps/x86_64/multiarch/memrchr-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/memrchr.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f63e3b5..55d4147 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,6 +7,7 @@ ifeq ($(subdir),string)
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
 		   memchr-avx2 rawmemchr-avx2 \
+		   memrchr-avx2 \
 		   memcmp-avx2-movbe \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 36f14a8..86bd4ab 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -112,6 +112,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
 			      __memmove_sse2_unaligned_erms))
 
+  /* Support sysdeps/x86_64/multiarch/memrchr.S.  */
+  IFUNC_IMPL (i, name, memrchr,
+	      IFUNC_IMPL_ADD (array, i, memrchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memrchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
new file mode 100644
index 0000000..3ee02e1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -0,0 +1,359 @@
+/* memrchr optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (__memrchr_avx2)
+	/* Broadcast CHAR to YMM0.  */
+	vmovd	%esi, %xmm0
+	vpbroadcastb %xmm0, %ymm0
+
+	subq	$VEC_SIZE, %rdx
+	jbe	L(last_vec_or_less)
+
+	addq	%rdx, %rdi
+
+	/* Check the last VEC_SIZE bytes.  */
+	vpcmpeqb (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x0)
+
+	subq	$(VEC_SIZE * 4), %rdi
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+	jz	L(aligned_more)
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rdx
+	andq	$-VEC_SIZE, %rdi
+	subq	%rcx, %rdx
+
+	.p2align 4
+L(aligned_more):
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+
+	/* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	vpcmpeqb (%rdi), %ymm0, %ymm4
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x0)
+
+	/* Align data to 4 * VEC_SIZE for loop with fewer branches.
+	   There are some overlaps with above if data isn't aligned
+	   to 4 * VEC_SIZE.  */
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	jz	L(loop_4x_vec)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rdx
+	andq	$-(VEC_SIZE * 4), %rdi
+	subq	%rcx, %rdx
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	subq	$(VEC_SIZE * 4), %rdi
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+
+	vmovdqa	(%rdi), %ymm1
+	vmovdqa	VEC_SIZE(%rdi), %ymm2
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
+
+	vpcmpeqb %ymm1, %ymm0, %ymm1
+	vpcmpeqb %ymm2, %ymm0, %ymm2
+	vpcmpeqb %ymm3, %ymm0, %ymm3
+	vpcmpeqb %ymm4, %ymm0, %ymm4
+
+	vpor	%ymm1, %ymm2, %ymm5
+	vpor	%ymm3, %ymm4, %ymm6
+	vpor	%ymm5, %ymm6, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jz	L(loop_4x_vec)
+
+	/* There is a match.  */
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	vpmovmskb %ymm1, %eax
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_4x_vec_or_less):
+	addl	$(VEC_SIZE * 4), %edx
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x1_check)
+	cmpl	$(VEC_SIZE * 3), %edx
+	jbe	L(zero)
+
+	vpcmpeqb (%rdi), %ymm0, %ymm4
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 4), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3_check)
+	cmpl	$VEC_SIZE, %edx
+	jbe	L(zero)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 2), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$(VEC_SIZE * 2), %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_x0):
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_x1):
+	bsrl	%eax, %eax
+	addl	$VEC_SIZE, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_x2):
+	bsrl	%eax, %eax
+	addl	$(VEC_SIZE * 2), %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_x3):
+	bsrl	%eax, %eax
+	addl	$(VEC_SIZE * 3), %eax
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(last_vec_x1_check):
+	bsrl	%eax, %eax
+	subq	$(VEC_SIZE * 3), %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$VEC_SIZE, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_x3_check):
+	bsrl	%eax, %eax
+	subq	$VEC_SIZE, %rdx
+	addq	%rax, %rdx
+	jl	L(zero)
+	addl	$(VEC_SIZE * 3), %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(zero):
+	VZEROUPPER
+L(null):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(last_vec_or_less_aligned):
+	movl	%edx, %ecx
+
+	vpcmpeqb (%rdi), %ymm0, %ymm1
+
+	movl	$1, %edx
+	/* Support rdx << 32.  */
+	salq	%cl, %rdx
+	subq	$1, %rdx
+
+	vpmovmskb %ymm1, %eax
+
+	/* Remove the trailing bytes.  */
+	andl	%edx, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_or_less):
+	addl	$VEC_SIZE, %edx
+
+	/* Check for zero length.  */
+	testl	%edx, %edx
+	jz	L(null)
+
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+	jz	L(last_vec_or_less_aligned)
+
+	movl	%ecx, %esi
+	movl	%ecx, %r8d
+	addl	%edx, %esi
+	andq	$-VEC_SIZE, %rdi
+
+	subl	$VEC_SIZE, %esi
+	ja	L(last_vec_2x_aligned)
+
+	/* Check the last VEC.  */
+	vpcmpeqb (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+
+	/* Remove the leading and trailing bytes.  */
+	sarl	%cl, %eax
+	movl	%edx, %ecx
+
+	movl	$1, %edx
+	sall	%cl, %edx
+	subl	$1, %edx
+
+	andl	%edx, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	addq	%r8, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_vec_2x_aligned):
+	movl	%esi, %ecx
+
+	/* Check the last VEC.  */
+	vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+
+	movl	$1, %edx
+	sall	%cl, %edx
+	subl	$1, %edx
+
+	vpmovmskb %ymm1, %eax
+
+	/* Remove the trailing bytes.  */
+	andl	%edx, %eax
+
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	/* Check the second last VEC.  */
+	vpcmpeqb (%rdi), %ymm0, %ymm1
+
+	movl	%r8d, %ecx
+
+	vpmovmskb %ymm1, %eax
+
+	/* Remove the leading bytes.  Must use unsigned right shift for
+	   bsrl below.  */
+	shrl	%cl, %eax
+	testl	%eax, %eax
+	jz	L(zero)
+
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	addq	%r8, %rax
+	VZEROUPPER
+	ret
+END (__memrchr_avx2)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memrchr.S b/sysdeps/x86_64/multiarch/memrchr.S
new file mode 100644
index 0000000..20a9e76
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr.S
@@ -0,0 +1,55 @@
+/* Multiple versions of memrchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__memrchr)
+	.type	__memrchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__memrchr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__memrchr_sse2(%rip), %rax
+	ret
+END(__memrchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memrchr_sse2, @function; \
+	.p2align 4; \
+	.globl __memrchr_sse2; \
+	.hidden __memrchr_sse2; \
+	__memrchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memrchr_sse2, .-__memrchr_sse2
+#endif
+
+#include "../memrchr.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=7f51d82fb0dcab775f6b9eb4430c152aa9d48923

commit 7f51d82fb0dcab775f6b9eb4430c152aa9d48923
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon May 22 15:09:50 2017 -0700

    x86-64: Optimize strchr/strchrnul/wcschr with AVX2
    
    Optimize strchr/strchrnul/wcschr with AVX2 to search 32 bytes with vector
    instructions.  It is as fast as SSE2 versions for size <= 16 bytes and up
    to 1X faster for or size > 16 bytes on Haswell.  Select AVX2 version on
    AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
    
    NB: It uses TZCNT instead of BSF since TZCNT produces the same result
    as BSF for non-zero input.  TZCNT is faster than BSF and is executed
    as BSF if machine doesn't support TZCNT.
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	strchr-avx2, strchrnul-avx2 and wcschr-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add tests for __strchr_avx2,
    	__strchrnul_avx2, __strchrnul_sse2, __wcschr_avx2 and
    	__wcschr_sse2.
    	* sysdeps/x86_64/multiarch/strchr-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/strchrnul-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/strchrnul.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcschr-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcschr.S: Likewise.
    	* sysdeps/x86_64/multiarch/strchr.S (strchr): Add support for
    	__strchr_avx2.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 4ba527c..f63e3b5 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memcpy-ssse3-back \
 		   memmove-ssse3-back \
 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+		   strchr-avx2 strchrnul-avx2 \
 		   strlen-avx2 strnlen-avx2 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -37,5 +38,6 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wmemchr-avx2 \
 		   wmemcmp-avx2-movbe \
 		   wcscpy-ssse3 wcscpy-c \
+		   wcschr-avx2 \
 		   wcslen-avx2 wcsnlen-avx2
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f139efc..36f14a8 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -231,9 +231,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strchr.S.  */
   IFUNC_IMPL (i, name, strchr,
+	      IFUNC_IMPL_ADD (array, i, strchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/strchrnul.S.  */
+  IFUNC_IMPL (i, name, strchrnul,
+	      IFUNC_IMPL_ADD (array, i, strchrnul,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strchrnul_avx2)
+	      IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+
   /* Support sysdeps/x86_64/multiarch/strcmp.S.  */
   IFUNC_IMPL (i, name, strcmp,
 	      IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
@@ -318,6 +328,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wcschr.S.  */
+  IFUNC_IMPL (i, name, wcschr,
+	      IFUNC_IMPL_ADD (array, i, wcschr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wcschr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/wcscpy.S.  */
   IFUNC_IMPL (i, name, wcscpy,
 	      IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
new file mode 100644
index 0000000..e4292d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -0,0 +1,254 @@
+/* strchr/strchrnul optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCHR
+#  define STRCHR	__strchr_avx2
+# endif
+
+# ifdef USE_AS_WCSCHR
+#  define VPBROADCAST	vpbroadcastd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_REG	esi
+# else
+#  define VPBROADCAST	vpbroadcastb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_REG	sil
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (STRCHR)
+	movl	%edi, %ecx
+	/* Broadcast CHAR to YMM0.  */
+	vmovd	%esi, %xmm0
+	vpxor	%xmm9, %xmm9, %xmm9
+	VPBROADCAST %xmm0, %ymm0
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
+	   null byte.  */
+	vmovdqu	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+
+	jmp	L(more_4x_vec)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+	vmovdqu	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	/* Found CHAR or the null byte.  */
+	tzcntl	%eax, %eax
+	addq	%rcx, %rax
+# ifdef USE_AS_STRCHRNUL
+	addq	%rdi, %rax
+# else
+	xorl	%edx, %edx
+	leaq	(%rdi, %rax), %rax
+	cmp	(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(aligned_more):
+	addq	$VEC_SIZE, %rdi
+
+L(more_4x_vec):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	vmovdqa	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	vmovdqa	VEC_SIZE(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+	/* Align data to 4 * VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andl	$(4 * VEC_SIZE - 1), %ecx
+	andq	$-(4 * VEC_SIZE), %rdi
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqa	(%rdi), %ymm5
+	vmovdqa	VEC_SIZE(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+
+	VPCMPEQ %ymm5, %ymm0, %ymm1
+	VPCMPEQ %ymm6, %ymm0, %ymm2
+	VPCMPEQ %ymm7, %ymm0, %ymm3
+	VPCMPEQ %ymm8, %ymm0, %ymm4
+
+	VPCMPEQ %ymm5, %ymm9, %ymm5
+	VPCMPEQ %ymm6, %ymm9, %ymm6
+	VPCMPEQ %ymm7, %ymm9, %ymm7
+	VPCMPEQ %ymm8, %ymm9, %ymm8
+
+	vpor	%ymm1, %ymm5, %ymm1
+	vpor	%ymm2, %ymm6, %ymm2
+	vpor	%ymm3, %ymm7, %ymm3
+	vpor	%ymm4, %ymm8, %ymm4
+
+	vpor	%ymm1, %ymm2, %ymm5
+	vpor	%ymm3, %ymm4, %ymm6
+
+	vpor	%ymm5, %ymm6, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+	jmp	L(loop_4x_vec)
+
+	.p2align 4
+L(first_vec_x0):
+	/* Found CHAR or the null byte.  */
+	tzcntl	%eax, %eax
+# ifdef USE_AS_STRCHRNUL
+	addq	%rdi, %rax
+# else
+	xorl	%edx, %edx
+	leaq	(%rdi, %rax), %rax
+	cmp	(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_STRCHRNUL
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+# else
+	xorl	%edx, %edx
+	leaq	VEC_SIZE(%rdi, %rax), %rax
+	cmp	(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_STRCHRNUL
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+# else
+	xorl	%edx, %edx
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	cmp	(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_STRCHRNUL
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+# else
+	xorl	%edx, %edx
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	cmp	(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index c9f54ca..f83ba6e 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -26,6 +26,15 @@
 ENTRY(strchr)
 	.type	strchr, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__strchr_avx2(%rip), %rax
+	ret
+1:
 	leaq	__strchr_sse2(%rip), %rax
 2:	HAS_ARCH_FEATURE (Slow_BSF)
 	jz	3f
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2.S b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
new file mode 100644
index 0000000..fa0cc09
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchrnul.S
similarity index 51%
copy from sysdeps/x86_64/multiarch/strchr.S
copy to sysdeps/x86_64/multiarch/strchrnul.S
index c9f54ca..4df9c39 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchrnul.S
@@ -1,5 +1,6 @@
-/* Multiple versions of strchr
-   Copyright (C) 2009-2017 Free Software Foundation, Inc.
+/* Multiple versions of strchrnul
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -19,39 +20,36 @@
 #include <sysdep.h>
 #include <init-arch.h>
 
-
-/* Define multiple versions only for the definition in libc.  */
+/* Define multiple versions only for the definition in libc. */
 #if IS_IN (libc)
 	.text
-ENTRY(strchr)
-	.type	strchr, @gnu_indirect_function
+ENTRY(__strchrnul)
+	.type	__strchrnul, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	leaq	__strchr_sse2(%rip), %rax
-2:	HAS_ARCH_FEATURE (Slow_BSF)
-	jz	3f
-	leaq    __strchr_sse2_no_bsf(%rip), %rax
-3:	ret
-END(strchr)
-
-
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__strchrnul_avx2(%rip), %rax
+	ret
+
+1:	leaq	__strchrnul_sse2(%rip), %rax
+	ret
+END(__strchrnul)
 
 # undef ENTRY
 # define ENTRY(name) \
-	.type __strchr_sse2, @function; \
-	.align 16; \
-	.globl __strchr_sse2; \
-	.hidden __strchr_sse2; \
-	__strchr_sse2: cfi_startproc; \
+	.type __strchrnul_sse2, @function; \
+	.p2align 4; \
+	.globl __strchrnul_sse2; \
+	.hidden __strchrnul_sse2; \
+	__strchrnul_sse2: cfi_startproc; \
 	CALL_MCOUNT
 # undef END
 # define END(name) \
-	cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strchr calls through a PLT.
-   The speedup we get from using SSE4.2 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_strchr; __GI_strchr = __strchr_sse2
+	cfi_endproc; .size __strchrnul_sse2, .-__strchrnul_sse2
 #endif
 
-#include "../strchr.S"
+#include "../strchrnul.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2.S b/sysdeps/x86_64/multiarch/wcschr-avx2.S
new file mode 100644
index 0000000..67726b6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr.S b/sysdeps/x86_64/multiarch/wcschr.S
new file mode 100644
index 0000000..e132acc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr.S
@@ -0,0 +1,67 @@
+/* Multiple versions of wcschr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__wcschr)
+	.type	__wcschr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wcschr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__wcschr_sse2(%rip), %rax
+	ret
+END(__wcschr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __wcschr_sse2, @function; \
+	.p2align 4; \
+	.globl __wcschr_sse2; \
+	.hidden __wcschr_sse2; \
+	__wcschr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __wcschr_sse2, .-__wcschr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal wcschr calls through a PLT.
+   The speedup we get from using AVX2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___wcschr; __GI___wcschr = __wcschr_sse2
+#  undef libc_hidden_weak
+#  define libc_hidden_weak(name) \
+	.weak __GI_wcschr; __GI_wcschr = __wcschr_sse2
+# endif
+#endif
+
+#include "../wcschr.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=bdfa304cd82cded5441b921c9cabb7f0a5624835

commit bdfa304cd82cded5441b921c9cabb7f0a5624835
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri May 19 12:19:42 2017 -0700

    x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2
    
    Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with
    a single vector compare instruction.  It is as fast as SSE2 versions for
    size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell.
    Select AVX2 version on AVX2 machines where vzeroupper is preferred and
    AVX unaligned load is fast.
    
    NB: It uses TZCNT instead of BSF since TZCNT produces the same result
    as BSF for non-zero input.  TZCNT is faster than BSF and is executed
    as BSF if machine doesn't support TZCNT.
    
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	strlen-avx2, strnlen-avx2 and wcslen-avx2 and wcsnlen-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add tests for __strlen_avx2,
    	__strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2,
    	__wcslen_sse2, __wcsnlen_avx2 and __wcsnlen_sse2.
    	* sysdeps/x86_64/multiarch/strlen-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/strlen.S: Likewise.
    	* sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/strnlen.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcslen.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/wcsnlen.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 68ab559..4ba527c 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memcpy-ssse3-back \
 		   memmove-ssse3-back \
 		   memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+		   strlen-avx2 strnlen-avx2 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
@@ -35,5 +36,6 @@ ifeq ($(subdir),wcsmbs)
 sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wmemchr-avx2 \
 		   wmemcmp-avx2-movbe \
-		   wcscpy-ssse3 wcscpy-c
+		   wcscpy-ssse3 wcscpy-c \
+		   wcslen-avx2 wcsnlen-avx2
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f60535b..f139efc 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -166,6 +166,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __rawmemchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/strlen.S.  */
+  IFUNC_IMPL (i, name, strlen,
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strlen_avx2)
+	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/strnlen.S.  */
+  IFUNC_IMPL (i, name, strnlen,
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strnlen_avx2)
+	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+
   /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */
   IFUNC_IMPL (i, name, stpncpy,
 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
@@ -310,6 +324,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wcscpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wcslen.S.  */
+  IFUNC_IMPL (i, name, wcslen,
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wcslen_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wcsnlen.S.  */
+  IFUNC_IMPL (i, name, wcsnlen,
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wcsnlen_avx2)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
+
   /* Support sysdeps/x86_64/multiarch/wmemchr.S.  */
   IFUNC_IMPL (i, name, wmemchr,
 	      IFUNC_IMPL_ADD (array, i, wmemchr,
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
new file mode 100644
index 0000000..1dc823a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -0,0 +1,394 @@
+/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+#  define STRLEN	__strlen_avx2
+# endif
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (STRLEN)
+# ifdef USE_AS_STRNLEN
+	/* Check for zero length.  */
+	testq	%rsi, %rsi
+	jz	L(zero)
+#  ifdef USE_AS_WCSLEN
+	shl	$2, %rsi
+#  endif
+	movq	%rsi, %r8
+# endif
+	movl	%edi, %ecx
+	movq	%rdi, %rdx
+	vpxor	%xmm0, %xmm0, %xmm0
+
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+# ifdef USE_AS_STRNLEN
+	jnz	L(first_vec_x0_check)
+	/* Adjust length and check the end of data.  */
+	subq	$VEC_SIZE, %rsi
+	jbe	L(max)
+# else
+	jnz	L(first_vec_x0)
+# endif
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+	/* Adjust length.  */
+	addq	%rcx, %rsi
+
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+	jmp	L(more_4x_vec)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_STRNLEN
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+# endif
+	addq	%rdi, %rax
+	addq	%rcx, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(aligned_more):
+# ifdef USE_AS_STRNLEN
+        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+	    to void possible addition overflow.  */
+	negq	%rcx
+	addq	$VEC_SIZE, %rcx
+
+	/* Check the end of data.  */
+	subq	%rcx, %rsi
+	jbe	L(max)
+# endif
+
+	addq	$VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_STRNLEN
+	subq	$(VEC_SIZE * 4), %rsi
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+	/* Align data to 4 * VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andl	$(4 * VEC_SIZE - 1), %ecx
+	andq	$-(4 * VEC_SIZE), %rdi
+
+# ifdef USE_AS_STRNLEN
+	/* Adjust length.  */
+	addq	%rcx, %rsi
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqa (%rdi), %ymm1
+	vmovdqa	VEC_SIZE(%rdi), %ymm2
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
+	VPMINU	%ymm1, %ymm2, %ymm5
+	VPMINU	%ymm3, %ymm4, %ymm6
+	VPMINU	%ymm5, %ymm6, %ymm5
+
+	VPCMPEQ	%ymm5, %ymm0, %ymm5
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_STRNLEN
+	jmp	L(loop_4x_vec)
+# else
+	subq	$(VEC_SIZE * 4), %rsi
+	ja	L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+	addl	$(VEC_SIZE * 2), %esi
+	jle	L(last_2x_vec)
+
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x2_check)
+	subl	$VEC_SIZE, %esi
+	jle	L(max)
+
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x3_check)
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	addl	$(VEC_SIZE * 2), %esi
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x0_check)
+	subl	$VEC_SIZE, %esi
+	jle	L(max)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x0_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x3_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	jbe	L(max)
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(max):
+	movq	%r8, %rax
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+#  endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4
+L(first_vec_x0):
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+	VPCMPEQ %ymm2, %ymm0, %ymm2
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+	VPCMPEQ %ymm3, %ymm0, %ymm3
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+	VPCMPEQ %ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	subq	%rdx, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	VZEROUPPER
+	ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
new file mode 100644
index 0000000..2847440
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -0,0 +1,64 @@
+/* Multiple versions of strlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(strlen)
+	.type	strlen, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__strlen_avx2(%rip), %rax
+	ret
+
+1:	leaq	__strlen_sse2(%rip), %rax
+	ret
+END(strlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strlen_sse2, @function; \
+	.p2align 4; \
+	.globl __strlen_sse2; \
+	.hidden __strlen_sse2; \
+	__strlen_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strlen_sse2, .-__strlen_sse2
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strlen calls through a PLT.
+   The speedup we get from using AVX2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_strlen; __GI_strlen = __strlen_sse2
+# endif
+#endif
+
+#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2.S b/sysdeps/x86_64/multiarch/strnlen-avx2.S
new file mode 100644
index 0000000..c4062b2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-avx2.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_avx2
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S
new file mode 100644
index 0000000..0c2289a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen.S
@@ -0,0 +1,65 @@
+/* Multiple versions of strnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__strnlen)
+	.type	__strnlen, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__strnlen_avx2(%rip), %rax
+	ret
+
+1:	leaq	__strnlen_sse2(%rip), %rax
+	ret
+END(__strnlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strnlen_sse2, @function; \
+	.p2align 4; \
+	.globl __strnlen_sse2; \
+	.hidden __strnlen_sse2; \
+	__strnlen_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal strnlen calls through a PLT.
+   The speedup we get from using AVX2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI_strnlen; __GI_strnlen = __strnlen_sse2; \
+	.globl __GI___strnlen; __GI___strnlen = __strnlen_sse2
+# endif
+#endif
+
+#include "../strnlen.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2.S b/sysdeps/x86_64/multiarch/wcslen-avx2.S
new file mode 100644
index 0000000..c9224f1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-avx2.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_avx2
+#define USE_AS_WCSLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen.S b/sysdeps/x86_64/multiarch/wcslen.S
new file mode 100644
index 0000000..04369b6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen.S
@@ -0,0 +1,55 @@
+/* Multiple versions of wcslen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__wcslen)
+	.type	__wcslen, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wcslen_avx2(%rip), %rax
+	ret
+
+1:	leaq	__wcslen_sse2(%rip), %rax
+	ret
+END(__wcslen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __wcslen_sse2, @function; \
+	.p2align 4; \
+	.globl __wcslen_sse2; \
+	.hidden __wcslen_sse2; \
+	__wcslen_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __wcslen_sse2, .-__wcslen_sse2
+#endif
+
+#include "../wcslen.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2.S
new file mode 100644
index 0000000..fac8354
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_avx2
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.S b/sysdeps/x86_64/multiarch/wcsnlen.S
new file mode 100644
index 0000000..0893bea
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen.S
@@ -0,0 +1,55 @@
+/* Multiple versions of wcsnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__wcsnlen)
+	.type	__wcsnlen, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wcsnlen_avx2(%rip), %rax
+	ret
+
+1:	leaq	__wcsnlen_sse2(%rip), %rax
+	ret
+END(__wcsnlen)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __wcsnlen_sse2, @function; \
+	.p2align 4; \
+	.globl __wcsnlen_sse2; \
+	.hidden __wcsnlen_sse2; \
+	__wcsnlen_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __wcsnlen_sse2, .-__wcsnlen_sse2
+#endif
+
+#include "../wcsnlen.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=218709ea8905b294b9c7475820ef518631d2858b

commit 218709ea8905b294b9c7475820ef518631d2858b
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Thu May 18 11:10:09 2017 -0700

    x86-64: Optimize memchr/rawmemchr/wmemchr with SSE2/AVX2
    
    SSE2 memchr is extended to support wmemchr.  AVX2 memchr/rawmemchr/wmemchr
    are added to search 32 bytes with a single vector compare instruction.
    AVX2 memchr/rawmemchr/wmemchr are as fast as SSE2 memchr/rawmemchr/wmemchr
    for small sizes and up to 1.5X faster for larger sizes on Haswell and
    Skylake.  Select AVX2 memchr/rawmemchr/wmemchr on AVX2 machines where
    vzeroupper is preferred and AVX unaligned load is fast.
    
    NB: It uses TZCNT instead of BSF since TZCNT produces the same result
    as BSF for non-zero input.  TZCNT is faster than BSF and is executed
    as BSF if machine doesn't support TZCNT.
    
    	* sysdeps/x86_64/memchr.S (MEMCHR): New.  Depending on if
    	USE_AS_WMEMCHR is defined.
    	(PCMPEQ): Likewise.
    	(memchr): Renamed to ...
    	(MEMCHR): This.  Support wmemchr if USE_AS_WMEMCHR is defined.
    	Replace pcmpeqb with PCMPEQ.
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	memchr-avx2, rawmemchr-avx2 and wmemchr-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Test __memchr_avx2, __memchr_sse2,
    	__rawmemchr_avx2, __rawmemchr_sse2, __wmemchr_avx2 and
    	__wmemchr_sse2.
    	* sysdeps/x86_64/multiarch/memchr-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/memchr.S: Likewise.
    	* sysdeps/x86_64/multiarch/rawmemchr-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/rawmemchr.S: Likewise.
    	* sysdeps/x86_64/multiarch/wmemchr-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/wmemchr.S: Likewise.
    	* sysdeps/x86_64/wmemchr.S: Likewise.

diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index d3be012..3167cd8 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -18,17 +18,31 @@
 
 #include <sysdep.h>
 
+#ifdef USE_AS_WMEMCHR
+# define MEMCHR		wmemchr
+# define PCMPEQ		pcmpeqd
+#else
+# define MEMCHR		memchr
+# define PCMPEQ		pcmpeqb
+#endif
+
 /* fast SSE2 version with using pmaxub and 64 byte loop */
 
 	.text
-ENTRY(memchr)
+ENTRY(MEMCHR)
 	movd	%esi, %xmm1
 	mov	%edi, %ecx
 
+#ifdef USE_AS_WMEMCHR
+	test	%rdx, %rdx
+	jz	L(return_null)
+	shl	$2, %rdx
+#else
 	punpcklbw %xmm1, %xmm1
 	test	%rdx, %rdx
 	jz	L(return_null)
 	punpcklbw %xmm1, %xmm1
+#endif
 
 	and	$63, %ecx
 	pshufd	$0, %xmm1, %xmm1
@@ -37,7 +51,7 @@ ENTRY(memchr)
 	ja	L(crosscache)
 
 	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 
@@ -58,7 +72,7 @@ L(crosscache):
 	and	$-16, %rdi
 	movdqa	(%rdi), %xmm0
 
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 /* Check if there is a match.  */
 	pmovmskb %xmm0, %eax
 /* Remove the leading bytes.  */
@@ -90,25 +104,25 @@ L(unaligned_no_match):
 	.p2align 4
 L(loop_prolog):
 	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches)
 
 	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
+	PCMPEQ	%xmm1, %xmm2
 	pmovmskb %xmm2, %eax
 	test	%eax, %eax
 	jnz	L(matches16)
 
 	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32)
 
 	movdqa	48(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
+	PCMPEQ	%xmm1, %xmm4
 	add	$64, %rdi
 	pmovmskb %xmm4, %eax
 	test	%eax, %eax
@@ -121,25 +135,25 @@ L(loop_prolog):
 	jbe	L(exit_loop)
 
 	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches)
 
 	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
+	PCMPEQ	%xmm1, %xmm2
 	pmovmskb %xmm2, %eax
 	test	%eax, %eax
 	jnz	L(matches16)
 
 	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32)
 
 	movdqa	48(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 
 	add	$64, %rdi
@@ -160,10 +174,10 @@ L(align64_loop):
 	movdqa	32(%rdi), %xmm3
 	movdqa	48(%rdi), %xmm4
 
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
+	PCMPEQ	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm2
+	PCMPEQ	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm4
 
 	pmaxub	%xmm0, %xmm3
 	pmaxub	%xmm2, %xmm4
@@ -186,9 +200,9 @@ L(align64_loop):
 	jnz	L(matches16)
 
 	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm3
 
-	pcmpeqb	48(%rdi), %xmm1
+	PCMPEQ	48(%rdi), %xmm1
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32)
@@ -204,26 +218,26 @@ L(exit_loop):
 	jle	L(exit_loop_32)
 
 	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches)
 
 	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
+	PCMPEQ	%xmm1, %xmm2
 	pmovmskb %xmm2, %eax
 	test	%eax, %eax
 	jnz	L(matches16)
 
 	movdqa	32(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
+	PCMPEQ	%xmm1, %xmm3
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
 	sub	$16, %edx
 	jle	L(return_null)
 
-	pcmpeqb	48(%rdi), %xmm1
+	PCMPEQ	48(%rdi), %xmm1
 	pmovmskb %xmm1, %eax
 	test	%eax, %eax
 	jnz	L(matches48_1)
@@ -234,14 +248,14 @@ L(exit_loop):
 L(exit_loop_32):
 	add	$32, %edx
 	movdqa	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
+	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
 	sub	$16, %edx
 	jbe	L(return_null)
 
-	pcmpeqb	16(%rdi), %xmm1
+	PCMPEQ	16(%rdi), %xmm1
 	pmovmskb %xmm1, %eax
 	test	%eax, %eax
 	jnz	L(matches16_1)
@@ -308,8 +322,13 @@ L(matches48_1):
 L(return_null):
 	xor	%eax, %eax
 	ret
-END(memchr)
+END(MEMCHR)
 
+#ifdef USE_AS_WMEMCHR
+libc_hidden_def (__wmemchr)
+weak_alias (__wmemchr, wmemchr)
+libc_hidden_weak (wmemchr)
+#else
 strong_alias (memchr, __memchr)
-
 libc_hidden_builtin_def(memchr)
+#endif
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 8697ac3..68ab559 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,6 +6,7 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
+		   memchr-avx2 rawmemchr-avx2 \
 		   memcmp-avx2-movbe \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
@@ -32,6 +33,7 @@ endif
 
 ifeq ($(subdir),wcsmbs)
 sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+		   wmemchr-avx2 \
 		   wmemcmp-avx2-movbe \
 		   wcscpy-ssse3 wcscpy-c
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index b61bc9f..f60535b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -38,6 +38,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   size_t i = 0;
 
+  /* Support sysdeps/x86_64/multiarch/memchr.S.  */
+  IFUNC_IMPL (i, name, memchr,
+	      IFUNC_IMPL_ADD (array, i, memchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __memchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
   IFUNC_IMPL (i, name, memcmp,
 	      IFUNC_IMPL_ADD (array, i, memcmp,
@@ -152,6 +159,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memset_avx512_no_vzeroupper)
 	     )
 
+  /* Support sysdeps/x86_64/multiarch/rawmemchr.S.  */
+  IFUNC_IMPL (i, name, rawmemchr,
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __rawmemchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */
   IFUNC_IMPL (i, name, stpncpy,
 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
@@ -296,6 +310,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wcscpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wmemchr.S.  */
+  IFUNC_IMPL (i, name, wmemchr,
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemchr_avx2)
+	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+
   /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
   IFUNC_IMPL (i, name, wmemcmp,
 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
new file mode 100644
index 0000000..a7275ed
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -0,0 +1,340 @@
+/* memchr/wmemchr optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCHR
+#  define MEMCHR	__memchr_avx2
+# endif
+
+# ifdef USE_AS_WMEMCHR
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (MEMCHR)
+# ifndef USE_AS_RAWMEMCHR
+	/* Check for zero length.  */
+	testq	%rdx, %rdx
+	jz	L(null)
+# endif
+	movl	%edi, %ecx
+	/* Broadcast CHAR to YMM0.  */
+	vmovd	%esi, %xmm0
+# ifdef USE_AS_WMEMCHR
+	shl	$2, %rdx
+	vpbroadcastd %xmm0, %ymm0
+# else
+	vpbroadcastb %xmm0, %ymm0
+# endif
+	/* Check if we may cross page boundary with one vector load.  */
+	andl	$(2 * VEC_SIZE - 1), %ecx
+	cmpl	$VEC_SIZE, %ecx
+	ja	L(cros_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+	jnz	L(first_vec_x0_check)
+	/* Adjust length and check the end of data.  */
+	subq	$VEC_SIZE, %rdx
+	jbe	L(zero)
+# else
+	jnz	L(first_vec_x0)
+# endif
+
+	/* Align data for aligned loads in the loop.  */
+	addq	$VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+	jmp	L(more_4x_vec)
+
+	.p2align 4
+L(cros_page_boundary):
+	andl	$(VEC_SIZE - 1), %ecx
+	andq	$-VEC_SIZE, %rdi
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bytes.  */
+	sarl	%cl, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+# endif
+	addq	%rdi, %rax
+	addq	%rcx, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(aligned_more):
+# ifndef USE_AS_RAWMEMCHR
+        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+	   overflow.  */
+	negq	%rcx
+	addq	$VEC_SIZE, %rcx
+
+	/* Check the end of data.  */
+	subq	%rcx, %rdx
+	jbe	L(zero)
+# endif
+
+	addq	$VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+
+	/* Align data to 4 * VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andl	$(4 * VEC_SIZE - 1), %ecx
+	andq	$-(4 * VEC_SIZE), %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+
+	vpor	%ymm1, %ymm2, %ymm5
+	vpor	%ymm3, %ymm4, %ymm6
+	vpor	%ymm5, %ymm6, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_RAWMEMCHR
+	jmp	L(loop_4x_vec)
+# else
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jle	L(last_2x_vec)
+
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x2_check)
+	subl	$VEC_SIZE, %edx
+	jle	L(zero)
+
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x3_check)
+	xorl	%eax, %eax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	addl	$(VEC_SIZE * 2), %edx
+	VPCMPEQ (%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+
+	jnz	L(first_vec_x0_check)
+	subl	$VEC_SIZE, %edx
+	jle	L(zero)
+
+	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+	xorl	%eax, %eax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x0_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x3_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rdx
+	jbe	L(zero)
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(zero):
+	VZEROUPPER
+L(null):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4
+L(first_vec_x0):
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	addq	$VEC_SIZE, %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2), %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x0)
+	vpmovmskb %ymm2, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+	vpmovmskb %ymm3, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+	vpmovmskb %ymm4, %eax
+	testl	%eax, %eax
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3), %rax
+	addq	%rdi, %rax
+	VZEROUPPER
+	ret
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memchr.S b/sysdeps/x86_64/multiarch/memchr.S
new file mode 100644
index 0000000..dee3fd1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr.S
@@ -0,0 +1,64 @@
+/* Multiple versions of memchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(memchr)
+	.type	memchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__memchr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__memchr_sse2(%rip), %rax
+	ret
+END(memchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __memchr_sse2, @function; \
+	.p2align 4; \
+	.globl __memchr_sse2; \
+	.hidden __memchr_sse2; \
+	__memchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __memchr_sse2, .-__memchr_sse2
+
+# ifdef SHARED
+#  undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memchr calls through a PLT.
+   The speedup we get from using AVX2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memchr; __GI_memchr = __memchr_sse2
+# endif
+#endif
+
+#include "../memchr.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S
new file mode 100644
index 0000000..128f9ea
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_avx2
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
new file mode 100644
index 0000000..b938f13
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr.S
@@ -0,0 +1,64 @@
+/* Multiple versions of rawmemchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__rawmemchr)
+	.type	__rawmemchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__rawmemchr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__rawmemchr_sse2(%rip), %rax
+	ret
+END(__rawmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __rawmemchr_sse2, @function; \
+	.p2align 4; \
+	.globl __rawmemchr_sse2; \
+	.hidden __rawmemchr_sse2; \
+	__rawmemchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __rawmemchr_sse2, .-__rawmemchr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal rawmemchr calls through a
+   PLT.  The speedup we get from using AVX2 instructions is likely eaten
+   away by the indirect call in the PLT.  */
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2
+# endif
+#endif
+
+#include "../rawmemchr.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S
new file mode 100644
index 0000000..282854f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_avx2
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr.S b/sysdeps/x86_64/multiarch/wmemchr.S
new file mode 100644
index 0000000..b79df23
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr.S
@@ -0,0 +1,67 @@
+/* Multiple versions of wmemchr
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+	.text
+ENTRY(__wmemchr)
+	.type	__wmemchr, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wmemchr_avx2(%rip), %rax
+	ret
+
+1:	leaq	__wmemchr_sse2(%rip), %rax
+	ret
+END(__wmemchr)
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __wmemchr_sse2, @function; \
+	.p2align 4; \
+	.globl __wmemchr_sse2; \
+	.hidden __wmemchr_sse2; \
+	__wmemchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __wmemchr_sse2, .-__wmemchr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal wmemchr calls through a PLT.
+   The speedup we get from using AVX2 instructions is likely eaten away
+   by the indirect call in the PLT.  */
+#  undef libc_hidden_def
+#  define libc_hidden_def(name) \
+	.globl __GI___wmemchr; __GI___wmemchr = __wmemchr_sse2
+#  undef libc_hidden_weak
+#  define libc_hidden_weak(name) \
+	.weak __GI_wmemchr; __GI_wmemchr = __wmemchr_sse2
+# endif
+#endif
+
+#include "../wmemchr.S"
diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S
new file mode 100644
index 0000000..9d8079b
--- /dev/null
+++ b/sysdeps/x86_64/wmemchr.S
@@ -0,0 +1,3 @@
+#define USE_AS_WMEMCHR 1
+
+#include "memchr.S"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0cbd97cccf0aadfcda6beb0c8b30accbd4f2237a

commit 0cbd97cccf0aadfcda6beb0c8b30accbd4f2237a
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Jul 19 09:44:05 2016 -0700

    x86-64: Optimize memcmp/wmemcmp with AVX2 and MOVBE
    
    Optimize x86-64 memcmp/wmemcmp with AVX2.  It uses vector compare as
    much as possible.  It is as fast as SSE4 memcmp for size <= 16 bytes
    and up to 2X faster for size > 16 bytes on Haswell and Skylake.  Select
    AVX2 memcmp/wmemcmp on AVX2 machines where vzeroupper is preferred and
    AVX unaligned load is fast.
    
    NB: It uses TZCNT instead of BSF since TZCNT produces the same result
    as BSF for non-zero input.  TZCNT is faster than BSF and is executed
    as BSF if machine doesn't support TZCNT.
    
    Key features:
    
    1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
       to avoid branches.
    2. Use overlapping compare to avoid branch.
    3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
       bytes for wmemcmp.
    4. If size is 8 * VEC_SIZE or less, unroll the loop.
    5. Compare 4 * VEC_SIZE at a time with the aligned first memory area.
    6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
    7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
    8. Use 8 vector compares when size is 8 * VEC_SIZE or less.
    
    	* sysdeps/x86/cpu-features.h (index_cpu_MOVBE): New.
    	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
    	memcmp-avx2 and wmemcmp-avx2.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Test __memcmp_avx2 and __wmemcmp_avx2.
    	* sysdeps/x86_64/multiarch/memcmp-avx2.S: New file.
    	* sysdeps/x86_64/multiarch/wmemcmp-avx2.S: Likewise.
    	* sysdeps/x86_64/multiarch/memcmp.S: Use __memcmp_avx2 on AVX
    	2 machines if AVX unaligned load is fast and vzeroupper is
    	preferred.
    	* sysdeps/x86_64/multiarch/wmemcmp.S: Use __wmemcmp_avx2 on AVX
    	2 machines if AVX unaligned load is fast and vzeroupper is
    	preferred.
    
    xxx

diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 31c7c80..f2329ea 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -102,6 +102,7 @@
 # define index_cpu_AVX	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 # define index_cpu_AVX2	COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
 # define index_cpu_ERMS	COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
+# define index_cpu_MOVBE COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 
 # define index_arch_Fast_Rep_String	FEATURE_INDEX_1*FEATURE_SIZE
 # define index_arch_Fast_Copy_Backward	FEATURE_INDEX_1*FEATURE_SIZE
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 3736f54..8697ac3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,6 +6,7 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
+		   memcmp-avx2-movbe \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memmove-ssse3 \
 		   memcpy-ssse3-back \
@@ -30,5 +31,7 @@ CFLAGS-strspn-c.c += -msse4
 endif
 
 ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
+sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+		   wmemcmp-avx2-movbe \
+		   wcscpy-ssse3 wcscpy-c
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a91d2f9..b61bc9f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -40,6 +40,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memcmp.S.  */
   IFUNC_IMPL (i, name, memcmp,
+	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)),
+			      __memcmp_avx2_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __memcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_CPU_FEATURE (SSSE3),
@@ -294,6 +298,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/wmemcmp.S.  */
   IFUNC_IMPL (i, name, wmemcmp,
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (MOVBE)),
+			      __wmemcmp_avx2_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSE4_1),
 			      __wmemcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
new file mode 100644
index 0000000..47630dd
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -0,0 +1,425 @@
+/* memcmp/wmemcmp optimized with AVX2.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+      to avoid branches.
+   2. Use overlapping compare to avoid branch.
+   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+      bytes for wmemcmp.
+   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+      area.
+   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_avx2_movbe
+# endif
+
+# ifdef USE_AS_WMEMCMP
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define VEC_SIZE 32
+# define VEC_MASK ((1 << VEC_SIZE) - 1)
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.avx,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %rdx
+# endif
+	cmpq	$VEC_SIZE, %rdx
+	jb	L(less_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+
+L(last_2x_vec):
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+L(last_vec):
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec):
+	/* A byte or int32 is different within 16 or 32 bytes.  */
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rcx), %edx
+	cmpl	(%rsi, %rcx), %edx
+L(wmemcmp_return):
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+# ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(4):
+	xorl	%eax, %eax
+	movl	(%rdi), %edx
+	cmpl	(%rsi), %edx
+	jne	L(wmemcmp_return)
+	ret
+# else
+	.p2align 4
+L(between_4_7):
+	/* Load as big endian with overlapping movbe to avoid branches.  */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	je	L(exit)
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	ret
+
+	.p2align 4
+L(exit):
+	ret
+
+	.p2align 4
+L(between_2_3):
+	/* Load as big endian with overlapping loads and bswap to avoid
+	   branches.  */
+	movzwl	-2(%rdi, %rdx), %eax
+	movzwl	-2(%rsi, %rdx), %ecx
+	shll	$16, %eax
+	shll	$16, %ecx
+	movzwl	(%rdi), %edi
+	movzwl	(%rsi), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	bswap	%eax
+	bswap	%ecx
+	subl	%ecx, %eax
+	ret
+
+	.p2align 4
+L(1):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+	cmpb	$4, %dl
+	je	L(4)
+	jb	L(zero)
+# else
+	cmpb	$1, %dl
+	je	L(1)
+	jb	L(zero)
+	cmpb	$4, %dl
+	jb	L(between_2_3)
+	cmpb	$8, %dl
+	jb	L(between_4_7)
+# endif
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+	/* It is between 8 and 15 bytes.  */
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-8(%rdi, %rdx), %rdi
+	leaq	-8(%rsi, %rdx), %rsi
+	vmovq	(%rdi), %xmm1
+	vmovq	(%rsi), %xmm2
+	VPCMPEQ %xmm1, %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+	vmovdqu	(%rsi), %xmm2
+	VPCMPEQ (%rdi), %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-16(%rdi, %rdx), %rdi
+	leaq	-16(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %xmm2
+	VPCMPEQ (%rdi), %xmm2, %xmm2
+	vpmovmskb %xmm2, %eax
+	subl    $0xffff, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(more_2x_vec):
+	/* More than 2 * VEC.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+
+	/* From 4 * VEC to 8 * VEC, inclusively. */
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpand	%ymm2, %ymm1, %ymm5
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpand	%ymm3, %ymm5, %ymm5
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpand	%ymm4, %ymm5, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(more_8x_vec):
+	/* More than 8 * VEC.  Check the first VEC.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Align the first memory area for aligned loads in the loop.
+	   Compute how much the first memory area is misaligned.  */
+	movq	%rdi, %rcx
+	andl	$(VEC_SIZE - 1), %ecx
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %rcx
+	/* Adjust the second memory area.  */
+	subq	%rcx, %rsi
+	/* Adjust the first memory area which should be aligned now.  */
+	subq	%rcx, %rdi
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ (%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi), %ymm2
+	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpand	%ymm2, %ymm1, %ymm5
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpand	%ymm3, %ymm5, %ymm5
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpand	%ymm4, %ymm5, %ymm5
+
+	vpmovmskb %ymm5, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rsi
+
+	subq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jae	L(loop_4x_vec)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(last_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_2x_vec)
+
+L(last_4x_vec):
+	/* From 2 * VEC to 4 * VEC. */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ (%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	vpmovmskb %ymm1, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec)
+	vpmovmskb %ymm2, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x1)
+	vpmovmskb %ymm3, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x2)
+	vpmovmskb %ymm4, %eax
+	subl	$VEC_MASK, %eax
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	VEC_SIZE(%rdi, %rcx), %edx
+	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	VZEROUPPER
+	ret
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index 6129820..0c9804b 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -27,7 +27,18 @@
 ENTRY(memcmp)
 	.type	memcmp, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_CPU_FEATURE (SSSE3)
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_CPU_FEATURE (MOVBE)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__memcmp_avx2_movbe(%rip), %rax
+	ret
+
+1:	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__memcmp_sse2(%rip), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S
new file mode 100644
index 0000000..bfa1a16
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_avx2_movbe
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-avx2-movbe.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
index 5dc54d7..94b25a2 100644
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -27,7 +27,18 @@
 ENTRY(wmemcmp)
 	.type	wmemcmp, @gnu_indirect_function
 	LOAD_RTLD_GLOBAL_RO_RDX
-	HAS_CPU_FEATURE (SSSE3)
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	HAS_CPU_FEATURE (MOVBE)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	leaq	__wmemcmp_avx2_movbe(%rip), %rax
+	ret
+
+1:	HAS_CPU_FEATURE (SSSE3)
 	jnz	2f
 	leaq	__wmemcmp_sse2(%rip), %rax
 	ret

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9ecca7ff33631721e925236d0ea8e760852378a0

commit 9ecca7ff33631721e925236d0ea8e760852378a0
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun May 21 09:22:44 2017 -0700

    x86-64: Optimize wmemset with SSE2/AVX2/AVX512
    
    The difference between memset and wmemset is byte vs int.  Add stubs
    to SSE2/AVX2/AVX512 memset for wmemset with updated constant and size:
    
    SSE2 wmemset:
    	shl    $0x2,%rdx
    	movd   %esi,%xmm0
    	mov    %rdi,%rax
    	pshufd $0x0,%xmm0,%xmm0
    	jmp	entry_from_wmemset
    
    SSE2 memset:
    	movd   %esi,%xmm0
    	mov    %rdi,%rax
    	punpcklbw %xmm0,%xmm0
    	punpcklwd %xmm0,%xmm0
    	pshufd $0x0,%xmm0,%xmm0
    entry_from_wmemset:
    
    Since the ERMS versions of wmemset requires "rep stosl" instead of
    "rep stosb", only the vector store stubs of SSE2/AVX2/AVX512 wmemset
    are added.  The SSE2 wmemset is about 3X faster and the AVX2 wmemset
    is about 6X faster on Haswell.
    
    	* include/wchar.h (__wmemset_chk): New.
    	* sysdeps/x86_64/memset.S (VDUP_TO_VEC0_AND_SET_RETURN): Renamed
    	to MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
    	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
    	(WMEMSET_CHK_SYMBOL): Likewise.
    	(WMEMSET_SYMBOL): Likewise.
    	(__wmemset): Add hidden definition.
    	(wmemset): Add weak hidden definition.
    	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
    	(__libc_ifunc_impl_list): Add __wmemset_sse2_unaligned,
    	__wmemset_avx2_unaligned, __wmemset_avx512_unaligned,
    	__wmemset_chk_sse2_unaligned, __wmemset_chk_avx2_unaligned
    	and __wmemset_chk_avx512_unaligned.
    	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
    	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
    	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
    	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
    	(WMEMSET_SYMBOL): Likewise.
    	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
    	(VDUP_TO_VEC0_AND_SET_RETURN): Renamed to ...
    	(MEMSET_VDUP_TO_VEC0_AND_SET_RETURN): This.
    	(WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN): New.
    	(WMEMSET_SYMBOL): Likewise.
    	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Updated.
    	(WMEMSET_CHK_SYMBOL): New.
    	(WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)): Likewise.
    	(WMEMSET_SYMBOL (__wmemset, unaligned)): Likewise.
    	* sysdeps/x86_64/multiarch/memset.S (WMEMSET_SYMBOL): New.
    	(libc_hidden_builtin_def): Also define __GI_wmemset and
    	__GI___wmemset.
    	(weak_alias): New.
    	* sysdeps/x86_64/multiarch/wmemset.S: New file.
    	* sysdeps/x86_64/multiarch/wmemset_chk.S: Likewise.
    	* sysdeps/x86_64/wmemset.S: Likewise.
    	* sysdeps/x86_64/wmemset_chk.S: Likewise.

diff --git a/include/wchar.h b/include/wchar.h
index e2579a1..a773d56 100644
--- a/include/wchar.h
+++ b/include/wchar.h
@@ -157,6 +157,9 @@ extern wchar_t *__wmemmove (wchar_t *__s1, const wchar_t *__s2,
 extern wchar_t *__wcschrnul (const wchar_t *__s, wchar_t __wc)
      __attribute_pure__;
 
+extern wchar_t *__wmemset_chk (wchar_t *__s, wchar_t __c, size_t __n,
+			       size_t __ns) __THROW;
+
 extern int __vfwscanf (__FILE *__restrict __s,
 		       const wchar_t *__restrict __format,
 		       __gnuc_va_list __arg)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 69ed509..4127878 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -26,13 +26,18 @@
 #define VMOVU		movdqu
 #define VMOVA		movdqa
 
-#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
   movq r, %rax; \
   punpcklbw %xmm0, %xmm0; \
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
+#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movd d, %xmm0; \
+  movq r, %rax; \
+  pshufd $0, %xmm0, %xmm0
+
 #define SECTION(p)		p
 
 #ifndef MEMSET_SYMBOL
@@ -40,10 +45,21 @@
 # define MEMSET_SYMBOL(p,s)	memset
 #endif
 
+#ifndef WMEMSET_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s) p
+# define WMEMSET_SYMBOL(p,s)	__wmemset
+#endif
+
 #include "multiarch/memset-vec-unaligned-erms.S"
 
 libc_hidden_builtin_def (memset)
 
+#if IS_IN (libc)
+libc_hidden_def (__wmemset)
+weak_alias (__wmemset, wmemset)
+libc_hidden_weak (wmemset)
+#endif
+
 #if defined SHARED && IS_IN (libc) && !defined USE_MULTIARCH
 strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
 	.section .gnu.warning.__memset_zero_constant_len_parameter
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 06d9a9d..a91d2f9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -300,6 +300,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
+  /* Support sysdeps/x86_64/multiarch/wmemset.S.  */
+  IFUNC_IMPL (i, name, wmemset,
+	      IFUNC_IMPL_ADD (array, i, wmemset, 1,
+			      __wmemset_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_avx512_unaligned))
+
 #ifdef SHARED
   /* Support sysdeps/x86_64/multiarch/memcpy_chk.S.  */
   IFUNC_IMPL (i, name, __memcpy_chk,
@@ -417,6 +428,17 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
 			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
+
+  /* Support sysdeps/x86_64/multiarch/wmemset_chk.S.  */
+  IFUNC_IMPL (i, name, __wmemset_chk,
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk, 1,
+			      __wmemset_chk_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      __wmemset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      HAS_ARCH_FEATURE (AVX512F_Usable),
+			      __wmemset_chk_avx512_unaligned))
 #endif
 
   return i;
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 79975e0..7ab3d89 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -4,13 +4,19 @@
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
 
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
   movq r, %rax; \
   vpbroadcastb %xmm0, %ymm0
 
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %ymm0
+
 # define SECTION(p)		p##.avx
 # define MEMSET_SYMBOL(p,s)	p##_avx2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index a5ec349..0783979 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -4,14 +4,21 @@
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 
-# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
   movq r, %rax; \
   vpbroadcastb %xmm0, %xmm0; \
   vpbroadcastq %xmm0, %zmm0
 
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  vmovd d, %xmm0; \
+  movq r, %rax; \
+  vpbroadcastd %xmm0, %xmm0; \
+  vpbroadcastq %xmm0, %zmm0
+
 # define SECTION(p)		p##.avx512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+# define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
 
 # include "memset-vec-unaligned-erms.S"
 #endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 704eed9..2eb9e37 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -30,6 +30,10 @@
 # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
 #endif
 
+#ifndef WMEMSET_CHK_SYMBOL
+# define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
+#endif
+
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
@@ -79,6 +83,21 @@ END (__bzero)
 weak_alias (__bzero, bzero)
 #endif
 
+#if IS_IN (libc)
+# if defined SHARED
+ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+# endif
+
+ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+	shlq	$2, %rdx
+	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	jmp	L(entry_from_bzero)
+END (WMEMSET_SYMBOL (__wmemset, unaligned))
+#endif
+
 #if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 	cmpq	%rdx, %rcx
@@ -87,8 +106,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
-L(memset_entry):
-	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
@@ -132,7 +150,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
-	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
 	cmpq	$(VEC_SIZE * 2), %rdx
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 9d33118..11f2737 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -58,16 +58,23 @@ END(memset)
 
 #if IS_IN (libc)
 # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+# define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
 
 # ifdef SHARED
-# undef libc_hidden_builtin_def
+#  undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal memset calls through a PLT.
    The speedup we get from using SSE2 instructions is likely eaten away
    by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned
+#  define libc_hidden_builtin_def(name) \
+	.globl __GI_memset; __GI_memset = __memset_sse2_unaligned; \
+	.globl __GI_wmemset; __GI_wmemset = __wmemset_sse2_unaligned; \
+	.globl __GI___wmemset; __GI___wmemset = __wmemset_sse2_unaligned
 # endif
 
+# undef weak_alias
+# define weak_alias(original, alias) \
+	.weak bzero; bzero = __bzero
+
 # undef strong_alias
 # define strong_alias(original, alias)
 #endif
diff --git a/sysdeps/x86_64/multiarch/wmemset.S b/sysdeps/x86_64/multiarch/wmemset.S
new file mode 100644
index 0000000..3bd7ca2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset.S
@@ -0,0 +1,47 @@
+/* Multiple versions of wmemset
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+ENTRY(__wmemset)
+	.type	__wmemset, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__wmemset_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+	jz	1f
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	lea	__wmemset_avx2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__wmemset_avx512_unaligned(%rip), %RAX_LP
+1:	ret
+END(__wmemset)
+
+weak_alias (__wmemset, wmemset)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wmemset_chk.S b/sysdeps/x86_64/multiarch/wmemset_chk.S
new file mode 100644
index 0000000..c76fcb1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemset_chk.S
@@ -0,0 +1,46 @@
+/* Multiple versions of wmemset_chk
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include <shlib-compat.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# ifdef SHARED
+ENTRY(__wmemset_chk)
+	.type	__wmemset_chk, @gnu_indirect_function
+	LOAD_RTLD_GLOBAL_RO_RDX
+	lea	__wmemset_chk_sse2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (AVX2_Usable)
+	jz	1f
+	lea	__wmemset_chk_avx2_unaligned(%rip), %RAX_LP
+	HAS_ARCH_FEATURE (Prefer_No_AVX512)
+	jnz	1f
+	HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+	jnz	1f
+	HAS_ARCH_FEATURE (AVX512F_Usable)
+	jz	1f
+	lea	__wmemset_chk_avx512_unaligned(%rip), %RAX_LP
+1:	ret
+END(__wmemset_chk)
+# else
+#  include "../wmemset_chk.S"
+# endif
+#endif
diff --git a/sysdeps/x86_64/wmemset.S b/sysdeps/x86_64/wmemset.S
new file mode 100644
index 0000000..f96d567
--- /dev/null
+++ b/sysdeps/x86_64/wmemset.S
@@ -0,0 +1 @@
+/* Implemented in memset.S.  */
diff --git a/sysdeps/x86_64/wmemset_chk.S b/sysdeps/x86_64/wmemset_chk.S
new file mode 100644
index 0000000..64c2774
--- /dev/null
+++ b/sysdeps/x86_64/wmemset_chk.S
@@ -0,0 +1,33 @@
+/* Checking wmemset for x86-64.
+   Copyright (C) 2004-2017 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+#ifndef SHARED
+	/* For libc.so this is defined in wmemset.S.
+	   For libc.a, this is a separate source to avoid
+	   wmemset bringing in __chk_fail and all routines
+	   it calls.  */
+        .text
+ENTRY (__wmemset_chk)
+	cmpq	%rdx, %rcx
+	jb	__chk_fail
+	jmp	wmemset
+END (__wmemset_chk)
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=8e457b6d134cd90f1c9062792074273951e68d8f

commit 8e457b6d134cd90f1c9062792074273951e68d8f
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat May 20 06:48:04 2017 -0700

    x86-64: Update strlen.S to support wcslen/wcsnlen
    
    The difference between strlen and wcslen is byte vs int.  We can
    replace pminub and pcmpeqb with pminud and pcmpeqd to turn strlen
    into wcslen.
    
    	* sysdeps/x86_64/strlen.S (PMINU): New.
    	(PCMPEQ): Likewise.
    	(SHIFT_RETURN): Likewise.
    	(FIND_ZERO): Replace pcmpeqb with PCMPEQ.
    	(strlen): Add SHIFT_RETURN before ret.  Replace pcmpeqb and
    	pminub with PCMPEQ and PMINU.
    	* sysdeps/x86_64/wcsnlen.S: New file.

diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index 5896e6b..b5ab117 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,4 +1,4 @@
-/* SSE2 version of strlen.
+/* SSE2 version of strlen/wcslen.
    Copyright (C) 2012-2017 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -18,6 +18,16 @@
 
 #include <sysdep.h>
 
+#ifdef AS_WCSLEN
+# define PMINU		pminud
+# define PCMPEQ		pcmpeqd
+# define SHIFT_RETURN	shrq $2, %rax
+#else
+# define PMINU		pminub
+# define PCMPEQ		pcmpeqb
+# define SHIFT_RETURN
+#endif
+
 /* Long lived register in strlen(s), strnlen(s, n) are:
 
 	%xmm3 - zero
@@ -32,10 +42,10 @@ ENTRY(strlen)
 
 /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
 #define FIND_ZERO	\
-	pcmpeqb	(%rax), %xmm0;	\
-	pcmpeqb	16(%rax), %xmm1;	\
-	pcmpeqb	32(%rax), %xmm2;	\
-	pcmpeqb	48(%rax), %xmm3;	\
+	PCMPEQ	(%rax), %xmm0;	\
+	PCMPEQ	16(%rax), %xmm1;	\
+	PCMPEQ	32(%rax), %xmm2;	\
+	PCMPEQ	48(%rax), %xmm3;	\
 	pmovmskb	%xmm0, %esi;	\
 	pmovmskb	%xmm1, %edx;	\
 	pmovmskb	%xmm2, %r8d;	\
@@ -54,6 +64,9 @@ ENTRY(strlen)
 	xor	%rax, %rax
 	ret
 L(n_nonzero):
+# ifdef AS_WCSLEN
+	shlq	$2, %rsi
+# endif
 
 /* Initialize long lived registers.  */
 
@@ -96,6 +109,7 @@ L(n_nonzero):
 	test	%rdx, %rdx;	\
 	je	L(lab);	\
 	bsfq	%rdx, %rax;	\
+	SHIFT_RETURN;		\
 	ret
 
 #ifdef AS_STRNLEN
@@ -104,19 +118,20 @@ L(n_nonzero):
 #else
 	/* Test first 16 bytes unaligned.  */
 	movdqu	(%rax), %xmm4
-	pcmpeqb	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm4
 	pmovmskb	%xmm4, %edx
 	test	%edx, %edx
 	je 	L(next48_bytes)
 	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+	SHIFT_RETURN
 	ret
 
 L(next48_bytes):
 /* Same as FIND_ZERO except we do not check first 16 bytes.  */
 	andq	$-16, %rax
-	pcmpeqb 16(%rax), %xmm1
-	pcmpeqb 32(%rax), %xmm2
-	pcmpeqb 48(%rax), %xmm3
+	PCMPEQ 16(%rax), %xmm1
+	PCMPEQ 32(%rax), %xmm2
+	PCMPEQ 48(%rax), %xmm3
 	pmovmskb	%xmm1, %edx
 	pmovmskb	%xmm2, %r8d
 	pmovmskb	%xmm3, %ecx
@@ -145,6 +160,7 @@ L(strnlen_ret):
 	test	%rdx, %rdx
 	je	L(loop_init)
 	bsfq	%rdx, %rax
+	SHIFT_RETURN
 	ret
 #endif
 	.p2align 4
@@ -161,10 +177,10 @@ L(loop):
 	je	L(exit_end)
 
 	movdqa	(%rax), %xmm0
-	pminub	16(%rax), %xmm0
-	pminub	32(%rax), %xmm0
-	pminub	48(%rax), %xmm0
-	pcmpeqb	%xmm3, %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
 	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit)
@@ -182,6 +198,7 @@ L(first):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+	SHIFT_RETURN
 	ret
 
 	.p2align 4
@@ -192,6 +209,7 @@ L(exit):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+	SHIFT_RETURN
 	ret
 
 #else
@@ -201,10 +219,10 @@ L(exit):
 L(loop):
 
 	movdqa	64(%rax), %xmm0
-	pminub	80(%rax), %xmm0
-	pminub	96(%rax), %xmm0
-	pminub	112(%rax), %xmm0
-	pcmpeqb	%xmm3, %xmm0
+	PMINU	80(%rax), %xmm0
+	PMINU	96(%rax), %xmm0
+	PMINU	112(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
 	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit64)
@@ -212,10 +230,10 @@ L(loop):
 	subq	$-128, %rax
 
 	movdqa	(%rax), %xmm0
-	pminub	16(%rax), %xmm0
-	pminub	32(%rax), %xmm0
-	pminub	48(%rax), %xmm0
-	pcmpeqb	%xmm3, %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
 	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit0)
@@ -231,6 +249,7 @@ L(exit0):
 	bsfq	%rdx, %rdx
 	addq	%rdx, %rax
 	subq	%rdi, %rax
+	SHIFT_RETURN
 	ret
 
 #endif
diff --git a/sysdeps/x86_64/wcsnlen.S b/sysdeps/x86_64/wcsnlen.S
new file mode 100644
index 0000000..968bb69
--- /dev/null
+++ b/sysdeps/x86_64/wcsnlen.S
@@ -0,0 +1,7 @@
+#define AS_WCSLEN
+#define AS_STRNLEN
+#define strlen	__wcsnlen
+
+#include "strlen.S"
+
+weak_alias(__wcsnlen, wcsnlen)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5b7c9ebf151a6200e2a0f8910484cec8e4c14f25

commit 5b7c9ebf151a6200e2a0f8910484cec8e4c14f25
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue May 23 08:43:52 2017 -0700

    x86_64: Remove redundant REX bytes from memrchr.S
    
    By x86-64 specification, 32-bit destination registers are zero-extended
    to 64 bits.  There is no need to use 64-bit registers when only the lower
    32 bits are non-zero.  Also 2 instructions in:
    
    	mov	%rdi, %rcx
    	and	$15, %rcx
    	jz	L(length_less16_offset0)
    
    	mov	%rdi, %rcx		<<< redundant
    	and	$15, %rcx		<<< redundant
    
    are redundant.
    
    	* sysdeps/x86_64/memrchr.S (__memrchr): Use 32-bit registers for
    	the lower 32 bits.  Remove redundant instructions.

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index aab1a4a..5fa0fe9 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -22,7 +22,7 @@
 
 	.text
 ENTRY (__memrchr)
-	movd	%rsi, %xmm1
+	movd	%esi, %xmm1
 
 	sub	$16, %rdx
 	jbe	L(length_less16)
@@ -42,8 +42,8 @@ ENTRY (__memrchr)
 	jnz	L(matches0)
 
 	sub	$64, %rdi
-	mov	%rdi, %rcx
-	and	$15, %rcx
+	mov	%edi, %ecx
+	and	$15, %ecx
 	jz	L(loop_prolog)
 
 	add	$16, %rdi
@@ -108,8 +108,8 @@ L(loop_prolog):
 	test	%eax, %eax
 	jnz	L(matches0)
 
-	mov	%rdi, %rcx
-	and	$63, %rcx
+	mov	%edi, %ecx
+	and	$63, %ecx
 	jz	L(align64_loop)
 
 	add	$64, %rdi
@@ -166,8 +166,8 @@ L(align64_loop):
 
 	.p2align 4
 L(exit_loop):
-	add	$64, %rdx
-	cmp	$32, %rdx
+	add	$64, %edx
+	cmp	$32, %edx
 	jbe	L(exit_loop_32)
 
 	movdqa	48(%rdi), %xmm0
@@ -187,7 +187,7 @@ L(exit_loop):
 	pmovmskb	%xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches16_1)
-	cmp	$48, %rdx
+	cmp	$48, %edx
 	jbe	L(return_null)
 
 	pcmpeqb	(%rdi), %xmm1
@@ -204,7 +204,7 @@ L(exit_loop_32):
 	pmovmskb	%xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches48_1)
-	cmp	$16, %rdx
+	cmp	$16, %edx
 	jbe	L(return_null)
 
 	pcmpeqb	32(%rdi), %xmm1
@@ -276,7 +276,7 @@ L(matches48_1):
 
 	.p2align 4
 L(return_null):
-	xor	%rax, %rax
+	xor	%eax, %eax
 	ret
 
 	.p2align 4
@@ -306,18 +306,16 @@ L(length_less16):
 	punpcklbw	%xmm1, %xmm1
 	punpcklbw	%xmm1, %xmm1
 
-	add	$16, %rdx
+	add	$16, %edx
 
 	pshufd	$0, %xmm1, %xmm1
 
-	mov	%rdi, %rcx
-	and	$15, %rcx
+	mov	%edi, %ecx
+	and	$15, %ecx
 	jz	L(length_less16_offset0)
 
-	mov	%rdi, %rcx
-	and	$15, %rcx
 	mov	%cl, %dh
-	mov	%rcx, %r8
+	mov	%ecx, %esi
 	add	%dl, %dh
 	and	$-16, %rdi
 
@@ -340,7 +338,7 @@ L(length_less16):
 
 	bsr	%eax, %eax
 	add	%rdi, %rax
-	add	%r8, %rax
+	add	%rsi, %rax
 	ret
 
 	.p2align 4
@@ -362,14 +360,14 @@ L(length_less16_part2):
 	pcmpeqb	(%rdi), %xmm1
 	pmovmskb	%xmm1, %eax
 
-	mov	%r8, %rcx
+	mov	%esi, %ecx
 	sar	%cl, %eax
 	test	%eax, %eax
 	jz	L(return_null)
 
 	bsr	%eax, %eax
 	add	%rdi, %rax
-	add	%r8, %rax
+	add	%rsi, %rax
 	ret
 
 	.p2align 4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4a5d68e35d9a975efbeaa2e9a433172214c1ddd9

commit 4a5d68e35d9a975efbeaa2e9a433172214c1ddd9
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed May 24 08:41:23 2017 -0700

    Add more tests for memchr
    
    This patch adds tests for len == 0 and tests for positions close to the
    beginning, which are equivalent to positions close to the end for memchr.
    
    	* string/test-memrchr.c (test_main): Add tests for len == 0
    	and tests for positions close to the beginning, which are
    	equivalent to positions close to the end for memchr.

diff --git a/string/test-memrchr.c b/string/test-memrchr.c
index bfc9920..15483f5 100644
--- a/string/test-memrchr.c
+++ b/string/test-memrchr.c
@@ -151,15 +151,32 @@ test_main (void)
 
   for (i = 1; i < 8; ++i)
     {
+      /* Test len == 0.  */
+      do_test (i, i, 0, 0);
+      do_test (i, i, 0, 23);
+
       do_test (0, 16 << i, 2048, 23);
       do_test (i, 64, 256, 23);
       do_test (0, 16 << i, 2048, 0);
       do_test (i, 64, 256, 0);
+
+      do_test (0, i, 256, 23);
+      do_test (0, i, 256, 0);
+      do_test (i, i, 256, 23);
+      do_test (i, i, 256, 0);
+
     }
   for (i = 1; i < 32; ++i)
     {
       do_test (0, i, i + 1, 23);
       do_test (0, i, i + 1, 0);
+      do_test (i, i, i + 1, 23);
+      do_test (i, i, i + 1, 0);
+
+      do_test (0, 1, i + 1, 23);
+      do_test (0, 2, i + 1, 0);
+      do_test (i, 1, i + 1, 23);
+      do_test (i, 2, i + 1, 0);
     }
 
   do_random_tests ();

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3997ae5d0b51e1318a5bd838497f11fa43c128f0

commit 3997ae5d0b51e1318a5bd838497f11fa43c128f0
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed May 24 08:35:59 2017 -0700

    benchtests: Add more tests for memrchr
    
    bench-memchr.c is shared with bench-memrchr.c.  This patch adds some
    tests for positions close to the beginning for memrchr, which are
    equivalent to positions close to the end for memchr.
    
    	* benchtests/bench-memchr.c (do_test): Print out both length
    	and position.
    	(test_main): Also test the position close to the beginning for
    	memrchr.

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 16099ac..c95cc70 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -117,7 +117,8 @@ do_test (size_t align, size_t pos, size_t len, int seek_char)
       buf[align + len] = seek_char;
     }
 
-  printf ("Length %4zd, alignment %2zd:", pos, align);
+  printf ("Length %4zd, position %4zd, alignment %2zd:",
+	  len, pos, align);
 
   FOR_EACH_IMPL (impl, 0)
     do_one_test (impl, (CHAR *) (buf + align), seek_char, len, result);
@@ -143,11 +144,27 @@ test_main (void)
       do_test (i, 64, 256, 23);
       do_test (0, 16 << i, 2048, 0);
       do_test (i, 64, 256, 0);
+#ifdef USE_AS_MEMRCHR
+      /* Also test the position close to the beginning for memrchr.  */
+      do_test (0, i, 256, 23);
+      do_test (0, i, 256, 0);
+      do_test (i, i, 256, 23);
+      do_test (i, i, 256, 0);
+#endif
     }
   for (i = 1; i < 32; ++i)
     {
       do_test (0, i, i + 1, 23);
       do_test (0, i, i + 1, 0);
+      do_test (i, i, i + 1, 23);
+      do_test (i, i, i + 1, 0);
+#ifdef USE_AS_MEMRCHR
+      /* Also test the position close to the beginning for memrchr.  */
+      do_test (0, 1, i + 1, 23);
+      do_test (0, 2, i + 1, 0);
+      do_test (i, i, i + 1, 23);
+      do_test (i, i, i + 1, 0);
+#endif
     }
 
   return ret;

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]