This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/avx2/c created. glibc-2.25-411-g9593e23
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 5 Jun 2017 22:10:49 -0000
- Subject: GNU C Library master sources branch hjl/avx2/c created. glibc-2.25-411-g9593e23
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/avx2/c has been created
at 9593e235c2401156e9f50ca4b88c4f6b194d61f5 (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9593e235c2401156e9f50ca4b88c4f6b194d61f5
commit 9593e235c2401156e9f50ca4b88c4f6b194d61f5
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri May 26 12:21:55 2017 -0700
x86-64: Optimize strrchr/wcsrchr with AVX2
Optimize strrchr/wcsrchr with AVX2 to check 32 bytes with vector
instructions. It is as fast as SSE2 version for small data sizes
and up to 1X faster for large data sizes on Haswell. Select AVX2
version on AVX2 machines where vzeroupper is preferred and AVX
unaligned load is fast.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strrchr-sse2, strrchr-avx2, wcsrchr-sse2 and wcsrchr-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strrchr_avx2,
__strrchr_sse2, __wcsrchr_avx2 and __wcsrchr_sse2.
* sysdeps/x86_64/multiarch/strrchr-avx2.S: New file.
* sysdeps/x86_64/multiarch/strrchr-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strrchr.c: Likewise.
* sysdeps/x86_64/multiarch/wcsrchr-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcsrchr-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/wcsrchr.c: Likewise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 60c586c..6baf2db 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -15,6 +15,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memmove-ssse3-back \
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
+ strrchr-sse2 strrchr-avx2 \
strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -40,6 +41,7 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wmemcmp-avx2-movbe \
wcscpy-ssse3 wcscpy-c \
wcschr-sse2 wcschr-avx2 \
+ wcsrchr-sse2 wcsrchr-avx2 \
wcslen-sse2 wcsnlen-sse2 wcslen-avx2 wcsnlen-avx2
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 86bd4ab..c2578cb 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -251,6 +251,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strchrnul_avx2)
IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+ /* Support sysdeps/x86_64/multiarch/strrchr.S. */
+ IFUNC_IMPL (i, name, strrchr,
+ IFUNC_IMPL_ADD (array, i, strrchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strrchr_avx2)
+ IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/strcmp.S. */
IFUNC_IMPL (i, name, strcmp,
IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
@@ -342,6 +349,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wcschr_avx2)
IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+ /* Support sysdeps/x86_64/multiarch/wcsrchr.S. */
+ IFUNC_IMPL (i, name, wcsrchr,
+ IFUNC_IMPL_ADD (array, i, wcsrchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcsrchr_avx2)
+ IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/wcscpy.S. */
IFUNC_IMPL (i, name, wcscpy,
IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
new file mode 100644
index 0000000..36ef660
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -0,0 +1,235 @@
+/* strrchr/wcsrchr optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRRCHR
+# define STRRCHR __strrchr_avx2
+# endif
+
+# ifdef USE_AS_WCSRCHR
+# define VPBROADCAST vpbroadcastd
+# define VPCMPEQ vpcmpeqd
+# else
+# define VPBROADCAST vpbroadcastb
+# define VPCMPEQ vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRRCHR)
+ movd %esi, %xmm4
+ movl %edi, %ecx
+ /* Broadcast CHAR to YMM4. */
+ VPBROADCAST %xmm4, %ymm4
+ vpxor %ymm0, %ymm0, %ymm0
+
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ vmovdqu (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %ecx
+ vpmovmskb %ymm3, %eax
+ addq $VEC_SIZE, %rdi
+
+ testl %eax, %eax
+ jnz L(first_vec)
+
+ testl %ecx, %ecx
+ jnz L(return_null)
+
+ andq $-VEC_SIZE, %rdi
+ xorl %edx, %edx
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(first_vec):
+ /* Check if there is a nul CHAR. */
+ testl %ecx, %ecx
+ jnz L(char_and_nul_in_first_vec)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rdi
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+ vmovdqa (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %edx
+ vpmovmskb %ymm3, %eax
+ shrl %cl, %edx
+ shrl %cl, %eax
+ addq $VEC_SIZE, %rdi
+
+ /* Check if there is a CHAR. */
+ testl %eax, %eax
+ jnz L(found_char)
+
+ testl %edx, %edx
+ jnz L(return_null)
+
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(found_char):
+ testl %edx, %edx
+ jnz L(char_and_nul)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ leaq (%rdi, %rcx), %rsi
+
+ .p2align 4
+L(aligned_loop):
+ vmovdqa (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ addq $VEC_SIZE, %rdi
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %ecx
+ vpmovmskb %ymm3, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ vmovdqa (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ add $VEC_SIZE, %rdi
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %ecx
+ vpmovmskb %ymm3, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ vmovdqa (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ addq $VEC_SIZE, %rdi
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %ecx
+ vpmovmskb %ymm3, %eax
+ orl %eax, %ecx
+ jnz L(char_nor_null)
+
+ vmovdqa (%rdi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm2
+ addq $VEC_SIZE, %rdi
+ VPCMPEQ %ymm1, %ymm4, %ymm3
+ vpmovmskb %ymm2, %ecx
+ vpmovmskb %ymm3, %eax
+ orl %eax, %ecx
+ jz L(aligned_loop)
+
+ .p2align 4
+L(char_nor_null):
+ /* Find a CHAR or a nul CHAR in a loop. */
+ testl %eax, %eax
+ jnz L(match)
+L(return_value):
+ testl %edx, %edx
+ jz L(return_null)
+ movl %edx, %eax
+ movq %rsi, %rdi
+
+# ifdef USE_AS_WCSRCHR
+ /* Keep the first bit for each matching CHAR for bsr. */
+ andl $0x11111111, %eax
+# endif
+ bsrl %eax, %eax
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(match):
+ /* Find a CHAR. Check if there is a nul CHAR. */
+ vpmovmskb %ymm2, %ecx
+ testl %ecx, %ecx
+ jnz L(find_nul)
+
+ /* Remember the match and keep searching. */
+ movl %eax, %edx
+ movq %rdi, %rsi
+ jmp L(aligned_loop)
+
+ .p2align 4
+L(find_nul):
+# ifdef USE_AS_WCSRCHR
+ /* Keep the first bit for each matching CHAR for bsr. */
+ andl $0x11111111, %ecx
+ andl $0x11111111, %eax
+# endif
+ /* Mask out any matching bits after the nul CHAR. */
+ movl %ecx, %r8d
+ subl $1, %r8d
+ xorl %ecx, %r8d
+ andl %r8d, %eax
+ testl %eax, %eax
+ /* If there is no CHAR here, return the remembered one. */
+ jz L(return_value)
+ bsrl %eax, %eax
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(char_and_nul):
+ /* Find both a CHAR and a nul CHAR. */
+ addq %rcx, %rdi
+ movl %edx, %ecx
+L(char_and_nul_in_first_vec):
+# ifdef USE_AS_WCSRCHR
+ /* Keep the first bit for each matching CHAR for bsr. */
+ andl $0x11111111, %ecx
+ andl $0x11111111, %eax
+# endif
+ /* Mask out any matching bits after the nul CHAR. */
+ movl %ecx, %r8d
+ subl $1, %r8d
+ xorl %ecx, %r8d
+ andl %r8d, %eax
+ testl %eax, %eax
+ /* Return null pointer if the nul CHAR comes first. */
+ jz L(return_null)
+ bsrl %eax, %eax
+ leaq -VEC_SIZE(%rdi, %rax), %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(return_null):
+ xorl %eax, %eax
+ VZEROUPPER
+ ret
+
+END (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
new file mode 100644
index 0000000..9e8dc2a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -0,0 +1,35 @@
+/* strrchr optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define strrchr __strrchr_sse2
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
+ The speedup we get from using SSE4.2 instruction is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
+# endif
+
+# undef weak_alias
+# define weak_alias(strrchr, rindex)
+#endif
+
+#include "../strrchr.S"
diff --git a/sysdeps/x86_64/multiarch/strrchr.c b/sysdeps/x86_64/multiarch/strrchr.c
new file mode 100644
index 0000000..fbb9c84
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr.c
@@ -0,0 +1,30 @@
+/* Multiple versions of strrchr.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strrchr __redirect_strrchr
+# include <string.h>
+# undef strrchr
+
+# define SYMBOL_NAME strrchr
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_strrchr, strrchr, IFUNC_SELECTOR ());
+weak_alias (strrchr, rindex);
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2.S
new file mode 100644
index 0000000..cf8a239
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2.S
@@ -0,0 +1,3 @@
+#define STRRCHR __wcsrchr_avx2
+#define USE_AS_WCSRCHR 1
+#include "strrchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
new file mode 100644
index 0000000..0ac1b13
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -0,0 +1,23 @@
+/* wcsrchr optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define wcsrchr __wcsrchr_sse2
+#endif
+
+#include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr.c b/sysdeps/x86_64/multiarch/wcsrchr.c
new file mode 100644
index 0000000..0413bd2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr.c
@@ -0,0 +1,29 @@
+/* Multiple versions of wcsrchr.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define wcsrchr __redirect_wcsrchr
+# include <wchar.h>
+# undef wcsrchr
+
+# define SYMBOL_NAME wcsrchr
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_wcsrchr, wcsrchr, IFUNC_SELECTOR ());
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ce40306fcc3edb2baade47e8050c975c5ecba980
commit ce40306fcc3edb2baade47e8050c975c5ecba980
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue May 23 11:25:19 2017 -0700
x86-64: Optimize memrchr with AVX2
Optimize memrchr with AVX2 to search 32 bytes with a single vector
compare instruction. It is as fast as SSE2 memrchr for small data
sizes and up to 1X faster for large data sizes on Haswell. Select
AVX2 memrchr on AVX2 machines where vzeroupper is preferred and AVX
unaligned load is fast.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memrchr-sse2 and memrchr-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __memrchr_avx2 and
__memrchr_sse2.
* sysdeps/x86_64/multiarch/memrchr-avx2.S: New file.
* sysdeps/x86_64/multiarch/memrchr-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/memrchr.c: Likewise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index fd4baf3..60c586c 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,6 +7,7 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
+ memrchr-sse2 memrchr-avx2 \
memcmp-avx2-movbe \
memcmp-sse4 memcpy-ssse3 \
memmove-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 36f14a8..86bd4ab 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -112,6 +112,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove, 1,
__memmove_sse2_unaligned_erms))
+ /* Support sysdeps/x86_64/multiarch/memrchr.S. */
+ IFUNC_IMPL (i, name, memrchr,
+ IFUNC_IMPL_ADD (array, i, memrchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memrchr_avx2)
+ IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/memset_chk.S. */
IFUNC_IMPL (i, name, __memset_chk,
IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
new file mode 100644
index 0000000..3ee02e1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -0,0 +1,359 @@
+/* memrchr optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (__memrchr_avx2)
+ /* Broadcast CHAR to YMM0. */
+ vmovd %esi, %xmm0
+ vpbroadcastb %xmm0, %ymm0
+
+ subq $VEC_SIZE, %rdx
+ jbe L(last_vec_or_less)
+
+ addq %rdx, %rdi
+
+ /* Check the last VEC_SIZE bytes. */
+ vpcmpeqb (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x0)
+
+ subq $(VEC_SIZE * 4), %rdi
+ movl %edi, %ecx
+ andl $(VEC_SIZE - 1), %ecx
+ jz L(aligned_more)
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rdx
+ andq $-VEC_SIZE, %rdi
+ subq %rcx, %rdx
+
+ .p2align 4
+L(aligned_more):
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+
+ /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ vpcmpeqb (%rdi), %ymm0, %ymm4
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x0)
+
+ /* Align data to 4 * VEC_SIZE for loop with fewer branches.
+ There are some overlaps with above if data isn't aligned
+ to 4 * VEC_SIZE. */
+ movl %edi, %ecx
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ jz L(loop_4x_vec)
+
+ addq $(VEC_SIZE * 4), %rdi
+ addq $(VEC_SIZE * 4), %rdx
+ andq $-(VEC_SIZE * 4), %rdi
+ subq %rcx, %rdx
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ subq $(VEC_SIZE * 4), %rdi
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+
+ vmovdqa (%rdi), %ymm1
+ vmovdqa VEC_SIZE(%rdi), %ymm2
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
+
+ vpcmpeqb %ymm1, %ymm0, %ymm1
+ vpcmpeqb %ymm2, %ymm0, %ymm2
+ vpcmpeqb %ymm3, %ymm0, %ymm3
+ vpcmpeqb %ymm4, %ymm0, %ymm4
+
+ vpor %ymm1, %ymm2, %ymm5
+ vpor %ymm3, %ymm4, %ymm6
+ vpor %ymm5, %ymm6, %ymm5
+
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jz L(loop_4x_vec)
+
+ /* There is a match. */
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ vpmovmskb %ymm1, %eax
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_4x_vec_or_less):
+ addl $(VEC_SIZE * 4), %edx
+ cmpl $(VEC_SIZE * 2), %edx
+ jbe L(last_2x_vec)
+
+ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x1_check)
+ cmpl $(VEC_SIZE * 3), %edx
+ jbe L(zero)
+
+ vpcmpeqb (%rdi), %ymm0, %ymm4
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+ jz L(zero)
+ bsrl %eax, %eax
+ subq $(VEC_SIZE * 4), %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3_check)
+ cmpl $VEC_SIZE, %edx
+ jbe L(zero)
+
+ vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jz L(zero)
+ bsrl %eax, %eax
+ subq $(VEC_SIZE * 2), %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addl $(VEC_SIZE * 2), %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_x0):
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_x1):
+ bsrl %eax, %eax
+ addl $VEC_SIZE, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_x2):
+ bsrl %eax, %eax
+ addl $(VEC_SIZE * 2), %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_x3):
+ bsrl %eax, %eax
+ addl $(VEC_SIZE * 3), %eax
+ addq %rdi, %rax
+ ret
+
+ .p2align 4
+L(last_vec_x1_check):
+ bsrl %eax, %eax
+ subq $(VEC_SIZE * 3), %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addl $VEC_SIZE, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_x3_check):
+ bsrl %eax, %eax
+ subq $VEC_SIZE, %rdx
+ addq %rax, %rdx
+ jl L(zero)
+ addl $(VEC_SIZE * 3), %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(zero):
+ VZEROUPPER
+L(null):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(last_vec_or_less_aligned):
+ movl %edx, %ecx
+
+ vpcmpeqb (%rdi), %ymm0, %ymm1
+
+ movl $1, %edx
+ /* Support rdx << 32. */
+ salq %cl, %rdx
+ subq $1, %rdx
+
+ vpmovmskb %ymm1, %eax
+
+ /* Remove the trailing bytes. */
+ andl %edx, %eax
+ testl %eax, %eax
+ jz L(zero)
+
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_or_less):
+ addl $VEC_SIZE, %edx
+
+ /* Check for zero length. */
+ testl %edx, %edx
+ jz L(null)
+
+ movl %edi, %ecx
+ andl $(VEC_SIZE - 1), %ecx
+ jz L(last_vec_or_less_aligned)
+
+ movl %ecx, %esi
+ movl %ecx, %r8d
+ addl %edx, %esi
+ andq $-VEC_SIZE, %rdi
+
+ subl $VEC_SIZE, %esi
+ ja L(last_vec_2x_aligned)
+
+ /* Check the last VEC. */
+ vpcmpeqb (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+
+ /* Remove the leading and trailing bytes. */
+ sarl %cl, %eax
+ movl %edx, %ecx
+
+ movl $1, %edx
+ sall %cl, %edx
+ subl $1, %edx
+
+ andl %edx, %eax
+ testl %eax, %eax
+ jz L(zero)
+
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ addq %r8, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_vec_2x_aligned):
+ movl %esi, %ecx
+
+ /* Check the last VEC. */
+ vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
+
+ movl $1, %edx
+ sall %cl, %edx
+ subl $1, %edx
+
+ vpmovmskb %ymm1, %eax
+
+ /* Remove the trailing bytes. */
+ andl %edx, %eax
+
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ /* Check the second last VEC. */
+ vpcmpeqb (%rdi), %ymm0, %ymm1
+
+ movl %r8d, %ecx
+
+ vpmovmskb %ymm1, %eax
+
+ /* Remove the leading bytes. Must use unsigned right shift for
+ bsrl below. */
+ shrl %cl, %eax
+ testl %eax, %eax
+ jz L(zero)
+
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ addq %r8, %rax
+ VZEROUPPER
+ ret
+END (__memrchr_avx2)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S
new file mode 100644
index 0000000..f518819
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S
@@ -0,0 +1,26 @@
+/* memrchr optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define __memrchr __memrchr_sse2
+
+# undef weak_alias
+# define weak_alias(__memrchr, memrchr)
+#endif
+
+#include "../memrchr.S"
diff --git a/sysdeps/x86_64/multiarch/memrchr.c b/sysdeps/x86_64/multiarch/memrchr.c
new file mode 100644
index 0000000..a947e93
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memrchr.c
@@ -0,0 +1,31 @@
+/* Multiple versions of memrchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define memrchr __redirect_memrchr
+# include <string.h>
+# undef memrchr
+
+# define SYMBOL_NAME memrchr
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_memrchr, __memrchr, IFUNC_SELECTOR ());
+weak_alias (__memrchr, memrchr)
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2aa22acfbbbb26a2e585ff62fef1ebdd290d9d85
commit 2aa22acfbbbb26a2e585ff62fef1ebdd290d9d85
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon May 22 15:09:50 2017 -0700
x86-64: Optimize strchr/strchrnul/wcschr with AVX2
Optimize strchr/strchrnul/wcschr with AVX2 to search 32 bytes with vector
instructions. It is as fast as SSE2 versions for size <= 16 bytes and up
to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on
AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strchr-sse2, strchrnul-sse2, strchr-avx2, strchrnul-avx2,
wcschr-sse2 and wcschr-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strchr_avx2,
__strchrnul_avx2, __strchrnul_sse2, __wcschr_avx2 and
__wcschr_sse2.
* sysdeps/x86_64/multiarch/strchr-avx2.S: New file.
* sysdeps/x86_64/multiarch/strchr-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strchr.c: New file.
* sysdeps/x86_64/multiarch/strchrnul-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/strchrnul-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strchrnul.c: Likewise.
* sysdeps/x86_64/multiarch/wcschr-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcschr-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/wcschr.c: Likewise.
* sysdeps/x86_64/multiarch/strchr.S: Removed.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 915b44f..fd4baf3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memcpy-ssse3-back \
memmove-ssse3-back \
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+ strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -37,6 +38,7 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wmemchr-sse2 wmemchr-avx2 \
wmemcmp-avx2-movbe \
wcscpy-ssse3 wcscpy-c \
+ wcschr-sse2 wcschr-avx2 \
wcslen-sse2 wcsnlen-sse2 wcslen-avx2 wcsnlen-avx2
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f139efc..36f14a8 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -231,9 +231,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strchr.S. */
IFUNC_IMPL (i, name, strchr,
+ IFUNC_IMPL_ADD (array, i, strchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strchr_avx2)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
+ /* Support sysdeps/x86_64/multiarch/strchrnul.S. */
+ IFUNC_IMPL (i, name, strchrnul,
+ IFUNC_IMPL_ADD (array, i, strchrnul,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strchrnul_avx2)
+ IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+
/* Support sysdeps/x86_64/multiarch/strcmp.S. */
IFUNC_IMPL (i, name, strcmp,
IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
@@ -318,6 +328,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
+ /* Support sysdeps/x86_64/multiarch/wcschr.S. */
+ IFUNC_IMPL (i, name, wcschr,
+ IFUNC_IMPL_ADD (array, i, wcschr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcschr_avx2)
+ IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+
/* Support sysdeps/x86_64/multiarch/wcscpy.S. */
IFUNC_IMPL (i, name, wcscpy,
IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
new file mode 100644
index 0000000..e4292d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -0,0 +1,254 @@
+/* strchr/strchrnul optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCHR
+# define STRCHR __strchr_avx2
+# endif
+
+# ifdef USE_AS_WCSCHR
+# define VPBROADCAST vpbroadcastd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_REG esi
+# else
+# define VPBROADCAST vpbroadcastb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_REG sil
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRCHR)
+ movl %edi, %ecx
+ /* Broadcast CHAR to YMM0. */
+ vmovd %esi, %xmm0
+ vpxor %xmm9, %xmm9, %xmm9
+ VPBROADCAST %xmm0, %ymm0
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ null byte. */
+ vmovdqu (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+ jmp L(more_4x_vec)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+ vmovdqu (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %eax
+ testl %eax, %eax
+ jz L(aligned_more)
+ /* Found CHAR or the null byte. */
+ tzcntl %eax, %eax
+ addq %rcx, %rax
+# ifdef USE_AS_STRCHRNUL
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(aligned_more):
+ addq $VEC_SIZE, %rdi
+
+L(more_4x_vec):
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+ /* Align data to 4 * VEC_SIZE. */
+ movq %rdi, %rcx
+ andl $(4 * VEC_SIZE - 1), %ecx
+ andq $-(4 * VEC_SIZE), %rdi
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (%rdi), %ymm5
+ vmovdqa VEC_SIZE(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
+
+ VPCMPEQ %ymm5, %ymm0, %ymm1
+ VPCMPEQ %ymm6, %ymm0, %ymm2
+ VPCMPEQ %ymm7, %ymm0, %ymm3
+ VPCMPEQ %ymm8, %ymm0, %ymm4
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ VPCMPEQ %ymm6, %ymm9, %ymm6
+ VPCMPEQ %ymm7, %ymm9, %ymm7
+ VPCMPEQ %ymm8, %ymm9, %ymm8
+
+ vpor %ymm1, %ymm5, %ymm1
+ vpor %ymm2, %ymm6, %ymm2
+ vpor %ymm3, %ymm7, %ymm3
+ vpor %ymm4, %ymm8, %ymm4
+
+ vpor %ymm1, %ymm2, %ymm5
+ vpor %ymm3, %ymm4, %ymm6
+
+ vpor %ymm5, %ymm6, %ymm5
+
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+ jmp L(loop_4x_vec)
+
+ .p2align 4
+L(first_vec_x0):
+ /* Found CHAR or the null byte. */
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq VEC_SIZE(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+L(first_vec_x3):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr-sse2.S
similarity index 60%
rename from sysdeps/x86_64/multiarch/strchr.S
rename to sysdeps/x86_64/multiarch/strchr-sse2.S
index c9f54ca..44eb07e 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr-sse2.S
@@ -1,4 +1,4 @@
-/* Multiple versions of strchr
+/* strchr optimized with SSE2.
Copyright (C) 2009-2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,42 +16,20 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc. */
#if IS_IN (libc)
- .text
-ENTRY(strchr)
- .type strchr, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq __strchr_sse2(%rip), %rax
-2: HAS_ARCH_FEATURE (Slow_BSF)
- jz 3f
- leaq __strchr_sse2_no_bsf(%rip), %rax
-3: ret
-END(strchr)
-
+# define strchr __strchr_sse2
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strchr_sse2, @function; \
- .align 16; \
- .globl __strchr_sse2; \
- .hidden __strchr_sse2; \
- __strchr_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
-# undef libc_hidden_builtin_def
+# ifdef SHARED
+# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal strchr calls through a PLT.
The speedup we get from using SSE4.2 instruction is likely eaten away
by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
+# define libc_hidden_builtin_def(name) \
.globl __GI_strchr; __GI_strchr = __strchr_sse2
+# endif
+
+# undef weak_alias
+# define weak_alias(strchr, index)
#endif
#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
new file mode 100644
index 0000000..31dc583
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -0,0 +1,51 @@
+/* Multiple versions of strchr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2009-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strchr __redirect_strchr
+# include <string.h>
+# undef strchr
+
+# define SYMBOL_NAME strchr
+# include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ return OPTIMIZE (avx2);
+
+ if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF))
+ return OPTIMIZE (sse2_no_bsf);
+
+ return OPTIMIZE (sse2);
+}
+
+libc_ifunc_redirected (__redirect_strchr, strchr, IFUNC_SELECTOR ());
+weak_alias (strchr, index)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2.S b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
new file mode 100644
index 0000000..fa0cc09
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
new file mode 100644
index 0000000..4d199b3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
@@ -0,0 +1,26 @@
+/* strchrnul optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define __strchrnul __strchrnul_sse2
+
+# undef weak_alias
+# define weak_alias(__strchrnul, strchrnul)
+#endif
+
+#include "../strchrnul.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul.c b/sysdeps/x86_64/multiarch/strchrnul.c
new file mode 100644
index 0000000..95b6222
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul.c
@@ -0,0 +1,34 @@
+/* Multiple versions of strchrnul.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strchrnul __redirect_strchrnul
+# define __strchrnul __redirect___strchrnul
+# include <string.h>
+# undef __strchrnul
+# undef strchrnul
+
+# define SYMBOL_NAME strchrnul
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_strchrnul, __strchrnul,
+ IFUNC_SELECTOR ());
+weak_alias (__strchrnul, strchrnul)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2.S b/sysdeps/x86_64/multiarch/wcschr-avx2.S
new file mode 100644
index 0000000..67726b6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S
new file mode 100644
index 0000000..e5fa517
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S
@@ -0,0 +1,38 @@
+/* wcschr optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define __wcschr __wcschr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal wcschr calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___wcschr; __GI___wcschr = __wcschr_sse2
+# undef libc_hidden_weak
+# define libc_hidden_weak(name) \
+ .weak __GI_wcschr; __GI_wcschr = __wcschr_sse2
+# endif
+
+# undef weak_alias
+# define weak_alias(__wcschr, wcschr)
+#endif
+
+#include "../wcschr.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr.c b/sysdeps/x86_64/multiarch/wcschr.c
new file mode 100644
index 0000000..910468f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr.c
@@ -0,0 +1,33 @@
+/* Multiple versions of wcschr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define wcschr __redirect_wcschr
+# define __wcschr __redirect___wcschr
+# include <wchar.h>
+# undef wcschr
+# undef __wcschr
+
+# define SYMBOL_NAME wcschr
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_wcschr, __wcschr, IFUNC_SELECTOR ());
+weak_alias (__wcschr, wcschr);
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b38361c9a6da5aea0234a9c31ce63fec93d0fc86
commit b38361c9a6da5aea0234a9c31ce63fec93d0fc86
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri May 19 12:19:42 2017 -0700
x86-64: Optimize strlen/strnlen/wcslen/wcsnlen with AVX2
Optimize strlen/strnlen/wcslen/wcsnlen with AVX2 to check 32 bytes with
a single vector compare instruction. It is as fast as SSE2 versions for
size <= 16 bytes and up to 1X faster for or size > 16 bytes on Haswell.
Select AVX2 version on AVX2 machines where vzeroupper is preferred and
AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strlen-sse2, strnlen-sse2, strlen-avx2, strnlen-avx2,
wcslen-sse2, wcsnlen-sse2, wcslen-avx2 and wcsnlen-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strlen_avx2,
__strlen_sse2, __strnlen_avx2, __strnlen_sse2, __wcslen_avx2,
__wcslen_sse2, __wcsnlen_avx2 and __wcsnlen_sse2.
* sysdeps/x86_64/multiarch/strlen-avx2.S: New file.
* sysdeps/x86_64/multiarch/strlen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strlen.c: Likewise.
* sysdeps/x86_64/multiarch/strnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strnlen.c: Likewise.
* sysdeps/x86_64/multiarch/wcslen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/wcslen.c: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/wcsnlen.c: Likewise.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index eb42b19..915b44f 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memcpy-ssse3-back \
memmove-ssse3-back \
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+ strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
@@ -35,7 +36,8 @@ ifeq ($(subdir),wcsmbs)
sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wmemchr-sse2 wmemchr-avx2 \
wmemcmp-avx2-movbe \
- wcscpy-ssse3 wcscpy-c
+ wcscpy-ssse3 wcscpy-c \
+ wcslen-sse2 wcsnlen-sse2 wcslen-avx2 wcsnlen-avx2
endif
ifeq ($(subdir),debug)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f60535b..f139efc 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -166,6 +166,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__rawmemchr_avx2)
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+ /* Support sysdeps/x86_64/multiarch/strlen.S. */
+ IFUNC_IMPL (i, name, strlen,
+ IFUNC_IMPL_ADD (array, i, strlen,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strlen_avx2)
+ IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+
+ /* Support sysdeps/x86_64/multiarch/strnlen.S. */
+ IFUNC_IMPL (i, name, strnlen,
+ IFUNC_IMPL_ADD (array, i, strnlen,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strnlen_avx2)
+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+
/* Support sysdeps/x86_64/multiarch/stpncpy.S. */
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
@@ -310,6 +324,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wcscpy_ssse3)
IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
+ /* Support sysdeps/x86_64/multiarch/wcslen.S. */
+ IFUNC_IMPL (i, name, wcslen,
+ IFUNC_IMPL_ADD (array, i, wcslen,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcslen_avx2)
+ IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+
+ /* Support sysdeps/x86_64/multiarch/wcsnlen.S. */
+ IFUNC_IMPL (i, name, wcsnlen,
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcsnlen_avx2)
+ IFUNC_IMPL_ADD (array, i, wcsnlen, 1, __wcsnlen_sse2))
+
/* Support sysdeps/x86_64/multiarch/wmemchr.S. */
IFUNC_IMPL (i, name, wmemchr,
IFUNC_IMPL_ADD (array, i, wmemchr,
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
new file mode 100644
index 0000000..1dc823a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -0,0 +1,394 @@
+/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+# define STRLEN __strlen_avx2
+# endif
+
+# ifdef USE_AS_WCSLEN
+# define VPCMPEQ vpcmpeqd
+# define VPMINU vpminud
+# else
+# define VPCMPEQ vpcmpeqb
+# define VPMINU vpminub
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRLEN)
+# ifdef USE_AS_STRNLEN
+ /* Check for zero length. */
+ testq %rsi, %rsi
+ jz L(zero)
+# ifdef USE_AS_WCSLEN
+ shl $2, %rsi
+# endif
+ movq %rsi, %r8
+# endif
+ movl %edi, %ecx
+ movq %rdi, %rdx
+ vpxor %xmm0, %xmm0, %xmm0
+
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. */
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+# ifdef USE_AS_STRNLEN
+ jnz L(first_vec_x0_check)
+ /* Adjust length and check the end of data. */
+ subq $VEC_SIZE, %rsi
+ jbe L(max)
+# else
+ jnz L(first_vec_x0)
+# endif
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. */
+ addq %rcx, %rsi
+
+ subq $(VEC_SIZE * 4), %rsi
+ jbe L(last_4x_vec_or_less)
+# endif
+ jmp L(more_4x_vec)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %eax
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRNLEN
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+# endif
+ addq %rdi, %rax
+ addq %rcx, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(aligned_more):
+# ifdef USE_AS_STRNLEN
+ /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
+ with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+ to void possible addition overflow. */
+ negq %rcx
+ addq $VEC_SIZE, %rcx
+
+ /* Check the end of data. */
+ subq %rcx, %rsi
+ jbe L(max)
+# endif
+
+ addq $VEC_SIZE, %rdi
+
+# ifdef USE_AS_STRNLEN
+ subq $(VEC_SIZE * 4), %rsi
+ jbe L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_STRNLEN
+ subq $(VEC_SIZE * 4), %rsi
+ jbe L(last_4x_vec_or_less)
+# endif
+
+ /* Align data to 4 * VEC_SIZE. */
+ movq %rdi, %rcx
+ andl $(4 * VEC_SIZE - 1), %ecx
+ andq $-(4 * VEC_SIZE), %rdi
+
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. */
+ addq %rcx, %rsi
+# endif
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (%rdi), %ymm1
+ vmovdqa VEC_SIZE(%rdi), %ymm2
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
+ VPMINU %ymm1, %ymm2, %ymm5
+ VPMINU %ymm3, %ymm4, %ymm6
+ VPMINU %ymm5, %ymm6, %ymm5
+
+ VPCMPEQ %ymm5, %ymm0, %ymm5
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_STRNLEN
+ jmp L(loop_4x_vec)
+# else
+ subq $(VEC_SIZE * 4), %rsi
+ ja L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+ /* Less than 4 * VEC and aligned to VEC_SIZE. */
+ addl $(VEC_SIZE * 2), %esi
+ jle L(last_2x_vec)
+
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x2_check)
+ subl $VEC_SIZE, %esi
+ jle L(max)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x3_check)
+ movq %r8, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ addl $(VEC_SIZE * 2), %esi
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x0_check)
+ subl $VEC_SIZE, %esi
+ jle L(max)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ movq %r8, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x0_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x3_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ jbe L(max)
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(max):
+ movq %r8, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+# endif
+
+ .p2align 4
+L(first_vec_x0):
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+ VPCMPEQ %ymm2, %ymm0, %ymm2
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+ VPCMPEQ %ymm3, %ymm0, %ymm3
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+ subq %rdx, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ VZEROUPPER
+ ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
new file mode 100644
index 0000000..d0c2991
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
@@ -0,0 +1,32 @@
+/* strlen optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define strlen __strlen_sse2
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strlen calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_strlen; __GI_strlen = __strlen_sse2
+# endif
+#endif
+
+#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/strlen.c b/sysdeps/x86_64/multiarch/strlen.c
new file mode 100644
index 0000000..8384035
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen.c
@@ -0,0 +1,30 @@
+/* Multiple versions of strlen.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strlen __redirect_strlen
+# include <string.h>
+# undef strlen
+
+# define SYMBOL_NAME strlen
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ());
+#endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2.S b/sysdeps/x86_64/multiarch/strnlen-avx2.S
new file mode 100644
index 0000000..c4062b2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-avx2.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_avx2
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S
new file mode 100644
index 0000000..7db8821
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S
@@ -0,0 +1,36 @@
+/* strnlen optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define __strnlen __strnlen_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal strnlen calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI_strnlen; __GI_strnlen = __strnlen_sse2; \
+ .globl __GI___strnlen; __GI___strnlen = __strnlen_sse2
+# endif
+
+# undef weak_alias
+# define weak_alias(__strnlen, strnlen)
+#endif
+
+#include "../strnlen.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen.c b/sysdeps/x86_64/multiarch/strnlen.c
new file mode 100644
index 0000000..bb09b83
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen.c
@@ -0,0 +1,33 @@
+/* Multiple versions of strnlen.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strnlen __redirect_strnlen
+# define __strnlen __redirect___strnlen
+# include <string.h>
+# undef __strnlen
+# undef strnlen
+
+# define SYMBOL_NAME strnlen
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_strnlen, __strnlen, IFUNC_SELECTOR ());
+weak_alias (__strnlen, strnlen);
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2.S b/sysdeps/x86_64/multiarch/wcslen-avx2.S
new file mode 100644
index 0000000..c9224f1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-avx2.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_avx2
+#define USE_AS_WCSLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse2.S b/sysdeps/x86_64/multiarch/wcslen-sse2.S
new file mode 100644
index 0000000..e7b24ea
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-sse2.S
@@ -0,0 +1,26 @@
+/* wcslen optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define __wcslen __wcslen_sse2
+
+# undef weak_alias
+# define weak_alias(__wcslen, wcslen)
+#endif
+
+#include "../wcslen.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
new file mode 100644
index 0000000..3f8ad87
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen.c
@@ -0,0 +1,31 @@
+/* Multiple versions of wcslen.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define __wcslen __redirect_wcslen
+# include <wchar.h>
+# undef __wcslen
+
+# define SYMBOL_NAME wcslen
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
+weak_alias (__wcslen, wcslen);
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2.S
new file mode 100644
index 0000000..fac8354
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_avx2
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse2.S b/sysdeps/x86_64/multiarch/wcsnlen-sse2.S
new file mode 100644
index 0000000..846466b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse2.S
@@ -0,0 +1,26 @@
+/* wcsnlen optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define __wcsnlen __wcsnlen_sse2
+
+# undef weak_alias
+# define weak_alias(__wcsnlen, wcsnlen)
+#endif
+
+#include "../wcsnlen.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
new file mode 100644
index 0000000..35541b1
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -0,0 +1,31 @@
+/* Multiple versions of wcsnlen.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define __wcsnlen __redirect_wcsnlen
+# include <wchar.h>
+# undef __wcsnlen
+
+# define SYMBOL_NAME wcsnlen
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
+weak_alias (__wcsnlen, wcsnlen);
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=01abda31adb56b756721332f8811c0bdc1a715c4
commit 01abda31adb56b756721332f8811c0bdc1a715c4
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu May 18 11:10:09 2017 -0700
x86-64: Optimize memchr/rawmemchr/wmemchr with SSE2/AVX2
SSE2 memchr is extended to support wmemchr. AVX2 memchr/rawmemchr/wmemchr
are added to search 32 bytes with a single vector compare instruction.
AVX2 memchr/rawmemchr/wmemchr are as fast as SSE2 memchr/rawmemchr/wmemchr
for small sizes and up to 1.5X faster for larger sizes on Haswell and
Skylake. Select AVX2 memchr/rawmemchr/wmemchr on AVX2 machines where
vzeroupper is preferred and AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/memchr.S (MEMCHR): New. Depending on if
USE_AS_WMEMCHR is defined.
(PCMPEQ): Likewise.
(memchr): Renamed to ...
(MEMCHR): This. Support wmemchr if USE_AS_WMEMCHR is defined.
Replace pcmpeqb with PCMPEQ.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memchr-sse2, rawmemchr-sse2, memchr-avx2, rawmemchr-avx2,
wmemchr-sse2 and wmemchr-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test __memchr_avx2, __memchr_sse2,
__rawmemchr_avx2, __rawmemchr_sse2, __wmemchr_avx2 and
__wmemchr_sse2.
* sysdeps/x86_64/multiarch/ifunc-sse2-avx2.h: New file.
* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/memchr-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/memchr.c: Likewise.
* sysdeps/x86_64/multiarch/rawmemchr-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/rawmemchr-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/rawmemchr.c: Likewise.
* sysdeps/x86_64/multiarch/wmemchr-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wmemchr-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/wmemchr.c: Likewise.
* sysdeps/x86_64/wmemchr.S: Likewise.
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index d3be012..3167cd8 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -18,17 +18,31 @@
#include <sysdep.h>
+#ifdef USE_AS_WMEMCHR
+# define MEMCHR wmemchr
+# define PCMPEQ pcmpeqd
+#else
+# define MEMCHR memchr
+# define PCMPEQ pcmpeqb
+#endif
+
/* fast SSE2 version with using pmaxub and 64 byte loop */
.text
-ENTRY(memchr)
+ENTRY(MEMCHR)
movd %esi, %xmm1
mov %edi, %ecx
+#ifdef USE_AS_WMEMCHR
+ test %rdx, %rdx
+ jz L(return_null)
+ shl $2, %rdx
+#else
punpcklbw %xmm1, %xmm1
test %rdx, %rdx
jz L(return_null)
punpcklbw %xmm1, %xmm1
+#endif
and $63, %ecx
pshufd $0, %xmm1, %xmm1
@@ -37,7 +51,7 @@ ENTRY(memchr)
ja L(crosscache)
movdqu (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
@@ -58,7 +72,7 @@ L(crosscache):
and $-16, %rdi
movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
/* Check if there is a match. */
pmovmskb %xmm0, %eax
/* Remove the leading bytes. */
@@ -90,25 +104,25 @@ L(unaligned_no_match):
.p2align 4
L(loop_prolog):
movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches)
movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
+ PCMPEQ %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches16)
movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32)
movdqa 48(%rdi), %xmm4
- pcmpeqb %xmm1, %xmm4
+ PCMPEQ %xmm1, %xmm4
add $64, %rdi
pmovmskb %xmm4, %eax
test %eax, %eax
@@ -121,25 +135,25 @@ L(loop_prolog):
jbe L(exit_loop)
movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches)
movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
+ PCMPEQ %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches16)
movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32)
movdqa 48(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm3
pmovmskb %xmm3, %eax
add $64, %rdi
@@ -160,10 +174,10 @@ L(align64_loop):
movdqa 32(%rdi), %xmm3
movdqa 48(%rdi), %xmm4
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm1, %xmm2
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm1, %xmm4
+ PCMPEQ %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm2
+ PCMPEQ %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm4
pmaxub %xmm0, %xmm3
pmaxub %xmm2, %xmm4
@@ -186,9 +200,9 @@ L(align64_loop):
jnz L(matches16)
movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm3
- pcmpeqb 48(%rdi), %xmm1
+ PCMPEQ 48(%rdi), %xmm1
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32)
@@ -204,26 +218,26 @@ L(exit_loop):
jle L(exit_loop_32)
movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches)
movdqa 16(%rdi), %xmm2
- pcmpeqb %xmm1, %xmm2
+ PCMPEQ %xmm1, %xmm2
pmovmskb %xmm2, %eax
test %eax, %eax
jnz L(matches16)
movdqa 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm3
+ PCMPEQ %xmm1, %xmm3
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32_1)
sub $16, %edx
jle L(return_null)
- pcmpeqb 48(%rdi), %xmm1
+ PCMPEQ 48(%rdi), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
jnz L(matches48_1)
@@ -234,14 +248,14 @@ L(exit_loop):
L(exit_loop_32):
add $32, %edx
movdqa (%rdi), %xmm0
- pcmpeqb %xmm1, %xmm0
+ PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches_1)
sub $16, %edx
jbe L(return_null)
- pcmpeqb 16(%rdi), %xmm1
+ PCMPEQ 16(%rdi), %xmm1
pmovmskb %xmm1, %eax
test %eax, %eax
jnz L(matches16_1)
@@ -308,8 +322,13 @@ L(matches48_1):
L(return_null):
xor %eax, %eax
ret
-END(memchr)
+END(MEMCHR)
+#ifdef USE_AS_WMEMCHR
+libc_hidden_def (__wmemchr)
+weak_alias (__wmemchr, wmemchr)
+libc_hidden_weak (wmemchr)
+#else
strong_alias (memchr, __memchr)
-
libc_hidden_builtin_def(memchr)
+#endif
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index b040288..eb42b19 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,6 +6,7 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
+ memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
memcmp-avx2-movbe \
memcmp-sse4 memcpy-ssse3 \
memmove-ssse3 \
@@ -32,6 +33,7 @@ endif
ifeq ($(subdir),wcsmbs)
sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ wmemchr-sse2 wmemchr-avx2 \
wmemcmp-avx2-movbe \
wcscpy-ssse3 wcscpy-c
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index b61bc9f..f60535b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -38,6 +38,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
size_t i = 0;
+ /* Support sysdeps/x86_64/multiarch/memchr.S. */
+ IFUNC_IMPL (i, name, memchr,
+ IFUNC_IMPL_ADD (array, i, memchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __memchr_avx2)
+ IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/memcmp.S. */
IFUNC_IMPL (i, name, memcmp,
IFUNC_IMPL_ADD (array, i, memcmp,
@@ -152,6 +159,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memset_avx512_no_vzeroupper)
)
+ /* Support sysdeps/x86_64/multiarch/rawmemchr.S. */
+ IFUNC_IMPL (i, name, rawmemchr,
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __rawmemchr_avx2)
+ IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/stpncpy.S. */
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
@@ -296,6 +310,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wcscpy_ssse3)
IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_sse2))
+ /* Support sysdeps/x86_64/multiarch/wmemchr.S. */
+ IFUNC_IMPL (i, name, wmemchr,
+ IFUNC_IMPL_ADD (array, i, wmemchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wmemchr_avx2)
+ IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+
/* Support sysdeps/x86_64/multiarch/wmemcmp.S. */
IFUNC_IMPL (i, name, wmemcmp,
IFUNC_IMPL_ADD (array, i, wmemcmp,
diff --git a/sysdeps/x86_64/multiarch/ifunc-sse2-avx2.h b/sysdeps/x86_64/multiarch/ifunc-sse2-avx2.h
new file mode 100644
index 0000000..25432b8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-sse2-avx2.h
@@ -0,0 +1,36 @@
+/* Common definition for ifunc selections optimized with SSE2 and AVX2.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ return OPTIMIZE (avx2);
+
+ return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
new file mode 100644
index 0000000..a7275ed
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -0,0 +1,340 @@
+/* memchr/wmemchr optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef MEMCHR
+# define MEMCHR __memchr_avx2
+# endif
+
+# ifdef USE_AS_WMEMCHR
+# define VPCMPEQ vpcmpeqd
+# else
+# define VPCMPEQ vpcmpeqb
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (MEMCHR)
+# ifndef USE_AS_RAWMEMCHR
+ /* Check for zero length. */
+ testq %rdx, %rdx
+ jz L(null)
+# endif
+ movl %edi, %ecx
+ /* Broadcast CHAR to YMM0. */
+ vmovd %esi, %xmm0
+# ifdef USE_AS_WMEMCHR
+ shl $2, %rdx
+ vpbroadcastd %xmm0, %ymm0
+# else
+ vpbroadcastb %xmm0, %ymm0
+# endif
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. */
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+# ifndef USE_AS_RAWMEMCHR
+ jnz L(first_vec_x0_check)
+ /* Adjust length and check the end of data. */
+ subq $VEC_SIZE, %rdx
+ jbe L(zero)
+# else
+ jnz L(first_vec_x0)
+# endif
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. */
+ addq %rcx, %rdx
+
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
+ jmp L(more_4x_vec)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %eax
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Check the end of data. */
+ cmpq %rax, %rdx
+ jbe L(zero)
+# endif
+ addq %rdi, %rax
+ addq %rcx, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(aligned_more):
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+ instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+ overflow. */
+ negq %rcx
+ addq $VEC_SIZE, %rcx
+
+ /* Check the end of data. */
+ subq %rcx, %rdx
+ jbe L(zero)
+# endif
+
+ addq $VEC_SIZE, %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
+
+L(more_4x_vec):
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
+
+ /* Align data to 4 * VEC_SIZE. */
+ movq %rdi, %rcx
+ andl $(4 * VEC_SIZE - 1), %ecx
+ andq $-(4 * VEC_SIZE), %rdi
+
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. */
+ addq %rcx, %rdx
+# endif
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+
+ vpor %ymm1, %ymm2, %ymm5
+ vpor %ymm3, %ymm4, %ymm6
+ vpor %ymm5, %ymm6, %ymm5
+
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+# ifdef USE_AS_RAWMEMCHR
+ jmp L(loop_4x_vec)
+# else
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(loop_4x_vec)
+
+L(last_4x_vec_or_less):
+ /* Less than 4 * VEC and aligned to VEC_SIZE. */
+ addl $(VEC_SIZE * 2), %edx
+ jle L(last_2x_vec)
+
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x2_check)
+ subl $VEC_SIZE, %edx
+ jle L(zero)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x3_check)
+ xorl %eax, %eax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ addl $(VEC_SIZE * 2), %edx
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+
+ jnz L(first_vec_x0_check)
+ subl $VEC_SIZE, %edx
+ jle L(zero)
+
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ xorl %eax, %eax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x0_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rdx
+ jbe L(zero)
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rdx
+ jbe L(zero)
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rdx
+ jbe L(zero)
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x3_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rdx
+ jbe L(zero)
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(zero):
+ VZEROUPPER
+L(null):
+ xorl %eax, %eax
+ ret
+# endif
+
+ .p2align 4
+L(first_vec_x0):
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+ VZEROUPPER
+ ret
+
+END (MEMCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memchr-sse2.S b/sysdeps/x86_64/multiarch/memchr-sse2.S
new file mode 100644
index 0000000..12c9c70
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-sse2.S
@@ -0,0 +1,35 @@
+/* memchr optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define memchr __memchr_sse2
+
+# ifdef SHARED
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal memchr calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# define libc_hidden_builtin_def(name) \
+ .globl __GI_memchr; __GI_memchr = __memchr_sse2
+# endif
+
+# undef strong_alias
+# define strong_alias(memchr, __memchr)
+#endif
+
+#include "../memchr.S"
diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
new file mode 100644
index 0000000..aabad61
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr.c
@@ -0,0 +1,31 @@
+/* Multiple versions of memchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define memchr __redirect_memchr
+# include <string.h>
+# undef memchr
+
+# define SYMBOL_NAME memchr
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
+strong_alias (memchr, __memchr)
+#endif
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S
new file mode 100644
index 0000000..128f9ea
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2.S
@@ -0,0 +1,4 @@
+#define MEMCHR __rawmemchr_avx2
+#define USE_AS_RAWMEMCHR 1
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-sse2.S b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S
new file mode 100644
index 0000000..a44a7ff
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-sse2.S
@@ -0,0 +1,36 @@
+/* rawmemchr optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define __rawmemchr __rawmemchr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal rawmemchr calls through a
+ PLT. The speedup we get from using AVX2 instructions is likely eaten
+ away by the indirect call in the PLT. */
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___rawmemchr; __GI___rawmemchr = __rawmemchr_sse2
+# endif
+
+# undefine weak_alias
+# define weak_alias(__rawmemchr, rawmemchr)
+#endif
+
+#include "../rawmemchr.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
new file mode 100644
index 0000000..e1afa57
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
@@ -0,0 +1,34 @@
+/* Multiple versions of rawmemchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define rawmemchr __redirect_rawmemchr
+# define __rawmemchr __redirect___rawmemchr
+# include <string.h>
+# undef rawmemchr
+# undef __rawmemchr
+
+# define SYMBOL_NAME rawmemchr
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
+ IFUNC_SELECTOR ());
+weak_alias (__rawmemchr, rawmemchr)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2.S b/sysdeps/x86_64/multiarch/wmemchr-avx2.S
new file mode 100644
index 0000000..282854f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-avx2.S
@@ -0,0 +1,4 @@
+#define MEMCHR __wmemchr_avx2
+#define USE_AS_WMEMCHR 1
+
+#include "memchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr-sse2.S b/sysdeps/x86_64/multiarch/wmemchr-sse2.S
new file mode 100644
index 0000000..49cd86a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-sse2.S
@@ -0,0 +1,39 @@
+/* wmemchr optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define wmemchr __wmemchr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal wmemchr calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___wmemchr; __GI___wmemchr = __wmemchr_sse2
+# undef libc_hidden_weak
+# define libc_hidden_weak(name) \
+ .weak __GI_wmemchr; __GI_wmemchr = __wmemchr_sse2
+# endif
+
+# undef weak_alias
+# define weak_alias(__wmemchr, wmemchr)
+#endif
+
+#include "../wmemchr.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
new file mode 100644
index 0000000..1825ad7
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
@@ -0,0 +1,33 @@
+/* Multiple versions of wmemchr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define wmemchr __redirect_wmemchr
+# define __wmemchr __redirect___wmemchr
+# include <wchar.h>
+# undef wmemchr
+# undef __wmemchr
+
+# define SYMBOL_NAME wmemchr
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
+weak_alias (__wmemchr, wmemchr)
+#endif
diff --git a/sysdeps/x86_64/wmemchr.S b/sysdeps/x86_64/wmemchr.S
new file mode 100644
index 0000000..9d8079b
--- /dev/null
+++ b/sysdeps/x86_64/wmemchr.S
@@ -0,0 +1,3 @@
+#define USE_AS_WMEMCHR 1
+
+#include "memchr.S"
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d9b18e30d9f44c1c27487fb2d80866ba93d0d93a
commit d9b18e30d9f44c1c27487fb2d80866ba93d0d93a
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed May 24 08:41:23 2017 -0700
Add more tests for memchr
This patch adds tests for len == 0 and tests for positions close to the
beginning, which are equivalent to positions close to the end for memchr.
* string/test-memrchr.c (test_main): Add tests for len == 0
and tests for positions close to the beginning, which are
equivalent to positions close to the end for memchr.
diff --git a/string/test-memrchr.c b/string/test-memrchr.c
index bfc9920..15483f5 100644
--- a/string/test-memrchr.c
+++ b/string/test-memrchr.c
@@ -151,15 +151,32 @@ test_main (void)
for (i = 1; i < 8; ++i)
{
+ /* Test len == 0. */
+ do_test (i, i, 0, 0);
+ do_test (i, i, 0, 23);
+
do_test (0, 16 << i, 2048, 23);
do_test (i, 64, 256, 23);
do_test (0, 16 << i, 2048, 0);
do_test (i, 64, 256, 0);
+
+ do_test (0, i, 256, 23);
+ do_test (0, i, 256, 0);
+ do_test (i, i, 256, 23);
+ do_test (i, i, 256, 0);
+
}
for (i = 1; i < 32; ++i)
{
do_test (0, i, i + 1, 23);
do_test (0, i, i + 1, 0);
+ do_test (i, i, i + 1, 23);
+ do_test (i, i, i + 1, 0);
+
+ do_test (0, 1, i + 1, 23);
+ do_test (0, 2, i + 1, 0);
+ do_test (i, 1, i + 1, 23);
+ do_test (i, 2, i + 1, 0);
}
do_random_tests ();
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources