This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch master updated. glibc-2.18-622-g584b18e


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  584b18eb4df61ccd447db2dfe8c8a7901f8c8598 (commit)
      from  8a5c7897dd1c52ca74b06aaf5a3bacf0919c97aa (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=584b18eb4df61ccd447db2dfe8c8a7901f8c8598

commit 584b18eb4df61ccd447db2dfe8c8a7901f8c8598
Author: OndÅ?ej Bílka <neleai@seznam.cz>
Date:   Sat Dec 14 19:33:56 2013 +0100

    Add strstr with unaligned loads. Fixes bug 12100.
    
    A sse42 version of strstr used pcmpistr instruction which is quite
    ineffective. A faster way is look for pairs of characters which is uses
    sse2, is faster than pcmpistr and for real strings a pairs we look for
    are relatively rare.
    
    For linear time complexity we use buy or rent technique which switches
    to two-way algorithm when superlinear behaviour is detected.

diff --git a/ChangeLog b/ChangeLog
index 3de3901..811f4c9 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2013-12-14  OndÅ?ej Bílka  <neleai@seznam.cz>
+
+	[BZ #12100]
+	* sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S: New file
+	* sysdeps/x86_64/multiarch/strstr-c.c: Moved to ...
+	* sysdeps/x86_64/multiarch/strstr.c: ... here.
+	(strstr): Add __strstr_sse2_unaligned ifunc.
+	* sysdeps/x86_64/multiarch/strcasestr-c.c: Moved to ...
+	* sysdeps/x86_64/multiarch/strcasestr.c ... here.
+	(strcasestr): Remove __strcasestr_sse42 ifunc.
+	* sysdeps/x86_64/multiarch/strcasestr-nonascii.c: Remove.
+	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update.
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Update.
+
 2013-12-14  Kaz Kojima  <kkojima@rr.iij4u.or.jp>
 
 	* sysdeps/sh/sh4/fpu/bits/fenv.h: Move to ...
diff --git a/NEWS b/NEWS
index 5815d82..ebe8197 100644
--- a/NEWS
+++ b/NEWS
@@ -10,18 +10,18 @@ Version 2.19
 * The following bugs are resolved with this release:
 
   156, 387, 431, 832, 926, 2801, 4772, 6786, 6787, 6807, 6810, 7003, 9954,
-  10253, 10278, 11087, 11157, 11214, 12486, 13028, 13982, 13985, 14029,
-  14032, 14143, 14155, 14547, 14699, 14752, 14876, 14910, 15004, 15048,
-  15089, 15218, 15268, 15277, 15308, 15362, 15374, 15400, 15425, 15427,
-  15483, 15522, 15531, 15532, 15593, 15601, 15608, 15609, 15610, 15632,
-  15640, 15670, 15672, 15680, 15681, 15723, 15734, 15735, 15736, 15748,
-  15749, 15754, 15760, 15763, 15764, 15797, 15799, 15825, 15843, 15844,
-  15847, 15849, 15855, 15856, 15857, 15859, 15867, 15886, 15887, 15890,
-  15892, 15893, 15895, 15897, 15901, 15905, 15909, 15915, 15917, 15919,
-  15921, 15923, 15939, 15941, 15948, 15963, 15966, 15985, 15988, 15997,
-  16032, 16034, 16036, 16037, 16038, 16041, 16055, 16071, 16072, 16074,
-  16077, 16078, 16103, 16112, 16143, 16144, 16146, 16150, 16151, 16153,
-  16167, 16172, 16195, 16214, 16245, 16271, 16274, 16283, 16289.
+  10253, 10278, 11087, 11157, 11214, 12100, 12486, 13028, 13982, 13985,
+  14029, 14032, 14143, 14155, 14547, 14699, 14752, 14876, 14910, 15004,
+  15048, 15089, 15218, 15268, 15277, 15308, 15362, 15374, 15400, 15425,
+  15427, 15483, 15522, 15531, 15532, 15593, 15601, 15608, 15609, 15610,
+  15632, 15640, 15670, 15672, 15680, 15681, 15723, 15734, 15735, 15736,
+  15748, 15749, 15754, 15760, 15763, 15764, 15797, 15799, 15825, 15843,
+  15844, 15847, 15849, 15855, 15856, 15857, 15859, 15867, 15886, 15887,
+  15890, 15892, 15893, 15895, 15897, 15901, 15905, 15909, 15915, 15917,
+  15919, 15921, 15923, 15939, 15941, 15948, 15963, 15966, 15985, 15988,
+  15997, 16032, 16034, 16036, 16037, 16038, 16041, 16055, 16071, 16072,
+  16074, 16077, 16078, 16103, 16112, 16143, 16144, 16146, 16150, 16151,
+  16153, 16167, 16172, 16195, 16214, 16245, 16271, 16274, 16283, 16289.
 
 * The public headers no longer use __unused nor __block.  This change is to
   support compiling programs that are derived from BSD sources and use
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 9fd0fd6..57a3c13 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -11,22 +11,19 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memcmp-sse4 memcpy-ssse3 \
 		   memcpy-sse2-unaligned mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
-		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
+		   memmove-ssse3-back strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strchr-sse2-no-bsf memcmp-ssse3
+		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned
 ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
+sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
 CFLAGS-strspn-c.c += -msse4
-CFLAGS-strstr.c += -msse4
-CFLAGS-strcasestr.c += -msse4
-CFLAGS-strcasestr-nonascii.c += -msse4
 endif
 endif
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 71beab8..3e2cad5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasestr.c.  */
   IFUNC_IMPL (i, name, strcasestr,
-	      IFUNC_IMPL_ADD (array, i, strcasestr, HAS_SSE4_2,
-			      __strcasestr_sse42)
 	      IFUNC_IMPL_ADD (array, i, strcasestr, 1, __strcasestr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcat.S.  */
@@ -184,7 +182,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strstr-c.c.  */
   IFUNC_IMPL (i, name, strstr,
-	      IFUNC_IMPL_ADD (array, i, strstr, HAS_SSE4_2, __strstr_sse42)
+	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wcscpy.S.  */
diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c
deleted file mode 100644
index c13a4c4..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr-c.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Multiple versions of strcasestr
-   All versions must be listed in ifunc-impl-list.c.  */
-
-#include "init-arch.h"
-
-#define STRCASESTR __strcasestr_sse2
-
-#include "string/strcasestr.c"
-
-extern char *__strcasestr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
-
-#if 1
-libc_ifunc (__strcasestr,
-	    HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
-#else
-libc_ifunc (__strcasestr,
-	    0 ? __strcasestr_sse42 : __strcasestr_sse2);
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
deleted file mode 100644
index 032a642..0000000
--- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/* strstr with SSE4.2 intrinsics
-   Copyright (C) 2010-2013 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <ctype.h>
-#include <xmmintrin.h>
-
-
-/* Similar to __m128i_strloadu.  Convert to lower case for none-POSIX/C
-   locale.  */
-static __m128i
-__m128i_strloadu_tolower (const unsigned char *p)
-{
-  union
-    {
-      char b[16];
-      __m128i x;
-    } u;
-
-  for (int i = 0; i < 16; ++i)
-    if (p[i] == 0)
-      {
-	u.b[i] = 0;
-	break;
-      }
-    else
-      u.b[i] = tolower (p[i]);
-
-  return u.x;
-}
-
-
-#define STRCASESTR_NONASCII
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42_nonascii
-#include "strstr.c"
diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
index d1cfb3b..834e656 100644
--- a/sysdeps/x86_64/multiarch/strcasestr.c
+++ b/sysdeps/x86_64/multiarch/strcasestr.c
@@ -1,7 +1,13 @@
-extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
-					  const unsigned char *s2)
-  attribute_hidden;
+/* Multiple versions of strcasestr
+   All versions must be listed in ifunc-impl-list.c.  */
 
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42
-#include "strstr.c"
+#include "init-arch.h"
+
+#define STRCASESTR __strcasestr_sse2
+
+#include "string/strcasestr.c"
+
+extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
+
+libc_ifunc (__strcasestr,
+	    __strcasestr_sse2);
diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c
deleted file mode 100644
index 42bbe48..0000000
--- a/sysdeps/x86_64/multiarch/strstr-c.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Multiple versions of strstr.
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2012-2013 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* Redefine strstr so that the compiler won't complain about the type
-   mismatch with the IFUNC selector in strong_alias, below.  */
-#undef  strstr
-#define strstr __redirect_strstr
-#include <string.h>
-#undef  strstr
-
-#define STRSTR __strstr_sse2
-#ifdef SHARED
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
-#endif
-
-#include "string/strstr.c"
-
-extern __typeof (__redirect_strstr) __strstr_sse42 attribute_hidden;
-extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
-
-#include "init-arch.h"
-
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-   ifunc symbol properly.  */
-extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2)
-
-#undef strstr
-strong_alias (__libc_strstr, strstr)
diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
new file mode 100644
index 0000000..99bae2c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
@@ -0,0 +1,374 @@
+/* strstr with unaligned loads
+   Copyright (C) 2009-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY(__strstr_sse2_unaligned)
+	movzbl	(%rsi), %eax
+	testb	%al, %al
+	je	L(empty)
+	movzbl	1(%rsi), %edx
+	testb	%dl, %dl
+	je	L(strchr)
+	movd	%eax, %xmm1
+	movd	%edx, %xmm2
+	movq	%rdi, %rax
+	andl	$4095, %eax
+	punpcklbw	%xmm1, %xmm1
+	cmpq	$4031, %rax
+	punpcklbw	%xmm2, %xmm2
+	punpcklwd	%xmm1, %xmm1
+	punpcklwd	%xmm2, %xmm2
+	pshufd	$0, %xmm1, %xmm1
+	pshufd	$0, %xmm2, %xmm2
+	ja	L(cross_page)
+	movdqu	(%rdi), %xmm3
+	pxor	%xmm5, %xmm5
+	movdqu	1(%rdi), %xmm4
+	movdqa	%xmm3, %xmm6
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	movdqu	16(%rdi), %xmm0
+	pcmpeqb	%xmm5, %xmm6
+	pminub	%xmm4, %xmm3
+	movdqa	%xmm3, %xmm4
+	movdqu	17(%rdi), %xmm3
+	pcmpeqb	%xmm0, %xmm5
+	pcmpeqb	%xmm2, %xmm3
+	por	%xmm6, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pminub	%xmm3, %xmm0
+	por	%xmm5, %xmm0
+	pmovmskb	%xmm4, %r8d
+	pmovmskb	%xmm0, %eax
+	salq	$16, %rax
+	orq	%rax, %r8
+	je	L(next_32_bytes)
+L(next_pair_index):
+	bsf	%r8, %rax
+	addq	%rdi, %rax
+	cmpb	$0, (%rax)
+	je	L(zero1)
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(found1)
+	cmpb	2(%rax), %dl
+	jne	L(next_pair)
+	xorl	%edx, %edx
+	jmp	L(pair_loop_start)
+
+	.p2align 4
+L(strchr):
+	movzbl	%al, %esi
+	jmp	__strchr_sse2
+
+	.p2align 4
+L(pair_loop):
+	addq	$1, %rdx
+	cmpb	2(%rax,%rdx), %cl
+	jne	L(next_pair)
+L(pair_loop_start):
+	movzbl	3(%rsi,%rdx), %ecx
+	testb	%cl, %cl
+	jne	L(pair_loop)
+L(found1):
+	ret
+L(zero1):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(next_pair):
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair_index)
+
+	.p2align 4
+L(next_32_bytes):
+	movdqu	32(%rdi), %xmm3
+	pxor	%xmm5, %xmm5
+	movdqu	33(%rdi), %xmm4
+	movdqa	%xmm3, %xmm6
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	movdqu	48(%rdi), %xmm0
+	pcmpeqb	%xmm5, %xmm6
+	pminub	%xmm4, %xmm3
+	movdqa	%xmm3, %xmm4
+	movdqu	49(%rdi), %xmm3
+	pcmpeqb	%xmm0, %xmm5
+	pcmpeqb	%xmm2, %xmm3
+	por	%xmm6, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pminub	%xmm3, %xmm0
+	por	%xmm5, %xmm0
+	pmovmskb	%xmm4, %eax
+	salq	$32, %rax
+	pmovmskb	%xmm0, %r8d
+	salq	$48, %r8
+	orq	%rax, %r8
+	je	L(loop_header)
+L(next_pair2_index):
+	bsfq	%r8, %rax
+	addq	%rdi, %rax
+	cmpb	$0, (%rax)
+	je	L(zero2)
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(found2)
+	cmpb	2(%rax), %dl
+	jne	L(next_pair2)
+	xorl	%edx, %edx
+	jmp	L(pair_loop2_start)
+
+	.p2align 4
+L(pair_loop2):
+	addq	$1, %rdx
+	cmpb	2(%rax,%rdx), %cl
+	jne	L(next_pair2)
+L(pair_loop2_start):
+	movzbl	3(%rsi,%rdx), %ecx
+	testb	%cl, %cl
+	jne	L(pair_loop2)
+L(found2):
+	ret
+	L(zero2):
+	xorl	%eax, %eax
+	ret
+L(empty):
+	mov %rdi, %rax
+	ret
+
+	.p2align 4
+L(next_pair2):
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair2_index)
+L(loop_header):
+	movq	$-512, %r11
+	movq	%rdi, %r9
+
+	pxor	%xmm7, %xmm7
+	andq	$-64, %rdi
+
+	.p2align 4
+L(loop):
+	movdqa	64(%rdi), %xmm3
+	movdqu	63(%rdi), %xmm6
+	movdqa	%xmm3, %xmm0
+	pxor	%xmm2, %xmm3
+	pxor	%xmm1, %xmm6
+	movdqa	80(%rdi), %xmm10
+	por	%xmm3, %xmm6
+	pminub	%xmm10, %xmm0
+	movdqu	79(%rdi), %xmm3
+	pxor	%xmm2, %xmm10
+	pxor	%xmm1, %xmm3
+	movdqa	96(%rdi), %xmm9
+	por	%xmm10, %xmm3
+	pminub	%xmm9, %xmm0
+	pxor	%xmm2, %xmm9
+	movdqa	112(%rdi), %xmm8
+	addq	$64, %rdi
+	pminub	%xmm6, %xmm3
+	movdqu	31(%rdi), %xmm4
+	pminub	%xmm8, %xmm0
+	pxor	%xmm2, %xmm8
+	pxor	%xmm1, %xmm4
+	por	%xmm9, %xmm4
+	pminub	%xmm4, %xmm3
+	movdqu	47(%rdi), %xmm5
+	pxor	%xmm1, %xmm5
+	por	%xmm8, %xmm5
+	pminub	%xmm5, %xmm3
+	pminub	%xmm3, %xmm0
+	pcmpeqb	%xmm7, %xmm0
+	pmovmskb	%xmm0, %eax
+	testl	%eax, %eax
+	je	L(loop)
+	pminub (%rdi), %xmm6
+	pminub 32(%rdi),%xmm4
+	pminub 48(%rdi),%xmm5
+	pcmpeqb %xmm7, %xmm6
+	pcmpeqb %xmm7, %xmm5
+	pmovmskb	%xmm6, %edx
+	movdqa	16(%rdi), %xmm8
+	pcmpeqb %xmm7, %xmm4
+	movdqu  15(%rdi), %xmm0
+	pmovmskb	%xmm5, %r8d
+	movdqa  %xmm8, %xmm3
+	pmovmskb	%xmm4, %ecx
+	pcmpeqb %xmm1,%xmm0
+	pcmpeqb %xmm2,%xmm3
+	salq	$32, %rcx
+	pcmpeqb %xmm7,%xmm8
+	salq	$48, %r8
+	pminub  %xmm0,%xmm3
+	orq	%rcx, %rdx
+	por	%xmm3,%xmm8
+	orq	%rdx, %r8
+	pmovmskb	%xmm8, %eax
+	salq	$16, %rax
+	orq	%rax, %r8
+	je	L(loop)
+L(next_pair_index3):
+	bsfq	%r8, %rcx
+	addq	%rdi, %rcx
+	cmpb	$0, (%rcx)
+	je	L(zero)
+	xorl	%eax, %eax
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(success3)
+	cmpb	1(%rcx), %dl
+	jne	L(next_pair3)
+	jmp	L(pair_loop_start3)
+
+	.p2align 4
+L(pair_loop3):
+	addq	$1, %rax
+	cmpb	1(%rcx,%rax), %dl
+	jne	L(next_pair3)
+L(pair_loop_start3):
+	movzbl	3(%rsi,%rax), %edx
+	testb	%dl, %dl
+	jne	L(pair_loop3)
+L(success3):
+	lea	-1(%rcx), %rax
+	ret
+
+	.p2align 4
+L(next_pair3):
+	addq	%rax, %r11
+	movq	%rdi,  %rax
+	subq	%r9, %rax
+	cmpq	%r11, %rax
+	jl	L(switch_strstr)
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair_index3)
+	jmp	L(loop)
+
+	.p2align 4
+L(switch_strstr):
+	movq	%rdi, %rdi
+	jmp	__strstr_sse2
+
+	.p2align 4
+L(cross_page):
+
+	movq	%rdi, %rax
+	pxor	%xmm0, %xmm0
+	andq	$-64, %rax
+	movdqa	(%rax), %xmm3
+	movdqu	-1(%rax), %xmm4
+	movdqa	%xmm3, %xmm8
+	movdqa	16(%rax), %xmm5
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm0, %xmm8
+	pcmpeqb	%xmm2, %xmm3
+	movdqa	%xmm5, %xmm7
+	pminub	%xmm4, %xmm3
+	movdqu	15(%rax), %xmm4
+	pcmpeqb	%xmm0, %xmm7
+	por	%xmm3, %xmm8
+	movdqa	%xmm5, %xmm3
+	movdqa	32(%rax), %xmm5
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm2, %xmm3
+	movdqa	%xmm5, %xmm6
+	pmovmskb	%xmm8, %ecx
+	pminub	%xmm4, %xmm3
+	movdqu	31(%rax), %xmm4
+	por	%xmm3, %xmm7
+	movdqa	%xmm5, %xmm3
+	pcmpeqb	%xmm0, %xmm6
+	movdqa	48(%rax), %xmm5
+	pcmpeqb	%xmm1, %xmm4
+	pmovmskb	%xmm7, %r8d
+	pcmpeqb	%xmm2, %xmm3
+	pcmpeqb	%xmm5, %xmm0
+	pminub	%xmm4, %xmm3
+	movdqu	47(%rax), %xmm4
+	por	%xmm3, %xmm6
+	movdqa	%xmm5, %xmm3
+	salq	$16, %r8
+	pcmpeqb	%xmm1, %xmm4
+	pcmpeqb	%xmm2, %xmm3
+	pmovmskb	%xmm6, %r10d
+	pminub	%xmm4, %xmm3
+	por	%xmm3, %xmm0
+	salq	$32, %r10
+	orq	%r10, %r8
+	orq	%rcx, %r8
+	movl	%edi, %ecx
+	pmovmskb	%xmm0, %edx
+	subl	%eax, %ecx
+	salq	$48, %rdx
+	orq	%rdx, %r8
+	shrq	%cl, %r8
+	je	L(loop_header)
+L(next_pair_index4):
+	bsfq	%r8, %rax
+	addq	%rdi, %rax
+	cmpb	$0, (%rax)
+	je	L(zero)
+
+	cmpq	%rax,%rdi
+	je	L(next_pair4)
+
+	movzbl	2(%rsi), %edx
+	testb	%dl, %dl
+	je	L(found3)
+	cmpb	1(%rax), %dl
+	jne	L(next_pair4)
+	xorl	%edx, %edx
+	jmp	L(pair_loop_start4)
+
+	.p2align 4
+L(pair_loop4):
+	addq	$1, %rdx
+	cmpb	1(%rax,%rdx), %cl
+	jne	L(next_pair4)
+L(pair_loop_start4):
+	movzbl	3(%rsi,%rdx), %ecx
+	testb	%cl, %cl
+	jne	L(pair_loop4)
+L(found3):
+	subq $1, %rax
+	ret
+
+	.p2align 4
+L(next_pair4):
+	leaq	-1(%r8), %rax
+	andq	%rax, %r8
+	jne	L(next_pair_index4)
+	jmp	L(loop_header)
+
+	.p2align 4
+L(found):
+	rep
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+
+END(__strstr_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index cd63b68..fbff3a8 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -1,6 +1,6 @@
-/* strstr with SSE4.2 intrinsics
-   Copyright (C) 2009-2013 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
+/* Multiple versions of strstr.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2012-2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -17,369 +17,31 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <nmmintrin.h>
-#include "varshift.h"
-
-#ifndef STRSTR_SSE42
-# define STRSTR_SSE42 __strstr_sse42
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# include <ctype.h>
-# include <locale/localeinfo.h>
-
-# define LOADBYTE(C)		tolower (C)
-# define CMPBYTE(C1, C2)	(tolower (C1) == tolower (C2))
-#else
-# define LOADBYTE(C)		(C)
-# define CMPBYTE(C1, C2)	((C1) == (C2))
-#endif
-
-/* We use 0xe ordered-compare:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_ORDER
-	| _SIDD_LEAST_SIGNIFICANT
-   on pcmpistri to do the scanning and string comparsion requirements of
-   sub-string match.  In the scanning phase, we process Cflag and ECX
-   index to locate the first fragment match; once the first fragment
-   match position has been identified, we do comparison of subsequent
-   string fragments until we can conclude false or true match; whe
-   n concluding a false match, we may need to repeat scanning process
-   from next relevant offset in the target string.
-
-   In the scanning phase we have 4 cases:
-   case		ECX	CFlag	ZFlag	SFlag
-    1		16	  0	  0	  0
-    2a		16	  0	  0	  1
-    2b		16	  0	  1	  0
-    2c		16	  0	  1	  1
-
-   1. No ordered-comparison match, both 16B fragments are valid, so
-      continue to next fragment.
-   2. No ordered-comparison match, there is EOS in either fragment,
-   2a. Zflg = 0, Sflg = 1, we continue
-   2b. Zflg = 1, Sflg = 0, we conclude no match and return.
-   2c. Zflg = 1, sflg = 1, lenth determine match or no match
-
-   In the string comparison phase, the 1st fragment match is fixed up
-   to produce ECX = 0.  Subsequent fragment compare of nonzero index
-   and no match conclude a false match.
-
-   case		ECX	CFlag	ZFlag	SFlag
-    3		 X	  1	  0	  0/1
-    4a		 0	  1	  0	  0
-    4b		 0	  1	  0	  1
-    4c		0 < X	  1	  0	  0/1
-    5		16	  0	  1	  0
-
-   3. An initial ordered-comparison fragment match, we fix up to do
-      subsequent string comparison
-   4a. Continuation of fragment comparison of a string compare.
-   4b. EOS reached in the reference string, we conclude true match and
-       return
-   4c. String compare failed if index is nonzero, we need to go back to
-       scanning
-   5.  failed string compare, go back to scanning
- */
-
-#if !(defined USE_AS_STRCASESTR && defined STRCASESTR_NONASCII)
-/* Simple replacement of movdqu to address 4KB boundary cross issue.
-   If EOS occurs within less than 16B before 4KB boundary, we don't
-   cross to next page.  */
-static __m128i
-__m128i_strloadu (const unsigned char * p, __m128i zero)
-{
-  if (__builtin_expect ((int) ((size_t) p & 0xfff) > 0xff0, 0))
-    {
-      size_t offset = ((size_t) p & (16 - 1));
-      __m128i a = _mm_load_si128 ((__m128i *) (p - offset));
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
-      if ((bmsk >> offset) != 0)
-	return __m128i_shift_right (a, offset);
-    }
-  return _mm_loadu_si128 ((__m128i *) p);
-}
-#endif
-
-#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
-
-/* Similar to __m128i_strloadu.  Convert to lower case for POSIX/C
-   locale and other which have single-byte letters only in the ASCII
-   range.  */
-static __m128i
-__m128i_strloadu_tolower (const unsigned char *p, __m128i zero, __m128i uclow,
-			  __m128i uchigh, __m128i lcqword)
-{
-  __m128i frag = __m128i_strloadu (p, zero);
-
-  /* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'.  */
-  __m128i r2 = _mm_cmpgt_epi8 (uchigh, frag);
-  /* Compare if bytes are > 'A' - 1.  */
-  __m128i r1 = _mm_cmpgt_epi8 (frag, uclow);
-  /* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1.  */
-  __m128i mask = _mm_and_si128 (r2, r1);
-  /* Apply lowercase bit 6 mask for above mask bytes == ff.  */
-  return _mm_or_si128 (frag, _mm_and_si128 (mask, lcqword));
-}
-
-#endif
-
-/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
-   algorithm) overlap for a fully populated 16B vector.
-   Input parameter: 1st 16Byte loaded from the reference string of a
-		    strstr function.
-   We don't use KMP algorithm if reference string is less than 16B.  */
-static int
-__inline__ __attribute__ ((__always_inline__,))
-KMP16Bovrlap (__m128i s2)
-{
-  __m128i b = _mm_unpacklo_epi8 (s2, s2);
-  __m128i a = _mm_unpacklo_epi8 (b, b);
-  a = _mm_shuffle_epi32 (a, 0);
-  b = _mm_srli_si128 (s2, sizeof (char));
-  int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
-
-  /* _BitScanForward(&k1, bmsk); */
-  int k1;
-  __asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
-  if (!bmsk)
-    return 16;
-  else if (bmsk == 0x7fff)
-    return 1;
-  else if (!k1)
-    {
-      /* There are al least two distinct chars in s2.  If byte 0 and 1 are
-	 idential and the distinct value lies farther down, we can deduce
-	 the next byte offset to restart full compare is least no earlier
-	 than byte 3.  */
-      return 3;
-    }
-  else
-    {
-      /* Byte 1 is not degenerated to byte 0.  */
-      return k1 + 1;
-    }
-}
-
-char *
-__attribute__ ((section (".text.sse4.2")))
-STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
-{
-#define p1 s1
-  const unsigned char *p2 = s2;
-
-#ifndef STRCASESTR_NONASCII
-  if (__builtin_expect (p2[0] == '\0', 0))
-    return (char *) p1;
-
-  if (__builtin_expect (p1[0] == '\0', 0))
-    return NULL;
-
-  /* Check if p1 length is 1 byte long.  */
-  if (__builtin_expect (p1[1] == '\0', 0))
-    return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# ifndef STRCASESTR_NONASCII
-  if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
-			!= 0, 0))
-    return __strcasestr_sse42_nonascii (s1, s2);
-
-  const __m128i uclow = _mm_set1_epi8 (0x40);
-  const __m128i uchigh = _mm_set1_epi8 (0x5b);
-  const __m128i lcqword = _mm_set1_epi8 (0x20);
-  const __m128i zero = _mm_setzero_si128 ();
-#  define strloadu(p) __m128i_strloadu_tolower (p, zero, uclow, uchigh, lcqword)
-# else
-#  define strloadu __m128i_strloadu_tolower
-#  define zero _mm_setzero_si128 ()
-# endif
-#else
-# define strloadu(p) __m128i_strloadu (p, zero)
-  const __m128i zero = _mm_setzero_si128 ();
+/* Redefine strstr so that the compiler won't complain about the type
+   mismatch with the IFUNC selector in strong_alias, below.  */
+#undef  strstr
+#define strstr __redirect_strstr
+#include <string.h>
+#undef  strstr
+
+#define STRSTR __strstr_sse2
+#ifdef SHARED
+# undef libc_hidden_builtin_def
+# define libc_hidden_builtin_def(name) \
+  __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
 #endif
 
-  /* p1 > 1 byte long.  Load up to 16 bytes of fragment.  */
-  __m128i frag1 = strloadu (p1);
-
-  __m128i frag2;
-  if (p2[1] != '\0')
-    /* p2 is > 1 byte long.  */
-    frag2 = strloadu (p2);
-  else
-    frag2 = _mm_insert_epi8 (zero, LOADBYTE (p2[0]), 0);
-
-  /* Unsigned bytes, equal order, does frag2 has null?  */
-  int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-  if (cmp_s & cmp_c)
-    {
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2, zero));
-      int len;
-      __asm ("bsfl %[bmsk], %[len]"
-	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
-      p1 += cmp;
-      if ((len + cmp) <= 16)
-	return (char *) p1;
-
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      if ((len + cmp) <= 16)
-	return (char *) p1 + cmp;
-    }
-
-  if (cmp_s)
-    {
-      /* Adjust addr for 16B alginment in ensuing loop.  */
-      while (!cmp_z)
-	{
-	  p1 += cmp;
-	  /* Load up to 16 bytes of fragment.  */
-	  frag1 = strloadu (p1);
-	  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-	  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-	  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-	  /* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
-	     once already, this time cmp will be zero and we can exit.  */
-	  if ((!cmp) & cmp_c)
-	    break;
-	}
-
-      if (!cmp_c)
-	return NULL;
-
-      /* Since s2 is less than 16 bytes, com_c is definitive
-	 determination of full match.  */
-      return (char *) p1 + cmp;
-    }
-
-  /* General case, s2 is at least 16 bytes or more.
-     First, the common case of false-match at first byte of p2.  */
-  const unsigned char *pt = NULL;
-  int kmp_fwd = 0;
-re_trace:
-  while (!cmp_c)
-    {
-      /* frag1 has null. */
-      if (cmp_z)
-	return NULL;
-
-      /* frag 1 has no null, advance 16 bytes.  */
-      p1 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      /* Unsigned bytes, equal order, is there a partial match?  */
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-    }
-
-  /* Next, handle initial positive match as first byte of p2.  We have
-     a partial fragment match, make full determination until we reached
-     end of s2.  */
-  if (!cmp)
-    {
-      if (cmp_z)
-	return (char *) p1;
-
-      pt = p1;
-      p1 += 16;
-      p2 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag2 = strloadu (p2);
-    }
-  else
-    {
-      /* Adjust 16B alignment.  */
-      p1 += cmp;
-      pt = p1;
-    }
-
-  /* Load up to 16 bytes of fragment.  */
-  frag1 = strloadu (p1);
-
-  /* Unsigned bytes, equal order, does frag2 has null?  */
-  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-  while (!(cmp | cmp_z | cmp_s))
-    {
-      p1 += 16;
-      p2 += 16;
-      /* Load up to 16 bytes of fragment.  */
-      frag2 = strloadu (p2);
-      /* Load up to 16 bytes of fragment.  */
-      frag1 = strloadu (p1);
-      /* Unsigned bytes, equal order, does frag2 has null?  */
-      cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-      cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-      cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-      cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
-    }
-
-  /* Full determination yielded a false result, retrace s1 to next
-     starting position.
-     Zflg	1      0      1			0/1
-     Sflg	0      1      1			0/1
-     cmp	na     0      0			>0
-     action   done   done   continue    continue if s2 < s1
-	      false  match  retrace s1     else false
-   */
-
-  if (cmp_s & !cmp)
-    return (char *) pt;
-  if (cmp_z)
-    {
-      if (!cmp_s)
-	return NULL;
-
-      /* Handle both zero and sign flag set and s1 is shorter in
-	 length.  */
-      int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
-      int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
-      int len;
-      int len1;
-      __asm ("bsfl %[bmsk], %[len]"
-	     : [len] "=r" (len) : [bmsk] "r" (bmsk));
-      __asm ("bsfl %[bmsk1], %[len1]"
-	     : [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
-      if (len >= len1)
-	return NULL;
-    }
-  else if (!cmp)
-    return (char *) pt;
-
-  /* Otherwise, we have to retrace and continue.  Default of multiple
-     paths that need to retrace from next byte in s1.  */
-  p2 = s2;
-  frag2 = strloadu (p2);
-
-  if (!kmp_fwd)
-    kmp_fwd = KMP16Bovrlap (frag2);
+#include "string/strstr.c"
 
-  /* KMP algorithm predicted overlap needs to be corrected for
-     partial fragment compare.  */
-  p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
+extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
+extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
 
-  /* Since s2 is at least 16 bytes long, we're certain there is no
-     match.  */
-  if (p1[0] == '\0')
-    return NULL;
+#include "init-arch.h"
 
-  /* Load up to 16 bytes of fragment.  */
-  frag1 = strloadu (p1);
+/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
+   ifunc symbol properly.  */
+extern __typeof (__redirect_strstr) __libc_strstr;
+libc_ifunc (__libc_strstr, HAS_FAST_UNALIGNED_LOAD ? __strstr_sse2_unaligned : __strstr_sse2)
 
-  /* Unsigned bytes, equal order, is there a partial match?  */
-  cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
-  cmp = _mm_cmpistri (frag2, frag1, 0x0c);
-  cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
-  goto re_trace;
-}
+#undef strstr
+strong_alias (__libc_strstr, strstr)

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                        |   14 +
 NEWS                                             |   24 +-
 sysdeps/x86_64/multiarch/Makefile                |    9 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c       |    4 +-
 sysdeps/x86_64/multiarch/strcasestr-c.c          |   19 -
 sysdeps/x86_64/multiarch/strcasestr-nonascii.c   |   50 ---
 sysdeps/x86_64/multiarch/strcasestr.c            |   18 +-
 sysdeps/x86_64/multiarch/strstr-c.c              |   47 ---
 sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S |  374 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strstr.c                |  388 ++--------------------
 10 files changed, 441 insertions(+), 506 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-c.c
 delete mode 100644 sysdeps/x86_64/multiarch/strcasestr-nonascii.c
 delete mode 100644 sysdeps/x86_64/multiarch/strstr-c.c
 create mode 100644 sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]