This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch master updated. glibc-2.18-108-g5905e7b

From: neleai at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 11 Sep 2013 15:08:54 -0000
Subject: GNU C Library master sources branch master updated. glibc-2.18-108-g5905e7b
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  5905e7b3e29139dbef84c065ca39315485f497e1 (commit)
      from  b987c776723eea2f223885289c466386f12e823a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5905e7b3e29139dbef84c065ca39315485f497e1

commit 5905e7b3e29139dbef84c065ca39315485f497e1
Author: OndÅ?ej BÃlka <neleai@seznam.cz>
Date:   Wed Sep 11 17:07:38 2013 +0200

    Faster strchr implementation.

diff --git a/ChangeLog b/ChangeLog
index 6a54979..cb1b403 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2013-09-11  OndÅ?ej BÃlka  <neleai@seznam.cz>
+
+	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Remove: __strchr_sse42.
+	* sysdeps/x86_64/multiarch/strchr.S (__strchr_sse42): Remove.
+	(strchr): Remove __strchr_sse42 ifunc selection.
+	* sysdeps/x86_64/strchr.S (strchr): Use optimized implementation.
+	* sysdeps/x86_64/strchrnul.S: Include sysdeps/x86_64/strchr.S.
+
 2013-09-11  Will Newton  <will.newton@linaro.org>
 
 	* benchtests/bench-timing.h (TIMING_INIT): Rename ITERS
@@ -35,7 +44,7 @@
 	* malloc/malloc.c (__libc_pvalloc): Check the value of bytes
 	does not overflow.
 
-2013-09-10   OndÅ?ej BÃlka  <neleai@seznam.cz>
+2013-09-10  OndÅ?ej BÃlka  <neleai@seznam.cz>
 
 	* sysdeps/ieee754/dbl-64/e_j0.c: Remove DO_NOT_USE_THIS conditionals.
 	* sysdeps/ieee754/dbl-64/e_j1.c: Likewise.
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f8756d7..1a65ac0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -110,7 +110,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strchr.S.  */
   IFUNC_IMPL (i, name, strchr,
-	      IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
 
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index f170238..3f0b2c5 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -29,12 +29,6 @@ ENTRY(strchr)
 	jne	1f
 	call	__init_cpu_features
 1:	leaq	__strchr_sse2(%rip), %rax
-	testl	$bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
-	jnz	2f
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-	jz	2f
-	leaq	__strchr_sse42(%rip), %rax
-	ret
 2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
 	jz	3f
 	leaq    __strchr_sse2_no_bsf(%rip), %rax
@@ -42,127 +36,6 @@ ENTRY(strchr)
 END(strchr)
 
 
-/*
-   This implementation uses SSE4 instructions to compare up to 16 bytes
-   at a time looking for the first occurrence of the character c in the
-   string s:
-
-   char *strchr (const char *s, int c);
-
-   We use 0xa:
-	_SIDD_SBYTE_OPS
-	| _SIDD_CMP_EQUAL_EACH
-	| _SIDD_LEAST_SIGNIFICANT
-   on pcmpistri to compare xmm/mem128
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   X X X X X X X X X X X X X X X X
-
-   against xmm
-
-   0 1 2 3 4 5 6 7 8 9 A B C D E F
-   C C C C C C C C C C C C C C C C
-
-   to find out if the first 16byte data element has a byte C and the
-   offset of the first byte.  There are 3 cases:
-
-   1. The first 16byte data element has the byte C at the offset X.
-   2. The first 16byte data element has EOS and doesn't have the byte C.
-   3. The first 16byte data element is valid and doesn't have the byte C.
-
-   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
-   case		ECX	CFlag	ZFlag	SFlag
-    1		 X	  1	 0/1	  0
-    2		16	  0	  1	  0
-    3		16	  0	  0	  0
-
-   We exit from the loop for cases 1 and 2 with jbe which branches
-   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
-   X for case 1.  */
-
-	.section .text.sse4.2,"ax",@progbits
-	.align	16
-	.type	__strchr_sse42, @function
-	.globl	__strchr_sse42
-	.hidden	__strchr_sse42
-__strchr_sse42:
-	cfi_startproc
-	CALL_MCOUNT
-	testb	%sil, %sil
-	je	__strend_sse4
-	pxor	%xmm2, %xmm2
-	movd	%esi, %xmm1
-	movl	%edi, %ecx
-	pshufb  %xmm2, %xmm1
-	andl	$15, %ecx
-	movq	%rdi, %r8
-	je	L(aligned_start)
-
-/* Handle unaligned string.  */
-	andq	$-16, %r8
-	movdqa	(%r8), %xmm0
-	pcmpeqb	 %xmm0, %xmm2
-	pcmpeqb	 %xmm1, %xmm0
-	/* Find where NULL is.  */
-	pmovmskb %xmm2, %edx
-	/* Check if there is a match.  */
-	pmovmskb %xmm0, %esi
-	/* Remove the leading  bytes.  */
-	sarl	%cl, %edx
-	sarl	%cl, %esi
-	testl	%esi, %esi
-	je	L(unaligned_no_match)
-	/* Check which byte is a match.  */
-	bsfl	%esi, %eax
-	/* Is there a NULL? */
-	testl	%edx, %edx
-	je      L(unaligned_match)
-	bsfl	%edx, %esi
-	cmpl	%esi, %eax
-	/* Return NULL if NULL comes first.  */
-	ja	L(return_null)
-L(unaligned_match):
-	addq	%rdi, %rax
-	ret
-
-	.p2align 4
-L(unaligned_no_match):
-	testl	%edx, %edx
-	jne	L(return_null)
-
-/* Loop start on aligned string.  */
-L(loop):
-	addq	$16, %r8
-L(aligned_start):
-	pcmpistri	$0x2, (%r8), %xmm1
-	jbe	L(wrap)
-	addq	$16, %r8
-	pcmpistri	$0x2, (%r8), %xmm1
-	jbe	L(wrap)
-	addq	$16, %r8
-	pcmpistri       $0x2, (%r8), %xmm1
-	jbe     L(wrap)
-	addq	$16, %r8
-	pcmpistri	$0x2, (%r8), %xmm1
-	jbe	L(wrap)
-	jmp	L(loop)
-L(wrap):
-	jc	L(loop_exit)
-
-/* Return NULL.  */
-L(return_null):
-	xorl	%eax, %eax
-	ret
-
-/* Loop exit.  */
-	.p2align 4
-L(loop_exit):
-	leaq	(%r8,%rcx), %rax
-	ret
-	cfi_endproc
-	.size	__strchr_sse42, .-__strchr_sse42
-
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index d89f1eb..1900b37 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -19,51 +19,174 @@
 
 #include <sysdep.h>
 
+# ifndef ALIGN
+#  define ALIGN(n)	.p2align n
+# endif
+
 
 	.text
 ENTRY (strchr)
 	movd	%esi, %xmm1
-	movq	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	andq	$~15, %rdi
-	pxor	%xmm2, %xmm2
+	movl	%edi, %eax
+	andl	$4095, %eax
 	punpcklbw %xmm1, %xmm1
-	orl	$0xffffffff, %esi
-	movdqa	(%rdi), %xmm0
+	cmpl	$4032, %eax
+	punpcklwd %xmm1, %xmm1
 	pshufd	$0, %xmm1, %xmm1
-	subq	%rdi, %rcx
-	movdqa	%xmm0, %xmm3
-	leaq	16(%rdi), %rdi
+	jg	L(cross_page)
+	movdqu	(%rdi), %xmm0
+	pxor	%xmm3, %xmm3
+	movdqa	%xmm0, %xmm4
 	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	andl	%esi, %edx
-	andl	%esi, %ecx
-	orl	%edx, %ecx
-	jnz	1f
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	je	L(next_48_bytes)
+	bsf	%eax, %eax
+#ifdef AS_STRCHRNUL
+	leaq	(%rdi,%rax), %rax
+#else
+	movl	$0, %edx
+	leaq	(%rdi,%rax), %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+#endif
+	ret
 
-2:	movdqa	(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	movdqa	%xmm0, %xmm3
+	ALIGN(3)
+	L(next_48_bytes):
+	movdqu	16(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
 	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	orl	%edx, %ecx
-	jz	2b
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %ecx
+	movdqu	32(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	salq	$16, %rcx
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	48(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	salq	$32, %rax
+	pcmpeqb	%xmm1, %xmm0
+	orq	%rcx, %rax
+	por	%xmm3, %xmm0
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	testq	%rax, %rax
+	jne	L(return)
+L(loop_start):
+	/* We use this alignment to force loop be aligned to 8 but not
+	   16 bytes.  This gives better sheduling on AMD processors.  */
+	ALIGN(4)
+	pxor	%xmm6, %xmm6
+	andq	$-64, %rdi
+	ALIGN(3)
+L(loop64):
+	addq	$64, %rdi
+	movdqa	(%rdi), %xmm5
+	movdqa	16(%rdi), %xmm2
+	movdqa	32(%rdi), %xmm3
+	pxor	%xmm1, %xmm5
+	movdqa	48(%rdi), %xmm4
+	pxor	%xmm1, %xmm2
+	pxor	%xmm1, %xmm3
+	pminub	(%rdi), %xmm5
+	pxor	%xmm1, %xmm4
+	pminub	16(%rdi), %xmm2
+	pminub	32(%rdi), %xmm3
+	pminub	%xmm2, %xmm5
+	pminub	48(%rdi), %xmm4
+	pminub	%xmm3, %xmm5
+	pminub	%xmm4, %xmm5
+	pcmpeqb %xmm6, %xmm5
+	pmovmskb %xmm5, %eax
+
+	testl	%eax, %eax
+	je	L(loop64)
+
+	movdqa	(%rdi), %xmm5
+	movdqa	%xmm5, %xmm0
+	pcmpeqb	%xmm1, %xmm5
+	pcmpeqb	%xmm6, %xmm0
+	por	%xmm0, %xmm5
+	pcmpeqb %xmm6, %xmm2
+	pcmpeqb %xmm6, %xmm3
+	pcmpeqb %xmm6, %xmm4
+
+	pmovmskb %xmm5, %ecx
+	pmovmskb %xmm2, %eax
+	salq	$16, %rax
+	pmovmskb %xmm3, %r8d
+	pmovmskb %xmm4, %edx
+	salq	$32, %r8
+	orq	%r8, %rax
+	orq	%rcx, %rax
+	salq	$48, %rdx
+	orq	%rdx, %rax
+	ALIGN(3)
+L(return):
+	bsfq	%rax, %rax
+#ifdef AS_STRCHRNUL
+	leaq	(%rdi,%rax), %rax
+#else
+	movl	$0, %edx
+	leaq	(%rdi,%rax), %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+#endif
+	ret
+	ALIGN(4)
+
+L(cross_page):
+	movq	%rdi, %rdx
+	pxor	%xmm2, %xmm2
+	andq	$-64, %rdx
+	movdqa	%xmm1, %xmm0
+	movdqa	(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %r8d
+	movdqa	16(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %eax
+	movdqa	32(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	salq	$16, %rax
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %r9d
+	movdqa	48(%rdx), %xmm3
+	pcmpeqb	%xmm3, %xmm2
+	salq	$32, %r9
+	pcmpeqb	%xmm3, %xmm0
+	orq	%r9, %rax
+	orq	%r8, %rax
+	por	%xmm2, %xmm0
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	movl	%edi, %ecx
+	subb	%dl, %cl
+	shrq	%cl, %rax
+	testq	%rax, %rax
+	jne	L(return)
+	jmp	L(loop_start)
 
-1:	bsfl	%edx, %edx
-	jz	4f
-	bsfl	%ecx, %ecx
-	leaq	-16(%rdi,%rdx), %rax
-	cmpl	%edx, %ecx
-	je	5f
-4:	xorl	%eax, %eax
-5:	ret
 END (strchr)
 
+#ifndef AS_STRCHRNUL
 weak_alias (strchr, index)
 libc_hidden_builtin_def (strchr)
-
+#endif
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
index d8c345b..bceeb61 100644
--- a/sysdeps/x86_64/strchrnul.S
+++ b/sysdeps/x86_64/strchrnul.S
@@ -20,43 +20,8 @@
 
 #include <sysdep.h>
 
-
-	.text
-ENTRY (__strchrnul)
-	movd	%esi, %xmm1
-	movq	%rdi, %rcx
-	punpcklbw %xmm1, %xmm1
-	andq	$~15, %rdi
-	pxor	%xmm2, %xmm2
-	punpcklbw %xmm1, %xmm1
-	orl	$0xffffffff, %esi
-	movdqa	(%rdi), %xmm0
-	pshufd	$0, %xmm1, %xmm1
-	subq	%rdi, %rcx
-	movdqa	%xmm0, %xmm3
-	leaq	16(%rdi), %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	orl	%edx, %ecx
-	andl	%esi, %ecx
-	jnz	1f
-
-2:	movdqa	(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	movdqa	%xmm0, %xmm3
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm2, %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm3, %ecx
-	orl	%edx, %ecx
-	jz	2b
-
-1:	bsfl	%ecx, %edx
-	leaq	-16(%rdi,%rdx), %rax
-	ret
-END (__strchrnul)
+#define strchr __strchrnul
+#define AS_STRCHRNUL
+#include "strchr.S"
 
 weak_alias (__strchrnul, strchrnul)

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                  |   11 ++-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    1 -
 sysdeps/x86_64/multiarch/strchr.S          |  127 ------------------
 sysdeps/x86_64/strchr.S                    |  191 +++++++++++++++++++++++-----
 sysdeps/x86_64/strchrnul.S                 |   41 +------
 5 files changed, 170 insertions(+), 201 deletions(-)


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]