This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [PATCH v2] Faster strchr implementation.

From: OndÅej BÃlka <neleai at seznam dot cz>
To: Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>
Cc: GNU C Library <libc-alpha at sourceware dot org>
Date: Mon, 2 Sep 2013 23:01:52 +0200
Subject: Re: [PATCH v2] Faster strchr implementation.
Authentication-results: sourceware.org; auth=none
References: <20130807140911 dot GA31968 at domone dot kolej dot mff dot cuni dot cz> <CAHjhQ926EE-MYDJR5Eftf+DUefBg-Gox0pw57vZ7XUwsO3OPJg at mail dot gmail dot com> <20130816095908 dot GA15776 at domone dot kolej dot mff dot cuni dot cz> <20130816121533 dot GB26552 at domone dot kolej dot mff dot cuni dot cz>
ping,

As this is similar to rawmemchr implementation these patches should be read
together
On Fri, Aug 16, 2013 at 02:15:33PM +0200, OndÅej BÃlka wrote:
> 
> A correct version is here.
> 
> > Hello,
> > 
> > I will run your tests for atom, silvermont, haswell tomorrow morning.
> > I should check strcmp, strchr, strrchr you wrote about. Right? Or
> > something else?
> > 
> A strchr will need to rerun tests.
> 
> Hi, I tuned my implementation do decrease loop overhead. It decreases
> loop overhead by significant constant factor over previous
> implementation.
> 
> There are architectures that I do not cover,
> haswell - an avx2 implementation that I posted is better and it is
>           better posted separately.
> 
> atom - An loop caused big overhead for sizes around 64 bytes and we need
>        work on more effective header, we keep no-bsf implementation for now.
> 
> silvermont - similar issues as atom but we need separate IFUNC casing to
>              choose no-bsf variant
> 
> athlon,athlon x2 - Same situation an we also need flag to choose other variant.
> 
> I updated results of my profiler.
> In my random test strchr would always find terminating zero. This does
> not happen in practice so now strchr will find character with 50%
> probability.
> 
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile.html
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile160813.tar.bz2
> 
> 
> OK to commit this iteration?
> 
> 	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
> 	(__libc_ifunc_impl_list): Remove: __strchr_sse42.
> 	* sysdeps/x86_64/multiarch/strchr.S (__strchr_sse42): Remove.
> 	(strchr): Remove __strchr_sse42 ifunc selection.
> 	* sysdeps/x86_64/strchr.S (strchr): Use optimized implementation.
> 	* sysdeps/x86_64/strchrnul.S: Include sysdeps/x86_64/strchr.S.
> 
> 
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 28d3579..8486294 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -116,7 +116,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/x86_64/multiarch/strchr.S.  */
>    IFUNC_IMPL (i, name, strchr,
> -	      IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42)
>  	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
>  	      IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
>  
> diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
> index f170238..3f0b2c5 100644
> --- a/sysdeps/x86_64/multiarch/strchr.S
> +++ b/sysdeps/x86_64/multiarch/strchr.S
> @@ -29,12 +29,6 @@ ENTRY(strchr)
>  	jne	1f
>  	call	__init_cpu_features
>  1:	leaq	__strchr_sse2(%rip), %rax
> -	testl	$bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
> -	jnz	2f
> -	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
> -	jz	2f
> -	leaq	__strchr_sse42(%rip), %rax
> -	ret
>  2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
>  	jz	3f
>  	leaq    __strchr_sse2_no_bsf(%rip), %rax
> @@ -42,127 +36,6 @@ ENTRY(strchr)
>  END(strchr)
>  
>  
> -/*
> -   This implementation uses SSE4 instructions to compare up to 16 bytes
> -   at a time looking for the first occurrence of the character c in the
> -   string s:
> -
> -   char *strchr (const char *s, int c);
> -
> -   We use 0xa:
> -	_SIDD_SBYTE_OPS
> -	| _SIDD_CMP_EQUAL_EACH
> -	| _SIDD_LEAST_SIGNIFICANT
> -   on pcmpistri to compare xmm/mem128
> -
> -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> -   X X X X X X X X X X X X X X X X
> -
> -   against xmm
> -
> -   0 1 2 3 4 5 6 7 8 9 A B C D E F
> -   C C C C C C C C C C C C C C C C
> -
> -   to find out if the first 16byte data element has a byte C and the
> -   offset of the first byte.  There are 3 cases:
> -
> -   1. The first 16byte data element has the byte C at the offset X.
> -   2. The first 16byte data element has EOS and doesn't have the byte C.
> -   3. The first 16byte data element is valid and doesn't have the byte C.
> -
> -   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
> -
> -   case		ECX	CFlag	ZFlag	SFlag
> -    1		 X	  1	 0/1	  0
> -    2		16	  0	  1	  0
> -    3		16	  0	  0	  0
> -
> -   We exit from the loop for cases 1 and 2 with jbe which branches
> -   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
> -   X for case 1.  */
> -
> -	.section .text.sse4.2,"ax",@progbits
> -	.align	16
> -	.type	__strchr_sse42, @function
> -	.globl	__strchr_sse42
> -	.hidden	__strchr_sse42
> -__strchr_sse42:
> -	cfi_startproc
> -	CALL_MCOUNT
> -	testb	%sil, %sil
> -	je	__strend_sse4
> -	pxor	%xmm2, %xmm2
> -	movd	%esi, %xmm1
> -	movl	%edi, %ecx
> -	pshufb  %xmm2, %xmm1
> -	andl	$15, %ecx
> -	movq	%rdi, %r8
> -	je	L(aligned_start)
> -
> -/* Handle unaligned string.  */
> -	andq	$-16, %r8
> -	movdqa	(%r8), %xmm0
> -	pcmpeqb	 %xmm0, %xmm2
> -	pcmpeqb	 %xmm1, %xmm0
> -	/* Find where NULL is.  */
> -	pmovmskb %xmm2, %edx
> -	/* Check if there is a match.  */
> -	pmovmskb %xmm0, %esi
> -	/* Remove the leading  bytes.  */
> -	sarl	%cl, %edx
> -	sarl	%cl, %esi
> -	testl	%esi, %esi
> -	je	L(unaligned_no_match)
> -	/* Check which byte is a match.  */
> -	bsfl	%esi, %eax
> -	/* Is there a NULL? */
> -	testl	%edx, %edx
> -	je      L(unaligned_match)
> -	bsfl	%edx, %esi
> -	cmpl	%esi, %eax
> -	/* Return NULL if NULL comes first.  */
> -	ja	L(return_null)
> -L(unaligned_match):
> -	addq	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(unaligned_no_match):
> -	testl	%edx, %edx
> -	jne	L(return_null)
> -
> -/* Loop start on aligned string.  */
> -L(loop):
> -	addq	$16, %r8
> -L(aligned_start):
> -	pcmpistri	$0x2, (%r8), %xmm1
> -	jbe	L(wrap)
> -	addq	$16, %r8
> -	pcmpistri	$0x2, (%r8), %xmm1
> -	jbe	L(wrap)
> -	addq	$16, %r8
> -	pcmpistri       $0x2, (%r8), %xmm1
> -	jbe     L(wrap)
> -	addq	$16, %r8
> -	pcmpistri	$0x2, (%r8), %xmm1
> -	jbe	L(wrap)
> -	jmp	L(loop)
> -L(wrap):
> -	jc	L(loop_exit)
> -
> -/* Return NULL.  */
> -L(return_null):
> -	xorl	%eax, %eax
> -	ret
> -
> -/* Loop exit.  */
> -	.p2align 4
> -L(loop_exit):
> -	leaq	(%r8,%rcx), %rax
> -	ret
> -	cfi_endproc
> -	.size	__strchr_sse42, .-__strchr_sse42
> -
>  
>  # undef ENTRY
>  # define ENTRY(name) \
> diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
> index d89f1eb..e6b705c 100644
> --- a/sysdeps/x86_64/strchr.S
> +++ b/sysdeps/x86_64/strchr.S
> @@ -19,51 +19,174 @@
>  
>  #include <sysdep.h>
>  
> +# ifndef ALIGN
> +#  define ALIGN(n)	.p2align n
> +# endif
> +
>  
>  	.text
>  ENTRY (strchr)
>  	movd	%esi, %xmm1
> -	movq	%rdi, %rcx
> -	punpcklbw %xmm1, %xmm1
> -	andq	$~15, %rdi
> -	pxor	%xmm2, %xmm2
> -	punpcklbw %xmm1, %xmm1
> -	orl	$0xffffffff, %esi
> -	movdqa	(%rdi), %xmm0
> +	movl	%edi, %eax
> +	andl	$4095, %eax
> +	punpcklbw	%xmm1, %xmm1
> +	cmpl	$4031, %eax
> +	punpcklwd	%xmm1, %xmm1
>  	pshufd	$0, %xmm1, %xmm1
> -	subq	%rdi, %rcx
> -	movdqa	%xmm0, %xmm3
> -	leaq	16(%rdi), %rdi
> +	jg	L(cross_page)
> +	movdqu	(%rdi), %xmm0
> +	pxor	%xmm3, %xmm3
> +	movdqa	%xmm0, %xmm4
>  	pcmpeqb	%xmm1, %xmm0
> -	pcmpeqb	%xmm2, %xmm3
> -	shl	%cl, %esi
> -	pmovmskb %xmm0, %edx
> -	pmovmskb %xmm3, %ecx
> -	andl	%esi, %edx
> -	andl	%esi, %ecx
> -	orl	%edx, %ecx
> -	jnz	1f
> +	pcmpeqb	%xmm3, %xmm4
> +	por	%xmm4, %xmm0
> +	pmovmskb	%xmm0, %eax
> +	test	%eax, %eax
> +	je	L(next_48_bytes)
> +	bsf	%eax, %eax
> +#ifdef AS_STRCHRNUL
> +	leaq    (%rdi,%rax), %rax
> +#else
> +	movl	$0, %edx
> +	leaq	(%rdi,%rax), %rax
> +	cmpb	%sil, (%rax)
> +	cmovne	%rdx, %rax
> +#endif
> +	ret
>  
> -2:	movdqa	(%rdi), %xmm0
> -	leaq	16(%rdi), %rdi
> -	movdqa	%xmm0, %xmm3
> +	ALIGN(3)
> +	L(next_48_bytes):
> +	movdqu	16(%rdi), %xmm0
> +	movdqa	%xmm0, %xmm4
> +	pcmpeqb	%xmm1, %xmm0
> +	pcmpeqb	%xmm3, %xmm4
> +	por	%xmm4, %xmm0
> +	pmovmskb	%xmm0, %ecx
> +	movdqu	32(%rdi), %xmm0
> +	movdqa	%xmm0, %xmm4
>  	pcmpeqb	%xmm1, %xmm0
> -	pcmpeqb	%xmm2, %xmm3
> -	pmovmskb %xmm0, %edx
> -	pmovmskb %xmm3, %ecx
> -	orl	%edx, %ecx
> -	jz	2b
> +	salq	$16, %rcx
> +	pcmpeqb	%xmm3, %xmm4
> +	por	%xmm4, %xmm0
> +	pmovmskb	%xmm0, %eax
> +	movdqu	48(%rdi), %xmm0
> +	pcmpeqb	%xmm0, %xmm3
> +	salq	$32, %rax
> +	pcmpeqb	%xmm1, %xmm0
> +	orq	%rcx, %rax
> +	por	%xmm3, %xmm0
> +	pmovmskb	%xmm0, %ecx
> +	salq	$48, %rcx
> +	orq	%rcx, %rax
> +	testq	%rax,%rax
> +	jne	L(return)
> +L(loop_start):
> +	/* We use this alignment to force loop be aligned to 8 but not
> +	   16 bytes.  This gives better sheduling on AMD processors.  */
> +	ALIGN(4)
> +	pxor	%xmm6, %xmm6
> +	andq	$-64, %rdi
> +	ALIGN(3)
> +L(loop64):
> +	addq	$64, %rdi
> +	movdqa  (%rdi), %xmm5
> +	movdqa  16(%rdi), %xmm2
> +	movdqa  32(%rdi), %xmm3
> +	pxor	%xmm1, %xmm5
> +	movdqa  48(%rdi), %xmm4
> +	pxor	%xmm1, %xmm2
> +	pxor	%xmm1, %xmm3
> +	pminub  (%rdi), %xmm5
> +	pxor	%xmm1, %xmm4
> +	pminub  16(%rdi), %xmm2
> +	pminub  32(%rdi), %xmm3
> +	pminub  %xmm2, %xmm5
> +	pminub  48(%rdi), %xmm4
> +	pminub  %xmm3, %xmm5
> +	pminub  %xmm4, %xmm5
> +	pcmpeqb %xmm6, %xmm5
> +	pmovmskb	%xmm5, %eax
> +
> +	testl	%eax, %eax
> +	je	L(loop64)
> +
> +	movdqa	(%rdi), %xmm5
> +	movdqa	%xmm5, %xmm0
> +	pcmpeqb	%xmm1, %xmm5
> +	pcmpeqb	%xmm6, %xmm0
> +	por	%xmm0, %xmm5
> +	pcmpeqb %xmm6, %xmm2
> +	pcmpeqb %xmm6, %xmm3
> +	pcmpeqb %xmm6, %xmm4
> +
> +	pmovmskb	%xmm5, %ecx
> +	pmovmskb	%xmm2, %eax
> +	salq	$16, %rax
> +	pmovmskb	%xmm3, %r8d
> +	pmovmskb	%xmm4, %edx
> +	salq	$32, %r8
> +	orq	%r8, %rax
> +	orq	%rcx, %rax
> +	salq	$48, %rdx
> +	orq	%rdx, %rax
> +	ALIGN(3)
> +L(return):
> +	bsfq	%rax, %rax
> +#ifdef AS_STRCHRNUL
> +	leaq    (%rdi,%rax), %rax
> +#else
> +	movl	$0, %edx
> +	leaq	(%rdi,%rax), %rax
> +	cmpb	%sil, (%rax)
> +	cmovne	%rdx, %rax
> +#endif
> +	ret
> +	ALIGN(4)
> +
> +L(cross_page):
> +	movq	%rdi, %rdx
> +	pxor	%xmm2, %xmm2
> +	andq	$-64, %rdx
> +	movdqa	%xmm1, %xmm0
> +	movdqa	(%rdx), %xmm3
> +	movdqa	%xmm3, %xmm4
> +	pcmpeqb	%xmm1, %xmm3
> +	pcmpeqb	%xmm2, %xmm4
> +	por	%xmm4, %xmm3
> +	pmovmskb	%xmm3, %r8d
> +	movdqa	16(%rdx), %xmm3
> +	movdqa	%xmm3, %xmm4
> +	pcmpeqb	%xmm1, %xmm3
> +	pcmpeqb	%xmm2, %xmm4
> +	por	%xmm4, %xmm3
> +	pmovmskb	%xmm3, %eax
> +	movdqa	32(%rdx), %xmm3
> +	movdqa	%xmm3, %xmm4
> +	pcmpeqb	%xmm1, %xmm3
> +	salq	$16, %rax
> +	pcmpeqb	%xmm2, %xmm4
> +	por	%xmm4, %xmm3
> +	pmovmskb	%xmm3, %r9d
> +	movdqa	48(%rdx), %xmm3
> +	pcmpeqb	%xmm3, %xmm2
> +	salq	$32, %r9
> +	pcmpeqb	%xmm3, %xmm0
> +	orq	%r9, %rax
> +	orq	%r8, %rax
> +	por	%xmm2, %xmm0
> +	pmovmskb	%xmm0, %ecx
> +	salq	$48, %rcx
> +	orq	%rcx, %rax
> +	movl	%edi, %ecx
> +	subb	%dl, %cl
> +	shrq	%cl, %rax
> +	testq	%rax, %rax
> +	jne	L(return)
> +	jmp	L(loop_start)
>  
> -1:	bsfl	%edx, %edx
> -	jz	4f
> -	bsfl	%ecx, %ecx
> -	leaq	-16(%rdi,%rdx), %rax
> -	cmpl	%edx, %ecx
> -	je	5f
> -4:	xorl	%eax, %eax
> -5:	ret
>  END (strchr)
>  
> +#ifndef AS_STRCHRNUL
>  weak_alias (strchr, index)
>  libc_hidden_builtin_def (strchr)
> -
> +#endif
> diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
> index d8c345b..bceeb61 100644
> --- a/sysdeps/x86_64/strchrnul.S
> +++ b/sysdeps/x86_64/strchrnul.S
> @@ -20,43 +20,8 @@
>  
>  #include <sysdep.h>
>  
> -
> -	.text
> -ENTRY (__strchrnul)
> -	movd	%esi, %xmm1
> -	movq	%rdi, %rcx
> -	punpcklbw %xmm1, %xmm1
> -	andq	$~15, %rdi
> -	pxor	%xmm2, %xmm2
> -	punpcklbw %xmm1, %xmm1
> -	orl	$0xffffffff, %esi
> -	movdqa	(%rdi), %xmm0
> -	pshufd	$0, %xmm1, %xmm1
> -	subq	%rdi, %rcx
> -	movdqa	%xmm0, %xmm3
> -	leaq	16(%rdi), %rdi
> -	pcmpeqb	%xmm1, %xmm0
> -	pcmpeqb	%xmm2, %xmm3
> -	shl	%cl, %esi
> -	pmovmskb %xmm0, %edx
> -	pmovmskb %xmm3, %ecx
> -	orl	%edx, %ecx
> -	andl	%esi, %ecx
> -	jnz	1f
> -
> -2:	movdqa	(%rdi), %xmm0
> -	leaq	16(%rdi), %rdi
> -	movdqa	%xmm0, %xmm3
> -	pcmpeqb	%xmm1, %xmm0
> -	pcmpeqb	%xmm2, %xmm3
> -	pmovmskb %xmm0, %edx
> -	pmovmskb %xmm3, %ecx
> -	orl	%edx, %ecx
> -	jz	2b
> -
> -1:	bsfl	%ecx, %edx
> -	leaq	-16(%rdi,%rdx), %rax
> -	ret
> -END (__strchrnul)
> +#define strchr __strchrnul
> +#define AS_STRCHRNUL
> +#include "strchr.S"
>  
>  weak_alias (__strchrnul, strchrnul)
Follow-Ups:
- Re: [PATCH v2] Faster strchr implementation.
  - From: OndÅej BÃlka
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]