This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PING][PATCH] Optimize strchr more.


ping
On Fri, Oct 04, 2013 at 11:10:10PM +0200, OndÅej BÃlka wrote:
> Hi,
> I improved strchr in similar way as strrchr. There is one difference
> that as ifunc is available we could use pshufb to broadcast byte. This
> gives us few cycles almost for free.
> 
> I did same optimizations as strrchr with one more. In loop check for 4
> 16-byte blocks if they match and calculate a mask for it. As x86-64 has
> destructive operations it looked faster to recalculate one block
> otherwise moves would slow us down like here.
> 
> int m0,m1,m2,m3;
> if (m0|m1|m2|m3)
>   {
>     /* recalculate m0
>     mask=m0|m1<<16|m2<<32|m3<<48;
>     ...
> 
> But this is not neccesary when we are interested only in first nonzero bit.
> We can use combined mask at end and it will come in action only when
> m0,m1,m2 are zero so m3 has correct value as in following pseudocode:
> 
> int m0,m1,m2,m3;
> if (m0|m1|m2|m3)
>   {
>     mask=m0|m1<<16|m2<<32|(m0|m1|m2|m3)<<48;
>     ...
> 
> These modifications give us around 4%. benchmarks are here:
> 
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strchr_profile.html
> 
> 	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
> 	Add strchr-ssse3.
> 	* sysdeps/x86_64/multiarch/strchr-ssse3.S: New file.
> 	* sysdeps/x86_64/multiarch/strchr.S: Add ifunc.
> 	* sysdeps/x86_64/strchr.S: Optimize implementation.
> 
> ---
>  sysdeps/x86_64/multiarch/Makefile       |   2 +-
>  sysdeps/x86_64/multiarch/strchr-ssse3.S |   3 +
>  sysdeps/x86_64/multiarch/strchr.S       |   4 +
>  sysdeps/x86_64/strchr.S                 | 228 ++++++++++++++++----------------
>  4 files changed, 122 insertions(+), 115 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/strchr-ssse3.S
> 
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index f910fb2..042f501 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -16,7 +16,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>  		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
>  		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
>  		   strcat-sse2-unaligned strncat-sse2-unaligned \
> -		   strchr-sse2-no-bsf memcmp-ssse3
> +		   strchr-sse2-no-bsf strchr-ssse3 memcmp-ssse3
>  ifeq (yes,$(config-cflags-sse4))
>  sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
>  CFLAGS-varshift.c += -msse4
> diff --git a/sysdeps/x86_64/multiarch/strchr-ssse3.S b/sysdeps/x86_64/multiarch/strchr-ssse3.S
> new file mode 100644
> index 0000000..6d09015
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strchr-ssse3.S
> @@ -0,0 +1,3 @@
> +#define USE_SSSE3
> +#define strchr __strchr_ssse3
> +#include "../strchr.S"
> diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
> index 3f0b2c5..2b67ca4 100644
> --- a/sysdeps/x86_64/multiarch/strchr.S
> +++ b/sysdeps/x86_64/multiarch/strchr.S
> @@ -29,6 +29,10 @@ ENTRY(strchr)
>  	jne	1f
>  	call	__init_cpu_features
>  1:	leaq	__strchr_sse2(%rip), %rax
> +	testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
> +	je	2f
> +	leaq    __strchr_ssse3(%rip), %rax
> +	ret
>  2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
>  	jz	3f
>  	leaq    __strchr_sse2_no_bsf(%rip), %rax
> diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
> index 1900b37..0bb45d2 100644
> --- a/sysdeps/x86_64/strchr.S
> +++ b/sysdeps/x86_64/strchr.S
> @@ -19,174 +19,174 @@
>  
>  #include <sysdep.h>
>  
> -# ifndef ALIGN
> -#  define ALIGN(n)	.p2align n
> -# endif
> -
> -
>  	.text
>  ENTRY (strchr)
>  	movd	%esi, %xmm1
>  	movl	%edi, %eax
> +	pxor	%xmm4, %xmm4
>  	andl	$4095, %eax
> +	pxor	%xmm5, %xmm5
> +#ifdef USE_SSSE3
> +	pshufb	%xmm4, %xmm1
> +#else
>  	punpcklbw %xmm1, %xmm1
> -	cmpl	$4032, %eax
>  	punpcklwd %xmm1, %xmm1
>  	pshufd	$0, %xmm1, %xmm1
> +#endif
> +	cmpl	$4032, %eax
>  	jg	L(cross_page)
>  	movdqu	(%rdi), %xmm0
> -	pxor	%xmm3, %xmm3
> -	movdqa	%xmm0, %xmm4
> +	pcmpeqb	%xmm0, %xmm4
>  	pcmpeqb	%xmm1, %xmm0
> -	pcmpeqb	%xmm3, %xmm4
>  	por	%xmm4, %xmm0
>  	pmovmskb %xmm0, %eax
>  	test	%eax, %eax
>  	je	L(next_48_bytes)
> -	bsf	%eax, %eax
> +	bsf	%eax, %eax #| bsf %ax, %ax #| bsf %rax, %rax
>  #ifdef AS_STRCHRNUL
> -	leaq	(%rdi,%rax), %rax
> +	addq	%rdi, %rax
>  #else
> -	movl	$0, %edx
> -	leaq	(%rdi,%rax), %rax
> +	xor	%edx, %edx
> +	addq	%rdi, %rax
>  	cmpb	%sil, (%rax)
>  	cmovne	%rdx, %rax
>  #endif
>  	ret
>  
> -	ALIGN(3)
> -	L(next_48_bytes):
> +	.p2align 3
> +L(next_48_bytes):
> +	pxor	%xmm6, %xmm6
> +	pxor	%xmm7, %xmm7
> +	movdqu	48(%rdi), %xmm3
> +	movdqu	32(%rdi), %xmm2
> +	pcmpeqb	%xmm3, %xmm7
>  	movdqu	16(%rdi), %xmm0
> -	movdqa	%xmm0, %xmm4
> -	pcmpeqb	%xmm1, %xmm0
> -	pcmpeqb	%xmm3, %xmm4
> -	por	%xmm4, %xmm0
> -	pmovmskb %xmm0, %ecx
> -	movdqu	32(%rdi), %xmm0
> -	movdqa	%xmm0, %xmm4
> +	movq	%rdi, %r8
> +	pcmpeqb	%xmm1, %xmm3
> +	andq	$-64, %rdi
> +	por	%xmm7, %xmm3
> +	pcmpeqb	%xmm2, %xmm6
> +	pcmpeqb	%xmm1, %xmm2
> +	pcmpeqb	%xmm0, %xmm5
>  	pcmpeqb	%xmm1, %xmm0
> -	salq	$16, %rcx
> -	pcmpeqb	%xmm3, %xmm4
> -	por	%xmm4, %xmm0
> +	por	%xmm6, %xmm2
> +	pmovmskb %xmm3, %edx
> +	pmovmskb %xmm2, %ecx
> +	shlq	$32, %rdx
> +	shlq	$16, %rcx
> +	por	%xmm5, %xmm0
>  	pmovmskb %xmm0, %eax
> -	movdqu	48(%rdi), %xmm0
> -	pcmpeqb	%xmm0, %xmm3
> -	salq	$32, %rax
> -	pcmpeqb	%xmm1, %xmm0
> -	orq	%rcx, %rax
> -	por	%xmm3, %xmm0
> -	pmovmskb %xmm0, %ecx
> -	salq	$48, %rcx
>  	orq	%rcx, %rax
> -	testq	%rax, %rax
> -	jne	L(return)
> -L(loop_start):
> -	/* We use this alignment to force loop be aligned to 8 but not
> -	   16 bytes.  This gives better sheduling on AMD processors.  */
> -	ALIGN(4)
>  	pxor	%xmm6, %xmm6
> -	andq	$-64, %rdi
> -	ALIGN(3)
> +	orq	%rdx, %rax
> +	je	L(loop64)
> +	bsfq	%rax, %rax
> +#ifdef AS_STRCHRNUL
> +	leaq	16(%r8, %rax), %rax
> +#else
> +	xor	%edx, %edx
> +	leaq	16(%r8, %rax), %rax
> +	cmpb	%sil, (%rax)
> +	cmovne	%rdx, %rax
> +#endif
> +	ret
> +
> +	.p2align 4
>  L(loop64):
> -	addq	$64, %rdi
> -	movdqa	(%rdi), %xmm5
> -	movdqa	16(%rdi), %xmm2
> -	movdqa	32(%rdi), %xmm3
> +	movdqa	96(%rdi), %xmm3
> +	movdqa	64(%rdi), %xmm5
>  	pxor	%xmm1, %xmm5
> -	movdqa	48(%rdi), %xmm4
> -	pxor	%xmm1, %xmm2
> -	pxor	%xmm1, %xmm3
> -	pminub	(%rdi), %xmm5
> +	movdqa	112(%rdi), %xmm4
> +	movdqa	80(%rdi), %xmm2
>  	pxor	%xmm1, %xmm4
> -	pminub	16(%rdi), %xmm2
> -	pminub	32(%rdi), %xmm3
> -	pminub	%xmm2, %xmm5
> -	pminub	48(%rdi), %xmm4
> -	pminub	%xmm3, %xmm5
> -	pminub	%xmm4, %xmm5
> -	pcmpeqb %xmm6, %xmm5
> -	pmovmskb %xmm5, %eax
> -
> -	testl	%eax, %eax
> +	pminub	112(%rdi), %xmm4
> +	pminub	64(%rdi), %xmm5
> +	pxor	%xmm1, %xmm3
> +	pxor	%xmm1, %xmm2
> +	pminub	%xmm5, %xmm4
> +	pminub	96(%rdi), %xmm3
> +	pminub	80(%rdi), %xmm2
> +	addq	$64, %rdi
> +	pminub	%xmm3, %xmm4
> +	pminub	%xmm2, %xmm4
> +	pcmpeqb	%xmm6, %xmm4
> +	pmovmskb %xmm4, %edx
> +	testl	%edx, %edx
>  	je	L(loop64)
> -
> -	movdqa	(%rdi), %xmm5
> -	movdqa	%xmm5, %xmm0
> -	pcmpeqb	%xmm1, %xmm5
> -	pcmpeqb	%xmm6, %xmm0
> -	por	%xmm0, %xmm5
> -	pcmpeqb %xmm6, %xmm2
> -	pcmpeqb %xmm6, %xmm3
> -	pcmpeqb %xmm6, %xmm4
> -
> -	pmovmskb %xmm5, %ecx
> +	pcmpeqb	%xmm6, %xmm3
> +	pmovmskb %xmm3, %r8d
> +	pcmpeqb	%xmm6, %xmm2
> +	pcmpeqb	%xmm6, %xmm5
>  	pmovmskb %xmm2, %eax
> +	pmovmskb %xmm5, %ecx
>  	salq	$16, %rax
> -	pmovmskb %xmm3, %r8d
> -	pmovmskb %xmm4, %edx
>  	salq	$32, %r8
> +	salq	$48, %rdx
>  	orq	%r8, %rax
>  	orq	%rcx, %rax
> -	salq	$48, %rdx
>  	orq	%rdx, %rax
> -	ALIGN(3)
> -L(return):
>  	bsfq	%rax, %rax
>  #ifdef AS_STRCHRNUL
> -	leaq	(%rdi,%rax), %rax
> +	leaq	(%rdi, %rax), %rax
>  #else
> -	movl	$0, %edx
> -	leaq	(%rdi,%rax), %rax
> +	xor	%edx, %edx
> +	leaq	(%rdi, %rax), %rax
>  	cmpb	%sil, (%rax)
>  	cmovne	%rdx, %rax
>  #endif
>  	ret
> -	ALIGN(4)
>  
> +	.p2align 4
>  L(cross_page):
> -	movq	%rdi, %rdx
> -	pxor	%xmm2, %xmm2
> -	andq	$-64, %rdx
> -	movdqa	%xmm1, %xmm0
> -	movdqa	(%rdx), %xmm3
> -	movdqa	%xmm3, %xmm4
> -	pcmpeqb	%xmm1, %xmm3
> -	pcmpeqb	%xmm2, %xmm4
> -	por	%xmm4, %xmm3
> -	pmovmskb %xmm3, %r8d
> -	movdqa	16(%rdx), %xmm3
> -	movdqa	%xmm3, %xmm4
> -	pcmpeqb	%xmm1, %xmm3
> -	pcmpeqb	%xmm2, %xmm4
> -	por	%xmm4, %xmm3
> -	pmovmskb %xmm3, %eax
> -	movdqa	32(%rdx), %xmm3
> -	movdqa	%xmm3, %xmm4
> +	pxor	%xmm7, %xmm7
> +	movq	%rdi, %rcx
> +	andq	$-64, %rdi
> +	movdqu	48(%rdi), %xmm3
> +	movdqu	(%rdi), %xmm8
> +	movdqu	32(%rdi), %xmm2
> +	pxor	%xmm6, %xmm6
> +	pcmpeqb	%xmm8, %xmm4
> +	movdqu	16(%rdi), %xmm0
> +	pcmpeqb	%xmm1, %xmm8
> +	pcmpeqb	%xmm0, %xmm5
> +	pcmpeqb	%xmm1, %xmm0
> +	por	%xmm5, %xmm0
> +	por	%xmm4, %xmm8
> +	pcmpeqb	%xmm2, %xmm6
> +	pcmpeqb	%xmm1, %xmm2
> +	por	%xmm6, %xmm2
> +	pmovmskb %xmm8, %edx
> +	pcmpeqb	%xmm3, %xmm7
>  	pcmpeqb	%xmm1, %xmm3
> -	salq	$16, %rax
> -	pcmpeqb	%xmm2, %xmm4
> -	por	%xmm4, %xmm3
> +	pmovmskb %xmm0, %eax
> +	pmovmskb %xmm2, %r8d
> +	por	%xmm7, %xmm3
>  	pmovmskb %xmm3, %r9d
> -	movdqa	48(%rdx), %xmm3
> -	pcmpeqb	%xmm3, %xmm2
> -	salq	$32, %r9
> -	pcmpeqb	%xmm3, %xmm0
> -	orq	%r9, %rax
> +	shlq	$48, %r9
> +	shlq	$32, %r8
> +	shlq	$16, %rax
> +	orq	%rdx, %rax
>  	orq	%r8, %rax
> -	por	%xmm2, %xmm0
> -	pmovmskb %xmm0, %ecx
> -	salq	$48, %rcx
> -	orq	%rcx, %rax
> -	movl	%edi, %ecx
> -	subb	%dl, %cl
> +	orq	%r9, %rax
> +	pxor	%xmm6, %xmm6
>  	shrq	%cl, %rax
>  	testq	%rax, %rax
> -	jne	L(return)
> -	jmp	L(loop_start)
> -
> +	je	L(loop64)
> +	bsfq	%rax, %rax
> +#ifdef AS_STRCHRNUL
> +	addq	%rcx, %rax
> +#else
> +	xor	%edx, %edx
> +	addq	%rcx, %rax
> +	cmpb	%sil, (%rax)
> +	cmovne	%rdx, %rax
> +#endif
> +	ret
>  END (strchr)
>  
>  #ifndef AS_STRCHRNUL
> +# ifndef USE_SSSE3
>  weak_alias (strchr, index)
>  libc_hidden_builtin_def (strchr)
> +# endif
>  #endif
> -- 
> 1.8.4.rc3

-- 

Incorrect time synchronization


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]