This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] faster strlen implementation


Here is optimized strlen sse2 implementation. It is about twice as fast
on i7. 

Benchmarks are here:
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7/strlen/html/test_r.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/xeon/strlen/html/test_r.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/phenomII/strlen/html/test_r.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/opteron/strlen/html/test_r.html

Most important trick is fact that pcmpeqb is good for pipelining.

It was derived from c implementation I sent before, changes were
following
1. gcc does not know that bsfq and pmovmskb always return positive
result so I removed sign extensions.
2. It is faster to recalculate mask at end that save it to separate
register in loop. I do not know how to explain this to c compiler.
3. Reorganize jump structure.


How important today is handle SLOW_BSF? Google search showed
that K8 had slow bsf.


I also added sligthly faster version that uses sse4.1 instruction ptest instead pmovmskb
testl pair.


---
 ChangeLog                                |    6 ++
 sysdeps/x86_64/multiarch/Makefile        |    4 +-
 sysdeps/x86_64/multiarch/strlen-sse4.S   |   84 -----------------
 sysdeps/x86_64/multiarch/strlen.S        |   15 +--
 sysdeps/x86_64/multiarch/strlen_sse4_1.S |    3 +
 sysdeps/x86_64/strlen.S                  |  146 +++++++++++++++---------------
 6 files changed, 88 insertions(+), 170 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse4.S
 create mode 100644 sysdeps/x86_64/multiarch/strlen_sse4_1.S

diff --git a/ChangeLog b/ChangeLog
index 2ba4508..359c3d3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2012-07-06  Ondrej Bilka  <neleai@seznam.cz>
+
+	* sysdeps/x86_64/strlen.S: faster implementation
+	* sysdeps/x86_64/multiarch/strlen.S: choose sse2/sse4_1
+	* sysdeps/x86_64/multiarch/strlen_sse4_1.S: sse4_1 version
+	* sysdeps/x86_64/multiarch/strlen-sse4.S: no longer needed
+ * sysdeps/x86_64/multiarch/Makefile: update
+
 2012-07-06  Joseph Myers  <joseph@codesourcery.com>
 
 	[BZ #6778]
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..f54fe0e 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -10,12 +10,12 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
 		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
-		   strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
+		   strncase_l-ssse3 strlen_sse4_1 memset-x86-64 \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
+		   strcat-ssse3 strncat-ssse3  \
 		   strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
 		   memcmp-ssse3
 ifeq (yes,$(config-cflags-sse4))
diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S
deleted file mode 100644
index ea5b783..0000000
--- a/sysdeps/x86_64/multiarch/strlen-sse4.S
+++ /dev/null
@@ -1,84 +0,0 @@
-/* strlen with SSE4
-   Copyright (C) 2009, 2010 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if defined SHARED && !defined NOT_IN_libc
-
-#include <sysdep.h>
-
-	.section .text.sse4.2,"ax",@progbits
-ENTRY (__strlen_sse42)
-	pxor	%xmm1, %xmm1
-	movl	%edi, %ecx
-	movq	%rdi, %r8
-	andq	$~15, %rdi
-	xor	%edi, %ecx
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb %xmm1, %edx
-	shrl	%cl, %edx
-	shll	%cl, %edx
-	andl	%edx, %edx
-	jnz	L(less16bytes)
-	pxor	%xmm1, %xmm1
-
-	.p2align 4
-L(more64bytes_loop):
-	pcmpistri $0x08, 16(%rdi), %xmm1
-	jz	L(more32bytes)
-
-	pcmpistri $0x08, 32(%rdi), %xmm1
-	jz	L(more48bytes)
-
-	pcmpistri $0x08, 48(%rdi), %xmm1
-	jz	L(more64bytes)
-
-	add	$64, %rdi
-	pcmpistri $0x08, (%rdi), %xmm1
-	jnz	L(more64bytes_loop)
-	leaq	(%rdi,%rcx), %rax
-	subq	%r8, %rax
-	ret
-
-	.p2align 4
-L(more32bytes):
-	leaq	16(%rdi,%rcx, 1), %rax
-	subq	%r8, %rax
-	ret
-
-	.p2align 4
-L(more48bytes):
-	leaq	32(%rdi,%rcx, 1), %rax
-	subq	%r8, %rax
-	ret
-
-	.p2align 4
-L(more64bytes):
-	leaq	48(%rdi,%rcx, 1), %rax
-	subq	%r8, %rax
-	ret
-
-	.p2align 4
-L(less16bytes):
-	subq	%r8, %rdi
-	bsfl	%edx, %eax
-	addq	%rdi, %rax
-	ret
-
-END (__strlen_sse42)
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 0c46b4f..bdfe546 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -31,18 +31,13 @@ ENTRY(strlen)
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
 	jne	1f
 	call	__init_cpu_features
-1:	leaq	__strlen_sse2_pminub(%rip), %rax
-	testl	$bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
-	jnz	2f
-	leaq	__strlen_sse2(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+1:	
+	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
 	jz	2f
-	leaq	__strlen_sse42(%rip), %rax
+	leaq	__strlen_sse4_1(%rip), %rax
+	ret
+2:leaq    __strlen_sse2(%rip), %rax
 	ret
-2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
-	jz	3f
-	leaq    __strlen_sse2_no_bsf(%rip), %rax
-3:	ret
 END(strlen)
 
 # undef ENTRY
diff --git a/sysdeps/x86_64/multiarch/strlen_sse4_1.S b/sysdeps/x86_64/multiarch/strlen_sse4_1.S
new file mode 100644
index 0000000..be8a42c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen_sse4_1.S
@@ -0,0 +1,3 @@
+#define USE_SSE4_1
+#define strlen __strlen_sse4_1
+#include "../strlen.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index f83d857..e3f0675 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -22,80 +22,78 @@
 
 	.text
 ENTRY(strlen)
-	xor	%rax, %rax
-	mov	%edi, %ecx
-	and	$0x3f, %ecx
-	pxor	%xmm0, %xmm0
-	cmp	$0x30, %ecx
-	ja	L(next)
-	movdqu	(%rdi), %xmm1
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_less16)
-	mov	%rdi, %rax
-	and	$-16, %rax
-	jmp	L(align16_start)
-L(next):
-	mov	%rdi, %rax
-	and	$-16, %rax
-	pcmpeqb	(%rax), %xmm0
-	mov	$-1, %esi
-	sub	%rax, %rcx
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	and	%esi, %edx
-	jnz	L(exit)
-L(align16_start):
-	pxor	%xmm0, %xmm0
-	pxor	%xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	pxor	%xmm3, %xmm3
-	.p2align 4
-L(align16_loop):
-	pcmpeqb	16(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
+  movq  %rdi, %rax
+  pxor  %xmm0, %xmm0
+  andq  $-64, %rax
+  movl  %edi, %ecx
+  movdqa  (%rax), %xmm1
+  andl  $63, %ecx
+  movdqa  %xmm0, %xmm2
+  pcmpeqb %xmm0, %xmm1
+  por %xmm0, %xmm1
+  pmovmskb  %xmm1, %r9d
+  movdqa  16(%rax), %xmm1
+  pcmpeqb %xmm0, %xmm1
+  por %xmm0, %xmm1
+  pmovmskb  %xmm1, %edx
+  movdqa  32(%rax), %xmm1
+  pcmpeqb %xmm0, %xmm1
+  salq  $16, %rdx
+  por %xmm0, %xmm1
+  orq %r9, %rdx
+  pmovmskb  %xmm1, %r8d
+  movdqa  48(%rax), %xmm1
+  pcmpeqb %xmm0, %xmm1
+  por %xmm0, %xmm1
+  pmovmskb  %xmm1, %esi
+  salq  $16, %rsi
+  orq %r8, %rsi
+  salq  $32, %rsi
+  orq %rsi, %rdx
+  movq  $-1, %rsi
+  salq  %cl, %rsi
+  andq  %rsi, %rdx
+  jne  .L3
+.L9:
+  addq  $64, %rax
+  prefetcht0  512(%rax)
+  movdqa  (%rax), %xmm6
+  movdqa  16(%rax), %xmm5
+  pcmpeqb %xmm0, %xmm6
+  movdqa  32(%rax), %xmm4
+  pcmpeqb %xmm0, %xmm5
+  movdqa  48(%rax), %xmm3
+  por %xmm6, %xmm5
+  pcmpeqb %xmm0, %xmm4
+  pcmpeqb %xmm0, %xmm3
+  por %xmm5, %xmm3
+  por %xmm4, %xmm3
+#ifdef USE_SSE4_1
+  ptest %xmm3, %xmm3
+#else
+  pmovmskb %xmm3, %edx
+  testl %edx,%edx  
+#endif
+  je  .L9
+  movdqa  16(%rax), %xmm5
+  pcmpeqb %xmm0, %xmm5
+  movdqa  48(%rax), %xmm3
+  pcmpeqb %xmm0, %xmm3
+  pmovmskb  %xmm3, %edx
+  pmovmskb  %xmm5, %esi
+  pmovmskb  %xmm6, %r8d
+  pmovmskb  %xmm4, %ecx
+  salq  $16, %rdx
+  salq  $16, %rsi
+  orq %rcx, %rdx
+  orq %r8, %rsi
+  salq  $32, %rdx
+  orq %rsi, %rdx
+.L3:
+  bsfq  %rdx, %rdx
+  addq  %rdx, %rax
+  subq  %rdi, %rax
+  ret
 
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
-
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
-
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	lea	64(%rax), %rax
-	test	%edx, %edx
-	jz	L(align16_loop)
-L(exit):
-	sub	%rdi, %rax
-L(exit_less16):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	ret
-	.p2align 4
-L(exit16):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	lea	16(%rdx,%rax), %rax
-	ret
-	.p2align 4
-L(exit32):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	lea	32(%rdx,%rax), %rax
-	ret
-	.p2align 4
-L(exit48):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	lea	48(%rdx,%rax), %rax
-	ret
 END(strlen)
 libc_hidden_builtin_def (strlen)
-- 
1.7.4.4


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]