This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] faster strlen implementation
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sat, 7 Jul 2012 20:45:29 +0200
- Subject: [PATCH] faster strlen implementation
Here is optimized strlen sse2 implementation. It is about twice as fast
on i7.
Benchmarks are here:
http://kam.mff.cuni.cz/~ondra/benchmark_string/i7/strlen/html/test_r.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/xeon/strlen/html/test_r.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/phenomII/strlen/html/test_r.html
http://kam.mff.cuni.cz/~ondra/benchmark_string/opteron/strlen/html/test_r.html
Most important trick is fact that pcmpeqb is good for pipelining.
It was derived from c implementation I sent before, changes were
following
1. gcc does not know that bsfq and pmovmskb always return positive
result so I removed sign extensions.
2. It is faster to recalculate mask at end that save it to separate
register in loop. I do not know how to explain this to c compiler.
3. Reorganize jump structure.
How important today is handle SLOW_BSF? Google search showed
that K8 had slow bsf.
I also added sligthly faster version that uses sse4.1 instruction ptest instead pmovmskb
testl pair.
---
ChangeLog | 6 ++
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/strlen-sse4.S | 84 -----------------
sysdeps/x86_64/multiarch/strlen.S | 15 +--
sysdeps/x86_64/multiarch/strlen_sse4_1.S | 3 +
sysdeps/x86_64/strlen.S | 146 +++++++++++++++---------------
6 files changed, 88 insertions(+), 170 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse4.S
create mode 100644 sysdeps/x86_64/multiarch/strlen_sse4_1.S
diff --git a/ChangeLog b/ChangeLog
index 2ba4508..359c3d3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2012-07-06 Ondrej Bilka <neleai@seznam.cz>
+
+ * sysdeps/x86_64/strlen.S: faster implementation
+ * sysdeps/x86_64/multiarch/strlen.S: choose sse2/sse4_1
+ * sysdeps/x86_64/multiarch/strlen_sse4_1.S: sse4_1 version
+ * sysdeps/x86_64/multiarch/strlen-sse4.S: no longer needed
+ * sysdeps/x86_64/multiarch/Makefile: update
+
2012-07-06 Joseph Myers <joseph@codesourcery.com>
[BZ #6778]
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index dd6c27d..f54fe0e 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -10,12 +10,12 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
- strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
+ strncase_l-ssse3 strlen_sse4_1 memset-x86-64 \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
+ strcat-ssse3 strncat-ssse3 \
strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
memcmp-ssse3
ifeq (yes,$(config-cflags-sse4))
diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S
deleted file mode 100644
index ea5b783..0000000
--- a/sysdeps/x86_64/multiarch/strlen-sse4.S
+++ /dev/null
@@ -1,84 +0,0 @@
-/* strlen with SSE4
- Copyright (C) 2009, 2010 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if defined SHARED && !defined NOT_IN_libc
-
-#include <sysdep.h>
-
- .section .text.sse4.2,"ax",@progbits
-ENTRY (__strlen_sse42)
- pxor %xmm1, %xmm1
- movl %edi, %ecx
- movq %rdi, %r8
- andq $~15, %rdi
- xor %edi, %ecx
- pcmpeqb (%rdi), %xmm1
- pmovmskb %xmm1, %edx
- shrl %cl, %edx
- shll %cl, %edx
- andl %edx, %edx
- jnz L(less16bytes)
- pxor %xmm1, %xmm1
-
- .p2align 4
-L(more64bytes_loop):
- pcmpistri $0x08, 16(%rdi), %xmm1
- jz L(more32bytes)
-
- pcmpistri $0x08, 32(%rdi), %xmm1
- jz L(more48bytes)
-
- pcmpistri $0x08, 48(%rdi), %xmm1
- jz L(more64bytes)
-
- add $64, %rdi
- pcmpistri $0x08, (%rdi), %xmm1
- jnz L(more64bytes_loop)
- leaq (%rdi,%rcx), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(more32bytes):
- leaq 16(%rdi,%rcx, 1), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(more48bytes):
- leaq 32(%rdi,%rcx, 1), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(more64bytes):
- leaq 48(%rdi,%rcx, 1), %rax
- subq %r8, %rax
- ret
-
- .p2align 4
-L(less16bytes):
- subq %r8, %rdi
- bsfl %edx, %eax
- addq %rdi, %rax
- ret
-
-END (__strlen_sse42)
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
index 0c46b4f..bdfe546 100644
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ b/sysdeps/x86_64/multiarch/strlen.S
@@ -31,18 +31,13 @@ ENTRY(strlen)
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
-1: leaq __strlen_sse2_pminub(%rip), %rax
- testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
- jnz 2f
- leaq __strlen_sse2(%rip), %rax
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+1:
+ testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
jz 2f
- leaq __strlen_sse42(%rip), %rax
+ leaq __strlen_sse4_1(%rip), %rax
+ ret
+2:leaq __strlen_sse2(%rip), %rax
ret
-2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
- jz 3f
- leaq __strlen_sse2_no_bsf(%rip), %rax
-3: ret
END(strlen)
# undef ENTRY
diff --git a/sysdeps/x86_64/multiarch/strlen_sse4_1.S b/sysdeps/x86_64/multiarch/strlen_sse4_1.S
new file mode 100644
index 0000000..be8a42c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen_sse4_1.S
@@ -0,0 +1,3 @@
+#define USE_SSE4_1
+#define strlen __strlen_sse4_1
+#include "../strlen.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index f83d857..e3f0675 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -22,80 +22,78 @@
.text
ENTRY(strlen)
- xor %rax, %rax
- mov %edi, %ecx
- and $0x3f, %ecx
- pxor %xmm0, %xmm0
- cmp $0x30, %ecx
- ja L(next)
- movdqu (%rdi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit_less16)
- mov %rdi, %rax
- and $-16, %rax
- jmp L(align16_start)
-L(next):
- mov %rdi, %rax
- and $-16, %rax
- pcmpeqb (%rax), %xmm0
- mov $-1, %esi
- sub %rax, %rcx
- shl %cl, %esi
- pmovmskb %xmm0, %edx
- and %esi, %edx
- jnz L(exit)
-L(align16_start):
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
- .p2align 4
-L(align16_loop):
- pcmpeqb 16(%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- jnz L(exit16)
+ movq %rdi, %rax
+ pxor %xmm0, %xmm0
+ andq $-64, %rax
+ movl %edi, %ecx
+ movdqa (%rax), %xmm1
+ andl $63, %ecx
+ movdqa %xmm0, %xmm2
+ pcmpeqb %xmm0, %xmm1
+ por %xmm0, %xmm1
+ pmovmskb %xmm1, %r9d
+ movdqa 16(%rax), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ por %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ movdqa 32(%rax), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ salq $16, %rdx
+ por %xmm0, %xmm1
+ orq %r9, %rdx
+ pmovmskb %xmm1, %r8d
+ movdqa 48(%rax), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ por %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ salq $16, %rsi
+ orq %r8, %rsi
+ salq $32, %rsi
+ orq %rsi, %rdx
+ movq $-1, %rsi
+ salq %cl, %rsi
+ andq %rsi, %rdx
+ jne .L3
+.L9:
+ addq $64, %rax
+ prefetcht0 512(%rax)
+ movdqa (%rax), %xmm6
+ movdqa 16(%rax), %xmm5
+ pcmpeqb %xmm0, %xmm6
+ movdqa 32(%rax), %xmm4
+ pcmpeqb %xmm0, %xmm5
+ movdqa 48(%rax), %xmm3
+ por %xmm6, %xmm5
+ pcmpeqb %xmm0, %xmm4
+ pcmpeqb %xmm0, %xmm3
+ por %xmm5, %xmm3
+ por %xmm4, %xmm3
+#ifdef USE_SSE4_1
+ ptest %xmm3, %xmm3
+#else
+ pmovmskb %xmm3, %edx
+ testl %edx,%edx
+#endif
+ je .L9
+ movdqa 16(%rax), %xmm5
+ pcmpeqb %xmm0, %xmm5
+ movdqa 48(%rax), %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ pmovmskb %xmm5, %esi
+ pmovmskb %xmm6, %r8d
+ pmovmskb %xmm4, %ecx
+ salq $16, %rdx
+ salq $16, %rsi
+ orq %rcx, %rdx
+ orq %r8, %rsi
+ salq $32, %rdx
+ orq %rsi, %rdx
+.L3:
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ ret
- pcmpeqb 32(%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- jnz L(exit32)
-
- pcmpeqb 48(%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- jnz L(exit48)
-
- pcmpeqb 64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- lea 64(%rax), %rax
- test %edx, %edx
- jz L(align16_loop)
-L(exit):
- sub %rdi, %rax
-L(exit_less16):
- bsf %rdx, %rdx
- add %rdx, %rax
- ret
- .p2align 4
-L(exit16):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 16(%rdx,%rax), %rax
- ret
- .p2align 4
-L(exit32):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 32(%rdx,%rax), %rax
- ret
- .p2align 4
-L(exit48):
- sub %rdi, %rax
- bsf %rdx, %rdx
- lea 48(%rdx,%rax), %rax
- ret
END(strlen)
libc_hidden_builtin_def (strlen)
--
1.7.4.4