This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch master updated. glibc-2.18-108-g5905e7b
- From: neleai at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 11 Sep 2013 15:08:54 -0000
- Subject: GNU C Library master sources branch master updated. glibc-2.18-108-g5905e7b
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 5905e7b3e29139dbef84c065ca39315485f497e1 (commit)
from b987c776723eea2f223885289c466386f12e823a (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5905e7b3e29139dbef84c065ca39315485f497e1
commit 5905e7b3e29139dbef84c065ca39315485f497e1
Author: OndÅ?ej BÃlka <neleai@seznam.cz>
Date: Wed Sep 11 17:07:38 2013 +0200
Faster strchr implementation.
diff --git a/ChangeLog b/ChangeLog
index 6a54979..cb1b403 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2013-09-11 OndÅ?ej BÃlka <neleai@seznam.cz>
+
+ * sysdeps/x86_64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Remove: __strchr_sse42.
+ * sysdeps/x86_64/multiarch/strchr.S (__strchr_sse42): Remove.
+ (strchr): Remove __strchr_sse42 ifunc selection.
+ * sysdeps/x86_64/strchr.S (strchr): Use optimized implementation.
+ * sysdeps/x86_64/strchrnul.S: Include sysdeps/x86_64/strchr.S.
+
2013-09-11 Will Newton <will.newton@linaro.org>
* benchtests/bench-timing.h (TIMING_INIT): Rename ITERS
@@ -35,7 +44,7 @@
* malloc/malloc.c (__libc_pvalloc): Check the value of bytes
does not overflow.
-2013-09-10 OndÅ?ej BÃlka <neleai@seznam.cz>
+2013-09-10 OndÅ?ej BÃlka <neleai@seznam.cz>
* sysdeps/ieee754/dbl-64/e_j0.c: Remove DO_NOT_USE_THIS conditionals.
* sysdeps/ieee754/dbl-64/e_j1.c: Likewise.
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f8756d7..1a65ac0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -110,7 +110,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strchr.S. */
IFUNC_IMPL (i, name, strchr,
- IFUNC_IMPL_ADD (array, i, strchr, HAS_SSE4_2, __strchr_sse42)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index f170238..3f0b2c5 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -29,12 +29,6 @@ ENTRY(strchr)
jne 1f
call __init_cpu_features
1: leaq __strchr_sse2(%rip), %rax
- testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
- jnz 2f
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jz 2f
- leaq __strchr_sse42(%rip), %rax
- ret
2: testl $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
jz 3f
leaq __strchr_sse2_no_bsf(%rip), %rax
@@ -42,127 +36,6 @@ ENTRY(strchr)
END(strchr)
-/*
- This implementation uses SSE4 instructions to compare up to 16 bytes
- at a time looking for the first occurrence of the character c in the
- string s:
-
- char *strchr (const char *s, int c);
-
- We use 0xa:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_EACH
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to compare xmm/mem128
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- X X X X X X X X X X X X X X X X
-
- against xmm
-
- 0 1 2 3 4 5 6 7 8 9 A B C D E F
- C C C C C C C C C C C C C C C C
-
- to find out if the first 16byte data element has a byte C and the
- offset of the first byte. There are 3 cases:
-
- 1. The first 16byte data element has the byte C at the offset X.
- 2. The first 16byte data element has EOS and doesn't have the byte C.
- 3. The first 16byte data element is valid and doesn't have the byte C.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
-
- case ECX CFlag ZFlag SFlag
- 1 X 1 0/1 0
- 2 16 0 1 0
- 3 16 0 0 0
-
- We exit from the loop for cases 1 and 2 with jbe which branches
- when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
- X for case 1. */
-
- .section .text.sse4.2,"ax",@progbits
- .align 16
- .type __strchr_sse42, @function
- .globl __strchr_sse42
- .hidden __strchr_sse42
-__strchr_sse42:
- cfi_startproc
- CALL_MCOUNT
- testb %sil, %sil
- je __strend_sse4
- pxor %xmm2, %xmm2
- movd %esi, %xmm1
- movl %edi, %ecx
- pshufb %xmm2, %xmm1
- andl $15, %ecx
- movq %rdi, %r8
- je L(aligned_start)
-
-/* Handle unaligned string. */
- andq $-16, %r8
- movdqa (%r8), %xmm0
- pcmpeqb %xmm0, %xmm2
- pcmpeqb %xmm1, %xmm0
- /* Find where NULL is. */
- pmovmskb %xmm2, %edx
- /* Check if there is a match. */
- pmovmskb %xmm0, %esi
- /* Remove the leading bytes. */
- sarl %cl, %edx
- sarl %cl, %esi
- testl %esi, %esi
- je L(unaligned_no_match)
- /* Check which byte is a match. */
- bsfl %esi, %eax
- /* Is there a NULL? */
- testl %edx, %edx
- je L(unaligned_match)
- bsfl %edx, %esi
- cmpl %esi, %eax
- /* Return NULL if NULL comes first. */
- ja L(return_null)
-L(unaligned_match):
- addq %rdi, %rax
- ret
-
- .p2align 4
-L(unaligned_no_match):
- testl %edx, %edx
- jne L(return_null)
-
-/* Loop start on aligned string. */
-L(loop):
- addq $16, %r8
-L(aligned_start):
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- addq $16, %r8
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- addq $16, %r8
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- addq $16, %r8
- pcmpistri $0x2, (%r8), %xmm1
- jbe L(wrap)
- jmp L(loop)
-L(wrap):
- jc L(loop_exit)
-
-/* Return NULL. */
-L(return_null):
- xorl %eax, %eax
- ret
-
-/* Loop exit. */
- .p2align 4
-L(loop_exit):
- leaq (%r8,%rcx), %rax
- ret
- cfi_endproc
- .size __strchr_sse42, .-__strchr_sse42
-
# undef ENTRY
# define ENTRY(name) \
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index d89f1eb..1900b37 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -19,51 +19,174 @@
#include <sysdep.h>
+# ifndef ALIGN
+# define ALIGN(n) .p2align n
+# endif
+
.text
ENTRY (strchr)
movd %esi, %xmm1
- movq %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- andq $~15, %rdi
- pxor %xmm2, %xmm2
+ movl %edi, %eax
+ andl $4095, %eax
punpcklbw %xmm1, %xmm1
- orl $0xffffffff, %esi
- movdqa (%rdi), %xmm0
+ cmpl $4032, %eax
+ punpcklwd %xmm1, %xmm1
pshufd $0, %xmm1, %xmm1
- subq %rdi, %rcx
- movdqa %xmm0, %xmm3
- leaq 16(%rdi), %rdi
+ jg L(cross_page)
+ movdqu (%rdi), %xmm0
+ pxor %xmm3, %xmm3
+ movdqa %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- shl %cl, %esi
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- andl %esi, %edx
- andl %esi, %ecx
- orl %edx, %ecx
- jnz 1f
+ pcmpeqb %xmm3, %xmm4
+ por %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ je L(next_48_bytes)
+ bsf %eax, %eax
+#ifdef AS_STRCHRNUL
+ leaq (%rdi,%rax), %rax
+#else
+ movl $0, %edx
+ leaq (%rdi,%rax), %rax
+ cmpb %sil, (%rax)
+ cmovne %rdx, %rax
+#endif
+ ret
-2: movdqa (%rdi), %xmm0
- leaq 16(%rdi), %rdi
- movdqa %xmm0, %xmm3
+ ALIGN(3)
+ L(next_48_bytes):
+ movdqu 16(%rdi), %xmm0
+ movdqa %xmm0, %xmm4
pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- orl %edx, %ecx
- jz 2b
+ pcmpeqb %xmm3, %xmm4
+ por %xmm4, %xmm0
+ pmovmskb %xmm0, %ecx
+ movdqu 32(%rdi), %xmm0
+ movdqa %xmm0, %xmm4
+ pcmpeqb %xmm1, %xmm0
+ salq $16, %rcx
+ pcmpeqb %xmm3, %xmm4
+ por %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ movdqu 48(%rdi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ salq $32, %rax
+ pcmpeqb %xmm1, %xmm0
+ orq %rcx, %rax
+ por %xmm3, %xmm0
+ pmovmskb %xmm0, %ecx
+ salq $48, %rcx
+ orq %rcx, %rax
+ testq %rax, %rax
+ jne L(return)
+L(loop_start):
+ /* We use this alignment to force loop be aligned to 8 but not
+ 16 bytes. This gives better sheduling on AMD processors. */
+ ALIGN(4)
+ pxor %xmm6, %xmm6
+ andq $-64, %rdi
+ ALIGN(3)
+L(loop64):
+ addq $64, %rdi
+ movdqa (%rdi), %xmm5
+ movdqa 16(%rdi), %xmm2
+ movdqa 32(%rdi), %xmm3
+ pxor %xmm1, %xmm5
+ movdqa 48(%rdi), %xmm4
+ pxor %xmm1, %xmm2
+ pxor %xmm1, %xmm3
+ pminub (%rdi), %xmm5
+ pxor %xmm1, %xmm4
+ pminub 16(%rdi), %xmm2
+ pminub 32(%rdi), %xmm3
+ pminub %xmm2, %xmm5
+ pminub 48(%rdi), %xmm4
+ pminub %xmm3, %xmm5
+ pminub %xmm4, %xmm5
+ pcmpeqb %xmm6, %xmm5
+ pmovmskb %xmm5, %eax
+
+ testl %eax, %eax
+ je L(loop64)
+
+ movdqa (%rdi), %xmm5
+ movdqa %xmm5, %xmm0
+ pcmpeqb %xmm1, %xmm5
+ pcmpeqb %xmm6, %xmm0
+ por %xmm0, %xmm5
+ pcmpeqb %xmm6, %xmm2
+ pcmpeqb %xmm6, %xmm3
+ pcmpeqb %xmm6, %xmm4
+
+ pmovmskb %xmm5, %ecx
+ pmovmskb %xmm2, %eax
+ salq $16, %rax
+ pmovmskb %xmm3, %r8d
+ pmovmskb %xmm4, %edx
+ salq $32, %r8
+ orq %r8, %rax
+ orq %rcx, %rax
+ salq $48, %rdx
+ orq %rdx, %rax
+ ALIGN(3)
+L(return):
+ bsfq %rax, %rax
+#ifdef AS_STRCHRNUL
+ leaq (%rdi,%rax), %rax
+#else
+ movl $0, %edx
+ leaq (%rdi,%rax), %rax
+ cmpb %sil, (%rax)
+ cmovne %rdx, %rax
+#endif
+ ret
+ ALIGN(4)
+
+L(cross_page):
+ movq %rdi, %rdx
+ pxor %xmm2, %xmm2
+ andq $-64, %rdx
+ movdqa %xmm1, %xmm0
+ movdqa (%rdx), %xmm3
+ movdqa %xmm3, %xmm4
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ por %xmm4, %xmm3
+ pmovmskb %xmm3, %r8d
+ movdqa 16(%rdx), %xmm3
+ movdqa %xmm3, %xmm4
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ por %xmm4, %xmm3
+ pmovmskb %xmm3, %eax
+ movdqa 32(%rdx), %xmm3
+ movdqa %xmm3, %xmm4
+ pcmpeqb %xmm1, %xmm3
+ salq $16, %rax
+ pcmpeqb %xmm2, %xmm4
+ por %xmm4, %xmm3
+ pmovmskb %xmm3, %r9d
+ movdqa 48(%rdx), %xmm3
+ pcmpeqb %xmm3, %xmm2
+ salq $32, %r9
+ pcmpeqb %xmm3, %xmm0
+ orq %r9, %rax
+ orq %r8, %rax
+ por %xmm2, %xmm0
+ pmovmskb %xmm0, %ecx
+ salq $48, %rcx
+ orq %rcx, %rax
+ movl %edi, %ecx
+ subb %dl, %cl
+ shrq %cl, %rax
+ testq %rax, %rax
+ jne L(return)
+ jmp L(loop_start)
-1: bsfl %edx, %edx
- jz 4f
- bsfl %ecx, %ecx
- leaq -16(%rdi,%rdx), %rax
- cmpl %edx, %ecx
- je 5f
-4: xorl %eax, %eax
-5: ret
END (strchr)
+#ifndef AS_STRCHRNUL
weak_alias (strchr, index)
libc_hidden_builtin_def (strchr)
-
+#endif
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
index d8c345b..bceeb61 100644
--- a/sysdeps/x86_64/strchrnul.S
+++ b/sysdeps/x86_64/strchrnul.S
@@ -20,43 +20,8 @@
#include <sysdep.h>
-
- .text
-ENTRY (__strchrnul)
- movd %esi, %xmm1
- movq %rdi, %rcx
- punpcklbw %xmm1, %xmm1
- andq $~15, %rdi
- pxor %xmm2, %xmm2
- punpcklbw %xmm1, %xmm1
- orl $0xffffffff, %esi
- movdqa (%rdi), %xmm0
- pshufd $0, %xmm1, %xmm1
- subq %rdi, %rcx
- movdqa %xmm0, %xmm3
- leaq 16(%rdi), %rdi
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- shl %cl, %esi
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- orl %edx, %ecx
- andl %esi, %ecx
- jnz 1f
-
-2: movdqa (%rdi), %xmm0
- leaq 16(%rdi), %rdi
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm3, %ecx
- orl %edx, %ecx
- jz 2b
-
-1: bsfl %ecx, %edx
- leaq -16(%rdi,%rdx), %rax
- ret
-END (__strchrnul)
+#define strchr __strchrnul
+#define AS_STRCHRNUL
+#include "strchr.S"
weak_alias (__strchrnul, strchrnul)
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 11 ++-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 1 -
sysdeps/x86_64/multiarch/strchr.S | 127 ------------------
sysdeps/x86_64/strchr.S | 191 +++++++++++++++++++++++-----
sysdeps/x86_64/strchrnul.S | 41 +------
5 files changed, 170 insertions(+), 201 deletions(-)
hooks/post-receive
--
GNU C Library master sources