This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch, master, updated. glibc-2.14-330-g66fb11b
- From: drepper at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 7 Oct 2011 15:50:31 -0000
- Subject: GNU C Library master sources branch, master, updated. glibc-2.14-330-g66fb11b
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 66fb11b1da6b56a78c09e8b5802ace4f10189dca (commit)
via 093ecf92998de275820296058ad5648e354b9e0d (commit)
from fde56e5cc5011d8c0de39290af0e76d884d07624 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=66fb11b1da6b56a78c09e8b5802ace4f10189dca
commit 66fb11b1da6b56a78c09e8b5802ace4f10189dca
Author: Ulrich Drepper <drepper@gmail.com>
Date: Fri Oct 7 11:50:21 2011 -0400
Fix whitespace
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index a85dc6b..81b7a1c 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -111,7 +111,7 @@ L(loop_prolog):
mov %rdi, %rcx
and $63, %rcx
- jz L(align64_loop)
+ jz L(align64_loop)
add $64, %rdi
add $64, %rdx
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=093ecf92998de275820296058ad5648e354b9e0d
commit 093ecf92998de275820296058ad5648e354b9e0d
Author: Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
Date: Fri Oct 7 11:49:10 2011 -0400
Improve 64 bit memchr, memrchr, rawmemchr with SSE2
diff --git a/ChangeLog b/ChangeLog
index c369539..a5b5a53 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+2011-08-31 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
+
+ * sysdeps/x86_64/multiarch/rawmemchr.S: Update.
+ Use new sse2 version for core i3 - i7 as it's faster
+ than sse42 version.
+ (bit_Prefer_PMINUB_for_stringop): New.
+ * sysdeps/x86_64/rawmemchr.S: Update.
+ Replace with faster SSE2 version.
+ * sysdeps/x86_64/memrchr.S: New file.
+ * sysdeps/x86_64/memchr.S: Update.
+ Replace with faster SSE2 version.
+
2011-09-12 Marek Polacek <mpolacek@redhat.com>
* elf/dl-load.c (lose): Add cast to avoid warning.
diff --git a/NEWS b/NEWS
index 1af566f..73552e6 100644
--- a/NEWS
+++ b/NEWS
@@ -33,6 +33,9 @@ Version 2.15
* Optimized strchr and strrchr for SSE on x86-32.
Contributed by Liubov Dmitrieva.
+* Optimized memchr, memrchr, rawmemchr for x86-64.
+ Contributed by Liubov Dmitrieva.
+
* New interfaces: scandirat, scandirat64
Implemented by Ulrich Drepper.
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index 6082aa7..895a014 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -1,5 +1,5 @@
-/* Copyright (C) 2009 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
+/* Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -19,50 +19,294 @@
#include <sysdep.h>
+/* fast SSE2 version with using pmaxub and 64 byte loop */
.text
-ENTRY (memchr)
- movd %esi, %xmm1
- movq %rdi, %rcx
+ENTRY(memchr)
+ movd %rsi, %xmm1
+ mov %rdi, %rcx
+
punpcklbw %xmm1, %xmm1
- andq $~15, %rdi
- testq %rdx, %rdx
+ test %rdx, %rdx
+ jz L(return_null)
punpcklbw %xmm1, %xmm1
- jz 3f
- orl $0xffffffff, %esi
- movdqa (%rdi), %xmm0
+
+ and $63, %rcx
pshufd $0, %xmm1, %xmm1
- subq %rdi, %rcx
+
+ cmp $48, %rcx
+ ja L(crosscache)
+
+ movdqu (%rdi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+
+ jnz L(matches_1)
+ sub $16, %rdx
+ jbe L(return_null)
+ add $16, %rdi
+ and $15, %rcx
+ and $-16, %rdi
+ add %rcx, %rdx
+ sub $64, %rdx
+ jbe L(exit_loop)
+ jmp L(loop_prolog)
+
+ .p2align 4
+L(crosscache):
+ and $15, %rcx
+ and $-16, %rdi
+ movdqa (%rdi), %xmm0
+
+ pcmpeqb %xmm1, %xmm0
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+/* Remove the leading bytes. */
+ sar %cl, %eax
+ test %eax, %eax
+ je L(unaligned_no_match)
+/* Check which byte is a match. */
+ bsf %eax, %eax
+
+ sub %rax, %rdx
+ jbe L(return_null)
+ add %rdi, %rax
+ add %rcx, %rax
+ ret
+
+ .p2align 4
+L(unaligned_no_match):
+ add %rcx, %rdx
+ sub $16, %rdx
+ jbe L(return_null)
+ add $16, %rdi
+ sub $64, %rdx
+ jbe L(exit_loop)
+
+ .p2align 4
+L(loop_prolog):
+ movdqa (%rdi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ movdqa 16(%rdi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa 32(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 48(%rdi), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ add $64, %rdi
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ test $0x3f, %rdi
+ jz L(align64_loop)
+
+ sub $64, %rdx
+ jbe L(exit_loop)
+
+ movdqa (%rdi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ movdqa 16(%rdi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa 32(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 48(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+
+ add $64, %rdi
+ test %eax, %eax
+ jnz L(matches0)
+
+ mov %rdi, %rcx
+ and $-64, %rdi
+ and $63, %rcx
+ add %rcx, %rdx
+
+ .p2align 4
+L(align64_loop):
+ sub $64, %rdx
+ jbe L(exit_loop)
+ movdqa (%rdi), %xmm0
+ movdqa 16(%rdi), %xmm2
+ movdqa 32(%rdi), %xmm3
+ movdqa 48(%rdi), %xmm4
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm0, %xmm3
+ pmaxub %xmm2, %xmm4
+ pmaxub %xmm3, %xmm4
+ pmovmskb %xmm4, %eax
+
+ add $64, %rdi
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+ sub $64, %rdi
+
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa 32(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+
+ pcmpeqb 48(%rdi), %xmm1
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ pmovmskb %xmm1, %eax
+ bsf %eax, %eax
+ lea 48(%rdi, %rax), %rax
+ ret
+
+ .p2align 4
+L(exit_loop):
+ add $32, %rdx
+ jle L(exit_loop_32)
+
+ movdqa (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
- addq %rcx, %rdx
- shl %cl, %esi
- pmovmskb %xmm0, %ecx
- andl %esi, %ecx
- movl $16, %esi
- jnz 1f
- cmpq %rsi, %rdx
- jbe 3f
-
-2: movdqa (%rdi,%rsi), %xmm0
- leaq 16(%rsi), %rsi
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ movdqa 16(%rdi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa 32(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ sub $16, %rdx
+ jle L(return_null)
+
+ pcmpeqb 48(%rdi), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(exit_loop_32):
+ add $32, %rdx
+ movdqa (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %ecx
- testl %ecx, %ecx
- jnz 1f
- cmpq %rsi, %rdx
- ja 2b
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches_1)
+ sub $16, %rdx
+ jbe L(return_null)
-3: xorl %eax, %eax
+ pcmpeqb 16(%rdi), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ xor %rax, %rax
ret
-1: leaq -16(%rdi,%rsi), %rax
- bsfl %ecx, %ecx
- addq %rcx, %rax
- leaq -16(%rsi,%rcx), %rsi
- cmpq %rsi, %rdx
- jbe 3b
+ .p2align 4
+L(matches0):
+ bsf %eax, %eax
+ lea -16(%rax, %rdi), %rax
ret
-END (memchr)
+
+ .p2align 4
+L(matches):
+ bsf %eax, %eax
+ add %rdi, %rax
+ ret
+
+ .p2align 4
+L(matches16):
+ bsf %eax, %eax
+ lea 16(%rax, %rdi), %rax
+ ret
+
+ .p2align 4
+L(matches32):
+ bsf %eax, %eax
+ lea 32(%rax, %rdi), %rax
+ ret
+
+ .p2align 4
+L(matches_1):
+ bsf %eax, %eax
+ sub %rax, %rdx
+ jbe L(return_null)
+ add %rdi, %rax
+ ret
+
+ .p2align 4
+L(matches16_1):
+ bsf %eax, %eax
+ sub %rax, %rdx
+ jbe L(return_null)
+ lea 16(%rdi, %rax), %rax
+ ret
+
+ .p2align 4
+L(matches32_1):
+ bsf %eax, %eax
+ sub %rax, %rdx
+ jbe L(return_null)
+ lea 32(%rdi, %rax), %rax
+ ret
+
+ .p2align 4
+L(matches48_1):
+ bsf %eax, %eax
+ sub %rax, %rdx
+ jbe L(return_null)
+ lea 48(%rdi, %rax), %rax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %rax, %rax
+ ret
+END(memchr)
strong_alias (memchr, __memchr)
-libc_hidden_builtin_def (memchr)
+
+libc_hidden_builtin_def(memchr)
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
new file mode 100644
index 0000000..a85dc6b
--- /dev/null
+++ b/sysdeps/x86_64/memrchr.S
@@ -0,0 +1,380 @@
+/* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using
+
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+ .text
+ENTRY (memrchr)
+ movd %rsi, %xmm1
+
+ sub $16, %rdx
+ jbe L(length_less16)
+
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+
+ add %rdx, %rdi
+ pshufd $0, %xmm1, %xmm1
+
+ movdqu (%rdi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ sub $64, %rdi
+ mov %rdi, %rcx
+ and $15, %rcx
+ jz L(loop_prolog)
+
+ add $16, %rdi
+ add $16, %rdx
+ and $-16, %rdi
+ sub %rcx, %rdx
+
+ .p2align 4
+L(loop_prolog):
+ sub $64, %rdx
+ jbe L(exit_loop)
+
+ movdqa 48(%rdi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%rdi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%rdi), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ sub $64, %rdi
+ sub $64, %rdx
+ jbe L(exit_loop)
+
+ movdqa 48(%rdi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%rdi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa (%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ mov %rdi, %rcx
+ and $63, %rcx
+ jz L(align64_loop)
+
+ add $64, %rdi
+ add $64, %rdx
+ and $-64, %rdi
+ sub %rcx, %rdx
+
+ .p2align 4
+L(align64_loop):
+ sub $64, %rdi
+ sub $64, %rdx
+ jbe L(exit_loop)
+
+ movdqa (%rdi), %xmm0
+ movdqa 16(%rdi), %xmm2
+ movdqa 32(%rdi), %xmm3
+ movdqa 48(%rdi), %xmm4
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm3, %xmm0
+ pmaxub %xmm4, %xmm2
+ pmaxub %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%rdi), %xmm2
+
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb (%rdi), %xmm1
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ pmovmskb %xmm1, %eax
+ bsr %eax, %eax
+
+ add %rdi, %rax
+ ret
+
+ .p2align 4
+L(exit_loop):
+ add $64, %rdx
+ cmp $32, %rdx
+ jbe L(exit_loop_32)
+
+ movdqa 48(%rdi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48)
+
+ movdqa 32(%rdi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 16(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches16_1)
+ cmp $48, %rdx
+ jbe L(return_null)
+
+ pcmpeqb (%rdi), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches0_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(exit_loop_32):
+ movdqa 48(%rdi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches48_1)
+ cmp $16, %rdx
+ jbe L(return_null)
+
+ pcmpeqb 32(%rdi), %xmm1
+ pmovmskb %xmm1, %eax
+ test %eax, %eax
+ jnz L(matches32_1)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(matches0):
+ bsr %eax, %eax
+ add %rdi, %rax
+ ret
+
+ .p2align 4
+L(matches16):
+ bsr %eax, %eax
+ lea 16(%rax, %rdi), %rax
+ ret
+
+ .p2align 4
+L(matches32):
+ bsr %eax, %eax
+ lea 32(%rax, %rdi), %rax
+ ret
+
+ .p2align 4
+L(matches48):
+ bsr %eax, %eax
+ lea 48(%rax, %rdi), %rax
+ ret
+
+ .p2align 4
+L(matches0_1):
+ bsr %eax, %eax
+ sub $64, %rdx
+ add %rax, %rdx
+ jl L(return_null)
+ add %rdi, %rax
+ ret
+
+ .p2align 4
+L(matches16_1):
+ bsr %eax, %eax
+ sub $48, %rdx
+ add %rax, %rdx
+ jl L(return_null)
+ lea 16(%rdi, %rax), %rax
+ ret
+
+ .p2align 4
+L(matches32_1):
+ bsr %eax, %eax
+ sub $32, %rdx
+ add %rax, %rdx
+ jl L(return_null)
+ lea 32(%rdi, %rax), %rax
+ ret
+
+ .p2align 4
+L(matches48_1):
+ bsr %eax, %eax
+ sub $16, %rdx
+ add %rax, %rdx
+ jl L(return_null)
+ lea 48(%rdi, %rax), %rax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %rax, %rax
+ ret
+
+ .p2align 4
+L(length_less16_offset0):
+ mov %dl, %cl
+ pcmpeqb (%rdi), %xmm1
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ pmovmskb %xmm1, %eax
+
+ and %edx, %eax
+ test %eax, %eax
+ jz L(return_null)
+
+ bsr %eax, %eax
+ add %rdi, %rax
+ ret
+
+ .p2align 4
+L(length_less16):
+ punpcklbw %xmm1, %xmm1
+ punpcklbw %xmm1, %xmm1
+
+ add $16, %rdx
+
+ pshufd $0, %xmm1, %xmm1
+
+ mov %rdi, %rcx
+ and $15, %rcx
+ jz L(length_less16_offset0)
+
+ mov %rdi, %rcx
+ and $15, %rcx
+ mov %cl, %dh
+ mov %rcx, %r8
+ add %dl, %dh
+ and $-16, %rdi
+
+ sub $16, %dh
+ ja L(length_less16_part2)
+
+ pcmpeqb (%rdi), %xmm1
+ pmovmskb %xmm1, %eax
+
+ sar %cl, %eax
+ mov %dl, %cl
+
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %eax
+ test %eax, %eax
+ jz L(return_null)
+
+ bsr %eax, %eax
+ add %rdi, %rax
+ add %r8, %rax
+ ret
+
+ .p2align 4
+L(length_less16_part2):
+ movdqa 16(%rdi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+
+ mov %dh, %cl
+ mov $1, %edx
+ sal %cl, %edx
+ sub $1, %edx
+
+ and %edx, %eax
+
+ test %eax, %eax
+ jnz L(length_less16_part2_return)
+
+ pcmpeqb (%rdi), %xmm1
+ pmovmskb %xmm1, %eax
+
+ mov %r8, %rcx
+ sar %cl, %eax
+ test %eax, %eax
+ jz L(return_null)
+
+ bsr %eax, %eax
+ add %rdi, %rax
+ add %r8, %rax
+ ret
+
+ .p2align 4
+L(length_less16_part2_return):
+ bsr %eax, %eax
+ lea 16(%rax, %rdi), %rax
+ ret
+
+END (memrchr)
+strong_alias (memrchr, __memrchr)
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.S b/sysdeps/x86_64/multiarch/rawmemchr.S
index 2a8a690..a8933fb 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr.S
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009 Free Software Foundation, Inc.
+/* Copyright (C) 2009, 2011 Free Software Foundation, Inc.
Contributed by Ulrich Drepper <drepper@redhat.com>.
This file is part of the GNU C Library.
@@ -29,11 +29,15 @@ ENTRY(rawmemchr)
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
-1: leaq __rawmemchr_sse2(%rip), %rax
+1: testl $bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
+ jnz 2f
testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
jz 2f
leaq __rawmemchr_sse42(%rip), %rax
-2: ret
+ ret
+2: leaq __rawmemchr_sse2(%rip), %rax
+ ret
+
END(rawmemchr)
strong_alias (rawmemchr, __rawmemchr)
diff --git a/sysdeps/x86_64/rawmemchr.S b/sysdeps/x86_64/rawmemchr.S
index cfb4ceb..a68b52c 100644
--- a/sysdeps/x86_64/rawmemchr.S
+++ b/sysdeps/x86_64/rawmemchr.S
@@ -1,5 +1,7 @@
-/* Copyright (C) 2009 Free Software Foundation, Inc.
- Contributed by Ulrich Drepper <drepper@redhat.com>.
+/* fast SSE2 memchr with 64 byte loop and pmaxub instruction using
+
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -19,34 +21,187 @@
#include <sysdep.h>
-
.text
ENTRY (rawmemchr)
- movd %esi, %xmm1
- movq %rdi, %rcx
+ movd %rsi, %xmm1
+ mov %rdi, %rcx
+
punpcklbw %xmm1, %xmm1
- andq $~15, %rdi
punpcklbw %xmm1, %xmm1
- orl $0xffffffff, %esi
- movdqa (%rdi), %xmm0
+
+ and $63, %rcx
pshufd $0, %xmm1, %xmm1
- subq %rdi, %rcx
+
+ cmp $48, %rcx
+ ja L(crosscache)
+
+ movdqu (%rdi), %xmm0
pcmpeqb %xmm1, %xmm0
- shl %cl, %esi
- pmovmskb %xmm0, %ecx
- andl %esi, %ecx
- jnz 1f
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+
+ jnz L(matches)
+ add $16, %rdi
+ and $-16, %rdi
+ jmp L(loop_prolog)
+
+ .p2align 4
+L(crosscache):
+ and $15, %rcx
+ and $-16, %rdi
+ movdqa (%rdi), %xmm0
-2: movdqa 16(%rdi), %xmm0
- leaq 16(%rdi), %rdi
pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %ecx
- testl %ecx, %ecx
- jz 2b
+/* Check if there is a match. */
+ pmovmskb %xmm0, %eax
+/* Remove the leading bytes. */
+ sar %cl, %eax
+ test %eax, %eax
+ je L(unaligned_no_match)
+/* Check which byte is a match. */
+ bsf %eax, %eax
-1: bsfl %ecx, %eax
- addq %rdi, %rax
+ add %rdi, %rax
+ add %rcx, %rax
ret
+
+ .p2align 4
+L(unaligned_no_match):
+ add $16, %rdi
+
+ .p2align 4
+L(loop_prolog):
+ movdqa (%rdi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ movdqa 16(%rdi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa 32(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 48(%rdi), %xmm4
+ pcmpeqb %xmm1, %xmm4
+ add $64, %rdi
+ pmovmskb %xmm4, %eax
+ test %eax, %eax
+ jnz L(matches0)
+
+ test $0x3f, %rdi
+ jz L(align64_loop)
+
+ movdqa (%rdi), %xmm0
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ movdqa 16(%rdi), %xmm2
+ pcmpeqb %xmm1, %xmm2
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa 32(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ movdqa 48(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+
+ add $64, %rdi
+ test %eax, %eax
+ jnz L(matches0)
+
+ and $-64, %rdi
+
+ .p2align 4
+L(align64_loop):
+ movdqa (%rdi), %xmm0
+ movdqa 16(%rdi), %xmm2
+ movdqa 32(%rdi), %xmm3
+ movdqa 48(%rdi), %xmm4
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm1, %xmm4
+
+ pmaxub %xmm0, %xmm3
+ pmaxub %xmm2, %xmm4
+ pmaxub %xmm3, %xmm4
+ pmovmskb %xmm4, %eax
+
+ add $64, %rdi
+
+ test %eax, %eax
+ jz L(align64_loop)
+
+ sub $64, %rdi
+
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ jnz L(matches)
+
+ pmovmskb %xmm2, %eax
+ test %eax, %eax
+ jnz L(matches16)
+
+ movdqa 32(%rdi), %xmm3
+ pcmpeqb %xmm1, %xmm3
+
+ pcmpeqb 48(%rdi), %xmm1
+ pmovmskb %xmm3, %eax
+ test %eax, %eax
+ jnz L(matches32)
+
+ pmovmskb %xmm1, %eax
+ bsf %eax, %eax
+ lea 48(%rdi, %rax), %rax
+ ret
+
+ .p2align 4
+L(matches0):
+ bsf %eax, %eax
+ lea -16(%rax, %rdi), %rax
+ ret
+
+ .p2align 4
+L(matches):
+ bsf %eax, %eax
+ add %rdi, %rax
+ ret
+
+ .p2align 4
+L(matches16):
+ bsf %eax, %eax
+ lea 16(%rax, %rdi), %rax
+ ret
+
+ .p2align 4
+L(matches32):
+ bsf %eax, %eax
+ lea 32(%rax, %rdi), %rax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %rax, %rax
+ ret
+
END (rawmemchr)
strong_alias (rawmemchr, __rawmemchr)
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 12 +
NEWS | 3 +
sysdeps/x86_64/memchr.S | 316 +++++++++++++++++++++++++----
sysdeps/x86_64/memrchr.S | 380 ++++++++++++++++++++++++++++++++++
sysdeps/x86_64/multiarch/rawmemchr.S | 10 +-
sysdeps/x86_64/rawmemchr.S | 195 ++++++++++++++++--
6 files changed, 857 insertions(+), 59 deletions(-)
create mode 100644 sysdeps/x86_64/memrchr.S
hooks/post-receive
--
GNU C Library master sources