This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

PATCH: Optimize strchr/strrchr with SSE4.2


Hi,

This patch adds SSE4.2 optimized strchr/strrchr. It can speed up
strchr/strrchr by up to 2X on Intel Core i7.

Thanks.

H.J.
---
2009-10-21  H.J. Lu  <hongjiu.lu@intel.com>

	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
	strend-sse4.

	* sysdeps/x86_64/multiarch/strchr.S: New.
	* sysdeps/x86_64/multiarch/strend-sse4.S: Likewise.
	* sysdeps/x86_64/multiarch/strrchr.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 0ded3b3..364e7bb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -4,7 +4,8 @@ gen-as-const-headers += ifunc-defines.sym
 endif
 
 ifeq ($(subdir),string)
-sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3
+sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
+		   strend-sse4
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
new file mode 100644
index 0000000..a77cc1c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -0,0 +1,177 @@
+/* strchr with SSE4.2
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+
+/* Define multiple versions only for the definition in libc.  */
+#ifndef NOT_IN_libc
+	.text
+ENTRY(strchr)
+	.type	strchr, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__strchr_sse2(%rip), %rax
+	testl	$(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jz	2f
+	leaq	__strchr_sse42(%rip), %rax
+2:	ret
+END(strchr)
+
+
+/*
+   This implementation uses SSE4 instructions to compare up to 16 bytes
+   at a time looking for the first occurrence of the character c in the
+   string s:
+
+   char *strchr (const char *s, int c);
+
+   We use 0xa:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_EACH
+	| _SIDD_LEAST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   C C C C C C C C C C C C C C C C
+
+   to find out if the first 16byte data element has a byte C and the
+   offset of the first byte.  There are 3 cases:
+
+   1. The first 16byte data element has the byte C at the offset X.
+   2. The first 16byte data element has EOS and doesn't have the byte C.
+   3. The first 16byte data element is valid and doesn't have the byte C.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
+
+   case		ECX	CFlag	ZFlag	SFlag
+    1		 X	  1	 0/1	  0
+    2		16	  0	  1	  0
+    3		16	  0	  0	  0
+
+   We exit from the loop for cases 1 and 2 with jbe which branches
+   when either CFlag or ZFlag is 1.  If CFlag == 1, ECX has the offset
+   X for case 1.  */
+
+	.section .text.sse4.2,"ax",@progbits
+	.align 	16
+	.type	__strchr_sse42, @function
+__strchr_sse42:
+	cfi_startproc
+	CALL_MCOUNT
+	testb	%sil, %sil
+	je	__strend_sse4
+	pxor	%xmm2, %xmm2
+	movd	%esi, %xmm1
+	movl	%edi, %ecx
+	andl	$15, %ecx
+	movq	%rdi, %r8
+	je	L(aligned_start)
+
+/* Handle unaligned string.  */
+	andq	$-16, %r8
+	pshufb  %xmm2, %xmm1
+	movdqa	(%r8), %xmm0
+	pcmpeqb	 %xmm0, %xmm2
+	pcmpeqb	 %xmm1, %xmm0
+	/* Find where NULL is.  */
+	pmovmskb %xmm2, %edx
+	/* Check if there is a match.  */
+	pmovmskb %xmm0, %esi
+	/* Remove the leading  bytes.  */
+	sarl	%cl, %edx
+	sarl	%cl, %esi
+	testl	%esi, %esi
+	je	L(unaligned_no_match)
+	/* Check which byte is a match.  */
+	bsfl	%esi, %eax
+	/* Is there a NULL? */
+	testl	%edx, %edx
+	je      L(unaligned_match)
+	bsfl	%edx, %esi
+	cmpl	%esi, %eax
+	/* Return NULL if NULL comes first.  */
+	ja	L(return_null)
+L(unaligned_match):
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4
+L(unaligned_no_match):
+	testl	%edx, %edx
+	jne	L(return_null)
+
+/* Loop start on aligned string.  */
+L(loop):
+	addq	$16, %r8
+L(aligned_start):
+	pcmpistri	$0x2, (%r8), %xmm1
+	jbe 	L(wrap)
+	addq	$16, %r8
+	pcmpistri	$0x2, (%r8), %xmm1
+	jbe 	L(wrap)
+	addq 	$16, %r8
+        pcmpistri       $0x2, (%r8), %xmm1
+  	jbe     L(wrap)
+	addq	$16, %r8
+	pcmpistri 	$0x2, (%r8), %xmm1
+	jbe 	L(wrap)
+	jmp	L(loop)
+L(wrap):
+	jc	L(loop_exit)
+
+/* Return NULL.  */
+L(return_null):
+	xorl	%eax, %eax
+	ret
+
+/* Loop exit.  */
+	.p2align 4
+L(loop_exit):
+	leaq	(%r8,%rcx), %rax
+	ret
+	cfi_endproc
+	.size	__strchr_sse42, .-__strchr_sse42
+
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strchr_sse2, @function; \
+	.align 16; \
+	__strchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strchr calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strchr; __GI_strchr = __strchr_sse2
+#endif
+
+#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/strend-sse4.S b/sysdeps/x86_64/multiarch/strend-sse4.S
new file mode 100644
index 0000000..c3220b3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strend-sse4.S
@@ -0,0 +1,49 @@
+/* Return the pointer to the end of string, using SSE4.2
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include "asm-syntax.h"
+
+	.section .text.sse4.2,"ax",@progbits
+ENTRY (__strend_sse4)
+	pxor	%xmm2, %xmm2
+	movq	%rdi, %rcx
+	andq	$~15, %rdi
+	movdqa	%xmm2, %xmm1
+	pcmpeqb	(%rdi), %xmm2
+	orl	$0xffffffff, %esi
+	subq	%rdi, %rcx
+	shll	%cl, %esi
+	pmovmskb %xmm2, %edx
+	andl	%esi, %edx
+	jnz	1f
+
+2:	pcmpistri $0x08, 16(%rdi), %xmm1
+	leaq	16(%rdi), %rdi
+	jnz	2b
+
+	leaq	(%rdi,%rcx), %rax
+	ret
+
+1:	bsfl	%edx, %eax
+	addq	%rdi, %rax
+	ret
+
+END (__strend_sse4)
diff --git a/sysdeps/x86_64/multiarch/strrchr.S b/sysdeps/x86_64/multiarch/strrchr.S
new file mode 100644
index 0000000..a8c28a4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr.S
@@ -0,0 +1,278 @@
+/* strrchr with SSE4.2
+   Copyright (C) 2009 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+#include <ifunc-defines.h>
+
+
+/* Define multiple versions only for the definition in libc and for
+   the DSO.  In static binaries we need strrchr before the initialization
+   happened.  */
+#if defined SHARED && !defined NOT_IN_libc
+	.text
+ENTRY(strrchr)
+	.type	strrchr, @gnu_indirect_function
+	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
+	jne	1f
+	call	__init_cpu_features
+1:	leaq	__strrchr_sse2(%rip), %rax
+	testl	$(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
+	jz	2f
+	leaq	__strrchr_sse42(%rip), %rax
+2:	ret
+END(strrchr)
+
+/*
+   This implementation uses SSE4 instructions to compare up to 16 bytes
+   at a time looking for the last occurrence of the character c in the
+   string s:
+
+   char *strrchr (const char *s, int c);
+
+   We use 0x4a:
+	_SIDD_SBYTE_OPS
+	| _SIDD_CMP_EQUAL_EACH
+	| _SIDD_MOST_SIGNIFICANT
+   on pcmpistri to compare xmm/mem128
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   X X X X X X X X X X X X X X X X
+
+   against xmm
+
+   0 1 2 3 4 5 6 7 8 9 A B C D E F
+   C C C C C C C C C C C C C C C C
+
+   to find out if the first 16byte data element has a byte C and the
+   last offset.  There are 4 cases:
+
+   1. The first 16byte data element has EOS and has the byte C at the
+      last offset X.
+   2. The first 16byte data element is valid and has the byte C at the
+      last offset X.
+   3. The first 16byte data element has EOS and doesn't have the byte C.
+   4. The first 16byte data element is valid and doesn't have the byte C.
+
+   Here is the table of ECX, CFlag, ZFlag and SFlag for 3 cases:
+
+   case		ECX	CFlag	ZFlag	SFlag
+    1		 X	  1	  1	  0
+    2		 X	  1	  0	  0
+    3		16	  0	  1	  0
+    4		16	  0	  0	  0
+
+   We exit from the loop for cases 1 and 3 with jz which branches
+   when ZFlag is 1.  If CFlag == 1, ECX has the offset X for case 1.  */
+
+
+	.section .text.sse4.2,"ax",@progbits
+	.align 	16
+	.type	__strrchr_sse42, @function
+__strrchr_sse42:
+	cfi_startproc
+	CALL_MCOUNT
+	testb	%sil, %sil
+	je	__strend_sse4
+	xor	%eax,%eax	/* RAX has the last occurrence of s.  */
+	movd	%esi, %xmm1
+	punpcklbw	%xmm1, %xmm1
+	movl	%edi, %esi
+	punpcklbw	%xmm1, %xmm1
+	andl	$15, %esi
+	pshufd	$0, %xmm1, %xmm1
+	movq	%rdi, %r8
+	je	L(loop)
+
+/* Handle unaligned string using psrldq.  */
+	leaq	L(psrldq_table)(%rip), %rdx
+	andq	$-16, %r8
+	movslq	(%rdx,%rsi,4),%r9
+	movdqa	(%r8), %xmm0
+	addq	%rdx, %r9
+	jmp	*%r9
+
+/* Handle unaligned string with offset 1 using psrldq.  */
+	.p2align 4
+L(psrldq_1):
+	psrldq	$1, %xmm0
+
+	.p2align 4
+L(unaligned_pcmpistri):
+	pcmpistri	$0x4a, %xmm1, %xmm0
+	jnc	L(unaligned_no_byte)
+	leaq	(%rdi,%rcx), %rax
+L(unaligned_no_byte):
+	/* Find the length of the unaligned string.  */
+	pcmpistri	$0x3a, %xmm0, %xmm0
+	movl	$16, %edx
+	subl	%esi, %edx
+	cmpl	%ecx, %edx
+	/* Return RAX if the unaligned fragment to next 16B already
+	   contain the NULL terminator.  */
+	jg	L(exit)
+	addq	$16, %r8
+
+/* Loop start on aligned string.  */
+	.p2align 4
+L(loop):
+	pcmpistri	$0x4a, (%r8), %xmm1
+	jbe	L(match_or_eos)
+	addq	$16, %r8
+	jmp	L(loop)
+	.p2align 4
+L(match_or_eos):
+	je	L(had_eos)
+L(match_no_eos):
+	leaq	(%r8,%rcx), %rax
+	addq	$16, %r8
+        jmp     L(loop)
+	.p2align 4
+L(had_eos):
+        jnc     L(exit)
+	leaq	(%r8,%rcx), %rax
+	.p2align 4
+L(exit):
+	ret
+
+/* Handle unaligned string with offset 15 using psrldq.  */
+	.p2align 4
+L(psrldq_15):
+	psrldq	$15, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 14 using psrldq.  */
+	.p2align 4
+L(psrldq_14):
+	psrldq	$14, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 13 using psrldq.  */
+	.p2align 4
+L(psrldq_13):
+	psrldq	$13, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 12 using psrldq.  */
+	.p2align 4
+L(psrldq_12):
+	psrldq	$12, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 11 using psrldq.  */
+	.p2align 4
+L(psrldq_11):
+	psrldq	$11, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 10 using psrldq.  */
+	.p2align 4
+L(psrldq_10):
+	psrldq	$10, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 9 using psrldq.  */
+	.p2align 4
+L(psrldq_9):
+	psrldq	$9, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 8 using psrldq.  */
+	.p2align 4
+L(psrldq_8):
+	psrldq	$8, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 7 using psrldq.  */
+	.p2align 4
+L(psrldq_7):
+	psrldq	$7, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 6 using psrldq.  */
+	.p2align 4
+L(psrldq_6):
+	psrldq	$6, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 5 using psrldq.  */
+	.p2align 4
+L(psrldq_5):
+	psrldq	$5, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 4 using psrldq.  */
+	.p2align 4
+L(psrldq_4):
+	psrldq	$4, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 3 using psrldq.  */
+	.p2align 4
+L(psrldq_3):
+	psrldq	$3, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+/* Handle unaligned string with offset 2 using psrldq.  */
+	.p2align 4
+L(psrldq_2):
+	psrldq	$2, %xmm0
+	jmp	L(unaligned_pcmpistri)
+
+	cfi_endproc
+	.size	__strrchr_sse42, .-__strrchr_sse42
+
+	.section .rodata.sse4.2,"a",@progbits
+	.p2align 4
+L(psrldq_table):
+	.int	L(loop) - L(psrldq_table)
+	.int	L(psrldq_1) - L(psrldq_table)
+	.int	L(psrldq_2) - L(psrldq_table)
+	.int	L(psrldq_3) - L(psrldq_table)
+	.int	L(psrldq_4) - L(psrldq_table)
+	.int	L(psrldq_5) - L(psrldq_table)
+	.int	L(psrldq_6) - L(psrldq_table)
+	.int	L(psrldq_7) - L(psrldq_table)
+	.int	L(psrldq_8) - L(psrldq_table)
+	.int	L(psrldq_9) - L(psrldq_table)
+	.int	L(psrldq_10) - L(psrldq_table)
+	.int	L(psrldq_11) - L(psrldq_table)
+	.int	L(psrldq_12) - L(psrldq_table)
+	.int	L(psrldq_13) - L(psrldq_table)
+	.int	L(psrldq_14) - L(psrldq_table)
+	.int	L(psrldq_15) - L(psrldq_table)
+
+
+# undef ENTRY
+# define ENTRY(name) \
+	.type __strrchr_sse2, @function; \
+	.align 16; \
+	__strrchr_sse2: cfi_startproc; \
+	CALL_MCOUNT
+# undef END
+# define END(name) \
+	cfi_endproc; .size __strrchr_sse2, .-__strrchr_sse2
+# undef libc_hidden_builtin_def
+/* It doesn't make sense to send libc-internal strrchr calls through a PLT.
+   The speedup we get from using SSE4.2 instruction is likely eaten away
+   by the indirect call in the PLT.  */
+# define libc_hidden_builtin_def(name) \
+	.globl __GI_strrchr; __GI_strrchr = __strrchr_sse2
+#endif
+
+#include "../strrchr.S"


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]