This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH]: sparc: Optimize strlen using techniques from powerpcimplementation.


This is an improved strlen for sparc that I blogged about the other
day:

	http://vger.kernel.org/~davem/cgi-bin/blog.cgi/index.html

It basically makes use of the scheme powerpc does to avoid sub-word
loads in handling the first few unaligned bytes.

Since, unlike powerpc, sparc lacks a count-leading-bits instruction
(we could use 'popc', but no cpu actually implements the 'popc'
instruction in hardware), we use divide and conquer plus some
conditional move tricks to find the zero byte in the final word.

This code is as fast, or faster, than the existing code for aligned
cases.  It is significantly faster for unaligned cases.  It is also
more deterministic than the existing code, since there is only one
conditional branch, and that's for the main loop.  Finally, it is
denser than the existing code and thus takes up less I-cache space.

Committed to mainline.

sparc: Optimize strlen using techniques from powerpc implementation.

---
 ChangeLog                              |    5 +
 sysdeps/sparc/sparc32/sparcv9/strlen.S |    3 -
 sysdeps/sparc/sparc32/strlen.S         |  128 ++++++++------------
 sysdeps/sparc/sparc64/strlen.S         |  210 +++++++++----------------------
 4 files changed, 115 insertions(+), 231 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index afb549b..7e5e7bb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -6,6 +6,11 @@
 
 	* sysdeps/sparc/sparc32/memcpy.S: Fix build.
 
+	* sysdeps/sparc/sparc32/strlen.S: Optimize.
+	* sysdeps/sparc/sparc64/strlen.S: Likewise.
+	* sysdeps/sparc/sparc32/sparcv9/strlen.S (ASI_PNF, ASI_BLK_P,
+	XCC): Delete definitions, not needed.
+
 2010-03-07  Ulrich Drepper  <drepper@redhat.com>
 
 	* sysdeps/unix/sysv/linux/internal_statvfs.c (__statvfs_getflags):
diff --git a/sysdeps/sparc/sparc32/sparcv9/strlen.S b/sysdeps/sparc/sparc32/sparcv9/strlen.S
index b8f4dba..28a216c 100644
--- a/sysdeps/sparc/sparc32/sparcv9/strlen.S
+++ b/sysdeps/sparc/sparc32/sparcv9/strlen.S
@@ -1,4 +1 @@
-#define ASI_PNF     0x82
-#define ASI_BLK_P   0xf0
-#define XCC icc
 #include <sparc64/strlen.S>
diff --git a/sysdeps/sparc/sparc32/strlen.S b/sysdeps/sparc/sparc32/strlen.S
index ed92f20..2945bb5 100644
--- a/sysdeps/sparc/sparc32/strlen.S
+++ b/sysdeps/sparc/sparc32/strlen.S
@@ -1,8 +1,9 @@
 /* Determine the length of a string.
    For SPARC v7.
-   Copyright (C) 1996, 1999, 2003 Free Software Foundation, Inc.
+   Copyright (C) 1996, 1999, 2003, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Jakub Jelinek <jj@ultra.linux.cz>.
+   Contributed by Jakub Jelinek <jj@ultra.linux.cz> and
+                  David S. Miller <davem@davemloft.net>.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -21,86 +22,55 @@
 
 #include <sysdep.h>
 
-	/* Normally, this uses ((xword - 0x01010101) & 0x80808080) test
-	   to find out if any byte in xword could be zero. This is fast, but
-	   also gives false alarm for any byte in range 0x81-0xff. It does
-	   not matter for correctness, as if this test tells us there could
-	   be some zero byte, we check it byte by byte, but if bytes with
-	   high bits set are common in the strings, then this will give poor
-	   performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
-	   will use one tick slower, but more precise test
-	   ((xword - 0x01010101) & (~xword) & 0x80808080),
-	   which does not give any false alarms (but if some bits are set,
-	   one cannot assume from it which bytes are zero and which are not).
-	   It is yet to be measured, what is the correct default for glibc
-	   in these days for an average user.
-	 */
-
 	.text
 	.align		4
 
 ENTRY(strlen)
-	mov		%o0, %o1
-	andcc		%o0, 3, %g0
-	be		20f
-	 sethi		%hi(0x80808080), %o4
-
-	ldub		[%o0], %o5
-	cmp		%o5, 0
-	be		21f
-	 add		%o0, 1, %o0
-	andcc		%o0, 3, %g0
-	be		4f
-	 or		%o4, %lo(0x80808080), %o3
-	ldub		[%o0], %o5
-	cmp		%o5, 0
-	be		22f
-	 add		%o0, 1, %o0
-	andcc		%o0, 3, %g0
-	be		5f
-	 sethi		%hi(0x01010101), %o4
-	ldub		[%o0], %o5
-	cmp		%o5, 0
-	be		23f
-	 add		%o0, 1, %o0
-	b		11f
-	 or		%o4, %lo(0x01010101), %o2
-21:	retl
-	 mov		0, %o0
-22:	retl
-	 mov		1, %o0
-23:	retl
-	 mov		2, %o0
-
-20:	or		%o4, %lo(0x80808080), %o3
-4:	sethi		%hi(0x01010101), %o4
-5:	or		%o4, %lo(0x01010101), %o2
-11:	ld		[%o0], %o5
-12:	sub		%o5, %o2, %o4
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o4, %o5, %o4
-#endif
-	andcc		%o4, %o3, %g0
-	be		11b
-	 add		%o0, 4, %o0
-
-	srl		%o5, 24, %g5
-	andcc		%g5, 0xff, %g0
-	be		13f
-	 add		%o0, -4, %o4
-	srl		%o5, 16, %g5
-	andcc		%g5, 0xff, %g0
-	be		13f
-	 add		%o4, 1, %o4
-	srl		%o5, 8, %g5
-	andcc		%g5, 0xff, %g0
-	be		13f
-	 add		%o4, 1, %o4
-	andcc		%o5, 0xff, %g0
-	bne,a		12b
-	 ld		[%o0], %o5
-	add		%o4, 1, %o4
-13:	retl
-	 sub		%o4, %o1, %o0
+	mov	%o0, %o1
+	andn	%o0, 0x3, %o0
+
+	ld	[%o0], %o5
+	and	%o1, 0x3, %g1
+	mov	-1, %g5
+
+	sethi	%hi(0x01010101), %o2
+	sll	%g1, 3, %g1
+
+	or	%o2, %lo(0x01010101), %o2
+	srl	%g5, %g1, %g2
+
+	orn	%o5, %g2, %o5
+	sll	%o2, 7, %o3
+10:	add	%o0, 4, %o0
+
+	andn	%o3, %o5, %g1
+	sub	%o5, %o2, %g2
+
+	andcc	%g1, %g2, %g0
+	be,a	10b
+	 ld	[%o0], %o5
+
+	srl	%o5, 24, %g1
+
+	andcc	%g1, 0xff, %g0
+	be	90f
+	 sub	%o0, 4, %o0
+
+	srl	%o5, 16, %g2
+
+	andcc	%g2, 0xff, %g0
+	be	90f
+	 add	%o0, 1, %o0
+
+	srl	%o5, 8, %g1
+
+	andcc	%g1, 0xff, %g0
+	be	90f
+	 add	%o0, 1, %o0
+
+	add	%o0, 1, %o0
+
+90:	retl
+	 sub	%o0, %o1, %o0
 END(strlen)
 libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/sparc/sparc64/strlen.S b/sysdeps/sparc/sparc64/strlen.S
index cc15e4e..64350fb 100644
--- a/sysdeps/sparc/sparc64/strlen.S
+++ b/sysdeps/sparc/sparc64/strlen.S
@@ -1,8 +1,9 @@
 /* Determine the length of a string.  For SPARC v9.
-   Copyright (C) 1998, 1999, 2003 Free Software Foundation, Inc.
+   Copyright (C) 1998, 1999, 2003, 2010 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz> and
-                  Jakub Jelinek <jj@ultra.linux.cz>.
+   Contributed by Jan Vondrak <jvon4518@ss1000.ms.mff.cuni.cz>,
+                  Jakub Jelinek <jj@ultra.linux.cz>, and
+                  David S. Miller <davem@davemloft.net>.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -20,155 +21,66 @@
    02111-1307 USA.  */
 
 #include <sysdep.h>
-#include <asm/asi.h>
-
-	/* Normally, this uses
-	   ((xword - 0x0101010101010101) & 0x8080808080808080) test
-	   to find out if any byte in xword could be zero. This is fast, but
-	   also gives false alarm for any byte in range 0x81-0xff. It does
-	   not matter for correctness, as if this test tells us there could
-	   be some zero byte, we check it byte by byte, but if bytes with
-	   high bits set are common in the strings, then this will give poor
-	   performance. You can #define EIGHTBIT_NOT_RARE and the algorithm
-	   will use one tick slower, but more precise test
-	   ((xword - 0x0101010101010101) & (~xword) & 0x8080808080808080),
-	   which does not give any false alarms (but if some bits are set,
-	   one cannot assume from it which bytes are zero and which are not).
-	   It is yet to be measured, what is the correct default for glibc
-	   in these days for an average user.
-	 */
+
+	.register	%g2, #scratch
+	.register	%g3, #scratch
 
 	.text
 	.align		32
 ENTRY(strlen)
-	sethi		%hi(0x01010101), %g1		/* IEU0		Group		*/
-	ldub		[%o0], %o3			/* Load				*/
-	or		%g1, %lo(0x01010101), %g1	/* IEU0		Group		*/
-	mov		%o0, %o1			/* IEU1				*/
-
-	sllx		%g1, 32, %g4			/* IEU0		Group 		*/
-	andcc		%o0, 7, %g0			/* IEU1				*/
-	or		%g1, %g4, %g1			/* IEU0		Group		*/
-	brz,pn		%o3, 13f			/* CTI+IEU1			*/
-
-	 sllx		%g1, 7, %g4			/* IEU0		Group		*/
-	bne,a,pn	%icc, 15f			/* CTI				*/
-	 add		%o0, 1, %o0			/* IEU1				*/
-							/* %g1 = 0x0101010101010101	*
-							 * %g4 = 0x8080808080808080	*
-							 * %o0 = string pointer		*
-							 * %o1 = start of string	*/
-1:	ldx		[%o0], %o3			/* Load		Group		*/
-
-	add		%o0, 8, %o0			/* IEU1				*/
-2:	sub		%o3, %g1, %o2			/* IEU0		Group		*/
-#ifdef EIGHTBIT_NOT_RARE
-	andn		%o2, %o3, %o5			/* IEU0		Group		*/
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%o5, %g4, %g0			/* IEU1		Group		*/
-#else
-	ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%o2, %g4, %g0			/* IEU1		Group		*/
-#endif
-
-	be,pt		%xcc, 2b			/* CTI				*/
-	 add		%o0, 8, %o0			/* IEU0				*/
- 	addcc		%o2, %g1, %g5			/* IEU1		Group		*/
-#ifdef EIGHTBIT_NOT_RARE
-	srlx		%o5, 32, %o5			/* IEU0				*/
-
-3:	andcc		%o5, %g4, %g0			/* IEU1		Group		*/
-#else
-	srlx		%o2, 32, %o2			/* IEU0				*/
-
-3:	andcc		%o2, %g4, %g0			/* IEU1		Group		*/
-#endif
-	be,pn		%xcc, 4f			/* CTI				*/
-	 srlx		%g5, 56, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 12f			/* CTI				*/
-	 srlx		%g5, 48, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 11f			/* CTI				*/
-
-	 srlx		%g5, 40, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 10f			/* CTI				*/
-	 srlx		%g5, 32, %o2			/* IEU0				*/
-
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 9f			/* CTI				*/
-4:	 srlx		%g5, 24, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-
-	be,pn		%icc, 8f			/* CTI				*/
-	 srlx		%g5, 16, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 7f			/* CTI				*/
-
-	 srlx		%g5, 8, %o2			/* IEU0				*/
-	andcc		%o2, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 6f			/* CTI				*/
-	 sub		%o3, %g1, %o2			/* IEU0				*/
-
-	andcc		%g5, 0xff, %g0			/* IEU1		Group		*/
-	be,pn		%icc, 5f			/* CTI				*/
-	 ldxa		[%o0] ASI_PNF, %o3		/* Load				*/
-	andcc		%o2, %g4, %g0			/* IEU1		Group		*/
-
-	be,pt		%xcc, 2b			/* CTI				*/
-	 add		%o0, 8, %o0			/* IEU0				*/
-	addcc		%o2, %g1, %g5			/* IEU1		Group		*/
-	ba,pt		%xcc, 3b			/* CTI				*/
-
-	 srlx		%o2, 32, %o2			/* IEU0				*/
-5:	add		%o0, -9, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-
-6:	add		%o0, -10, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-7:	add		%o0, -11, %o0			/* IEU0		Group		*/
-
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-8:	add		%o0, -12, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-9:	add		%o0, -13, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-
-10:	add		%o0, -14, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-11:	add		%o0, -15, %o0			/* IEU0		Group		*/
-
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-12:	add		%o0, -16, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-
-	 sub		%o0, %o1, %o0			/* IEU0				*/
-13:	retl						/* CTI+IEU1	Group		*/
-	 mov		0, %o0				/* IEU0				*/
-	nop
-
-15:	ldub		[%o0], %o3			/* Load		Group		*/
-16:	andcc		%o0, 7, %g0			/* IEU1				*/
-	be,pn		%icc, 1b			/* CTI				*/
-	 nop						/* IEU0		Group		*/
-
-	add		%o0, 1, %o0			/* IEU1				*/
-	andcc		%o3, 0xff, %g0			/* IEU1		Group		*/
-	bne,a,pt	%icc, 16b			/* CTI				*/
-	 lduba		[%o0] ASI_PNF, %o3		/* Load				*/
-
-	add		%o0, -1, %o0			/* IEU0		Group		*/
-	retl						/* CTI+IEU1	Group		*/
-	 sub		%o0, %o1, %o0			/* IEU0				*/
+	mov	%o0, %o1
+	andn	%o0, 0x7, %o0
+
+	ldx	[%o0], %o5
+	and	%o1, 0x7, %g1
+	mov	-1, %g5
+
+	sethi	%hi(0x01010101), %o2
+	sll	%g1, 3, %g1
+
+	or	%o2, %lo(0x01010101), %o2
+	srlx	%g5, %g1, %o3
+
+	sllx	%o2, 32, %g1
+	sethi	%hi(0x0000ff00), %g5
+
+	orn	%o5, %o3, %o5
+	or	%o2, %g1, %o2
+
+	sllx	%o2, 7, %o3
+10:	add	%o0, 8, %o0
+
+	andn	%o3, %o5, %g1
+	sub	%o5, %o2, %g2
+
+	andcc	%g1, %g2, %g0
+	be,a,pt	%xcc, 10b
+	 ldx	[%o0], %o5
+	srlx	%o5, 32, %g1
+
+	andn	%o3, %g1, %o4
+	sub	%g1, %o2, %g2
+
+	add	%o0, 4, %g3
+	andcc	%o4, %g2, %g0
+	movne	%icc, %g1, %o5
+
+	move	%icc, %g3, %o0
+	or	%g5, %lo(0x0000ff00), %g5
+	mov	3 - 8, %g2
+
+	andcc	%o5, %g5, %g0
+	srlx	%o5, 16, %g1
+	move	%icc, 2 - 8, %g2
+
+	andcc	%g1, 0xff, %g0
+	srl	%o5, 24, %o5
+	move	%icc, 1 - 8, %g2
+
+	movrz	%o5, 0 - 8, %g2
+	sub	%o0, %o1, %o0
+
+	retl
+	 add	%o0, %g2, %o0
 END(strlen)
 libc_hidden_builtin_def (strlen)
-- 
1.6.6.1


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]