This is the mail archive of the
libc-ports@sources.redhat.com
mailing list for the libc-ports project.
[ARM] Optimised strchr and strlen
- From: "Dr. David Alan Gilbert" <david dot gilbert at linaro dot org>
- To: libc-ports at sourceware dot org
- Cc: joseph at codesourcery dot com, patches at linaro dot org
- Date: Mon, 19 Dec 2011 17:21:23 +0000
- Subject: [ARM] Optimised strchr and strlen
This is a strchr and strlen optimised for ARM v6t2 or v7. It's against svn rev r15869
with my previous memchr patch. Tested both little & big endian.
(I've checked it still applies on svn trunk, but not done a retest on that; nothing
seems to have changed around there).
Dave
2012-12-19 Dr. David Alan Gilbert <david.gilbert@linaro.org>
* sysdeps/arm/eabi/armv6t2/strchr.S: New file
* sysdeps/arm/eabi/armv6t2/strlen.S: New file
diff -urN ports/sysdeps/arm/eabi/armv6t2/strchr.S src/ports/sysdeps/arm/eabi/armv6t2/strchr.S
--- ports/sysdeps/arm/eabi/armv6t2/strchr.S 1970-01-01 01:00:00.000000000 +0100
+++ ports/sysdeps/arm/eabi/armv6t2/strchr.S 2011-12-16 13:43:56.704694919 +0000
@@ -0,0 +1,71 @@
+/* Copyright (C) 2011 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Code contributed by Dave Gilbert <david.gilbert@linaro.org>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+@ A very simple strchr routine, from benchmarks on A9 it's a bit faster than
+@ the current version in eglibc.
+@ While I have a version that does 8 bytes/loop and is a lot faster on long
+@ strings, it is slower on short strings, and short strings seem more common
+@ in strchr usage.
+@ Note: The use of cbz/cbnz means it's Thumb only
+
+@ 2011-02-07 david.gilbert@linaro.org
+@ Extracted from local git a5b438d861
+@ 2011-12-16 david.gilbert@linaro.org
+@ Copy from Cortex strings rev 65 and change license
+
+ .syntax unified
+
+ .text
+ .thumb
+
+@ ---------------------------------------------------------------------------
+
+ .thumb_func
+ .global strchr
+ .type strchr,%function
+ENTRY(strchr)
+ @ r0 = start of string
+ @ r1 = character to match
+ @ returns NULL for no match, or a pointer to the match
+ and r1,r1, #255
+
+1:
+ ldrb r2,[r0],#1
+ cmp r2,r1
+ cbz r2,10f
+ bne 1b
+
+ @ We're here if it matched
+5:
+ subs r0,r0,#1
+ DO_RET(lr)
+
+10:
+ @ We're here if we ran off the end
+ cmp r1, #0 @ Corner case - you can search for the nil and get a pointer to it
+ beq 5b @ messy, if common we should branch at the start to a special loop
+ mov r0,#0
+ DO_RET(lr)
+
+END(strchr)
+
+weak_alias (strchr, index)
+libc_hidden_builtin_def(strchr)
diff -urN ports/sysdeps/arm/eabi/armv6t2/strlen.S src/ports/sysdeps/arm/eabi/armv6t2/strlen.S
--- ports/sysdeps/arm/eabi/armv6t2/strlen.S 1970-01-01 01:00:00.000000000 +0100
+++ ports/sysdeps/arm/eabi/armv6t2/strlen.S 2011-12-16 13:43:01.991130183 +0000
@@ -0,0 +1,118 @@
+/* Copyright (C) 2011 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Code contributed by Dave Gilbert <david.gilbert@linaro.org>
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+@ This strlen routine is optimised on a Cortex-A9 and should work on
+@ all ARMv7 processors. This routine is reasonably fast for short
+@ strings, but is probably slower than a simple implementation if all
+@ your strings are very short
+@ Note: The use of cbz/cbnz means it's Thumb only
+
+@ 2011-02-08 david.gilbert@linaro.org
+@ Extracted from local git 6848613a
+@ 2011-12-16 david.gilbert@linaro.org
+@ Copy from Cortex strings rev 65 and change license
+@ Add cfi magic, switch to ldrd
+
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+
+@-----------------------------------------------------------------------------
+ .syntax unified
+
+ .text
+ .thumb
+
+ .thumb_func
+ .global strlen
+ .type strlen,%function
+ENTRY(strlen)
+ @ r0 = string
+ @ returns count of bytes in string not including terminator
+ mov r1, r0
+ push { r4,r6 }
+ cfi_adjust_cfa_offset (8)
+ cfi_rel_offset (r4, 0)
+ cfi_rel_offset (r6, 4)
+
+ cfi_remember_state
+
+ mvns r6, #0 @ all F
+ movs r4, #0
+ tst r0, #7
+ beq 2f
+
+1:
+ ldrb r2, [r1], #1
+ tst r1, #7 @ Hit alignment yet?
+ cbz r2, 10f @ Exit if we found the 0
+ bne 1b
+
+ @ So we're now aligned
+2:
+ ldrd r2,r3,[r1],#8
+ uadd8 r2, r2, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r2, r4, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ uadd8 r3, r3, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r3, r2, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ cmp r3, #0
+ beq 2b
+
+strlenendtmp:
+ @ One (or more) of the bytes we loaded was 0 - but which one?
+ @ r2 has the mask corresponding to the first loaded word
+ @ r3 has a combined mask of the two words - but if r2 was all-non 0
+ @ then it's just the 2nd words
+ cmp r2, #0
+ itte eq
+ moveq r2, r3 @ the end is in the 2nd word
+ subeq r1,r1,#3
+ subne r1,r1,#7
+
+ @ r1 currently points to the 2nd byte of the word containing the 0
+ tst r2, # CHARTSTMASK(0) @ 1st character
+ bne 10f
+ adds r1,r1,#1
+ tst r2, # CHARTSTMASK(1) @ 2nd character
+ ittt eq
+ addeq r1,r1,#1
+ tsteq r2, # (3<<15) @ 2nd & 3rd character
+ @ If not the 3rd must be the last one
+ addeq r1,r1,#1
+
+10:
+ @ r0 is still at the beginning, r1 is pointing 1 byte after terminator
+ sub r0, r1, r0
+ subs r0, r0, #1
+ pop { r4, r6 }
+
+ cfi_adjust_cfa_offset (-8)
+ cfi_restore (r4)
+ cfi_restore (r6)
+
+ DO_RET(lr)
+
+END(strlen)
+libc_hidden_builtin_def (strlen)