This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH 1/* v3] Generic string function optimization: Add skeleton

From: OndÅej BÃlka <neleai at seznam dot cz>
To: libc-alpha at sourceware dot org
Date: Thu, 28 May 2015 16:29:56 +0200
Subject: Re: [PATCH 1/* v3] Generic string function optimization: Add skeleton
Authentication-results: sourceware.org; auth=none
References: <20150527060121 dot GA19105 at domone>

On Wed, May 27, 2015 at 08:01:21AM +0200, OndÅej BÃlka wrote:
> Hi,
> 
> As i mentioned improving generic string functions this is my second
> attempt. As lot of functions will be optimized in same way this uses
> generic code flow. Functions will vary just by expression to check and
> how interpret return value.
> 
> Performance gains of using this are from loop unrolling and better
> header that doesn't have to check start byte-by-byte.
> 
> Unrolling migth be excessive but its better to tune it in skeleton than
> try manually and risk introducing errors.
> 
> This implementation would probably be faster than assembly on other
> architectures as this will beat it unless you used hardware specific
> instruction or gcc messes up compilation and generates suboptimal code.
> 
> Comments?
> 

Here is a new version of skeleton. I added a big endian support. This
reminded me that when I first wrote it I wanted to use opperations that
dont cause carry, then forgotten about it. As thats needed only for
first aligned load or always on big endian you need to supply expression
twice. one version shouldn't cause carry propagation.

 	* string/common.h: New file.
 	* string/skeleton.h: Likewise.

diff --git a/string/common.h b/string/common.h
new file mode 100644
index 0000000..481c99f
--- /dev/null
+++ b/string/common.h
@@ -0,0 +1,80 @@
+#include <stdint.h>
+
+static const unsigned long int ones = (~0UL / 255); /* 0x0101...*/
+static const unsigned long int add = 127 * (~0UL / 255);
+static const unsigned long int high_bits = 128 * (~0UL / 255);
+
+/* Use vector arithmetic tricks. Idea is to take expression works on
+   unsigned byte and evaluates 0 for nozero byte and nonzero on zero byte.
+   Our expression is ((s - 1) & (~s)) & 128  
+   Now we evaluate this expression on each byte in parallel and on first 
+   nonzero byte our expression will have nonzero value. 
+
+   We need to provide version of expression that doesn't cause carry 
+   propagation and opperations could be done in parallel. However its
+   not needed on little endian architectures as we end on first byte 
+   that succeeds and we don't care that next ones could be corrupted.
+  */
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+static __always_inline
+unsigned long int 
+contains_zero (unsigned long int s)
+{
+  return (s - ones) & ~s & high_bits;
+}
+#else
+#define contains_zero contains_zero_nocarry
+#endif
+
+static __always_inline
+unsigned long int 
+contains_zero_nocarry (unsigned long int s)
+{
+  return (((s | high_bits) - ones) ^ high_bits) & ~s & high_bits;
+}
+
+#define LSIZE sizeof (unsigned long int)
+#define CROSS_PAGE(x, n) (((uintptr_t) x) % 4096 > 4096 - n)
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define SHIFT_BYTES(x, n) ((x) << (8 * (n)))
+#else
+#define SHIFT_BYTES(x, n) ((x) >> (8 * (n)))
+#endif
+
+static __always_inline
+size_t
+first_nonzero_byte (unsigned long int u)
+{
+#if __BYTE_ORDER == __BIG_ENDIAN
+# ifdef FAST_CLZ
+  return clz (u) / 8;
+# else
+#  ifdef NEED BITWISE
+  u = u | (u >> 1);
+  u = u | (u >> 2);
+  u = u | (u >> 4);
+#  else
+  u = u >> 7;
+#  endif
+  u = u | (u >> 8);
+  u = u | (u >> 16);
+  u = u | (u >> 32);
+#  ifdef NEED_BITWISE
+  u = u & ones;
+#  endif
+  u = u * ones;
+  return 8 - (u >> (8 * LSIZE - 8));
+# endif
+#else
+# ifdef FAST_FFS
+  return (ffsl (u) - 1) / 8;
+# else
+  u = u ^ (u - 1);
+  u = u & ones;
+  u = u * ones;
+  return (u >> (8 * LSIZE - 8)) - 1;
+# endif
+#endif
+}
diff --git a/string/skeleton.h b/string/skeleton.h
new file mode 100644
index 0000000..76bd08f
--- /dev/null
+++ b/string/skeleton.h
@@ -0,0 +1,150 @@
+/* Skeleton of generic string functions.
+   Copyright (C) 1991-2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include <libc-internal.h>
+#include <stdint.h>
+
+#ifndef BOUND
+# define BOUND(x) 0
+#endif
+
+/* On high endian an positive could cause false positive in previous byte.  */
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#undef EXPRESSION
+#define EXPRESSION(x,y) EXPRESSION_NOCARRY(x,y)
+#endif
+
+static __always_inline
+int
+found_in_long_bytes(char *s, unsigned long int cmask, char **result)
+{
+  const unsigned long int *lptr = (const unsigned long int *) s;
+  unsigned long int mask = EXPRESSION(*lptr, cmask);
+  if (mask)
+    {
+      *result = s + first_nonzero_byte (mask);
+      return 1;
+    }
+  else
+    return 0;
+}
+
+static __always_inline
+char *
+string_skeleton (const char *s_in, int c_in, char *end)
+{
+  unsigned long int mask;
+  const unsigned long int *lptr;
+  char *s = (char *) s_in;
+  unsigned char c = (unsigned char) c_in;
+  char *r;
+  unsigned long int __attribute__ ((unused)) cmask = c * ones;
+
+#if _STRING_ARCH_unaligned
+  /* We fetch 32 bytes while not crossing page boundary. 
+     Most strings in practice are of that size and we avoid a loop.
+     This looks as best in practice, alternative below uses aligned load 
+     but is slower when string starts just few 
+     bytes before 32 byte boundary. A tradeoff is that we rarely could 
+     fetch extra cache line without needing it but this optimization 
+     does pay for that. */
+  if (!CROSS_PAGE(s, 32))
+    {
+      if (found_in_long_bytes (s + 0 * LSIZE, cmask, &r))
+        return r;
+      if (found_in_long_bytes (s + 1 * LSIZE, cmask, &r))
+        return r;
+      if (found_in_long_bytes (s + 2 * LSIZE, cmask, &r))
+        return r;
+      if (found_in_long_bytes (s + 3 * LSIZE, cmask, &r))
+        return r;
+      if (sizeof (unsigned long int) == 4)
+        {
+          if (found_in_long_bytes (s + 5 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s + 6 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s + 7 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s + 8 * LSIZE, cmask, &r))
+            return r;
+        }
+
+      if (BOUND (s + 32))
+        return NULL;
+    }
+  else
+    {
+#endif
+  /* We need use aligned loads. For first load we read some bytes before 
+     start that we discard by shifting them down. */
+ 
+      char *s_aligned = PTR_ALIGN_DOWN (s, LSIZE);
+      lptr = (const unsigned long int *) s_aligned;
+      /* We need be careful here as bytes before start can corrupt it.  */
+      mask = SHIFT_BYTES ((EXPRESSION_NOCARRY (*lptr, cmask)), s - s_aligned);
+
+      if (mask)
+        return s + first_nonzero_byte (mask);
+
+      if (BOUND (s_aligned + 1 * LSIZE))
+        return NULL;
+      if (found_in_long_bytes (s_aligned + 1 * LSIZE, cmask, &r))
+        return r;
+      if (BOUND (s_aligned + 2 * LSIZE))
+        return NULL;
+      if (found_in_long_bytes (s_aligned + 2 * LSIZE, cmask, &r))
+        return r;
+      if (BOUND (s_aligned + 3 * LSIZE))
+        return NULL;
+      if (found_in_long_bytes (s_aligned + 3 * LSIZE, cmask, &r))
+        return r;
+#if _STRING_ARCH_unaligned
+    }
+#endif
+   /* Now we read enough bytes to start a loop.  */
+
+  char *s_loop = PTR_ALIGN_DOWN (s, 4 * LSIZE);
+  while (!BOUND (s_loop + 4 * LSIZE))
+    {
+      s_loop += 4 * LSIZE;
+      lptr = (const unsigned long int *) (s_loop + 0 * LSIZE);
+      mask = EXPRESSION (*lptr, cmask);
+      lptr = (const unsigned long int *) (s_loop + 1 * LSIZE);
+      mask |= EXPRESSION (*lptr, cmask);
+      lptr = (const unsigned long int *) (s_loop + 2 * LSIZE);
+      mask |= EXPRESSION (*lptr, cmask);
+      lptr = (const unsigned long int *) (s_loop + 3 * LSIZE);
+      mask |= EXPRESSION (*lptr, cmask);
+
+      if (mask)
+        {
+          if (found_in_long_bytes (s_loop + 0 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s_loop + 1 * LSIZE, cmask, &r))
+            return r;
+          if (found_in_long_bytes (s_loop + 2 * LSIZE, cmask, &r))
+            return r;
+
+          return s_loop + 3 * LSIZE + first_nonzero_byte (mask);
+        }
+    }
+ return NULL;
+}

Follow-Ups:
- Re: [PATCH 1/* v3] Generic string function optimization: Add skeleton
  - From: Richard Henderson

References:
- [PATCH 1/*] Generic string function optimization: Add skeleton
  - From: OndÅej BÃlka

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]