This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[RFC PATCH v2] Add strcmp, strncmp, memcmp inline implementation.


Sorry for noise,

I wanted to ask if one needs to surround strlen with
__builtin_constant_p to avoid runtime overhead, and when I read patch I
realized that I by mistake included older partial patch. What I intended
is here, please ignore previous one. I know that it had bug when you
have string literal with zero in middle.

So do I need that to be safe? And other comments?

On Mon, May 25, 2015 at 03:58:26AM +0200, OndÅej BÃlka wrote:
> Hi,
> 
> I just found that on x64 gcc __builtin_strcmp suck a lot. And by lot I
> mean its around three times slower than libcall by using rep cmpsb which
> even intel manual says that shouldn't be used.
> 
> So I decided to write a strcmp inline that works. As reencoding it as
> gcc pass would be extra effort without any benefit I skipped it.
> 
> This adds x64 specific implementation for constant arguments less than
> 16 bytes. These are quite common as programmer often does checks like 
> 
> if (strcmp (x, "foo"))
> 
> Extending this for 32 bytes and more would be straightforward but
> wouldn't help much as there aren't lot of that large words.
> 
> As same trick could be used for strncmp and memcmp with size <= 16 
> with few extra checks as we exploited alignment of string literals.
> 
> It could be optimized more with cooperation from gcc. A page-cross check
> could be omitted in most cases using dataflow that gcc already does in
> fortify_source. A CROSS_PAGE macro could first check for
> __builtin_valid_range_p(x, x+16) which evaluates to true if gcc can
> prove that x is more than 16 bytes large.
> 
> 
> A possible issue would be introducing sse with string.h. How detect gcc
> -no-sse flag?
> 

 	* sysdeps/x86_64/bits/string.h: New file. 

diff --git a/sysdeps/x86_64/bits/string.h b/sysdeps/x86_64/bits/string.h
new file mode 100644
index 0000000..c4d154b
--- /dev/null
+++ b/sysdeps/x86_64/bits/string.h
@@ -0,0 +1,87 @@
+/* This file should provide inline versions of string functions.
+
+   Surround GCC-specific parts with #ifdef __GNUC__, and use `__extern_inline'.
+
+   This file should define __STRING_INLINES if functions are actually defined
+   as inlines.  */
+
+#ifndef _BITS_STRING_H
+#define _BITS_STRING_H	1
+
+/* Define if architecture can access unaligned multi-byte variables.  */
+#define _STRING_ARCH_unaligned   0
+
+#ifdef _USE_GNU
+# if __GNUC_PREREQ (3, 2)
+#  define _HAVE_STRING_ARCH_strcmp
+#  define _HAVE_STRING_ARCH_strncmp
+#  define _HAVE_STRING_ARCH_memcmp
+
+#  include <stdint.h>
+#  include <emmintrin.h>
+#  define __LOAD(x) _mm_load_si128 ((__tp_vector *) (x))
+#  define __LOADU(x) _mm_loadu_si128 ((__tp_vector *) (x))
+#  define get_mask(x) ((uint64_t) _mm_movemask_epi8 (x))
+#  define __EQ  _mm_cmpeq_epi8
+typedef __m128i __tp_vector;
+typedef uint64_t __tp_mask;
+
+#define CROSS_PAGE(p) __builtin_expect (((uintptr_t) s) % 4096		      \
+					> 4096 - sizeof (__tp_vector) , 0)
+
+static inline __attribute__ ((always_inline)) 
+int
+__memcmp_small_a (char *s, char *c, int n)
+{
+  if (CROSS_PAGE (s))
+    return memcmp (s, c, n);
+  __tp_mask m = get_mask (__EQ (__LOADU (s), __LOAD (c))) | 1UL << n;
+  int found = __builtin_ctzl (m);
+  return s[found] - c[found];
+}
+static inline __attribute__ ((always_inline)) 
+int
+__memcmp_small (char *s, char *c, int n)
+{
+  if (CROSS_PAGE (s) || CROSS_PAGE (c))
+    return memcmp (s, c, n);
+  __tp_mask m = get_mask (__EQ (__LOADU (s), __LOADU (c))) | 1UL << n;
+  int found = __builtin_ctzl (m);
+  return s[found] - c[found];
+}
+#define __min(x,y) (x < y ? x : y)
+
+/* Dereferencing a pointer arg to run sizeof on it fails for the void
+   pointer case, so we use this instead.
+   Note that __x is evaluated twice. */
+#define __string2_1bptr_p(__x) __builtin_constant_p (__x) && \
+  ((size_t)(const void *)((__x) + 1) - (size_t)(const void *)(__x) == 1)
+
+
+#  define strcmp(s1, s2) \
+   (__extension__						\
+   (__string2_1bptr_p (s1) && strlen (s1) <= 16			\
+    ?  __memcmp_small_a (s1, s2, strlen (s1))			\
+    : (__string2_1bptr_p (s2) && strlen (s2) <= 16   		\
+       ? - __memcmp_small_a (s2, s1, strlen (s2))		\
+       : strcmp (s1, s2))))
+
+#  define strncmp(s1, s2, n) \
+   (__extension__						\
+   (__string2_1bptr_p (s1) && strlen (s1) <= 16			\
+    ?  __memcmp_small_a (s1, s2, min (n, strlen (s1)))		\
+    : (__string2_1bptr_p (s2) && strlen (s2) <= 16   		\
+       ? - __memcmp_small_a (s2, s1, min (n, strlen (s2)))	\
+       : strncmp (s1, s2, n))))
+
+#  define memcmp(s1, s2, n) \
+   (__extension__						\
+   (__builtin_constant_p (n <= 16) && n <= 16			\
+    ?  n == 0 ? 0 : __memcmp_small (s1, s2, n - 1))		\
+       : memcmp (s1, s2, n))
+
+  
+# undef __string2_1bptr_p
+# endif
+#endif


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]