This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[RFC PATCH v2] Add strcmp, strncmp, memcmp inline implementation.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 25 May 2015 10:11:56 +0200
- Subject: [RFC PATCH v2] Add strcmp, strncmp, memcmp inline implementation.
- Authentication-results: sourceware.org; auth=none
- References: <20150525015826 dot GA29445 at domone>
Sorry for noise,
I wanted to ask if one needs to surround strlen with
__builtin_constant_p to avoid runtime overhead, and when I read patch I
realized that I by mistake included older partial patch. What I intended
is here, please ignore previous one. I know that it had bug when you
have string literal with zero in middle.
So do I need that to be safe? And other comments?
On Mon, May 25, 2015 at 03:58:26AM +0200, OndÅej BÃlka wrote:
> Hi,
>
> I just found that on x64 gcc __builtin_strcmp suck a lot. And by lot I
> mean its around three times slower than libcall by using rep cmpsb which
> even intel manual says that shouldn't be used.
>
> So I decided to write a strcmp inline that works. As reencoding it as
> gcc pass would be extra effort without any benefit I skipped it.
>
> This adds x64 specific implementation for constant arguments less than
> 16 bytes. These are quite common as programmer often does checks like
>
> if (strcmp (x, "foo"))
>
> Extending this for 32 bytes and more would be straightforward but
> wouldn't help much as there aren't lot of that large words.
>
> As same trick could be used for strncmp and memcmp with size <= 16
> with few extra checks as we exploited alignment of string literals.
>
> It could be optimized more with cooperation from gcc. A page-cross check
> could be omitted in most cases using dataflow that gcc already does in
> fortify_source. A CROSS_PAGE macro could first check for
> __builtin_valid_range_p(x, x+16) which evaluates to true if gcc can
> prove that x is more than 16 bytes large.
>
>
> A possible issue would be introducing sse with string.h. How detect gcc
> -no-sse flag?
>
* sysdeps/x86_64/bits/string.h: New file.
diff --git a/sysdeps/x86_64/bits/string.h b/sysdeps/x86_64/bits/string.h
new file mode 100644
index 0000000..c4d154b
--- /dev/null
+++ b/sysdeps/x86_64/bits/string.h
@@ -0,0 +1,87 @@
+/* This file should provide inline versions of string functions.
+
+ Surround GCC-specific parts with #ifdef __GNUC__, and use `__extern_inline'.
+
+ This file should define __STRING_INLINES if functions are actually defined
+ as inlines. */
+
+#ifndef _BITS_STRING_H
+#define _BITS_STRING_H 1
+
+/* Define if architecture can access unaligned multi-byte variables. */
+#define _STRING_ARCH_unaligned 0
+
+#ifdef _USE_GNU
+# if __GNUC_PREREQ (3, 2)
+# define _HAVE_STRING_ARCH_strcmp
+# define _HAVE_STRING_ARCH_strncmp
+# define _HAVE_STRING_ARCH_memcmp
+
+# include <stdint.h>
+# include <emmintrin.h>
+# define __LOAD(x) _mm_load_si128 ((__tp_vector *) (x))
+# define __LOADU(x) _mm_loadu_si128 ((__tp_vector *) (x))
+# define get_mask(x) ((uint64_t) _mm_movemask_epi8 (x))
+# define __EQ _mm_cmpeq_epi8
+typedef __m128i __tp_vector;
+typedef uint64_t __tp_mask;
+
+#define CROSS_PAGE(p) __builtin_expect (((uintptr_t) s) % 4096 \
+ > 4096 - sizeof (__tp_vector) , 0)
+
+static inline __attribute__ ((always_inline))
+int
+__memcmp_small_a (char *s, char *c, int n)
+{
+ if (CROSS_PAGE (s))
+ return memcmp (s, c, n);
+ __tp_mask m = get_mask (__EQ (__LOADU (s), __LOAD (c))) | 1UL << n;
+ int found = __builtin_ctzl (m);
+ return s[found] - c[found];
+}
+static inline __attribute__ ((always_inline))
+int
+__memcmp_small (char *s, char *c, int n)
+{
+ if (CROSS_PAGE (s) || CROSS_PAGE (c))
+ return memcmp (s, c, n);
+ __tp_mask m = get_mask (__EQ (__LOADU (s), __LOADU (c))) | 1UL << n;
+ int found = __builtin_ctzl (m);
+ return s[found] - c[found];
+}
+#define __min(x,y) (x < y ? x : y)
+
+/* Dereferencing a pointer arg to run sizeof on it fails for the void
+ pointer case, so we use this instead.
+ Note that __x is evaluated twice. */
+#define __string2_1bptr_p(__x) __builtin_constant_p (__x) && \
+ ((size_t)(const void *)((__x) + 1) - (size_t)(const void *)(__x) == 1)
+
+
+# define strcmp(s1, s2) \
+ (__extension__ \
+ (__string2_1bptr_p (s1) && strlen (s1) <= 16 \
+ ? __memcmp_small_a (s1, s2, strlen (s1)) \
+ : (__string2_1bptr_p (s2) && strlen (s2) <= 16 \
+ ? - __memcmp_small_a (s2, s1, strlen (s2)) \
+ : strcmp (s1, s2))))
+
+# define strncmp(s1, s2, n) \
+ (__extension__ \
+ (__string2_1bptr_p (s1) && strlen (s1) <= 16 \
+ ? __memcmp_small_a (s1, s2, min (n, strlen (s1))) \
+ : (__string2_1bptr_p (s2) && strlen (s2) <= 16 \
+ ? - __memcmp_small_a (s2, s1, min (n, strlen (s2))) \
+ : strncmp (s1, s2, n))))
+
+# define memcmp(s1, s2, n) \
+ (__extension__ \
+ (__builtin_constant_p (n <= 16) && n <= 16 \
+ ? n == 0 ? 0 : __memcmp_small (s1, s2, n - 1)) \
+ : memcmp (s1, s2, n))
+
+
+# undef __string2_1bptr_p
+# endif
+#endif