This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [RFC PATCH v2] Add strcmp, strncmp, memcmp inline implementation.


On Mon, May 25, 2015 at 4:11 PM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> Sorry for noise,
>
> I wanted to ask if one needs to surround strlen with
> __builtin_constant_p to avoid runtime overhead, and when I read patch I
> realized that I by mistake included older partial patch. What I intended
> is here, please ignore previous one. I know that it had bug when you
> have string literal with zero in middle.
>
> So do I need that to be safe? And other comments?


Yes what will it take to get this into GCC instead of adding one more
hack to glibc for this?
Sorry but I feel like this would be better in GCC than doing this in glibc.

Thanks,
Andrew

>
> On Mon, May 25, 2015 at 03:58:26AM +0200, OndÅej BÃlka wrote:
>> Hi,
>>
>> I just found that on x64 gcc __builtin_strcmp suck a lot. And by lot I
>> mean its around three times slower than libcall by using rep cmpsb which
>> even intel manual says that shouldn't be used.
>>
>> So I decided to write a strcmp inline that works. As reencoding it as
>> gcc pass would be extra effort without any benefit I skipped it.
>>
>> This adds x64 specific implementation for constant arguments less than
>> 16 bytes. These are quite common as programmer often does checks like
>>
>> if (strcmp (x, "foo"))
>>
>> Extending this for 32 bytes and more would be straightforward but
>> wouldn't help much as there aren't lot of that large words.
>>
>> As same trick could be used for strncmp and memcmp with size <= 16
>> with few extra checks as we exploited alignment of string literals.
>>
>> It could be optimized more with cooperation from gcc. A page-cross check
>> could be omitted in most cases using dataflow that gcc already does in
>> fortify_source. A CROSS_PAGE macro could first check for
>> __builtin_valid_range_p(x, x+16) which evaluates to true if gcc can
>> prove that x is more than 16 bytes large.
>>
>>
>> A possible issue would be introducing sse with string.h. How detect gcc
>> -no-sse flag?
>>
>
>         * sysdeps/x86_64/bits/string.h: New file.
>
> diff --git a/sysdeps/x86_64/bits/string.h b/sysdeps/x86_64/bits/string.h
> new file mode 100644
> index 0000000..c4d154b
> --- /dev/null
> +++ b/sysdeps/x86_64/bits/string.h
> @@ -0,0 +1,87 @@
> +/* This file should provide inline versions of string functions.
> +
> +   Surround GCC-specific parts with #ifdef __GNUC__, and use `__extern_inline'.
> +
> +   This file should define __STRING_INLINES if functions are actually defined
> +   as inlines.  */
> +
> +#ifndef _BITS_STRING_H
> +#define _BITS_STRING_H 1
> +
> +/* Define if architecture can access unaligned multi-byte variables.  */
> +#define _STRING_ARCH_unaligned   0
> +
> +#ifdef _USE_GNU
> +# if __GNUC_PREREQ (3, 2)
> +#  define _HAVE_STRING_ARCH_strcmp
> +#  define _HAVE_STRING_ARCH_strncmp
> +#  define _HAVE_STRING_ARCH_memcmp
> +
> +#  include <stdint.h>
> +#  include <emmintrin.h>
> +#  define __LOAD(x) _mm_load_si128 ((__tp_vector *) (x))
> +#  define __LOADU(x) _mm_loadu_si128 ((__tp_vector *) (x))
> +#  define get_mask(x) ((uint64_t) _mm_movemask_epi8 (x))
> +#  define __EQ  _mm_cmpeq_epi8
> +typedef __m128i __tp_vector;
> +typedef uint64_t __tp_mask;
> +
> +#define CROSS_PAGE(p) __builtin_expect (((uintptr_t) s) % 4096               \
> +                                       > 4096 - sizeof (__tp_vector) , 0)
> +
> +static inline __attribute__ ((always_inline))
> +int
> +__memcmp_small_a (char *s, char *c, int n)
> +{
> +  if (CROSS_PAGE (s))
> +    return memcmp (s, c, n);
> +  __tp_mask m = get_mask (__EQ (__LOADU (s), __LOAD (c))) | 1UL << n;
> +  int found = __builtin_ctzl (m);
> +  return s[found] - c[found];
> +}
> +static inline __attribute__ ((always_inline))
> +int
> +__memcmp_small (char *s, char *c, int n)
> +{
> +  if (CROSS_PAGE (s) || CROSS_PAGE (c))
> +    return memcmp (s, c, n);
> +  __tp_mask m = get_mask (__EQ (__LOADU (s), __LOADU (c))) | 1UL << n;
> +  int found = __builtin_ctzl (m);
> +  return s[found] - c[found];
> +}
> +#define __min(x,y) (x < y ? x : y)
> +
> +/* Dereferencing a pointer arg to run sizeof on it fails for the void
> +   pointer case, so we use this instead.
> +   Note that __x is evaluated twice. */
> +#define __string2_1bptr_p(__x) __builtin_constant_p (__x) && \
> +  ((size_t)(const void *)((__x) + 1) - (size_t)(const void *)(__x) == 1)
> +
> +
> +#  define strcmp(s1, s2) \
> +   (__extension__                                              \
> +   (__string2_1bptr_p (s1) && strlen (s1) <= 16                        \
> +    ?  __memcmp_small_a (s1, s2, strlen (s1))                  \
> +    : (__string2_1bptr_p (s2) && strlen (s2) <= 16             \
> +       ? - __memcmp_small_a (s2, s1, strlen (s2))              \
> +       : strcmp (s1, s2))))
> +
> +#  define strncmp(s1, s2, n) \
> +   (__extension__                                              \
> +   (__string2_1bptr_p (s1) && strlen (s1) <= 16                        \
> +    ?  __memcmp_small_a (s1, s2, min (n, strlen (s1)))         \
> +    : (__string2_1bptr_p (s2) && strlen (s2) <= 16             \
> +       ? - __memcmp_small_a (s2, s1, min (n, strlen (s2)))     \
> +       : strncmp (s1, s2, n))))
> +
> +#  define memcmp(s1, s2, n) \
> +   (__extension__                                              \
> +   (__builtin_constant_p (n <= 16) && n <= 16                  \
> +    ?  n == 0 ? 0 : __memcmp_small (s1, s2, n - 1))            \
> +       : memcmp (s1, s2, n))
> +
> +
> +# undef __string2_1bptr_p
> +# endif
> +#endif


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]