This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PING][PATCH] Improve generic strcspn and strpbrk
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Carlos O'Donell <carlos at redhat dot com>
- Cc: libc-alpha at sourceware dot org
- Date: Wed, 12 Aug 2015 22:26:04 +0200
- Subject: [PING][PATCH] Improve generic strcspn and strpbrk
- Authentication-results: sourceware.org; auth=none
- References: <20150701153515 dot GA31676 at domone>
Ping
On Wed, Jul 01, 2015 at 05:35:15PM +0200, OndÅej BÃlka wrote:
> Hi,
>
> As I promised previously to improve sse2 strspn I found that I could
> convince gcc to generate reasonable code. Its really needed to use goto,
> otherwise gcc totally messes up a loop which will be 10% slower than
> current. Generated assembly is relatively ok, gcc only increments two
> registers by 4 instead one which doesn't affect performance much.
>
> So here is a generic strcspn/strpbrk implementation and I will send
> patch to replace x64 code later.
>
>
> Carlos as usual this is code where you need to make hard decision based
> on actual profile. This implementation on gcc workload shows 30%
> improvement. That is by exploiting property of workload that match in 4
> bytes is likely. If it isn't a performance will suffer as these checks
> increase cost of subsequent characters.
>
> This is again case where its unavoidable that some patterns will become
> five times slower. If accept has 128 characters and match is in second
> character then we spend ages with byte-by-byte checks of accept. But if
> match is in first character then strchr will find that quickly. Problem
> is that 99.8% of call have accept less than 8 bytes. So adding just a
> check if size exceeds 8 will harm performance. See following two graphs.
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strpbrk_profile/results_gcc/result.html
> http://kam.mff.cuni.cz/~ondra/benchmark_string/core2/strpbrk_profile/results_rand/result.html
>
> I cannot avoid that without my generic string framework where I would
> first to vectorized check of first 8 bytes and its dubious if that would
> help 32bit architectures. For x64 I correct solution is use sse2 prolog,
> hard part is tuning as these break even depending on size. For use in
> sse2 I need to add LATE_CHECK as it when called from sse42 routine
> accept size is greater than 16 so early heuristic doesn't make much
> sense.
>
> Main gain versus current sse2 assembly is that I align s to 4 bytes so I
> could read 4 bytes at start of loop before check. That helps mainly
> older cpu to execute loads earlier, it improves performance by 50% on
> core2 large inputs, but newer are better at speculative loads so
> difference gets smaller until haswell that doesn't show a difference.
>
> Tested on x64 by including them in test-strcspn/strpbrk
>
> OK to commit?
>
> * string/strcspn.c(STRCSPN): Check membership by array lookup.
> * string/strcspn.c: Include string/strpbrk.c
> * string/test-strcspn.c (simple_strcspn): Use string version.
> * string/test-strpbrk.c (simple_strpbrk): Use string version.
>
> ---
> string/strcspn.c | 43 +-----------------
> string/strpbrk.c | 120 +++++++++++++++++++++++++++++++++++++++++++++-----
> string/test-strcspn.c | 15 ++-----
> string/test-strpbrk.c | 14 +-----
> 4 files changed, 116 insertions(+), 76 deletions(-)
>
> diff --git a/string/strcspn.c b/string/strcspn.c
> index 2694d2a..2a82dd2 100644
> --- a/string/strcspn.c
> +++ b/string/strcspn.c
> @@ -1,41 +1,2 @@
> -/* Copyright (C) 1991-2015 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <http://www.gnu.org/licenses/>. */
> -
> -#include <string.h>
> -
> -#undef strcspn
> -
> -#ifndef STRCSPN
> -# define STRCSPN strcspn
> -#endif
> -
> -/* Return the length of the maximum initial segment of S
> - which contains no characters from REJECT. */
> -size_t
> -STRCSPN (const char *s, const char *reject)
> -{
> - size_t count = 0;
> -
> - while (*s != '\0')
> - if (strchr (reject, *s++) == NULL)
> - ++count;
> - else
> - return count;
> -
> - return count;
> -}
> -libc_hidden_builtin_def (strcspn)
> +#define AS_STRCSPN
> +#include "strpbrk.c"
> diff --git a/string/strpbrk.c b/string/strpbrk.c
> index 4f1d9b7..62808da 100644
> --- a/string/strpbrk.c
> +++ b/string/strpbrk.c
> @@ -16,26 +16,124 @@
> <http://www.gnu.org/licenses/>. */
>
> #include <string.h>
> -
> +#include <stdint.h>
> #undef strpbrk
> +#undef strcspn
> +
> +
> +#ifdef AS_STRCSPN
> +# ifndef STRPBRK
> +# define STRPBRK strcspn
> +# endif
> +# define RETURN_TYPE size_t
> +# define RETURN(c) return c
> +#else
> +# define RETURN_TYPE char *
> +# define RETURN(c) return (char *) (s[c] != '\0' ? s + c : NULL)
> +#endif
>
> #ifndef STRPBRK
> #define STRPBRK strpbrk
> #endif
>
> +
> /* Find the first occurrence in S of any character in ACCEPT. */
> -char *
> -STRPBRK (const char *s, const char *accept)
> +RETURN_TYPE
> +STRPBRK (const char *_s, const char *_accept)
> {
> - while (*s != '\0')
> + unsigned char *s = (unsigned char *) _s;
> + unsigned char *a = (unsigned char *) _accept;
> +
> +#ifndef LATE_CHECK
> + /* We need to align s to 4 bytes. We do check now to avoid expensive table
> + construction. */
> + do
> + {
> + if (s[0] == *a)
> + RETURN(0);
> + }
> + while (*a++);
> + a = (unsigned char *) _accept;
> +
> + /* We couldn't do these checks in one loop as gcc
> + messes up register allocation. */
> + do
> + {
> + if (s[1] == *a)
> + RETURN(1);
> + }
> + while (*a++);
> + a = (unsigned char *) _accept;
> +
> + do
> + {
> + if (s[2] == *a)
> + RETURN(2);
> + }
> + while (*a++);
> + a = (unsigned char *) _accept;
> +
> + do
> + {
> + if (s[3] == *a)
> + RETURN(3);
> + }
> + while (*a++);
> + a = (unsigned char *) _accept;
> +
> +#endif
> +
> + unsigned char table[256];
> + memset (table, 0, 256);
> + do
> {
> - const char *a = accept;
> - while (*a != '\0')
> - if (*a++ == *s)
> - return (char *) s;
> - ++s;
> + table[*a] = 1;
> }
> + while (*a++);
> + unsigned char s0, s1, s2, s3;
> + size_t count = 0;
> +#ifdef LATE_CHECK
> + s0 = s[count + 0];
> + s1 = s[count + 1];
> + s2 = s[count + 2];
> + s3 = s[count + 3];
> + if (table[s0])
> + goto ret0;
> + if (table[s1])
> + goto ret1;
> + if (table[s2])
> + goto ret2;
> + if (table[s3])
> + goto ret3;
>
> - return NULL;
> +#endif
> +
> + count = 4 - ((uintptr_t) s) % 4;
> +
> + while (1)
> + {
> + s0 = s[count + 0];
> + s1 = s[count + 1];
> + s2 = s[count + 2];
> + s3 = s[count + 3];
> + if (table[s0])
> + goto ret0;
> + if (table[s1])
> + goto ret1;
> + if (table[s2])
> + goto ret2;
> + if (table[s3])
> + goto ret3;
> + count += 4;
> + }
> + ret3:
> + count++;
> + ret2:
> + count++;
> + ret1:
> + count++;
> + ret0:
> + RETURN(count);
> }
> -libc_hidden_builtin_def (strpbrk)
> +
> +libc_hidden_builtin_def (STRPBRK)
> diff --git a/string/test-strcspn.c b/string/test-strcspn.c
> index b60a048..50a06e4 100644
> --- a/string/test-strcspn.c
> +++ b/string/test-strcspn.c
> @@ -31,18 +31,9 @@ IMPL (stupid_strcspn, 0)
> IMPL (simple_strcspn, 0)
> IMPL (strcspn, 1)
>
> -size_t
> -simple_strcspn (const char *s, const char *rej)
> -{
> - const char *r, *str = s;
> - char c;
> -
> - while ((c = *s++) != '\0')
> - for (r = rej; *r != '\0'; ++r)
> - if (*r == c)
> - return s - str - 1;
> - return s - str - 1;
> -}
> +#define AS_STRCSPN
> +#define STRPBRK simple_strcspn
> +#include "string/strpbrk.c"
>
> size_t
> stupid_strcspn (const char *s, const char *rej)
> diff --git a/string/test-strpbrk.c b/string/test-strpbrk.c
> index b4ac389..f389e9d 100644
> --- a/string/test-strpbrk.c
> +++ b/string/test-strpbrk.c
> @@ -32,18 +32,8 @@ IMPL (stupid_strpbrk, 0)
> IMPL (simple_strpbrk, 0)
> IMPL (strpbrk, 1)
>
> -char *
> -simple_strpbrk (const char *s, const char *rej)
> -{
> - const char *r;
> - char c;
> -
> - while ((c = *s++) != '\0')
> - for (r = rej; *r != '\0'; ++r)
> - if (*r == c)
> - return (char *) s - 1;
> - return NULL;
> -}
> +#define STRPBRK simple_strpbrk
> +#include "string/strpbrk.c"
>
> char *
> stupid_strpbrk (const char *s, const char *rej)
> --
> 1.8.4.rc3
--
hardware stress fractures