[PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX

Devulapalli, Raghuveer raghuveer.devulapalli@intel.com
Tue May 31 19:16:38 GMT 2022



> -----Original Message-----
> From: Noah Goldstein <goldstein.w.n@gmail.com>
> Sent: Thursday, May 26, 2022 2:26 PM
> To: Devulapalli, Raghuveer <raghuveer.devulapalli@intel.com>
> Cc: GNU C Library <libc-alpha@sourceware.org>
> Subject: Re: [PATCH 1/1] x86_64: Add strstr function with 512-bit EVEX
> 
> On Thu, May 26, 2022 at 3:11 PM Raghuveer Devulapalli via Libc-alpha <libc-
> alpha@sourceware.org> wrote:
> >
> > Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
> >
> > (1) We spend a few cycles at the begining to peek into the needle. We
> > locate an edge in the needle (first occurance of 2 consequent distinct
> > characters) and also store the first 64-bytes into a zmm register.
> >
> > (2) We search for the edge in the haystack by looking into one cache
> > line of the haystack at a time. This avoids having to read past a page
> > boundary which can cause a seg fault.
> >
> > (3) If an edge is found in the haystack we first compare the first
> > 64-bytes of the needle (already stored in a zmm register) before we
> > proceed with a full string compare performed byte by byte.
> >
> > Benchmarking data on ICX shows upto 2x speed up when compared to
> > __strstr_sse2_unaligned (including partial benchtests data from
> > bench-strstr.out):
> >
> > |---------------------------------+---------------+-----------------------|
> > |                                 | strstr_avx512 |
> > | strstr_sse2_unaligned |
> > |---------------------------------+---------------+-----------------------|
> > | Length 16384/ 16,  1/11, found: | 1939.75       | 3458.44               |
> > | Length 16384/ 16, 14/ 5, fail : | 1967.75       | 3541.12               |
> > | Length 16384/ 32,  1/11, found: | 1540.38       | 2908.25               |
> > | Length 16384/ 32, 14/ 5, fail : | 1345.94       | 2866.31               |
> > | Length 16384/ 64,  1/11, found: | 1968.81       | 4327.56               |
> > | Length 16384/ 64, 14/ 5, fail : | 1993.75       | 4215.69               |
> > | Length 16384/128,  1/11, found: | 1535.44       | 3780.56               |
> > | Length 16384/128, 14/ 5, fail : | 1414.75       | 3595.25               |
> > | Length 16384/256,  1/11, found: | 2957.75       | 5501.44               |
> > | Length 16384/256, 14/ 5, fail : | 2682.62       | 5099.88               |
> > | Length 32768/ 16,  1/11, found: | 7820.19       | 11262.9               |
> > | Length 32768/ 16, 14/ 5, fail : | 8196.88       | 10871.2               |
> > | Length 32768/ 32,  1/11, found: | 5709.19       | 6611.56               |
> > | Length 32768/ 32, 14/ 5, fail : | 5716.12       | 6647.06               |
> > | Length 32768/ 64,  1/11, found: | 7160.44       | 10143.7               |
> > | Length 32768/ 64, 14/ 5, fail : | 7021.38       | 10150.6               |
> > | Length 32768/128,  1/11, found: | 4935.31       | 6756.56               |
> > | Length 32768/128, 14/ 5, fail : | 4774.38       | 6746.19               |
> > | Length 32768/256,  1/11, found: | 7933.19       | 12563.8               |
> > | Length 32768/256, 14/ 5, fail : | 7975          | 12558.6               |
> > | Length 65536/ 16,  1/11, found: | 9066.69       | 9419.62               |
> > | Length 65536/ 16, 14/ 5, fail : | 8496          | 9384.75               |
> > | Length 65536/ 32,  1/11, found: | 10258.8       | 11192.4               |
> > | Length 65536/ 32, 14/ 5, fail : | 8712.12       | 11172.3               |
> > | Length 65536/ 64,  1/11, found: | 11085.2       | 18162.1               |
> > | Length 65536/ 64, 14/ 5, fail : | 11219.6       | 17921.5               |
> > | Length 65536/128,  1/11, found: | 9753.56       | 18704.6               |
> > | Length 65536/128, 14/ 5, fail : | 9588.81       | 18465.6               |
> > | Length 65536/256,  1/11, found: | 18333.3       | 28505.2               |
> > | Length 65536/256, 14/ 5, fail : | 18018.8       | 27990.8               |
> > |---------------------------------+---------------+-----------------------|
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   2 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   6 +
> >  sysdeps/x86_64/multiarch/strstr-avx512.c   | 208
> +++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strstr.c          |  24 ++-
> >  4 files changed, 236 insertions(+), 4 deletions(-)  create mode
> > 100644 sysdeps/x86_64/multiarch/strstr-avx512.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> > b/sysdeps/x86_64/multiarch/Makefile
> > index e7b413edad..6dc54a7265 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -126,6 +126,7 @@ sysdep_routines += \
> >    strrchr-sse2 \
> >    strspn-c \
> >    strspn-sse2 \
> > +  strstr-avx512 \
> >    strstr-sse2-unaligned \
> >    varshift \
> >  # sysdep_routines
> > @@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4  CFLAGS-strcspn-c.c
> +=
> > -msse4  CFLAGS-strpbrk-c.c += -msse4  CFLAGS-strspn-c.c += -msse4
> > +CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -
> mavx512bw
> > +-mbmi -mbmi2 -O3
> >  endif
> >
> >  ifeq ($(subdir),wcsmbs)
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a594f4176e..cc9a7eaaa1 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -653,6 +653,12 @@ __libc_ifunc_impl_list (const char *name, struct
> > libc_ifunc_impl *array,
> >
> >    /* Support sysdeps/x86_64/multiarch/strstr.c.  */
> >    IFUNC_IMPL (i, name, strstr,
> > +              IFUNC_IMPL_ADD (array, i, strstr,
> > +                              (CPU_FEATURE_USABLE (AVX512VL)
> > +                               && CPU_FEATURE_USABLE (AVX512BW)
> > +                               && CPU_FEATURE_USABLE (AVX512DQ)
> > +                               && CPU_FEATURE_USABLE (BMI2)),
> > +                              __strstr_avx512)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
> >               IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
> >
> > diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c
> > b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > new file mode 100644
> > index 0000000000..4082a75a1b
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
> > @@ -0,0 +1,208 @@
> > +/* strstr optimized with 512-bit AVX-512 instructions
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <immintrin.h>
> > +#include <inttypes.h>
> > +#include <stdbool.h>
> > +#include <string.h>
> > +
> > +#define FULL_MMASK64 0xffffffffffffffff #define ONE_64BIT 0x1ull
> > +#define ZMM_SIZE_IN_BYTES 64
> > +
> > +/*
> > + Returns the index of the first edge within the needle, returns 0 if
> > +no edge  is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
> > + */
> > +static inline size_t
> > +find_edge_in_needle (const char *ned) {
> > +  size_t ind = 0;
> > +  while (ned[ind + 1] != '\0')
> > +    {
> > +      if (ned[ind] != ned[ind + 1])
> > +        return ind;
> > +      else
> > +        ind = ind + 1;
> > +    }
> > +  return 0;
> > +}
> > +
> > +/*
> > + Compare needle with haystack byte by byte at specified location  */
> > +static inline bool verify_string_match (const char *hay, const size_t
> > +hay_index, const char *ned,
> > +                     size_t ind)
> > +{
> > +  while (ned[ind] != '\0')
> > +    {
>       strcmp? (you might be able to use memcmp which will be faster
>       but will need a bit of refactor to keep true nedlen and check for page
>       cross on hay)

Wouldn't strcmp give you the wrong answer here? For ex: I would need it to return true when haystack is "abcdefg" and needle is "bcd"

> > +      if (ned[ind] != hay[hay_index + ind])
> > +        return false;
> > +      ind = ind + 1;
> > +    }
> > +  return true;
> > +}
> > +
> > +/*
> > + Compare needle with haystack at specified location. The first 64
> > +bytes are  compared using a ZMM register.
> > + */
> > +static inline bool
> > +verify_string_match_avx512 (const char *hay, const size_t hay_index,
> > +                            const char *ned, const __mmask64 ned_mask,
> > +                            const __m512i ned_zmm) {
> > +  /* check first 64 bytes using zmm and then scalar */
> > +  __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe
> to
> > +do so
> > +  __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask,
> hay_zmm,
> > +ned_zmm);
> > +  if (match != 0x0) // failed the first few chars
> > +    return false;
> > +  else if (ned_mask == FULL_MMASK64)
> > +    return verify_string_match (hay, hay_index, ned,
> > +ZMM_SIZE_IN_BYTES);
> > +  return true;
> > +}
> > +
> > +char *
> > +__strstr_avx512 (const char *haystack, const char *ned) {
> > +  char first = ned[0];
> > +  if (first == '\0')
> > +    return (char *)haystack;
> > +  if (ned[1] == '\0')
> > +    return (char *)strchr (haystack, ned[0]);
> > +
> > +  size_t edge = find_edge_in_needle (ned);
> > +
> > +  /* ensure haystack is as long as the pos of edge in needle */  for
> > + (int ii = 0; ii < edge; ++ii)
> > +    {
>     strnlen

Makes sense. 

> > +      if (haystack[ii] == '\0')
> > +        return NULL;
> > +    }
> > +
> > +  const __m512i null = _mm512_setzero_si512 (); // '\0'
> > +
> > +  /*
> > +   Load 64 bytes of the needle and save it to a zmm register
> > +   Read one cache line at a time to avoid loading across a page boundary
> > +   */
> > +  __mmask64 ned_load_mask
> > +      = _bzhi_u64 (FULL_MMASK64, 64 - ((uintptr_t)ned & 63));
>     FULL_MMASK64 >> (((-(uintptr_t)ned) & 63));

+1

> > +  __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask,
> ned);
>     Maybe conditional on highly unlike page cross this is very
>     expensive if causes page walk

Elements on the next cache line are zero masked, shouldn't that prevent a cross page load?  

> > +  __mmask64 ned_nullmask
> > +      = _mm512_mask_cmpeq_epi8_mask (ned_load_mask, ned_zmm,
> null);
>     _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm,
> ned_zmm)
> 
>     likewise at all other compares with null unless it breaks
>     microfusion more than once.

The compiler was using vptestnmb, doesn't hurt to explicitly use it anyways. 

> 
>     If you can replace all then get rid of null
> > +  if (__glibc_unlikely (ned_nullmask == 0x0))
> > +    {
> > +      ned_zmm = _mm512_loadu_si512 (ned);
> > +      ned_nullmask = _mm512_cmpeq_epi8_mask (ned_zmm, null);
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      if (ned_nullmask != 0x0)
> > +        ned_load_mask = ned_load_mask >> 1;
> > +    }
> > +  else
> > +    {
> > +      ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
> > +      ned_load_mask = ned_load_mask >> 1;
>       I think you can get away with just ned_load_mask =
>       ned_nullmask - ONE_64BIT because you only use this after
>       checking haystack no null-term

Without the >> 1, we will compare the null char of the needle to the haystack which will give you the wrong answer.

> > +    }
> > +  const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);  const __m512i
> > + ned1 = _mm512_set1_epi8 (ned[edge + 1]);
> > +
> > +  /*
> > +   Read the bytes of haystack in the current cache line
> > +   */
> > +  size_t hay_index = edge;
> > +  __mmask64 loadmask = _bzhi_u64 (
> > +      FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
> > +  /* First load is a partial cache line */  __m512i hay0 =
> > + _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
> > +  /* Search for NULL and compare only till null char */
> > +  __mmask64 nullmask = _mm512_mask_cmpeq_epi8_mask (loadmask,
> hay0,
> > + null);
> > +  __mmask64 cmpmask = nullmask ^ (nullmask - ONE_64BIT);  cmpmask
> =
> > + _kand_mask64 (cmpmask, loadmask);
>   nullmask ^ (nullmask - ONE_64BIT); codegen ends up actually
>   using kand_mask here. Since loadmask and nullmask both go through
>   GPR (nullmask for the blsmsk) you can do this explicitly in uint64_t
>   to help GCC out.
> 
> > +  /* Search for the 2 charaters of needle */
> > +  __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +  __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
> > +  k1 = _kshiftri_mask64 (k1, 1);
> > +  /* k2 masks tell us if both chars from needle match */  uint64_t k2
> > + = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> cmpmask));
> > +  /* For every match, search for the entire needle for a full match
> > + */  while (k2)
> > +    {
> > +      uint64_t bitcount = _tzcnt_u64(k2);
> > +      k2 = _blsr_u64(k2);
> > +      size_t match_pos = hay_index + bitcount - edge;
> > +      if (nullmask == 0)
> > +        {
> > +          if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                          ned_load_mask, ned_zmm))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +      else
> > +        {
> > +          if (verify_string_match (haystack, match_pos, ned, 0))
> > +            return (char *)haystack + match_pos;
> > +        }
> > +    }
> > +  /* We haven't checked for potential match at the last char yet */
> > + hay_index += _mm_popcnt_u64 (loadmask) - 1;
>   hay_index = 0; haystay |= 63; You might want to check codegen and
>   ensure hay_index is being optimized out. AFAICT you just need a
>   pointer.

AFAICT, looks like it does optimize it out. 

> > +
> > +  /*
> > +   Loop over one cache line at a time to prevent reading over page
> > +   boundary
> > +   */
> > +  __m512i hay1;
> > +  while (nullmask == 0)
> > +    {
> > +      hay0 = _mm512_loadu_si512 (haystack + hay_index);
> > +      hay1 = _mm512_load_si512 (haystack + hay_index
> > +                                + 1); // Always 64 byte aligned
>     Is this really faster than using kshiftri?

Yes (assuming you mean using just one load and use mask shift operations to look for a match). A lot of instructions in this loop are stuck on port 5 and so is kshiftri. Using 2 loads which execute on Port 2 and 3 relives that pressure. 

> > +      nullmask = _mm512_cmpeq_epi8_mask (hay1, null);
> > +      /* Compare only till null char */
> > +      cmpmask = nullmask ^ (nullmask - ONE_64BIT);
> > +      k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
> > +      k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
> > +      /* k2 masks tell us if both chars from needle match */
> > +      k2 = _cvtmask64_u64 (_kand_mask64 (_kand_mask64 (k0, k1),
> cmpmask));
> > +      /* For every match, compare full strings for potential match */
> > +      while (k2)
> > +        {
> > +          uint64_t bitcount = _tzcnt_u64(k2);
> > +          k2 = _blsr_u64(k2);
> > +          size_t match_pos = hay_index + bitcount - edge;
> > +          if (nullmask == 0)
> > +            {
> > +              /*
> > +               Since the haystack doesn't terminate at the current cache
> > +               line, we can use zmm register to compare the first 64 bytes
> > +               */
> > +              if (verify_string_match_avx512 (haystack, match_pos, ned,
> > +                                              ned_load_mask, ned_zmm))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +          else
> > +            {
> > +              /* Compare byte by byte */
> > +              if (verify_string_match (haystack, match_pos, ned, 0))
> > +                return (char *)haystack + match_pos;
> > +            }
> > +        }
> > +      hay_index += ZMM_SIZE_IN_BYTES;
> > +    }
> > +  return NULL;
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strstr.c
> > b/sysdeps/x86_64/multiarch/strstr.c
> > index 95600a9de5..2fb8b169b6 100644
> > --- a/sysdeps/x86_64/multiarch/strstr.c
> > +++ b/sysdeps/x86_64/multiarch/strstr.c
> > @@ -35,16 +35,32 @@
> >
> >  extern __typeof (__redirect_strstr) __strstr_sse2_unaligned
> > attribute_hidden;  extern __typeof (__redirect_strstr) __strstr_sse2
> > attribute_hidden;
> > +extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
> >
> >  #include "init-arch.h"
> >
> >  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> >     ifunc symbol properly.  */
> >  extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc
> > (__libc_strstr,
> > -           HAS_ARCH_FEATURE (Fast_Unaligned_Load)
> > -           ? __strstr_sse2_unaligned
> > -           : __strstr_sse2)
> >
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > +  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
> > +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > +    return __strstr_avx512;
> > +
> > +  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> > +    return __strstr_sse2_unaligned;
> > +
> > +  return __strstr_sse2;
> > +}
> > +
> > +libc_ifunc_redirected (__redirect_strstr, __libc_strstr,
> > +IFUNC_SELECTOR ());
> >  #undef strstr
> >  strong_alias (__libc_strstr, strstr)
> > --
> > 2.36.1
> >


More information about the Libc-alpha mailing list