This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 3/3] aarch64: Hoist ZVA check out of the memset function


On Wed, Nov 8, 2017 at 9:13 PM, Siddhesh Poyarekar
<siddhesh@sourceware.org> wrote:
> The DZP bit in the dczid_el0 register does not change dynamically, so
> it is safe to read once during program startup.  Hoist the zva check
> into an ifunc resolver to optimize for the most common zva size
> i.e. 64 bytes.  I have retained the older memset as __memset_generic
> for internal libc.so use so that the change impact is minimal.  We
> should eventually have a discussion on what is more expensive, reading
> dczid_el0 on every memset invocation or the indirection due to PLT.
>
> The gains due to this are significant for falkor, with run time
> reductions as high as 48% in some cases.  Likewise for mustang,
> although the numbers are slightly lower.  Here's a sample from the
> falkor tests:

I don't like this at all for the increase file size for the not so
significant gain on real platforms.  I think we should declare falkor
micro-arch is broken and move on.

Thanks,
Andrew

>
> Function: memset
> Variant: walk
>                                     simple_memset       __memset_zva_64 __memset_generic
> ======================================================================================================
>                   length=256, char=0:       139.96 (-698.28%)           9.07 ( 48.26%)         17.53
>                   length=257, char=0:       140.50 (-699.03%)           9.53 ( 45.80%)         17.58
>                   length=258, char=0:       140.96 (-703.95%)           9.58 ( 45.36%)         17.53
>                   length=259, char=0:       141.56 (-705.16%)           9.53 ( 45.79%)         17.58
>                   length=260, char=0:       142.15 (-710.76%)           9.57 ( 45.39%)         17.53
>                   length=261, char=0:       142.50 (-710.39%)           9.53 ( 45.78%)         17.58
>                   length=262, char=0:       142.97 (-715.09%)           9.57 ( 45.42%)         17.54
>                   length=263, char=0:       143.51 (-716.18%)           9.53 ( 45.80%)         17.58
>                   length=264, char=0:       143.93 (-720.55%)           9.58 ( 45.39%)         17.54
>                   length=265, char=0:       144.56 (-722.07%)           9.53 ( 45.80%)         17.59
>                   length=266, char=0:       144.98 (-726.42%)           9.58 ( 45.42%)         17.54
>                   length=267, char=0:       145.53 (-727.53%)           9.53 ( 45.80%)         17.59
>                   length=268, char=0:       146.25 (-731.81%)           9.53 ( 45.79%)         17.58
>                   length=269, char=0:       146.52 (-735.39%)           9.53 ( 45.66%)         17.54
>                   length=270, char=0:       146.97 (-735.81%)           9.53 ( 45.80%)         17.58
>                   length=271, char=0:       147.54 (-741.08%)           9.58 ( 45.38%)         17.54
>                   length=512, char=0:       268.26 (-1307.85%)         12.06 ( 36.71%)         19.05
>                   length=513, char=0:       268.73 (-1273.89%)         13.56 ( 30.68%)         19.56
>                   length=514, char=0:       269.31 (-1276.89%)         13.56 ( 30.68%)         19.56
>                   length=515, char=0:       269.73 (-1279.05%)         13.56 ( 30.68%)         19.56
>                   length=516, char=0:       270.34 (-1282.24%)         13.56 ( 30.67%)         19.56
>                   length=517, char=0:       270.83 (-1284.71%)         13.56 ( 30.66%)         19.56
>                   length=518, char=0:       271.20 (-1286.54%)         13.56 ( 30.67%)         19.56
>                   length=519, char=0:       271.67 (-1288.67%)         13.65 ( 30.24%)         19.56
>                   length=520, char=0:       272.14 (-1291.04%)         13.65 ( 30.22%)         19.56
>                   length=521, char=0:       272.66 (-1293.69%)         13.65 ( 30.23%)         19.56
>                   length=522, char=0:       273.14 (-1296.13%)         13.65 ( 30.20%)         19.56
>                   length=523, char=0:       273.64 (-1298.75%)         13.65 ( 30.23%)         19.56
>                   length=524, char=0:       274.34 (-1302.16%)         13.66 ( 30.20%)         19.57
>                   length=525, char=0:       274.64 (-1297.78%)         13.56 ( 30.99%)         19.65
>                   length=526, char=0:       275.20 (-1300.04%)         13.56 ( 31.01%)         19.66
>                   length=527, char=0:       275.66 (-1302.86%)         13.56 ( 30.99%)         19.65
>                  length=1024, char=0:       524.46 (-2169.75%)         20.12 ( 12.92%)         23.11
>                  length=1025, char=0:       525.14 (-2124.63%)         21.62 (  8.40%)         23.61
>                  length=1026, char=0:       525.59 (-2125.36%)         21.88 (  7.37%)         23.62
>                  length=1027, char=0:       525.98 (-2127.14%)         21.62 (  8.46%)         23.62
>                  length=1028, char=0:       526.68 (-2131.10%)         21.62 (  8.42%)         23.61
>                  length=1029, char=0:       527.10 (-2131.70%)         21.79 (  7.73%)         23.62
>                  length=1030, char=0:       527.54 (-2118.51%)         21.62 (  9.10%)         23.78
>                  length=1031, char=0:       527.98 (-2136.37%)         21.62 (  8.43%)         23.61
>                  length=1032, char=0:       528.70 (-2139.38%)         21.62 (  8.43%)         23.61
>                  length=1033, char=0:       529.25 (-2124.37%)         21.62 (  9.11%)         23.79
>                  length=1034, char=0:       529.48 (-2142.95%)         21.62 (  8.43%)         23.61
>                  length=1035, char=0:       530.11 (-2145.13%)         21.62 (  8.44%)         23.61
>                  length=1036, char=0:       530.76 (-2147.10%)         21.79 (  7.73%)         23.62
>                  length=1037, char=0:       531.03 (-2149.45%)         21.62 (  8.42%)         23.61
>                  length=1038, char=0:       531.64 (-2151.87%)         21.62 (  8.42%)         23.61
>                  length=1039, char=0:       531.99 (-2151.63%)         21.80 (  7.75%)         23.63
>
>         * sysdeps/aarch64/memset-reg.h: New file.
>         * sysdeps/aarch64/memset.S: Use it.
>         (__memset): Rename to MEMSET macro.
>         [ZVA_MACRO]: Use zva_macro.
>         * sysdeps/aarch64/multiarch/Makefile (sysdep_routines):
>         Add memset_generic and memset_zva_64.
>         * sysdeps/aarch64/multiarch/ifunc-impl-list.c
>         (__libc_ifunc_impl_list): Add memset ifuncs.
>         * sysdeps/aarch64/multiarch/init-arch.h (INIT_ARCH): New
>         local variable zva_size.
>         * sysdeps/aarch64/multiarch/memset.c: New file.
>         * sysdeps/aarch64/multiarch/memset_generic.S: New file.
>         * sysdeps/aarch64/multiarch/memset_zva_64.S: New file.
>         * sysdeps/aarch64/multiarch/rtld-memset.S: New file.
>         * sysdeps/unix/sysv/linux/aarch64/cpu-features.c
>         (DCZID_DZP_MASK): New macro.
>         (DCZID_BS_MASK): Likewise.
>         (init_cpu_features): Read and set zva_size.
>         * sysdeps/unix/sysv/linux/aarch64/cpu-features.h
>         (struct cpu_features): New member zva_size.
> ---
>  sysdeps/aarch64/memset-reg.h                   | 30 ++++++++++++++++
>  sysdeps/aarch64/memset.S                       | 27 +++++---------
>  sysdeps/aarch64/multiarch/Makefile             |  2 +-
>  sysdeps/aarch64/multiarch/ifunc-impl-list.c    |  3 ++
>  sysdeps/aarch64/multiarch/init-arch.h          |  8 +++--
>  sysdeps/aarch64/multiarch/memset.c             | 41 +++++++++++++++++++++
>  sysdeps/aarch64/multiarch/memset_generic.S     | 27 ++++++++++++++
>  sysdeps/aarch64/multiarch/memset_zva_64.S      | 49 ++++++++++++++++++++++++++
>  sysdeps/aarch64/multiarch/rtld-memset.S        | 23 ++++++++++++
>  sysdeps/unix/sysv/linux/aarch64/cpu-features.c | 10 ++++++
>  sysdeps/unix/sysv/linux/aarch64/cpu-features.h |  1 +
>  11 files changed, 199 insertions(+), 22 deletions(-)
>  create mode 100644 sysdeps/aarch64/memset-reg.h
>  create mode 100644 sysdeps/aarch64/multiarch/memset.c
>  create mode 100644 sysdeps/aarch64/multiarch/memset_generic.S
>  create mode 100644 sysdeps/aarch64/multiarch/memset_zva_64.S
>  create mode 100644 sysdeps/aarch64/multiarch/rtld-memset.S
>
> diff --git a/sysdeps/aarch64/memset-reg.h b/sysdeps/aarch64/memset-reg.h
> new file mode 100644
> index 0000000..518e562
> --- /dev/null
> +++ b/sysdeps/aarch64/memset-reg.h
> @@ -0,0 +1,30 @@
> +/* Register aliases for memset to be used across implementations.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define dstin  x0
> +#define val    x1
> +#define valw   w1
> +#define count  x2
> +#define dst    x3
> +#define dstend x4
> +#define tmp1   x5
> +#define tmp1w  w5
> +#define tmp2   x6
> +#define tmp2w  w6
> +#define zva_len x7
> +#define zva_lenw w7
> diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
> index 110fd22..45fb0a8 100644
> --- a/sysdeps/aarch64/memset.S
> +++ b/sysdeps/aarch64/memset.S
> @@ -17,6 +17,7 @@
>     <http://www.gnu.org/licenses/>.  */
>
>  #include <sysdep.h>
> +#include "memset-reg.h"
>
>  /* Assumptions:
>   *
> @@ -24,20 +25,7 @@
>   *
>   */
>
> -#define dstin  x0
> -#define val    x1
> -#define valw   w1
> -#define count  x2
> -#define dst    x3
> -#define dstend x4
> -#define tmp1   x5
> -#define tmp1w  w5
> -#define tmp2   x6
> -#define tmp2w  w6
> -#define zva_len x7
> -#define zva_lenw w7
> -
> -ENTRY_ALIGN (__memset, 6)
> +ENTRY_ALIGN (MEMSET, 6)
>
>         DELOUSE (0)
>         DELOUSE (2)
> @@ -108,8 +96,11 @@ L(tail64):
>         stp     q0, q0, [dstend, -32]
>         ret
>
> -       .p2align 3
>  L(try_zva):
> +#ifdef ZVA_MACRO
> +       zva_macro
> +#else
> +       .p2align 3
>         mrs     tmp1, dczid_el0
>         tbnz    tmp1w, 4, L(no_zva)
>         and     tmp1w, tmp1w, 15
> @@ -189,7 +180,7 @@ L(zva_other):
>         b.hs    3b
>  4:     add     count, count, zva_len
>         b       L(tail64)
> +#endif
>
> -END (__memset)
> -weak_alias (__memset, memset)
> -libc_hidden_builtin_def (memset)
> +END (MEMSET)
> +libc_hidden_builtin_def (MEMSET)
> diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
> index 9aa1e79..57a6c8b 100644
> --- a/sysdeps/aarch64/multiarch/Makefile
> +++ b/sysdeps/aarch64/multiarch/Makefile
> @@ -1,4 +1,4 @@
>  ifeq ($(subdir),string)
>  sysdep_routines += memcpy_generic memcpy_thunderx memcpy_falkor \
> -                  memmove_falkor
> +                  memmove_falkor memset_generic memset_zva_64
>  endif
> diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> index 2cb74d5..a8c2f6d 100644
> --- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
> @@ -46,6 +46,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_thunderx)
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_falkor)
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
> +  IFUNC_IMPL (i, name, memset,
> +             IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva_64)
> +             IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
>
>    return i;
>  }
> diff --git a/sysdeps/aarch64/multiarch/init-arch.h b/sysdeps/aarch64/multiarch/init-arch.h
> index 3af442c..a756dad 100644
> --- a/sysdeps/aarch64/multiarch/init-arch.h
> +++ b/sysdeps/aarch64/multiarch/init-arch.h
> @@ -18,6 +18,8 @@
>
>  #include <ldsodefs.h>
>
> -#define INIT_ARCH()                            \
> -  uint64_t __attribute__((unused)) midr =      \
> -    GLRO(dl_aarch64_cpu_features).midr_el1;
> +#define INIT_ARCH()                                                          \
> +  uint64_t __attribute__((unused)) midr =                                    \
> +    GLRO(dl_aarch64_cpu_features).midr_el1;                                  \
> +  unsigned __attribute__((unused)) zva_size =                                \
> +    GLRO(dl_aarch64_cpu_features).zva_size;
> diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
> new file mode 100644
> index 0000000..fcaa376
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset.c
> @@ -0,0 +1,41 @@
> +/* Multiple versions of memset. AARCH64 version.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +
> +#if IS_IN (libc)
> +/* Redefine memset so that the compiler won't complain about the type
> +   mismatch with the IFUNC selector in strong_alias, below.  */
> +# undef memset
> +# define memset __redirect_memset
> +# include <string.h>
> +# include <init-arch.h>
> +
> +extern __typeof (__redirect_memset) __libc_memset;
> +
> +extern __typeof (__redirect_memset) __memset_nozva attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_64 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_zva_128 attribute_hidden;
> +extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
> +
> +libc_ifunc (__libc_memset, (zva_size == 64 ? __memset_zva_64
> +                           : __memset_generic));
> +
> +# undef memset
> +strong_alias (__libc_memset, memset);
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_generic.S b/sysdeps/aarch64/multiarch/memset_generic.S
> new file mode 100644
> index 0000000..55134ed
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_generic.S
> @@ -0,0 +1,27 @@
> +/* Memset for aarch64, default version for internal use.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +# define MEMSET __memset_generic
> +/* Add a hidden definition for use within libc.so.  */
> +# ifdef SHARED
> +       .globl __GI_memset; __GI_memset = __memset_generic
> +# endif
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/memset_zva_64.S b/sysdeps/aarch64/multiarch/memset_zva_64.S
> new file mode 100644
> index 0000000..bb9f140
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/memset_zva_64.S
> @@ -0,0 +1,49 @@
> +/* Memset for zva size of 64 bytes.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <memset-reg.h>
> +
> +#if IS_IN (libc)
> +.macro zva_macro
> +       .p2align 4
> +       /* Write the first and last 64 byte aligned block using stp rather
> +          than using DC ZVA.  This is faster on some cores.  */
> +       str     q0, [dst, 16]
> +       stp     q0, q0, [dst, 32]
> +       bic     dst, dst, 63
> +       stp     q0, q0, [dst, 64]
> +       stp     q0, q0, [dst, 96]
> +       sub     count, dstend, dst      /* Count is now 128 too large.  */
> +       sub     count, count, 128+64+64 /* Adjust count and bias for loop.  */
> +       add     dst, dst, 128
> +1:     dc      zva, dst
> +       add     dst, dst, 64
> +       subs    count, count, 64
> +       b.hi    1b
> +       stp     q0, q0, [dst, 0]
> +       stp     q0, q0, [dst, 32]
> +       stp     q0, q0, [dstend, -64]
> +       stp     q0, q0, [dstend, -32]
> +       ret
> +.endm
> +
> +# define ZVA_MACRO zva_macro
> +# define MEMSET __memset_zva_64
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/aarch64/multiarch/rtld-memset.S b/sysdeps/aarch64/multiarch/rtld-memset.S
> new file mode 100644
> index 0000000..4437393
> --- /dev/null
> +++ b/sysdeps/aarch64/multiarch/rtld-memset.S
> @@ -0,0 +1,23 @@
> +/* Memset for aarch64, for the dynamic linker.
> +   Copyright (C) 2017 Free Software Foundation, Inc.
> +
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library.  If not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (rtld)
> +# define MEMSET memset
> +# include <sysdeps/aarch64/memset.S>
> +#endif
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> index e769eeb..092ee81 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
> @@ -20,6 +20,9 @@
>  #include <sys/auxv.h>
>  #include <elf/dl-hwcaps.h>
>
> +#define DCZID_DZP_MASK (1 << 4)
> +#define DCZID_BS_MASK (0xf)
> +
>  #if HAVE_TUNABLES
>  struct cpu_list
>  {
> @@ -72,4 +75,11 @@ init_cpu_features (struct cpu_features *cpu_features)
>      }
>
>    cpu_features->midr_el1 = midr;
> +
> +  /* Check if ZVA is enabled.  */
> +  unsigned dczid;
> +  asm volatile ("mrs %0, dczid_el0" : "=r"(dczid));
> +
> +  if ((dczid & DCZID_DZP_MASK) == 0)
> +    cpu_features->zva_size = 4 << (dczid & DCZID_BS_MASK);
>  }
> diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> index 73cb53d..f2b6afd 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> +++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.h
> @@ -47,6 +47,7 @@
>  struct cpu_features
>  {
>    uint64_t midr_el1;
> +  unsigned zva_size;
>  };
>
>  #endif /* _CPU_FEATURES_AARCH64_H  */
> --
> 2.7.5
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]