This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: RFC: x86-64: Use fxsave/xsave/xsavec in _dl_runtime_resolve [BZ #21265]
H.J.,
Thank you for all the work on this and for moving this forward so that
we can support more of the user applications on GNU/Linux.
The patch looks good to me with the minor nit that you should use ALIGN_UP
to make it clear you're doing an alignment operation.
OK with that change.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
On 10/19/2017 03:36 PM, H.J. Lu wrote:
> [BZ #21265]
> * sysdeps/x86/cpu-features-offsets.sym (XSAVE_STATE_SIZE_OFFSET):
> New.
> * sysdeps/x86/cpu-features.c (get_common_indeces): Set
> xsave_state_size, xsave_state_full_size and
> bit_arch_XSAVEC_Usable if needed.
> (init_cpu_features): Remove bit_arch_Use_dl_runtime_resolve_slow
> and bit_arch_Use_dl_runtime_resolve_opt.
> * sysdeps/x86/cpu-features.h (bit_arch_Use_dl_runtime_resolve_opt):
> Removed.
> (bit_arch_Use_dl_runtime_resolve_slow): Likewise.
> (bit_arch_Prefer_No_AVX512): Updated.
> (bit_arch_MathVec_Prefer_No_AVX512): Likewise.
> (bit_arch_XSAVEC_Usable): New.
> (STATE_SAVE_OFFSET): Likewise.
> (STATE_SAVE_MASK): Likewise.
> [__ASSEMBLER__]: Include <cpu-features-offsets.h>.
> (cpu_features): Add xsave_state_size and xsave_state_full_size.
> (index_arch_Use_dl_runtime_resolve_opt): Removed.
> (index_arch_Use_dl_runtime_resolve_slow): Likewise.
> (index_arch_XSAVEC_Usable): New.
> * sysdeps/x86/cpu-tunables.c (TUNABLE_CALLBACK (set_hwcaps)):
> Support XSAVEC_Usable. Remove Use_dl_runtime_resolve_slow.
> * sysdeps/x86_64/Makefile (tst-x86_64-1-ENV): New if tunables
> is enabled.
> * sysdeps/x86_64/dl-machine.h (elf_machine_runtime_setup):
> Replace _dl_runtime_resolve_sse, _dl_runtime_resolve_avx,
> _dl_runtime_resolve_avx_slow, _dl_runtime_resolve_avx_opt,
> _dl_runtime_resolve_avx512 and _dl_runtime_resolve_avx512_opt
> with _dl_runtime_resolve_fxsave, _dl_runtime_resolve_xsave and
> _dl_runtime_resolve_xsavec.
> * sysdeps/x86_64/dl-trampoline.S (DL_RUNTIME_UNALIGNED_VEC_SIZE):
> Removed.
> (DL_RUNTIME_RESOLVE_REALIGN_STACK): Check STATE_SAVE_ALIGNMENT
> instead of VEC_SIZE.
> (REGISTER_SAVE_BND0): Removed.
> (REGISTER_SAVE_BND1): Likewise.
> (REGISTER_SAVE_BND3): Likewise.
> (REGISTER_SAVE_RAX): Always defined to 0.
> (VMOV): Removed.
> (_dl_runtime_resolve_avx): Likewise.
> (_dl_runtime_resolve_avx_slow): Likewise.
> (_dl_runtime_resolve_avx_opt): Likewise.
> (_dl_runtime_resolve_avx512): Likewise.
> (_dl_runtime_resolve_avx512_opt): Likewise.
> (_dl_runtime_resolve_sse): Likewise.
> (_dl_runtime_resolve_sse_vex): Likewise.
> (USE_FXSAVE): New.
> (_dl_runtime_resolve_fxsave): Likewise.
> (USE_XSAVE): Likewise.
> (_dl_runtime_resolve_xsave): Likewise.
> (USE_XSAVEC): Likewise.
> (_dl_runtime_resolve_xsavec): Likewise.
> * sysdeps/x86_64/dl-trampoline.h (_dl_runtime_resolve_avx512):
> Removed.
> (_dl_runtime_resolve_avx512_opt): Likewise.
> (_dl_runtime_resolve_avx): Likewise.
> (_dl_runtime_resolve_avx_opt): Likewise.
> (_dl_runtime_resolve_sse): Likewise.
> (_dl_runtime_resolve_sse_vex): Likewise.
> (_dl_runtime_resolve_fxsave): New.
> (_dl_runtime_resolve_xsave): Likewise.
> (_dl_runtime_resolve_xsavec): Likewise.
> ---
> sysdeps/x86/cpu-features-offsets.sym | 1 +
> sysdeps/x86/cpu-features.c | 87 +++++++++---
> sysdeps/x86/cpu-features.h | 34 ++++-
> sysdeps/x86/cpu-tunables.c | 17 ++-
> sysdeps/x86_64/Makefile | 4 +
> sysdeps/x86_64/dl-machine.h | 38 ++---
> sysdeps/x86_64/dl-trampoline.S | 87 ++++--------
> sysdeps/x86_64/dl-trampoline.h | 266 ++++++++++-------------------------
> 8 files changed, 228 insertions(+), 306 deletions(-)
>
> diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym
> index f6739fae81..33dd094e37 100644
> --- a/sysdeps/x86/cpu-features-offsets.sym
> +++ b/sysdeps/x86/cpu-features-offsets.sym
> @@ -15,6 +15,7 @@ CPUID_ECX_OFFSET offsetof (struct cpuid_registers, ecx)
> CPUID_EDX_OFFSET offsetof (struct cpuid_registers, edx)
> FAMILY_OFFSET offsetof (struct cpu_features, family)
> MODEL_OFFSET offsetof (struct cpu_features, model)
> +XSAVE_STATE_SIZE_OFFSET offsetof (struct cpu_features, xsave_state_size)
> FEATURE_OFFSET offsetof (struct cpu_features, feature)
> FEATURE_SIZE sizeof (unsigned int)
>
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 332b0f0d4a..6a5034f3c7 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
#include <libc-pointer-arith.h>
> @@ -103,6 +103,76 @@ get_common_indeces (struct cpu_features *cpu_features,
> }
> }
> }
> +
> + /* For _dl_runtime_resolve, set xsave_state_size to xsave area
> + size + integer register save size and align it to 64 bytes. */
OK.
> + if (cpu_features->max_cpuid >= 0xd)
> + {
> + unsigned int eax, ebx, ecx, edx;
> +
> + __cpuid_count (0xd, 0, eax, ebx, ecx, edx);
> + if (ebx != 0)
> + {
> + unsigned int xsave_state_full_size
> + = (ebx + STATE_SAVE_OFFSET + 63) & -64;
Use ALIGN_UP.
> +
> + cpu_features->xsave_state_size
> + = xsave_state_full_size;
> + cpu_features->xsave_state_full_size
> + = xsave_state_full_size;
> +
> + __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
> +
> + /* Check if XSAVEC is available. */
> + if ((eax & (1 << 1)) != 0)
> + {
> + unsigned int xstate_comp_offsets[32];
> + unsigned int xstate_comp_sizes[32];
> + unsigned int i;
> +
> + xstate_comp_offsets[0] = 0;
> + xstate_comp_offsets[1] = 160;
> + xstate_comp_offsets[2] = 576;
> + xstate_comp_sizes[0] = 160;
> + xstate_comp_sizes[1] = 256;
> +
> + for (i = 2; i < 32; i++)
> + {
> + if ((STATE_SAVE_MASK & (1 << i)) != 0)
> + {
> + __cpuid_count (0xd, i, eax, ebx, ecx, edx);
> + xstate_comp_sizes[i] = eax;
> + }
> + else
> + {
> + ecx = 0;
> + xstate_comp_sizes[i] = 0;
OK.
> + }
> +
> + if (i > 2)
> + {
> + xstate_comp_offsets[i]
> + = (xstate_comp_offsets[i - 1]
> + + xstate_comp_sizes[i -1]);
> + if ((ecx & (1 << 1)) != 0)
> + xstate_comp_offsets[i]
> + = (xstate_comp_offsets[i] + 63) & -64;
> + }
> + }
> +
> + /* Use XSAVEC. */
> + unsigned int size
> + = xstate_comp_offsets[31] + xstate_comp_sizes[31];
> + if (size)
> + {
> + cpu_features->xsave_state_size
> + = (size + STATE_SAVE_OFFSET + 63) & -64;
Use ALIGN_UP.
> + cpu_features->feature[index_arch_XSAVEC_Usable]
> + |= bit_arch_XSAVEC_Usable;
OK.
> + }
> + }
> + }
> + }
> }
> }
>
> @@ -242,23 +312,6 @@ init_cpu_features (struct cpu_features *cpu_features)
> else
> cpu_features->feature[index_arch_Prefer_No_AVX512]
> |= bit_arch_Prefer_No_AVX512;
> -
> - /* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
> - If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt.
> - Use _dl_runtime_resolve_opt only with AVX512F since it is
> - slower than _dl_runtime_resolve_slow with AVX. */
> - cpu_features->feature[index_arch_Use_dl_runtime_resolve_slow]
> - |= bit_arch_Use_dl_runtime_resolve_slow;
> - if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
> - && cpu_features->max_cpuid >= 0xd)
> - {
> - unsigned int eax;
> -
> - __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
> - if ((eax & (1 << 2)) != 0)
> - cpu_features->feature[index_arch_Use_dl_runtime_resolve_opt]
> - |= bit_arch_Use_dl_runtime_resolve_opt;
> - }
> }
> /* This spells out "AuthenticAMD". */
> else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
> diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
> index a032a2e168..b7f7898d11 100644
> --- a/sysdeps/x86/cpu-features.h
> +++ b/sysdeps/x86/cpu-features.h
> @@ -37,10 +37,9 @@
> #define bit_arch_Prefer_No_VZEROUPPER (1 << 17)
> #define bit_arch_Fast_Unaligned_Copy (1 << 18)
> #define bit_arch_Prefer_ERMS (1 << 19)
> -#define bit_arch_Use_dl_runtime_resolve_opt (1 << 20)
> -#define bit_arch_Use_dl_runtime_resolve_slow (1 << 21)
> -#define bit_arch_Prefer_No_AVX512 (1 << 22)
> -#define bit_arch_MathVec_Prefer_No_AVX512 (1 << 23)
> +#define bit_arch_Prefer_No_AVX512 (1 << 20)
> +#define bit_arch_MathVec_Prefer_No_AVX512 (1 << 21)
> +#define bit_arch_XSAVEC_Usable (1 << 22)
OK.
>
> /* CPUID Feature flags. */
>
> @@ -91,8 +90,18 @@
> /* The current maximum size of the feature integer bit array. */
> #define FEATURE_INDEX_MAX 1
>
> -#ifndef __ASSEMBLER__
> +/* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need
> + space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be
> + aligned to 16 bytes for fxsave and 64 bytes for xsave. */
> +#define STATE_SAVE_OFFSET (8 * 7 + 8)
>
> +/* Save SSE, AVX, AVX512, mask and bound registers. */
> +#define STATE_SAVE_MASK \
> + ((1 << 1) | (1 << 2) | (1 << 3) | (1 << 5) | (1 << 6) | (1 << 7))
> +
> +#ifdef __ASSEMBLER__
> +# include <cpu-features-offsets.h>
> +#else /* __ASSEMBLER__ */
> enum
> {
> COMMON_CPUID_INDEX_1 = 0,
> @@ -121,6 +130,18 @@ struct cpu_features
> } cpuid[COMMON_CPUID_INDEX_MAX];
> unsigned int family;
> unsigned int model;
> + /* The state size for XSAVEC or XSAVE. The type must be unsigned long
> + int so that we use
> +
> + sub xsave_state_size_offset(%rip) %RSP_LP
> +
> + in _dl_runtime_resolve. */
> + unsigned long int xsave_state_size;
> + /* The full state size for XSAVE when XSAVEC is disabled by
> +
> + GLIBC_TUNABLES=glibc.tune.hwcaps=-XSAVEC_Usable
> + */
> + unsigned int xsave_state_full_size;
> unsigned int feature[FEATURE_INDEX_MAX];
> /* Data cache size for use in memory and string routines, typically
> L1 size. */
> @@ -237,10 +258,9 @@ extern const struct cpu_features *__get_cpu_features (void)
> # define index_arch_Prefer_No_VZEROUPPER FEATURE_INDEX_1
> # define index_arch_Fast_Unaligned_Copy FEATURE_INDEX_1
> # define index_arch_Prefer_ERMS FEATURE_INDEX_1
> -# define index_arch_Use_dl_runtime_resolve_opt FEATURE_INDEX_1
> -# define index_arch_Use_dl_runtime_resolve_slow FEATURE_INDEX_1
> # define index_arch_Prefer_No_AVX512 FEATURE_INDEX_1
> # define index_arch_MathVec_Prefer_No_AVX512 FEATURE_INDEX_1
> +# define index_arch_XSAVEC_Usable FEATURE_INDEX_1
OK.
>
> #endif /* !__ASSEMBLER__ */
>
> diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
> index ec72d86f08..dcd0165f2e 100644
> --- a/sysdeps/x86/cpu-tunables.c
> +++ b/sysdeps/x86/cpu-tunables.c
> @@ -242,6 +242,16 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
> Slow_SSE4_2, SSE4_2,
> disable, 11);
> break;
> + case 13:
> + if (disable)
> + {
> + /* Update xsave_state_size to XSAVE state size. */
> + cpu_features->xsave_state_size
> + = cpu_features->xsave_state_full_size;
> + CHECK_GLIBC_IFUNC_ARCH_OFF (n, cpu_features,
> + XSAVEC_Usable, 13);
> + }
OK.
> + break;
> case 14:
> if (disable)
> {
> @@ -317,13 +327,6 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
> disable, 26);
> }
> break;
> - case 27:
> - {
> - CHECK_GLIBC_IFUNC_ARCH_BOTH (n, cpu_features,
> - Use_dl_runtime_resolve_slow,
> - disable, 27);
> - }
> - break;
> }
> p += len + 1;
> }
> diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
> index 12d4737240..9f1562f1b2 100644
> --- a/sysdeps/x86_64/Makefile
> +++ b/sysdeps/x86_64/Makefile
> @@ -55,6 +55,10 @@ CFLAGS-tst-quad2pie.c = $(PIE-ccflag)
> tests += tst-x86_64-1
> modules-names += x86_64/tst-x86_64mod-1
> LDFLAGS-tst-x86_64mod-1.so = -Wl,-soname,tst-x86_64mod-1.so
> +ifneq (no,$(have-tunables))
> +# Test the state size for XSAVE when XSAVEC is disabled.
> +tst-x86_64-1-ENV = GLIBC_TUNABLES=glibc.tune.hwcaps=-XSAVEC_Usable
> +endif
OK. Thanks for adding a test with the tunable!
>
> $(objpfx)tst-x86_64-1: $(objpfx)x86_64/tst-x86_64mod-1.so
>
> diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> index 6a04cbcdc9..905a37a5cc 100644
> --- a/sysdeps/x86_64/dl-machine.h
> +++ b/sysdeps/x86_64/dl-machine.h
> @@ -66,12 +66,9 @@ static inline int __attribute__ ((unused, always_inline))
> elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
> {
> Elf64_Addr *got;
> - extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
> - extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
> - extern void _dl_runtime_resolve_avx_slow (ElfW(Word)) attribute_hidden;
> - extern void _dl_runtime_resolve_avx_opt (ElfW(Word)) attribute_hidden;
> - extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
> - extern void _dl_runtime_resolve_avx512_opt (ElfW(Word)) attribute_hidden;
> + extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
> + extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
> + extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
OK.
> extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
> extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
> extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
> @@ -120,29 +117,14 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
> /* This function will get called to fix up the GOT entry
> indicated by the offset on the stack, and then jump to
> the resolved address. */
> - if (HAS_ARCH_FEATURE (AVX512F_Usable))
> - {
> - if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
> - *(ElfW(Addr) *) (got + 2)
> - = (ElfW(Addr)) &_dl_runtime_resolve_avx512_opt;
> - else
> - *(ElfW(Addr) *) (got + 2)
> - = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
> - }
> - else if (HAS_ARCH_FEATURE (AVX_Usable))
> - {
> - if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_opt))
> - *(ElfW(Addr) *) (got + 2)
> - = (ElfW(Addr)) &_dl_runtime_resolve_avx_opt;
> - else if (HAS_ARCH_FEATURE (Use_dl_runtime_resolve_slow))
> - *(ElfW(Addr) *) (got + 2)
> - = (ElfW(Addr)) &_dl_runtime_resolve_avx_slow;
> - else
> - *(ElfW(Addr) *) (got + 2)
> - = (ElfW(Addr)) &_dl_runtime_resolve_avx;
> - }
> + if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> + *(ElfW(Addr) *) (got + 2)
> + = (HAS_ARCH_FEATURE (XSAVEC_Usable)
> + ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> + : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
OK.
> else
> - *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
> + *(ElfW(Addr) *) (got + 2)
> + = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
> }
> }
>
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index c14c61aa58..a645572e44 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -34,41 +34,24 @@
> # define DL_STACK_ALIGNMENT 8
> #endif
>
> -#ifndef DL_RUNTIME_UNALIGNED_VEC_SIZE
> -/* The maximum size in bytes of unaligned vector load and store in the
> - dynamic linker. Since SSE optimized memory/string functions with
> - aligned SSE register load and store are used in the dynamic linker,
> - we must set this to 8 so that _dl_runtime_resolve_sse will align the
> - stack before calling _dl_fixup. */
> -# define DL_RUNTIME_UNALIGNED_VEC_SIZE 8
> -#endif
> -
> -/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes. */
> +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> + stack to 16 bytes before calling _dl_fixup. */
> #define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> - (VEC_SIZE > DL_STACK_ALIGNMENT \
> - && VEC_SIZE > DL_RUNTIME_UNALIGNED_VEC_SIZE)
> -
> -/* Align vector register save area to 16 bytes. */
> -#define REGISTER_SAVE_VEC_OFF 0
> + (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> + || 16 > DL_STACK_ALIGNMENT)
OK.
>
> /* Area on stack to save and restore registers used for parameter
> passing when calling _dl_fixup. */
> #ifdef __ILP32__
> -# define REGISTER_SAVE_RAX (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
> # define PRESERVE_BND_REGS_PREFIX
> #else
> -/* Align bound register save area to 16 bytes. */
> -# define REGISTER_SAVE_BND0 (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
> -# define REGISTER_SAVE_BND1 (REGISTER_SAVE_BND0 + 16)
> -# define REGISTER_SAVE_BND2 (REGISTER_SAVE_BND1 + 16)
> -# define REGISTER_SAVE_BND3 (REGISTER_SAVE_BND2 + 16)
> -# define REGISTER_SAVE_RAX (REGISTER_SAVE_BND3 + 16)
> # ifdef HAVE_MPX_SUPPORT
> # define PRESERVE_BND_REGS_PREFIX bnd
> # else
> # define PRESERVE_BND_REGS_PREFIX .byte 0xf2
> # endif
> #endif
> +#define REGISTER_SAVE_RAX 0
> #define REGISTER_SAVE_RCX (REGISTER_SAVE_RAX + 8)
> #define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8)
> #define REGISTER_SAVE_RSI (REGISTER_SAVE_RDX + 8)
> @@ -80,68 +63,56 @@
>
> #define VEC_SIZE 64
> #define VMOVA vmovdqa64
> -#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
> -# define VMOV vmovdqa64
> -#else
> -# define VMOV vmovdqu64
> -#endif
> #define VEC(i) zmm##i
> -#define _dl_runtime_resolve _dl_runtime_resolve_avx512
> #define _dl_runtime_profile _dl_runtime_profile_avx512
> #include "dl-trampoline.h"
> -#undef _dl_runtime_resolve
> #undef _dl_runtime_profile
> #undef VEC
> -#undef VMOV
> #undef VMOVA
> #undef VEC_SIZE
>
> #define VEC_SIZE 32
> #define VMOVA vmovdqa
> -#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
> -# define VMOV vmovdqa
> -#else
> -# define VMOV vmovdqu
> -#endif
> #define VEC(i) ymm##i
> -#define _dl_runtime_resolve _dl_runtime_resolve_avx
> -#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx_opt
> #define _dl_runtime_profile _dl_runtime_profile_avx
> #include "dl-trampoline.h"
> -#undef _dl_runtime_resolve
> -#undef _dl_runtime_resolve_opt
> #undef _dl_runtime_profile
> #undef VEC
> -#undef VMOV
> #undef VMOVA
> #undef VEC_SIZE
>
> /* movaps/movups is 1-byte shorter. */
> #define VEC_SIZE 16
> #define VMOVA movaps
> -#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
> -# define VMOV movaps
> -#else
> -# define VMOV movups
> -#endif
> #define VEC(i) xmm##i
> -#define _dl_runtime_resolve _dl_runtime_resolve_sse
> #define _dl_runtime_profile _dl_runtime_profile_sse
> #undef RESTORE_AVX
> #include "dl-trampoline.h"
> -#undef _dl_runtime_resolve
> #undef _dl_runtime_profile
> -#undef VMOV
> +#undef VEC
> #undef VMOVA
> +#undef VEC_SIZE
>
> -/* Used by _dl_runtime_resolve_avx_opt/_dl_runtime_resolve_avx512_opt
> - to preserve the full vector registers with zero upper bits. */
> -#define VMOVA vmovdqa
> -#if DL_RUNTIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
> -# define VMOV vmovdqa
> -#else
> -# define VMOV vmovdqu
> -#endif
> -#define _dl_runtime_resolve _dl_runtime_resolve_sse_vex
> -#define _dl_runtime_resolve_opt _dl_runtime_resolve_avx512_opt
> +#define USE_FXSAVE
> +#define STATE_SAVE_ALIGNMENT 16
> +#define _dl_runtime_resolve _dl_runtime_resolve_fxsave
> +#include "dl-trampoline.h"
> +#undef _dl_runtime_resolve
> +#undef USE_FXSAVE
> +#undef STATE_SAVE_ALIGNMENT
> +
> +#define USE_XSAVE
> +#define STATE_SAVE_ALIGNMENT 64
> +#define _dl_runtime_resolve _dl_runtime_resolve_xsave
> +#include "dl-trampoline.h"
> +#undef _dl_runtime_resolve
> +#undef USE_XSAVE
> +#undef STATE_SAVE_ALIGNMENT
> +
> +#define USE_XSAVEC
> +#define STATE_SAVE_ALIGNMENT 64
> +#define _dl_runtime_resolve _dl_runtime_resolve_xsavec
> #include "dl-trampoline.h"
> +#undef _dl_runtime_resolve
> +#undef USE_XSAVEC
> +#undef STATE_SAVE_ALIGNMENT
OK.
> diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> index 8db24c16ac..dfd7e4b803 100644
> --- a/sysdeps/x86_64/dl-trampoline.h
> +++ b/sysdeps/x86_64/dl-trampoline.h
> @@ -16,140 +16,47 @@
> License along with the GNU C Library; if not, see
> <http://www.gnu.org/licenses/>. */
>
> -#undef REGISTER_SAVE_AREA_RAW
> -#ifdef __ILP32__
> -/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
> - VEC7. */
> -# define REGISTER_SAVE_AREA_RAW (8 * 7 + VEC_SIZE * 8)
> -#else
> -/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
> - BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
> -# define REGISTER_SAVE_AREA_RAW (8 * 7 + 16 * 4 + VEC_SIZE * 8)
> -#endif
> + .text
> +#ifdef _dl_runtime_resolve
>
> -#undef REGISTER_SAVE_AREA
> -#undef LOCAL_STORAGE_AREA
> -#undef BASE
> -#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> -# define REGISTER_SAVE_AREA (REGISTER_SAVE_AREA_RAW + 8)
> -/* Local stack area before jumping to function address: RBX. */
> -# define LOCAL_STORAGE_AREA 8
> -# define BASE rbx
> -# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
> -# error REGISTER_SAVE_AREA must be multples of VEC_SIZE
> -# endif
> -#else
> -# define REGISTER_SAVE_AREA REGISTER_SAVE_AREA_RAW
> -/* Local stack area before jumping to function address: All saved
> - registers. */
> -# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA
> -# define BASE rsp
> -# if (REGISTER_SAVE_AREA % 16) != 8
> -# error REGISTER_SAVE_AREA must be odd multples of 8
> +# undef REGISTER_SAVE_AREA
> +# undef LOCAL_STORAGE_AREA
> +# undef BASE
> +
> +# if (STATE_SAVE_ALIGNMENT % 16) != 0
> +# error STATE_SAVE_ALIGNMENT must be multples of 16
> # endif
> -#endif
>
> - .text
> -#ifdef _dl_runtime_resolve_opt
> -/* Use the smallest vector registers to preserve the full YMM/ZMM
> - registers to avoid SSE transition penalty. */
> -
> -# if VEC_SIZE == 32
> -/* Check if the upper 128 bits in %ymm0 - %ymm7 registers are non-zero
> - and preserve %xmm0 - %xmm7 registers with the zero upper bits. Since
> - there is no SSE transition penalty on AVX512 processors which don't
> - support XGETBV with ECX == 1, _dl_runtime_resolve_avx512_slow isn't
> - provided. */
> - .globl _dl_runtime_resolve_avx_slow
> - .hidden _dl_runtime_resolve_avx_slow
> - .type _dl_runtime_resolve_avx_slow, @function
> - .align 16
> -_dl_runtime_resolve_avx_slow:
> - cfi_startproc
> - cfi_adjust_cfa_offset(16) # Incorporate PLT
> - vorpd %ymm0, %ymm1, %ymm8
> - vorpd %ymm2, %ymm3, %ymm9
> - vorpd %ymm4, %ymm5, %ymm10
> - vorpd %ymm6, %ymm7, %ymm11
> - vorpd %ymm8, %ymm9, %ymm9
> - vorpd %ymm10, %ymm11, %ymm10
> - vpcmpeqd %xmm8, %xmm8, %xmm8
> - vorpd %ymm9, %ymm10, %ymm10
> - vptest %ymm10, %ymm8
> - # Preserve %ymm0 - %ymm7 registers if the upper 128 bits of any
> - # %ymm0 - %ymm7 registers aren't zero.
> - PRESERVE_BND_REGS_PREFIX
> - jnc _dl_runtime_resolve_avx
> - # Use vzeroupper to avoid SSE transition penalty.
> - vzeroupper
> - # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits
> - # when the upper 128 bits of %ymm0 - %ymm7 registers are zero.
> - PRESERVE_BND_REGS_PREFIX
> - jmp _dl_runtime_resolve_sse_vex
> - cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
> - cfi_endproc
> - .size _dl_runtime_resolve_avx_slow, .-_dl_runtime_resolve_avx_slow
> +# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> +# error STATE_SAVE_OFFSET must be multples of STATE_SAVE_ALIGNMENT
> # endif
>
> -/* Use XGETBV with ECX == 1 to check which bits in vector registers are
> - non-zero and only preserve the non-zero lower bits with zero upper
> - bits. */
> - .globl _dl_runtime_resolve_opt
> - .hidden _dl_runtime_resolve_opt
> - .type _dl_runtime_resolve_opt, @function
> - .align 16
> -_dl_runtime_resolve_opt:
> - cfi_startproc
> - cfi_adjust_cfa_offset(16) # Incorporate PLT
> - pushq %rax
> - cfi_adjust_cfa_offset(8)
> - cfi_rel_offset(%rax, 0)
> - pushq %rcx
> - cfi_adjust_cfa_offset(8)
> - cfi_rel_offset(%rcx, 0)
> - pushq %rdx
> - cfi_adjust_cfa_offset(8)
> - cfi_rel_offset(%rdx, 0)
> - movl $1, %ecx
> - xgetbv
> - movl %eax, %r11d
> - popq %rdx
> - cfi_adjust_cfa_offset(-8)
> - cfi_restore (%rdx)
> - popq %rcx
> - cfi_adjust_cfa_offset(-8)
> - cfi_restore (%rcx)
> - popq %rax
> - cfi_adjust_cfa_offset(-8)
> - cfi_restore (%rax)
> -# if VEC_SIZE == 32
> - # For YMM registers, check if YMM state is in use.
> - andl $bit_YMM_state, %r11d
> - # Preserve %xmm0 - %xmm7 registers with the zero upper 128 bits if
> - # YMM state isn't in use.
> - PRESERVE_BND_REGS_PREFIX
> - jz _dl_runtime_resolve_sse_vex
> -# elif VEC_SIZE == 16
> - # For ZMM registers, check if YMM state and ZMM state are in
> - # use.
> - andl $(bit_YMM_state | bit_ZMM0_15_state), %r11d
> - cmpl $bit_YMM_state, %r11d
> - # Preserve %zmm0 - %zmm7 registers if ZMM state is in use.
> - PRESERVE_BND_REGS_PREFIX
> - jg _dl_runtime_resolve_avx512
> - # Preserve %ymm0 - %ymm7 registers with the zero upper 256 bits if
> - # ZMM state isn't in use.
> - PRESERVE_BND_REGS_PREFIX
> - je _dl_runtime_resolve_avx
> - # Preserve %xmm0 - %xmm7 registers with the zero upper 384 bits if
> - # neither YMM state nor ZMM state are in use.
> +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +/* Local stack area before jumping to function address: RBX. */
> +# define LOCAL_STORAGE_AREA 8
> +# define BASE rbx
> +# ifdef USE_FXSAVE
> +/* Use fxsave to save XMM registers. */
> +# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET)
> +# if (REGISTER_SAVE_AREA % 16) != 0
> +# error REGISTER_SAVE_AREA must be multples of 16
> +# endif
> +# endif
> # else
> -# error Unsupported VEC_SIZE!
> +# ifndef USE_FXSAVE
> +# error USE_FXSAVE must be defined
> +# endif
> +/* Use fxsave to save XMM registers. */
> +# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET + 8)
> +/* Local stack area before jumping to function address: All saved
> + registers. */
> +# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA
> +# define BASE rsp
> +# if (REGISTER_SAVE_AREA % 16) != 8
> +# error REGISTER_SAVE_AREA must be odd multples of 8
> +# endif
> # endif
> - cfi_adjust_cfa_offset(-16) # Restore PLT adjustment
> - cfi_endproc
> - .size _dl_runtime_resolve_opt, .-_dl_runtime_resolve_opt
> -#endif
> +
> .globl _dl_runtime_resolve
> .hidden _dl_runtime_resolve
> .type _dl_runtime_resolve, @function
> @@ -157,21 +64,29 @@ _dl_runtime_resolve_opt:
> cfi_startproc
> _dl_runtime_resolve:
> cfi_adjust_cfa_offset(16) # Incorporate PLT
> -#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> -# if LOCAL_STORAGE_AREA != 8
> -# error LOCAL_STORAGE_AREA must be 8
> -# endif
> +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +# if LOCAL_STORAGE_AREA != 8
> +# error LOCAL_STORAGE_AREA must be 8
> +# endif
OK.
> pushq %rbx # push subtracts stack by 8.
> cfi_adjust_cfa_offset(8)
> cfi_rel_offset(%rbx, 0)
> mov %RSP_LP, %RBX_LP
> cfi_def_cfa_register(%rbx)
> - and $-VEC_SIZE, %RSP_LP
> -#endif
> + and $-STATE_SAVE_ALIGNMENT, %RSP_LP
> +# endif
> +# ifdef REGISTER_SAVE_AREA
> sub $REGISTER_SAVE_AREA, %RSP_LP
> -#if !DL_RUNTIME_RESOLVE_REALIGN_STACK
> +# if !DL_RUNTIME_RESOLVE_REALIGN_STACK
> cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> -#endif
> +# endif
> +# else
> +# if IS_IN (rtld)
> + sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> +# else
> + sub _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> +# endif
> +# endif
OK. Allocate stack space of the required size to save the results.
> # Preserve registers otherwise clobbered.
> movq %rax, REGISTER_SAVE_RAX(%rsp)
> movq %rcx, REGISTER_SAVE_RCX(%rsp)
> @@ -180,59 +95,42 @@ _dl_runtime_resolve:
> movq %rdi, REGISTER_SAVE_RDI(%rsp)
> movq %r8, REGISTER_SAVE_R8(%rsp)
> movq %r9, REGISTER_SAVE_R9(%rsp)
> - VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
> - VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
> - VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
> - VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
> - VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
> - VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
> - VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
> - VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
> -#ifndef __ILP32__
> - # We also have to preserve bound registers. These are nops if
> - # Intel MPX isn't available or disabled.
> -# ifdef HAVE_MPX_SUPPORT
> - bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
> - bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
> - bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
> - bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
> +# ifdef USE_FXSAVE
> + fxsave STATE_SAVE_OFFSET(%rsp)
OK.
> # else
> -# if REGISTER_SAVE_BND0 == 0
> - .byte 0x66,0x0f,0x1b,0x04,0x24
> + movl $STATE_SAVE_MASK, %eax
> + xorl %edx, %edx
> + # Clear the XSAVE Header.
> +# ifdef USE_XSAVE
> + movq %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
> +# endif
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
> +# ifdef USE_XSAVE
> + xsave STATE_SAVE_OFFSET(%rsp)
OK.
> # else
> - .byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
> + xsavec STATE_SAVE_OFFSET(%rsp)
OK.
> # endif
> - .byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
> - .byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
> - .byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
> # endif
> -#endif
> # Copy args pushed by PLT in register.
> # %rdi: link_map, %rsi: reloc_index
> mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
> mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
> call _dl_fixup # Call resolver.
> mov %RAX_LP, %R11_LP # Save return value
> -#ifndef __ILP32__
> - # Restore bound registers. These are nops if Intel MPX isn't
> - # avaiable or disabled.
> -# ifdef HAVE_MPX_SUPPORT
> - bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
> - bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
> - bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
> - bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
> + # Get register content back.
> +# ifdef USE_FXSAVE
> + fxrstor STATE_SAVE_OFFSET(%rsp)
> # else
> - .byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
> - .byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
> - .byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
> -# if REGISTER_SAVE_BND0 == 0
> - .byte 0x66,0x0f,0x1a,0x04,0x24
> -# else
> - .byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
> -# endif
> + movl $STATE_SAVE_MASK, %eax
> + xorl %edx, %edx
> + xrstor STATE_SAVE_OFFSET(%rsp)
OK.
> # endif
> -#endif
> - # Get register content back.
> movq REGISTER_SAVE_R9(%rsp), %r9
> movq REGISTER_SAVE_R8(%rsp), %r8
> movq REGISTER_SAVE_RDI(%rsp), %rdi
> @@ -240,20 +138,12 @@ _dl_runtime_resolve:
> movq REGISTER_SAVE_RDX(%rsp), %rdx
> movq REGISTER_SAVE_RCX(%rsp), %rcx
> movq REGISTER_SAVE_RAX(%rsp), %rax
> - VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
> - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
> - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
> - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
> - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
> - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
> - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
> - VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
OK. Don't need these any more.
> -#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> mov %RBX_LP, %RSP_LP
> cfi_def_cfa_register(%rsp)
> movq (%rsp), %rbx
> cfi_restore(%rbx)
> -#endif
> +# endif
> # Adjust stack(PLT did 2 pushes)
> add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
> cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
> @@ -262,11 +152,9 @@ _dl_runtime_resolve:
> jmp *%r11 # Jump to function address.
> cfi_endproc
> .size _dl_runtime_resolve, .-_dl_runtime_resolve
> +#endif
>
>
> -/* To preserve %xmm0 - %xmm7 registers, dl-trampoline.h is included
> - twice, for _dl_runtime_resolve_sse and _dl_runtime_resolve_sse_vex.
> - But we don't need another _dl_runtime_profile for XMM registers. */
OK.
> #if !defined PROF && defined _dl_runtime_profile
> # if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
> # error LR_VECTOR_OFFSET must be multples of VEC_SIZE
> -- 2.13.6
--
Cheers,
Carlos.