This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch release/2.25/master updated. glibc-2.25-14-g06d7980
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 28 Apr 2017 17:23:40 -0000
- Subject: GNU C Library master sources branch release/2.25/master updated. glibc-2.25-14-g06d7980
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, release/2.25/master has been updated
via 06d79808f6faf6025c5a7d4e27d949a8216275cc (commit)
via 4c6f97798fe1854a32b1199c42370eac1620eebf (commit)
from b30b1c97ccfe72e82b0c95bb55274b5660bc539e (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=06d79808f6faf6025c5a7d4e27d949a8216275cc
commit 06d79808f6faf6025c5a7d4e27d949a8216275cc
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Apr 28 10:04:15 2017 -0700
x86: Use AVX2 memcpy/memset on Skylake server [BZ #21396]
On Skylake server, AVX512 load/store instructions in memcpy/memset may
lead to lower CPU turbo frequency in certain situations. Use of AVX2
in memcpy/memset has been observed to have improved overall performance
in many workloads due to the higher frequency.
Since AVX512ER is unique to Xeon Phi, this patch sets Prefer_No_AVX512
if AVX512ER isn't available so that AVX2 versions of memcpy/memset are
used on Skylake server.
[BZ #21396]
* sysdeps/x86/cpu-features.c (init_cpu_features): Set
Prefer_No_AVX512 if AVX512ER isn't available.
* sysdeps/x86/cpu-features.h (bit_arch_Prefer_No_AVX512): New.
(index_arch_Prefer_No_AVX512): Likewise.
* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Don't use
AVX512 version if Prefer_No_AVX512 is set.
* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk):
Likewise.
* sysdeps/x86_64/multiarch/memmove.S (__libc_memmove): Likewise.
* sysdeps/x86_64/multiarch/memmove_chk.S (__memmove_chk):
Likewise.
* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk):
Likewise.
* sysdeps/x86_64/multiarch/memset.S (memset): Likewise.
* sysdeps/x86_64/multiarch/memset_chk.S (__memset_chk):
Likewise.
(cherry picked from commit 4cb334c4d6249686653137ec273d081371b3672d)
diff --git a/ChangeLog b/ChangeLog
index dc49c78..adebc03 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,26 @@
2017-04-28 H.J. Lu <hongjiu.lu@intel.com>
+ [BZ #21396]
+ * sysdeps/x86/cpu-features.c (init_cpu_features): Set
+ Prefer_No_AVX512 if AVX512ER isn't available.
+ * sysdeps/x86/cpu-features.h (bit_arch_Prefer_No_AVX512): New.
+ (index_arch_Prefer_No_AVX512): Likewise.
+ * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Don't use
+ AVX512 version if Prefer_No_AVX512 is set.
+ * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk):
+ Likewise.
+ * sysdeps/x86_64/multiarch/memmove.S (__libc_memmove): Likewise.
+ * sysdeps/x86_64/multiarch/memmove_chk.S (__memmove_chk):
+ Likewise.
+ * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
+ * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk):
+ Likewise.
+ * sysdeps/x86_64/multiarch/memset.S (memset): Likewise.
+ * sysdeps/x86_64/multiarch/memset_chk.S (__memset_chk):
+ Likewise.
+
+2017-04-28 H.J. Lu <hongjiu.lu@intel.com>
+
* sysdeps/x86/cpu-features.c (init_cpu_features): Set
Prefer_No_VZEROUPPER if AVX512ER is available.
* sysdeps/x86/cpu-features.h
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 41d0be2..9afd74c 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -225,10 +225,14 @@ init_cpu_features (struct cpu_features *cpu_features)
|= bit_arch_AVX_Fast_Unaligned_Load;
/* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
- if AVX512ER is available. */
+ if AVX512ER is available. Don't use AVX512 to avoid lower CPU
+ frequency if AVX512ER isn't available. */
if (CPU_FEATURES_CPU_P (cpu_features, AVX512ER))
cpu_features->feature[index_arch_Prefer_No_VZEROUPPER]
|= bit_arch_Prefer_No_VZEROUPPER;
+ else
+ cpu_features->feature[index_arch_Prefer_No_AVX512]
+ |= bit_arch_Prefer_No_AVX512;
/* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt. */
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 2ee8a0a..a409db6 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -39,6 +39,7 @@
#define bit_arch_Prefer_ERMS (1 << 19)
#define bit_arch_Use_dl_runtime_resolve_opt (1 << 20)
#define bit_arch_Use_dl_runtime_resolve_slow (1 << 21)
+#define bit_arch_Prefer_No_AVX512 (1 << 22)
/* CPUID Feature flags. */
@@ -116,6 +117,7 @@
# define index_arch_Prefer_ERMS FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Use_dl_runtime_resolve_opt FEATURE_INDEX_1*FEATURE_SIZE
# define index_arch_Use_dl_runtime_resolve_slow FEATURE_INDEX_1*FEATURE_SIZE
+# define index_arch_Prefer_No_AVX512 FEATURE_INDEX_1*FEATURE_SIZE
# if defined (_LIBC) && !IS_IN (nonlib)
@@ -298,6 +300,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_arch_Prefer_ERMS FEATURE_INDEX_1
# define index_arch_Use_dl_runtime_resolve_opt FEATURE_INDEX_1
# define index_arch_Use_dl_runtime_resolve_slow FEATURE_INDEX_1
+# define index_arch_Prefer_No_AVX512 FEATURE_INDEX_1
#endif /* !__ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 1f83ee3..af27703 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -32,6 +32,8 @@ ENTRY(__new_memcpy)
lea __memcpy_erms(%rip), %RAX_LP
HAS_ARCH_FEATURE (Prefer_ERMS)
jnz 2f
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
lea __memcpy_avx512_no_vzeroupper(%rip), %RAX_LP
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 5492342..8737fb9 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -30,6 +30,8 @@
ENTRY(__memcpy_chk)
.type __memcpy_chk, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
lea __memcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
diff --git a/sysdeps/x86_64/multiarch/memmove.S b/sysdeps/x86_64/multiarch/memmove.S
index 2021bfc..8c534e8 100644
--- a/sysdeps/x86_64/multiarch/memmove.S
+++ b/sysdeps/x86_64/multiarch/memmove.S
@@ -30,6 +30,8 @@ ENTRY(__libc_memmove)
lea __memmove_erms(%rip), %RAX_LP
HAS_ARCH_FEATURE (Prefer_ERMS)
jnz 2f
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
lea __memmove_avx512_no_vzeroupper(%rip), %RAX_LP
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.S b/sysdeps/x86_64/multiarch/memmove_chk.S
index 8a252ad..7870dd0 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.S
+++ b/sysdeps/x86_64/multiarch/memmove_chk.S
@@ -29,6 +29,8 @@
ENTRY(__memmove_chk)
.type __memmove_chk, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
lea __memmove_chk_avx512_no_vzeroupper(%rip), %RAX_LP
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index 79c840d..b8b2b28 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -32,6 +32,8 @@ ENTRY(__mempcpy)
lea __mempcpy_erms(%rip), %RAX_LP
HAS_ARCH_FEATURE (Prefer_ERMS)
jnz 2f
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
lea __mempcpy_avx512_no_vzeroupper(%rip), %RAX_LP
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 6927962..072b22c 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -30,6 +30,8 @@
ENTRY(__mempcpy_chk)
.type __mempcpy_chk, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 1f
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 1f
lea __mempcpy_chk_avx512_no_vzeroupper(%rip), %RAX_LP
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index c958b2f..9d33118 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -41,6 +41,8 @@ ENTRY(memset)
jnz L(AVX512F)
lea __memset_avx2_unaligned(%rip), %RAX_LP
L(AVX512F):
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 2f
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 2f
lea __memset_avx512_no_vzeroupper(%rip), %RAX_LP
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
index 79eaa37..7e08311 100644
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -38,6 +38,8 @@ ENTRY(__memset_chk)
jnz L(AVX512F)
lea __memset_chk_avx2_unaligned(%rip), %RAX_LP
L(AVX512F):
+ HAS_ARCH_FEATURE (Prefer_No_AVX512)
+ jnz 2f
HAS_ARCH_FEATURE (AVX512F_Usable)
jz 2f
lea __memset_chk_avx512_no_vzeroupper(%rip), %RAX_LP
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4c6f97798fe1854a32b1199c42370eac1620eebf
commit 4c6f97798fe1854a32b1199c42370eac1620eebf
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Apr 28 10:03:09 2017 -0700
x86: Set Prefer_No_VZEROUPPER if AVX512ER is available
AVX512ER won't be implemented in any Xeon processors and will be in
all Xeon Phi processors. Don't check CPU model number when setting
Prefer_No_VZEROUPPER for Xeon Phi. Instead, set Prefer_No_VZEROUPPER
if AVX512ER is available. It works with current and future Xeon Phi
and non-Xeon Phi processors.
* sysdeps/x86/cpu-features.c (init_cpu_features): Set
Prefer_No_VZEROUPPER if AVX512ER is available.
* sysdeps/x86/cpu-features.h
(bit_cpu_AVX512PF): New.
(bit_cpu_AVX512ER): Likewise.
(bit_cpu_AVX512CD): Likewise.
(bit_cpu_AVX512BW): Likewise.
(bit_cpu_AVX512VL): Likewise.
(index_cpu_AVX512PF): Likewise.
(index_cpu_AVX512ER): Likewise.
(index_cpu_AVX512CD): Likewise.
(index_cpu_AVX512BW): Likewise.
(index_cpu_AVX512VL): Likewise.
(reg_AVX512PF): Likewise.
(reg_AVX512ER): Likewise.
(reg_AVX512CD): Likewise.
(reg_AVX512BW): Likewise.
(reg_AVX512VL): Likewise.
(cherry picked from commit 1c53cb49de6d82d9469ccbd5aa0c55924502bd8b)
diff --git a/ChangeLog b/ChangeLog
index 4962000..dc49c78 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,24 @@
+2017-04-28 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/x86/cpu-features.c (init_cpu_features): Set
+ Prefer_No_VZEROUPPER if AVX512ER is available.
+ * sysdeps/x86/cpu-features.h
+ (bit_cpu_AVX512PF): New.
+ (bit_cpu_AVX512ER): Likewise.
+ (bit_cpu_AVX512CD): Likewise.
+ (bit_cpu_AVX512BW): Likewise.
+ (bit_cpu_AVX512VL): Likewise.
+ (index_cpu_AVX512PF): Likewise.
+ (index_cpu_AVX512ER): Likewise.
+ (index_cpu_AVX512CD): Likewise.
+ (index_cpu_AVX512BW): Likewise.
+ (index_cpu_AVX512VL): Likewise.
+ (reg_AVX512PF): Likewise.
+ (reg_AVX512ER): Likewise.
+ (reg_AVX512CD): Likewise.
+ (reg_AVX512BW): Likewise.
+ (reg_AVX512VL): Likewise.
+
2017-04-11 Adhemerval Zanella <adhemerval.zanella@linaro.org>
* posix/globtest.sh: Add cleanup routine on trap 0.
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 1c714a4..41d0be2 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -139,8 +139,6 @@ init_cpu_features (struct cpu_features *cpu_features)
case 0x57:
/* Knights Landing. Enable Silvermont optimizations. */
- cpu_features->feature[index_arch_Prefer_No_VZEROUPPER]
- |= bit_arch_Prefer_No_VZEROUPPER;
case 0x5c:
case 0x5f:
@@ -226,6 +224,12 @@ init_cpu_features (struct cpu_features *cpu_features)
cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
|= bit_arch_AVX_Fast_Unaligned_Load;
+ /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
+ if AVX512ER is available. */
+ if (CPU_FEATURES_CPU_P (cpu_features, AVX512ER))
+ cpu_features->feature[index_arch_Prefer_No_VZEROUPPER]
+ |= bit_arch_Prefer_No_VZEROUPPER;
+
/* To avoid SSE transition penalty, use _dl_runtime_resolve_slow.
If XGETBV suports ECX == 1, use _dl_runtime_resolve_opt. */
cpu_features->feature[index_arch_Use_dl_runtime_resolve_slow]
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 95f0fcf..2ee8a0a 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -62,6 +62,11 @@
#define bit_cpu_AVX2 (1 << 5)
#define bit_cpu_AVX512F (1 << 16)
#define bit_cpu_AVX512DQ (1 << 17)
+#define bit_cpu_AVX512PF (1 << 26)
+#define bit_cpu_AVX512ER (1 << 27)
+#define bit_cpu_AVX512CD (1 << 28)
+#define bit_cpu_AVX512BW (1 << 30)
+#define bit_cpu_AVX512VL (1u << 31)
/* XCR0 Feature flags. */
#define bit_XMM_state (1 << 1)
@@ -236,6 +241,11 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_cpu_AVX2 COMMON_CPUID_INDEX_7
# define index_cpu_AVX512F COMMON_CPUID_INDEX_7
# define index_cpu_AVX512DQ COMMON_CPUID_INDEX_7
+# define index_cpu_AVX512PF COMMON_CPUID_INDEX_7
+# define index_cpu_AVX512ER COMMON_CPUID_INDEX_7
+# define index_cpu_AVX512CD COMMON_CPUID_INDEX_7
+# define index_cpu_AVX512BW COMMON_CPUID_INDEX_7
+# define index_cpu_AVX512VL COMMON_CPUID_INDEX_7
# define index_cpu_ERMS COMMON_CPUID_INDEX_7
# define index_cpu_RTM COMMON_CPUID_INDEX_7
# define index_cpu_FMA COMMON_CPUID_INDEX_1
@@ -254,6 +264,11 @@ extern const struct cpu_features *__get_cpu_features (void)
# define reg_AVX2 ebx
# define reg_AVX512F ebx
# define reg_AVX512DQ ebx
+# define reg_AVX512PF ebx
+# define reg_AVX512ER ebx
+# define reg_AVX512CD ebx
+# define reg_AVX512BW ebx
+# define reg_AVX512VL ebx
# define reg_ERMS ebx
# define reg_RTM ebx
# define reg_FMA ecx
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 42 ++++++++++++++++++++++++++++++++
sysdeps/x86/cpu-features.c | 12 +++++++-
sysdeps/x86/cpu-features.h | 18 +++++++++++++
sysdeps/x86_64/multiarch/memcpy.S | 2 +
sysdeps/x86_64/multiarch/memcpy_chk.S | 2 +
sysdeps/x86_64/multiarch/memmove.S | 2 +
sysdeps/x86_64/multiarch/memmove_chk.S | 2 +
sysdeps/x86_64/multiarch/mempcpy.S | 2 +
sysdeps/x86_64/multiarch/mempcpy_chk.S | 2 +
sysdeps/x86_64/multiarch/memset.S | 2 +
sysdeps/x86_64/multiarch/memset_chk.S | 2 +
11 files changed, 86 insertions(+), 2 deletions(-)
hooks/post-receive
--
GNU C Library master sources