This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: memcpy performance regressions 2.19 -> 2.24(5)
Here is the patch that slightly refactors how init_cacheinfo is called.
On Mon, May 22, 2017 at 7:24 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, May 22, 2017 at 6:23 PM, Erich Elsen <eriche@google.com> wrote:
>> I definitely think increasing the size in the case of processors with
>> a large number of cores makes sense. Hopefully with some testing we
>> can confirm it is a net win and/or find a more empirical number.
>>
>> Thanks for that patch with the tunable support. I've just put a
>> similar patch in review for sharing right now. It adds support in the
>> case that HAVE_TUNABLES isn't defined like the similar code in arena.c
>> and also makes a minor change that turns init_cacheinfo into a
>> init_cacheinfo_impl (a hidden callable). init_cacheinfo is now a
>> constructor that just calls the impl and passes the cpu_features
>> struct. This is useful in that it makes the code a bit more modular
>> (something that we'll need to be able to test this internally).
>
> This sounds a good idea. I'd also like to add tunable support in
> init_cpu_features to turn on/off CPU features. non_temporal_threshold
> will be one of them.
>
>
> --
> H.J.
From 87b133a3df55e4e444f893a354f01e10e7557ac6 Mon Sep 17 00:00:00 2001
From: Erich Elsen <eriche@google.com>
Date: Mon, 22 May 2017 18:08:58 -0700
Subject: [PATCH 1/2] add tunable for non temporal store. slightly refactor
cache info code to be allow for the possiblity of calling the implementation.
---
elf/dl-tunables.list | 7 ++++
sysdeps/x86/cacheinfo.c | 95 +++++++++++++++++++++++++++++++++++++++----------
2 files changed, 84 insertions(+), 18 deletions(-)
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index b9f1488798..d19fb0f175 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -30,6 +30,13 @@
# NONE: Read all the time.
glibc {
+ x86_cache {
+ x86_shared_non_temporal_threshold {
+ type: SIZE_T
+ env_alias: SHARED_NON_TEMPORAL_THRESHOLD
+ security_level: SXID_IGNORE
+ }
+ }
malloc {
check {
type: INT_32
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 1ccbe41b8f..2619c5a83c 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -23,6 +23,15 @@
#include <cpuid.h>
#include <init-arch.h>
+#if HAVE_TUNABLES
+# define TUNABLE_NAMESPACE x86_cache
+#else
+ #include <string.h>
+ extern char **_environ;
+#endif
+#include <elf/dl-tunables.h>
+
+
#define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
#define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
#define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
@@ -128,7 +137,7 @@ intel_02_known_compare (const void *p1, const void *p2)
static long int
__attribute__ ((noinline))
intel_check_word (int name, unsigned int value, bool *has_level_2,
- bool *no_level_2_or_3)
+ bool *no_level_2_or_3, const struct cpu_features* x86_cpu_features)
{
if ((value & 0x80000000) != 0)
/* The register value is reserved. */
@@ -206,8 +215,8 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
/* Intel reused this value. For family 15, model 6 it
specifies the 3rd level cache. Otherwise the 2nd
level cache. */
- unsigned int family = GLRO(dl_x86_cpu_features).family;
- unsigned int model = GLRO(dl_x86_cpu_features).model;
+ unsigned int family = x86_cpu_features->family;
+ unsigned int model = x86_cpu_features->model;
if (family == 15 && model == 6)
{
@@ -257,7 +266,8 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
static long int __attribute__ ((noinline))
-handle_intel (int name, unsigned int maxidx)
+handle_intel (int name, unsigned int maxidx,
+ const struct cpu_features* x86_cpu_features)
{
/* Return -1 for older CPUs. */
if (maxidx < 2)
@@ -289,19 +299,23 @@ handle_intel (int name, unsigned int maxidx)
}
/* Process the individual registers' value. */
- result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3);
+ result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3,
+ x86_cpu_features);
if (result != 0)
return result;
- result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3);
+ result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3,
+ x86_cpu_features);
if (result != 0)
return result;
- result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3);
+ result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3,
+ x86_cpu_features);
if (result != 0)
return result;
- result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3);
+ result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3,
+ x86_cpu_features);
if (result != 0)
return result;
}
@@ -437,7 +451,7 @@ attribute_hidden
__cache_sysconf (int name)
{
if (is_intel)
- return handle_intel (name, max_cpuid);
+ return handle_intel (name, max_cpuid, &GLRO(dl_x86_cpu_features));
if (is_amd)
return handle_amd (name);
@@ -475,9 +489,9 @@ int __x86_prefetchw attribute_hidden;
#endif
-static void
-__attribute__((constructor))
-init_cacheinfo (void)
+void
+attribute_hidden
+__init_cacheinfo_impl (const struct cpu_features* x86_cpu_features)
{
/* Find out what brand of processor. */
unsigned int eax;
@@ -492,14 +506,17 @@ init_cacheinfo (void)
if (is_intel)
{
- data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
+ data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid,
+ x86_cpu_features);
- long int core = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
+ long int core = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid,
+ x86_cpu_features);
bool inclusive_cache = true;
/* Try L3 first. */
level = 3;
- shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);
+ shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid,
+ x86_cpu_features);
/* Number of logical processors sharing L2 cache. */
int threads_l2;
@@ -529,8 +546,8 @@ init_cacheinfo (void)
highest cache level. */
if (max_cpuid >= 4)
{
- unsigned int family = GLRO(dl_x86_cpu_features).family;
- unsigned int model = GLRO(dl_x86_cpu_features).model;
+ unsigned int family = x86_cpu_features->family;
+ unsigned int model = x86_cpu_features->model;
int i = 0;
@@ -673,7 +690,7 @@ intel_bug_no_cache_info:
level. */
threads
- = ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx
+ = ((x86_cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx
>> 16) & 0xff);
}
@@ -768,4 +785,46 @@ intel_bug_no_cache_info:
shared cache size is the approximate value above which non-temporal
store becomes faster. */
__x86_shared_non_temporal_threshold = __x86_shared_cache_size * 6;
+
+#if HAVE_TUNABLES
+ TUNABLE_SET_VAL(x86_shared_non_temporal_threshold,
+ &__x86_shared_non_temporal_threshold);
+#else
+ if (__glibc_likely (_environ != NULL))
+ {
+ char **runp = _environ;
+ char *envline;
+
+ while (*runp != NULL)
+ {
+ envline = *runp;
+ runp++;
+ size_t len = strcspn (envline, "=");
+
+ if (envline[len] != '=')
+ continue;
+
+ switch (len)
+ {
+ case 29:
+ if (!__builtin_expect (__libc_enable_secure, 0))
+ {
+ if (memcmp (envline,
+ "SHARED_NON_TEMPORAL_THRESHOLD", 29) == 0)
+ __x86_shared_non_temporal_threshold = atoi (&envline[29]);
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+#endif
+}
+
+static void
+__attribute__((constructor))
+init_cacheinfo (void)
+{
+ __init_cacheinfo_impl (&GLRO(dl_x86_cpu_features));
}
--
2.13.0.219.gdb65acc882-goog