This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: memcpy performance regressions 2.19 -> 2.24(5)


Here is the patch that slightly refactors how init_cacheinfo is called.

On Mon, May 22, 2017 at 7:24 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Mon, May 22, 2017 at 6:23 PM, Erich Elsen <eriche@google.com> wrote:
>> I definitely think increasing the size in the case of processors with
>> a large number of cores makes sense.  Hopefully with some testing we
>> can confirm it is a net win and/or find a more empirical number.
>>
>> Thanks for that patch with the tunable support.  I've just put a
>> similar patch in review for sharing right now.  It adds support in the
>> case that HAVE_TUNABLES isn't defined like the similar code in arena.c
>>  and also makes a minor change that turns init_cacheinfo into a
>> init_cacheinfo_impl (a hidden callable).  init_cacheinfo is now a
>> constructor that just calls the impl and passes the cpu_features
>> struct.  This is useful in that it makes the code a bit more modular
>> (something that we'll need to be able to test this internally).
>
> This sounds a good idea.  I'd also like to add tunable support in
> init_cpu_features to turn on/off CPU features.   non_temporal_threshold
> will be one of them.
>
>
> --
> H.J.
From 87b133a3df55e4e444f893a354f01e10e7557ac6 Mon Sep 17 00:00:00 2001
From: Erich Elsen <eriche@google.com>
Date: Mon, 22 May 2017 18:08:58 -0700
Subject: [PATCH 1/2] add tunable for non temporal store. slightly refactor
 cache info code to be allow for the possiblity of calling the implementation.

---
 elf/dl-tunables.list    |  7 ++++
 sysdeps/x86/cacheinfo.c | 95 +++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 84 insertions(+), 18 deletions(-)

diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index b9f1488798..d19fb0f175 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -30,6 +30,13 @@
 # 	     NONE: Read all the time.
 
 glibc {
+  x86_cache {
+    x86_shared_non_temporal_threshold {
+      type: SIZE_T
+      env_alias: SHARED_NON_TEMPORAL_THRESHOLD
+      security_level: SXID_IGNORE
+    }
+  }
   malloc {
     check {
       type: INT_32
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 1ccbe41b8f..2619c5a83c 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -23,6 +23,15 @@
 #include <cpuid.h>
 #include <init-arch.h>
 
+#if HAVE_TUNABLES
+# define TUNABLE_NAMESPACE x86_cache
+#else
+  #include <string.h>
+  extern char **_environ;
+#endif
+#include <elf/dl-tunables.h>
+
+
 #define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
 #define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
 #define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
@@ -128,7 +137,7 @@ intel_02_known_compare (const void *p1, const void *p2)
 static long int
 __attribute__ ((noinline))
 intel_check_word (int name, unsigned int value, bool *has_level_2,
-		  bool *no_level_2_or_3)
+		  bool *no_level_2_or_3, const struct cpu_features* x86_cpu_features)
 {
   if ((value & 0x80000000) != 0)
     /* The register value is reserved.  */
@@ -206,8 +215,8 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
 	      /* Intel reused this value.  For family 15, model 6 it
 		 specifies the 3rd level cache.  Otherwise the 2nd
 		 level cache.  */
-	      unsigned int family = GLRO(dl_x86_cpu_features).family;
-	      unsigned int model = GLRO(dl_x86_cpu_features).model;
+	      unsigned int family = x86_cpu_features->family;
+	      unsigned int model = x86_cpu_features->model;
 
 	      if (family == 15 && model == 6)
 		{
@@ -257,7 +266,8 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
 
 
 static long int __attribute__ ((noinline))
-handle_intel (int name, unsigned int maxidx)
+handle_intel (int name, unsigned int maxidx,
+              const struct cpu_features* x86_cpu_features)
 {
   /* Return -1 for older CPUs.  */
   if (maxidx < 2)
@@ -289,19 +299,23 @@ handle_intel (int name, unsigned int maxidx)
 	}
 
       /* Process the individual registers' value.  */
-      result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3);
+      result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3,
+                                 x86_cpu_features);
       if (result != 0)
 	return result;
 
-      result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3);
+      result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3,
+                                 x86_cpu_features);
       if (result != 0)
 	return result;
 
-      result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3);
+      result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3,
+                                 x86_cpu_features);
       if (result != 0)
 	return result;
 
-      result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3);
+      result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3,
+                                 x86_cpu_features);
       if (result != 0)
 	return result;
     }
@@ -437,7 +451,7 @@ attribute_hidden
 __cache_sysconf (int name)
 {
   if (is_intel)
-    return handle_intel (name, max_cpuid);
+    return handle_intel (name, max_cpuid, &GLRO(dl_x86_cpu_features));
 
   if (is_amd)
     return handle_amd (name);
@@ -475,9 +489,9 @@ int __x86_prefetchw attribute_hidden;
 #endif
 
 
-static void
-__attribute__((constructor))
-init_cacheinfo (void)
+void
+attribute_hidden
+__init_cacheinfo_impl (const struct cpu_features* x86_cpu_features)
 {
   /* Find out what brand of processor.  */
   unsigned int eax;
@@ -492,14 +506,17 @@ init_cacheinfo (void)
 
   if (is_intel)
     {
-      data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
+      data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid,
+                           x86_cpu_features);
 
-      long int core = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
+      long int core = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid,
+                                    x86_cpu_features);
       bool inclusive_cache = true;
 
       /* Try L3 first.  */
       level  = 3;
-      shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);
+      shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid,
+                             x86_cpu_features);
 
       /* Number of logical processors sharing L2 cache.  */
       int threads_l2;
@@ -529,8 +546,8 @@ init_cacheinfo (void)
 	     highest cache level.  */
 	  if (max_cpuid >= 4)
 	    {
-	      unsigned int family = GLRO(dl_x86_cpu_features).family;
-	      unsigned int model = GLRO(dl_x86_cpu_features).model;
+				unsigned int family = x86_cpu_features->family;
+				unsigned int model = x86_cpu_features->model;
 
 	      int i = 0;
 
@@ -673,7 +690,7 @@ intel_bug_no_cache_info:
 		 level.  */
 
 	      threads
-		= ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx
+		= ((x86_cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx
 		    >> 16) & 0xff);
 	    }
 
@@ -768,4 +785,46 @@ intel_bug_no_cache_info:
      shared cache size is the approximate value above which non-temporal
      store becomes faster.  */
   __x86_shared_non_temporal_threshold = __x86_shared_cache_size * 6;
+
+#if HAVE_TUNABLES
+  TUNABLE_SET_VAL(x86_shared_non_temporal_threshold,
+                  &__x86_shared_non_temporal_threshold);
+#else
+  if (__glibc_likely (_environ != NULL))
+    {
+      char **runp = _environ;
+      char *envline;
+
+      while (*runp != NULL)
+        {
+          envline = *runp;
+          runp++;
+          size_t len = strcspn (envline, "=");
+
+          if (envline[len] != '=')
+            continue;
+
+          switch (len)
+            {
+            case 29:
+              if (!__builtin_expect (__libc_enable_secure, 0))
+                {
+                  if (memcmp (envline,
+                              "SHARED_NON_TEMPORAL_THRESHOLD", 29) == 0)
+                    __x86_shared_non_temporal_threshold = atoi (&envline[29]);
+                }
+              break;
+            default:
+              break;
+            }
+        }
+    }
+#endif
+}
+
+static void
+__attribute__((constructor))
+init_cacheinfo (void)
+{
+  __init_cacheinfo_impl (&GLRO(dl_x86_cpu_features));
 }
-- 
2.13.0.219.gdb65acc882-goog


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]