This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/cpuid/master created. glibc-2.21-669-g734d442


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/cpuid/master has been created
        at  734d442158bf9ff7532f80081eb016f9f10718aa (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=734d442158bf9ff7532f80081eb016f9f10718aa

commit 734d442158bf9ff7532f80081eb016f9f10718aa
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jul 31 13:46:05 2015 -0700

    Use LOAD_RTLD_GLOBAL_RO_RDX and HAS_XXX in libmvec

diff --git a/math/Makefile b/math/Makefile
index 6388bae..d3b483d 100644
--- a/math/Makefile
+++ b/math/Makefile
@@ -263,7 +263,7 @@ $(objpfx)libieee.a: $(objpfx)ieee-math.o
 $(addprefix $(objpfx),$(filter-out $(tests-static) $(libm-vec-tests),$(tests))): $(libm)
 $(addprefix $(objpfx),$(tests-static)): $(objpfx)libm.a
 $(addprefix $(objpfx), $(libm-vec-tests)): $(objpfx)%: $(libm) $(libmvec) \
-					   $(objpfx)init-arch.o $(objpfx)%-wrappers.o
+					   $(objpfx)%-wrappers.o
 
 gmp-objs = $(patsubst %,$(common-objpfx)stdlib/%.o,\
 		      add_n sub_n cmp addmul_1 mul_1 mul_n divmod_1 \
diff --git a/sysdeps/x86_64/fpu/math-tests-arch.h b/sysdeps/x86_64/fpu/math-tests-arch.h
index e8833bf..0de4cd8 100644
--- a/sysdeps/x86_64/fpu/math-tests-arch.h
+++ b/sysdeps/x86_64/fpu/math-tests-arch.h
@@ -19,66 +19,36 @@
 #if defined REQUIRE_AVX
 # include <init-arch.h>
 
-/* Set to 1 if AVX supported.  */
-static int avx_usable;
-
-# define INIT_ARCH_EXT                                         \
-  do                                                           \
-    {                                                          \
-      __init_cpu_features ();                                  \
-      avx_usable = __cpu_features.feature[index_AVX_Usable]    \
-                   & bit_AVX_Usable;                           \
-    }                                                          \
-  while (0)
+# define INIT_ARCH_EXT
 
 # define CHECK_ARCH_EXT                                        \
   do                                                           \
     {                                                          \
-      if (!avx_usable) return;                                 \
+      if (!HAS_AVX) return;                                    \
     }                                                          \
   while (0)
 
 #elif defined REQUIRE_AVX2
 # include <init-arch.h>
 
-  /* Set to 1 if AVX2 supported.  */
-  static int avx2_usable;
-
-# define INIT_ARCH_EXT                                         \
-  do                                                           \
-    {                                                          \
-      __init_cpu_features ();                                  \
-      avx2_usable = __cpu_features.feature[index_AVX2_Usable]  \
-                  & bit_AVX2_Usable;                           \
-    }                                                          \
-  while (0)
+# define INIT_ARCH_EXT
 
 # define CHECK_ARCH_EXT                                        \
   do                                                           \
     {                                                          \
-      if (!avx2_usable) return;                                \
+      if (!HAS_AVX2) return;                                   \
     }                                                          \
   while (0)
 
 #elif defined REQUIRE_AVX512F
 # include <init-arch.h>
 
-  /* Set to 1 if supported.  */
-  static int avx512f_usable;
-
-# define INIT_ARCH_EXT                                                \
-  do                                                                  \
-    {                                                                 \
-      __init_cpu_features ();                                         \
-      avx512f_usable = __cpu_features.feature[index_AVX512F_Usable]   \
-		       & bit_AVX512F_Usable;                          \
-    }                                                                 \
-  while (0)
+# define INIT_ARCH_EXT
 
 # define CHECK_ARCH_EXT                                        \
   do                                                           \
     {                                                          \
-      if (!avx512f_usable) return;                             \
+      if (!HAS_AVX512F) return;                                \
     }                                                          \
   while (0)
 
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
index 5f67d83..74305fb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2v_cos)
         .type   _ZGVbN2v_cos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_cos_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_cos_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_cos_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
index 5babb83..5ac3d0e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_cos)
         .type   _ZGVdN4v_cos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_cos_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_cos_avx2(%rip), %rax
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_cos_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
index d0f4f27..7d8f31c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_cos)
         .type   _ZGVeN8v_cos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
+	LOAD_RTLD_GLOBAL_RO_RDX
 1:      leaq    _ZGVeN8v_cos_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN8v_cos_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
index ef3dc49..1d625ae 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2v_exp)
         .type   _ZGVbN2v_exp, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_exp_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_exp_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_exp_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
index 7f2ebde..a80702b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_exp)
         .type   _ZGVdN4v_exp, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_exp_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_exp_avx2(%rip), %rax
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_exp_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
index 7b7c07d..3389c89 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_exp)
         .type   _ZGVeN8v_exp, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8v_exp_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_exp_skx(%rip), %rax
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN8v_exp_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
index 38d369f..4f9d990 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
@@ -22,11 +22,9 @@
         .text
 ENTRY (_ZGVbN2v_log)
         .type   _ZGVbN2v_log, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_log_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_log_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_log_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
index ddb6105..594adf6 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_log)
         .type   _ZGVdN4v_log, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_log_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_log_avx2(%rip), %rax
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_log_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
index 76375fd..ca22197 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_log)
         .type   _ZGVeN8v_log, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8v_log_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_log_skx(%rip), %rax
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN8v_log_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN8v_log_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
index f111388..49f1fb9 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2vv_pow)
         .type   _ZGVbN2vv_pow, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2vv_pow_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2vv_pow_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN2vv_pow_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
index 21e3070..dff294f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4vv_pow)
         .type   _ZGVdN4vv_pow, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4vv_pow_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4vv_pow_avx2(%rip), %rax
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN4vv_pow_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
index c1e5e76..197925b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8vv_pow)
         .type   _ZGVeN8vv_pow, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8vv_pow_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8vv_pow_skx(%rip), %rax
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN8vv_pow_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
index 29bd0a7..80bd858 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2v_sin)
         .type   _ZGVbN2v_sin, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_sin_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_sin_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_sin_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
index c3a453a..861c9b3 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_sin)
         .type   _ZGVdN4v_sin, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_sin_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_sin_avx2(%rip), %rax
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_sin_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
index 131f2f4..3482ac5 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_sin)
         .type   _ZGVeN8v_sin, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8v_sin_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_sin_skx(%rip), %rax
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN8v_sin_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
index e8e5771..8ae0903 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2vvv_sincos)
         .type   _ZGVbN2vvv_sincos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2vvv_sincos_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2vvv_sincos_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN2vvv_sincos_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
index 64744ff..671e8fc 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4vvv_sincos)
         .type   _ZGVdN4vvv_sincos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4vvv_sincos_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4vvv_sincos_avx2(%rip), %rax
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN4vvv_sincos_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
index e331090..24922e1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8vvv_sincos)
         .type   _ZGVeN8vvv_sincos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
index 0654d3c..fdd640c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_cosf)
         .type   _ZGVeN16v_cosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_cosf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_cosf_skx(%rip), %rax
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN16v_cosf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
index fa2363b..b9b2210 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_cosf)
         .type   _ZGVbN4v_cosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_cosf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_cosf_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_cosf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
index e14bba4..b9589b3 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_cosf)
         .type   _ZGVdN8v_cosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8v_cosf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_cosf_avx2(%rip), %rax
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_cosf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
index 62858eb..6a1fdbb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_expf)
         .type   _ZGVeN16v_expf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_expf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_expf_skx(%rip), %rax
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN16v_expf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
index 37d38bc..6ad7841 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_expf)
         .type   _ZGVbN4v_expf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_expf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_expf_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_expf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
index e3dc1b1..a5e1917 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_expf)
         .type   _ZGVdN8v_expf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8v_expf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_expf_avx2(%rip), %rax
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_expf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
index 68c57e4..3d2b8b1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_logf)
         .type   _ZGVeN16v_logf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_logf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_logf_skx(%rip), %rax
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN16v_logf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
index 153ed8e..a8dd898 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_logf)
         .type   _ZGVbN4v_logf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_logf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_logf_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_logf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
index 6f50bf6..f5356d8 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_logf)
         .type   _ZGVdN8v_logf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8v_logf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_logf_avx2(%rip), %rax
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_logf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
index 3aa9f95..3d32202 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16vv_powf)
         .type   _ZGVeN16vv_powf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16vv_powf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16vv_powf_skx(%rip), %rax
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN16vv_powf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
index f88b9ca..94f172c 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4vv_powf)
         .type   _ZGVbN4vv_powf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4vv_powf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4vv_powf_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN4vv_powf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
index 4552e57..3618adf 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8vv_powf)
         .type   _ZGVdN8vv_powf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8vv_powf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8vv_powf_avx2(%rip), %rax
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN8vv_powf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
index bdcabab..f20df2f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16vvv_sincosf)
         .type   _ZGVeN16vvv_sincosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
index 610046b..a83c830 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4vvv_sincosf)
         .type   _ZGVbN4vvv_sincosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4vvv_sincosf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4vvv_sincosf_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN4vvv_sincosf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
index 9e5be67..a20772b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8vvv_sincosf)
         .type   _ZGVdN8vvv_sincosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8vvv_sincosf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8vvv_sincosf_avx2(%rip), %rax
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN8vvv_sincosf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
index 3ec78a0..25ec834 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_sinf)
         .type   _ZGVeN16v_sinf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_sinf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_sinf_skx(%rip), %rax
+	HAS_AVX512DQ
         jnz     2f
         leaq    _ZGVeN16v_sinf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_AVX512F
         jnz     2f
         leaq    _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
index cf1e4df..4a71052 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_sinf)
         .type   _ZGVbN4v_sinf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_sinf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_sinf_sse4(%rip), %rax
+	HAS_SSE4_1
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_sinf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
index b28bf3c..e14c5b2 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_sinf)
         .type   _ZGVdN8v_sinf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
+	LOAD_RTLD_GLOBAL_RO_RDX
 1:      leaq    _ZGVdN8v_sinf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	HAS_AVX2
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_sinf_sse_wrapper(%rip), %rax

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=837b2ffe7c264acc384c81125b90f7457ba0efc4

commit 837b2ffe7c264acc384c81125b90f7457ba0efc4
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jul 31 16:52:19 2015 -0700

    Use LOAD_GOT_AND_RTLD_GLOBAL_RO and HAS_XXX in i686/multiarch

diff --git a/sysdeps/i386/i686/multiarch/bcopy.S b/sysdeps/i386/i686/multiarch/bcopy.S
index e767d97..7657082 100644
--- a/sysdeps/i386/i686/multiarch/bcopy.S
+++ b/sysdeps/i386/i686/multiarch/bcopy.S
@@ -23,51 +23,24 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(bcopy)
 	.type	bcopy, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bcopy_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bcopy_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__bcopy_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__bcopy_sse2_unaligned)
+	HAS_FAST_UNALIGNED_LOAD
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_SSSE3
 	jz	2f
-	leal	__bcopy_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3)
+	HAS_FAST_REP_STRING
 	jz	2f
-	leal	__bcopy_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(bcopy)
-# else
-	.text
-ENTRY(bcopy)
-	.type	bcopy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bcopy_ia32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__bcopy_ssse3, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__bcopy_ssse3_rep, %eax
+	LOAD_FUNC_GOT_EAX (__bcopy_ssse3_rep)
 2:	ret
 END(bcopy)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/bzero.S b/sysdeps/i386/i686/multiarch/bzero.S
index e8dc85f..ac142bc 100644
--- a/sysdeps/i386/i686/multiarch/bzero.S
+++ b/sysdeps/i386/i686/multiarch/bzero.S
@@ -23,46 +23,19 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
-	.text
-ENTRY(__bzero)
-	.type	__bzero, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bzero_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__bzero_sse2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__bzero_sse2_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__bzero)
-# else
 	.text
 ENTRY(__bzero)
 	.type	__bzero, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__bzero_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__bzero_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__bzero_sse2, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+	LOAD_FUNC_GOT_EAX ( __bzero_sse2)
+	HAS_FAST_REP_STRING
 	jz	2f
-	leal	__bzero_sse2_rep, %eax
+	LOAD_FUNC_GOT_EAX (__bzero_sse2_rep)
 2:	ret
 END(__bzero)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/memchr.S b/sysdeps/i386/i686/multiarch/memchr.S
index 02994d0..e444dd6 100644
--- a/sysdeps/i386/i686/multiarch/memchr.S
+++ b/sysdeps/i386/i686/multiarch/memchr.S
@@ -22,46 +22,22 @@
 #include <init-arch.h>
 
 #if IS_IN (libc)
-# define CFI_POP(REG) \
-	cfi_adjust_cfa_offset (-4); \
-	cfi_restore (REG)
-
-# define CFI_PUSH(REG) \
-	cfi_adjust_cfa_offset (4); \
-	cfi_rel_offset (REG, 0)
-
 	.text
 ENTRY(__memchr)
 	.type	__memchr, @gnu_indirect_function
-	pushl	%ebx
-	CFI_PUSH (%ebx)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_SSE2
 	jz	2f
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	HAS_SLOW_BSF
 	jz	3f
 
-	leal	__memchr_sse2@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+	LOAD_FUNC_GOT_EAX ( __memchr_sse2)
 	ret
 
-	CFI_PUSH (%ebx)
-
-2:	leal	__memchr_ia32@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+2:	LOAD_FUNC_GOT_EAX (__memchr_ia32)
 	ret
 
-	CFI_PUSH (%ebx)
-
-3:	leal	__memchr_sse2_bsf@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+3:	LOAD_FUNC_GOT_EAX (__memchr_sse2_bsf)
 	ret
 END(__memchr)
 
diff --git a/sysdeps/i386/i686/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/memcmp.S
index 6b607eb..aa08900 100644
--- a/sysdeps/i386/i686/multiarch/memcmp.S
+++ b/sysdeps/i386/i686/multiarch/memcmp.S
@@ -23,46 +23,19 @@
 
 /* Define multiple versions only for the definition in libc. */
 #if IS_IN (libc)
-# ifdef SHARED
-	.text
-ENTRY(memcmp)
-	.type	memcmp, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memcmp_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memcmp_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memcmp_sse4_2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(memcmp)
-# else
 	.text
 ENTRY(memcmp)
 	.type	memcmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memcmp_ia32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcmp_ia32)
+	HAS_SSSE3
 	jz	2f
-	leal	__memcmp_ssse3, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
+	LOAD_FUNC_GOT_EAX (__memcmp_ssse3)
+	HAS_SSE4_2
 	jz	2f
-	leal	__memcmp_sse4_2, %eax
+	LOAD_FUNC_GOT_EAX (__memcmp_sse4_2)
 2:	ret
 END(memcmp)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/memcpy.S b/sysdeps/i386/i686/multiarch/memcpy.S
index c6d20bd..d92f691 100644
--- a/sysdeps/i386/i686/multiarch/memcpy.S
+++ b/sysdeps/i386/i686/multiarch/memcpy.S
@@ -28,29 +28,20 @@
 	.text
 ENTRY(memcpy)
 	.type	memcpy, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memcpy_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__memcpy_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memcpy_sse2_unaligned)
+	HAS_FAST_UNALIGNED_LOAD
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_SSSE3
 	jz	2f
-	leal	__memcpy_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3)
+	HAS_FAST_REP_STRING
 	jz	2f
-	leal	__memcpy_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__memcpy_ssse3_rep)
+2:	ret
 END(memcpy)
 
 # undef ENTRY
diff --git a/sysdeps/i386/i686/multiarch/memcpy_chk.S b/sysdeps/i386/i686/multiarch/memcpy_chk.S
index 9399587..ba99478 100644
--- a/sysdeps/i386/i686/multiarch/memcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/memcpy_chk.S
@@ -29,29 +29,20 @@
 	.text
 ENTRY(__memcpy_chk)
 	.type	__memcpy_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memcpy_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__memcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_sse2_unaligned)
+	HAS_FAST_UNALIGNED_LOAD
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_SSSE3
 	jz	2f
-	leal	__memcpy_chk_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3)
+	HAS_FAST_REP_STRING
 	jz	2f
-	leal	__memcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__memcpy_chk_ssse3_rep)
+2:	ret
 END(__memcpy_chk)
 # else
 #  include "../memcpy_chk.S"
diff --git a/sysdeps/i386/i686/multiarch/memmove.S b/sysdeps/i386/i686/multiarch/memmove.S
index 7033463..6a4a5de 100644
--- a/sysdeps/i386/i686/multiarch/memmove.S
+++ b/sysdeps/i386/i686/multiarch/memmove.S
@@ -23,37 +23,28 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(memmove)
 	.type	memmove, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__memmove_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memmove_sse2_unaligned)
+	HAS_FAST_UNALIGNED_LOAD
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_SSSE3
 	jz	2f
-	leal	__memmove_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3)
+	HAS_FAST_REP_STRING
 	jz	2f
-	leal	__memmove_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__memmove_ssse3_rep)
+2:	ret
 END(memmove)
 
-# undef ENTRY
-# define ENTRY(name) \
+# ifdef SHARED
+#  undef ENTRY
+#  define ENTRY(name) \
 	.type __memmove_ia32, @function; \
 	.p2align 4; \
 	.globl __memmove_ia32; \
@@ -61,29 +52,8 @@ END(memmove)
 	__memmove_ia32: cfi_startproc; \
 	CALL_MCOUNT
 # else
-	.text
-ENTRY(memmove)
-	.type	memmove, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
-	jz	2f
-	leal	__memmove_sse2_unaligned, %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
-	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__memmove_ssse3, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__memmove_ssse3_rep, %eax
-2:	ret
-END(memmove)
-
-# undef ENTRY
-# define ENTRY(name) \
+#  undef ENTRY
+#  define ENTRY(name) \
 	.type __memmove_ia32, @function; \
 	.globl __memmove_ia32; \
 	.p2align 4; \
diff --git a/sysdeps/i386/i686/multiarch/memmove_chk.S b/sysdeps/i386/i686/multiarch/memmove_chk.S
index 2b576d4..83a4402 100644
--- a/sysdeps/i386/i686/multiarch/memmove_chk.S
+++ b/sysdeps/i386/i686/multiarch/memmove_chk.S
@@ -23,56 +23,26 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(__memmove_chk)
 	.type	__memmove_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__memmove_chk_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memmove_chk_sse2_unaligned)
+	HAS_FAST_UNALIGNED_LOAD
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_SSSE3
 	jz	2f
-	leal	__memmove_chk_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3)
+	HAS_FAST_REP_STRING
 	jz	2f
-	leal	__memmove_chk_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__memmove_chk)
-# else
-	.text
-ENTRY(__memmove_chk)
-	.type	__memmove_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memmove_chk_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
-	jz	2f
-	leal	__memmove_chk_sse2_unaligned, %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
-	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__memmove_chk_ssse3, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__memmove_chk_ssse3_rep, %eax
+	LOAD_FUNC_GOT_EAX (__memmove_chk_ssse3_rep)
 2:	ret
 END(__memmove_chk)
 
+# ifndef SHARED
 	.type __memmove_chk_sse2_unaligned, @function
 	.p2align 4;
 __memmove_chk_sse2_unaligned:
diff --git a/sysdeps/i386/i686/multiarch/mempcpy.S b/sysdeps/i386/i686/multiarch/mempcpy.S
index 39c934e..810d4c2 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy.S
@@ -28,29 +28,20 @@
 	.text
 ENTRY(__mempcpy)
 	.type	__mempcpy, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__mempcpy_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__mempcpy_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__mempcpy_sse2_unaligned)
+	HAS_FAST_UNALIGNED_LOAD
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_SSSE3
 	jz	2f
-	leal	__mempcpy_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3)
+	HAS_FAST_REP_STRING
 	jz	2f
-	leal	__mempcpy_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__mempcpy_ssse3_rep)
+2:	ret
 END(__mempcpy)
 
 # undef ENTRY
diff --git a/sysdeps/i386/i686/multiarch/mempcpy_chk.S b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
index b6fa202..a770bc9 100644
--- a/sysdeps/i386/i686/multiarch/mempcpy_chk.S
+++ b/sysdeps/i386/i686/multiarch/mempcpy_chk.S
@@ -29,29 +29,20 @@
 	.text
 ENTRY(__mempcpy_chk)
 	.type	__mempcpy_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__mempcpy_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__mempcpy_chk_sse2_unaligned@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_sse2_unaligned)
+	HAS_FAST_UNALIGNED_LOAD
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_SSSE3
 	jz	2f
-	leal	__mempcpy_chk_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3)
+	HAS_FAST_REP_STRING
 	jz	2f
-	leal	__mempcpy_chk_ssse3_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__mempcpy_chk_ssse3_rep)
+2:	ret
 END(__mempcpy_chk)
 # else
 #  include "../mempcpy_chk.S"
diff --git a/sysdeps/i386/i686/multiarch/memrchr.S b/sysdeps/i386/i686/multiarch/memrchr.S
index 321e0b7..5121a7c 100644
--- a/sysdeps/i386/i686/multiarch/memrchr.S
+++ b/sysdeps/i386/i686/multiarch/memrchr.S
@@ -22,46 +22,22 @@
 #include <init-arch.h>
 
 #if IS_IN (libc)
-# define CFI_POP(REG) \
-	cfi_adjust_cfa_offset (-4); \
-	cfi_restore (REG)
-
-# define CFI_PUSH(REG) \
-	cfi_adjust_cfa_offset (4); \
-	cfi_rel_offset (REG, 0)
-
 	.text
 ENTRY(__memrchr)
 	.type	__memrchr, @gnu_indirect_function
-	pushl	%ebx
-	CFI_PUSH (%ebx)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_SSE2
 	jz	2f
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	HAS_SLOW_BSF
 	jz	3f
 
-	leal	__memrchr_sse2@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+	LOAD_FUNC_GOT_EAX (__memrchr_sse2)
 	ret
 
-	CFI_PUSH (%ebx)
-
-2:	leal	__memrchr_ia32@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+2:	LOAD_FUNC_GOT_EAX (__memrchr_ia32)
 	ret
 
-	CFI_PUSH (%ebx)
-
-3:	leal	__memrchr_sse2_bsf@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+3:	LOAD_FUNC_GOT_EAX (__memrchr_sse2_bsf)
 	ret
 END(__memrchr)
 
diff --git a/sysdeps/i386/i686/multiarch/memset.S b/sysdeps/i386/i686/multiarch/memset.S
index 6d7d919..1cf40c2 100644
--- a/sysdeps/i386/i686/multiarch/memset.S
+++ b/sysdeps/i386/i686/multiarch/memset.S
@@ -23,46 +23,19 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
-	.text
-ENTRY(memset)
-	.type	memset, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memset_sse2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	__memset_sse2_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(memset)
-# else
 	.text
 ENTRY(memset)
 	.type	memset, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__memset_sse2, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
+	LOAD_FUNC_GOT_EAX (__memset_sse2)
+	HAS_FAST_REP_STRING
 	jz	2f
-	leal	__memset_sse2_rep, %eax
+	LOAD_FUNC_GOT_EAX (__memset_sse2_rep)
 2:	ret
 END(memset)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/memset_chk.S b/sysdeps/i386/i686/multiarch/memset_chk.S
index a770c0d..1418853 100644
--- a/sysdeps/i386/i686/multiarch/memset_chk.S
+++ b/sysdeps/i386/i686/multiarch/memset_chk.S
@@ -23,50 +23,26 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #if IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(__memset_chk)
 	.type	__memset_chk, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_chk_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__memset_chk_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__memset_chk_sse2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2)
+	HAS_FAST_REP_STRING
 	jz	2f
-	leal	__memset_chk_sse2_rep@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__memset_chk_sse2_rep)
+2:	ret
 END(__memset_chk)
 
+# ifdef SHARED
 strong_alias (__memset_chk, __memset_zero_constant_len_parameter)
 	.section .gnu.warning.__memset_zero_constant_len_parameter
 	.string "memset used with constant zero length parameter; this could be due to transposed parameters"
 # else
 	.text
-ENTRY(__memset_chk)
-	.type	__memset_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__memset_chk_ia32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
-	jz	2f
-	leal	__memset_chk_sse2, %eax
-	testl	$bit_Fast_Rep_String, FEATURE_OFFSET+index_Fast_Rep_String+__cpu_features
-	jz	2f
-	leal	__memset_chk_sse2_rep, %eax
-2:	ret
-END(__memset_chk)
-
 	.type __memset_chk_sse2, @function
 	.p2align 4;
 __memset_chk_sse2:
diff --git a/sysdeps/i386/i686/multiarch/rawmemchr.S b/sysdeps/i386/i686/multiarch/rawmemchr.S
index c2b7ee6..7616460 100644
--- a/sysdeps/i386/i686/multiarch/rawmemchr.S
+++ b/sysdeps/i386/i686/multiarch/rawmemchr.S
@@ -22,46 +22,22 @@
 #include <init-arch.h>
 
 #if IS_IN (libc)
-# define CFI_POP(REG) \
-	cfi_adjust_cfa_offset (-4); \
-	cfi_restore (REG)
-
-# define CFI_PUSH(REG) \
-	cfi_adjust_cfa_offset (4); \
-	cfi_rel_offset (REG, 0)
-
 	.text
 ENTRY(__rawmemchr)
 	.type	__rawmemchr, @gnu_indirect_function
-	pushl	%ebx
-	CFI_PUSH (%ebx)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	HAS_SSE2
 	jz	2f
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	HAS_SLOW_BSF
 	jz	3f
 
-	leal	__rawmemchr_sse2@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2)
 	ret
 
-	CFI_PUSH (%ebx)
-
-2:	leal	__rawmemchr_ia32@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+2:	LOAD_FUNC_GOT_EAX (__rawmemchr_ia32)
 	ret
 
-	CFI_PUSH (%ebx)
-
-3:	leal	__rawmemchr_sse2_bsf@GOTOFF(%ebx), %eax
-	popl	%ebx
-	CFI_POP	(%ebx)
+3:	LOAD_FUNC_GOT_EAX (__rawmemchr_sse2_bsf)
 	ret
 END(__rawmemchr)
 
diff --git a/sysdeps/i386/i686/multiarch/strcasecmp.S b/sysdeps/i386/i686/multiarch/strcasecmp.S
index c30ac3a..7ace685 100644
--- a/sysdeps/i386/i686/multiarch/strcasecmp.S
+++ b/sysdeps/i386/i686/multiarch/strcasecmp.S
@@ -20,49 +20,20 @@
 #include <sysdep.h>
 #include <init-arch.h>
 
-#ifdef SHARED
 	.text
 ENTRY(__strcasecmp)
 	.type	__strcasecmp, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strcasecmp_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ia32)
+	HAS_SSSE3
 	jz	2f
-	leal	__strcasecmp_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__strcasecmp_ssse3)
+	HAS_SSE4_2
 	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	HAS_SLOW_SSE4_2
 	jnz	2f
-	leal	__strcasecmp_sse4_2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__strcasecmp)
-#else
-	.text
-ENTRY(__strcasecmp)
-	.type	__strcasecmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strcasecmp_ia32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__strcasecmp_ssse3, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
-	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features
-	jnz	2f
-	leal	__strcasecmp_sse4_2, %eax
+	LOAD_FUNC_GOT_EAX (__strcasecmp_sse4_2)
 2:	ret
 END(__strcasecmp)
-#endif
 
 weak_alias (__strcasecmp, strcasecmp)
diff --git a/sysdeps/i386/i686/multiarch/strcat.S b/sysdeps/i386/i686/multiarch/strcat.S
index 474f753..e8e8f29 100644
--- a/sysdeps/i386/i686/multiarch/strcat.S
+++ b/sysdeps/i386/i686/multiarch/strcat.S
@@ -45,52 +45,22 @@
    need strncat before the initialization happened.  */
 #if IS_IN (libc)
 
-# ifdef SHARED
 	.text
 ENTRY(STRCAT)
 	.type	STRCAT, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCAT_IA32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	STRCAT_SSE2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
-	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
-	jz	2f
-	leal	STRCAT_SSSE3@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(STRCAT)
-# else
-
-ENTRY(STRCAT)
-	.type	STRCAT, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCAT_IA32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCAT_IA32)
+	HAS_SSE2
 	jz	2f
-	leal	STRCAT_SSE2, %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
+	LOAD_FUNC_GOT_EAX (STRCAT_SSE2)
+	HAS_FAST_UNALIGNED_LOAD
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
+	HAS_SSSE3
 	jz	2f
-	leal	STRCAT_SSSE3, %eax
+	LOAD_FUNC_GOT_EAX (STRCAT_SSSE3)
 2:	ret
 END(STRCAT)
 
-# endif
-
 # undef ENTRY
 # define ENTRY(name) \
 	.type STRCAT_IA32, @function; \
diff --git a/sysdeps/i386/i686/multiarch/strchr.S b/sysdeps/i386/i686/multiarch/strchr.S
index 45624fd..83d2b84 100644
--- a/sysdeps/i386/i686/multiarch/strchr.S
+++ b/sysdeps/i386/i686/multiarch/strchr.S
@@ -25,24 +25,15 @@
 	.text
 ENTRY(strchr)
 	.type	strchr, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strchr_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strchr_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__strchr_sse2_bsf@GOTOFF(%ebx), %eax
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__strchr_sse2_bsf)
+	HAS_SLOW_BSF
 	jz	2f
-	leal	__strchr_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__strchr_sse2)
+2:	ret
 END(strchr)
 
 # undef ENTRY
diff --git a/sysdeps/i386/i686/multiarch/strcmp.S b/sysdeps/i386/i686/multiarch/strcmp.S
index 9df4008..274c7b3 100644
--- a/sysdeps/i386/i686/multiarch/strcmp.S
+++ b/sysdeps/i386/i686/multiarch/strcmp.S
@@ -51,50 +51,21 @@
    define multiple versions for strncmp in static library since we
    need strncmp before the initialization happened.  */
 #if (defined SHARED || !defined USE_AS_STRNCMP) && IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(STRCMP)
 	.type	STRCMP, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__STRCMP_IA32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__STRCMP_IA32)
+	HAS_SSSE3
 	jz	2f
-	leal	__STRCMP_SSSE3@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSSE3)
+	HAS_SSE4_2
 	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	HAS_SLOW_SSE4_2
 	jnz	2f
-	leal	__STRCMP_SSE4_2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(STRCMP)
-# else
-	.text
-ENTRY(STRCMP)
-	.type	STRCMP, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__STRCMP_IA32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__STRCMP_SSSE3, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
-	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features
-	jnz	2f
-	leal	__STRCMP_SSE4_2, %eax
+	LOAD_FUNC_GOT_EAX (__STRCMP_SSE4_2)
 2:	ret
 END(STRCMP)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/strcpy.S b/sysdeps/i386/i686/multiarch/strcpy.S
index c279d46..c3844a8 100644
--- a/sysdeps/i386/i686/multiarch/strcpy.S
+++ b/sysdeps/i386/i686/multiarch/strcpy.S
@@ -61,52 +61,22 @@
    need strncpy before the initialization happened.  */
 #if IS_IN (libc)
 
-# ifdef SHARED
 	.text
 ENTRY(STRCPY)
 	.type	STRCPY, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCPY_IA32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCPY_IA32)
+	HAS_SSE2
 	jz	2f
-	leal	STRCPY_SSE2@GOTOFF(%ebx), %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (STRCPY_SSE2)
+	HAS_FAST_UNALIGNED_LOAD
 	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	HAS_SSSE3
 	jz	2f
-	leal	STRCPY_SSSE3@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(STRCPY)
-# else
-
-ENTRY(STRCPY)
-	.type	STRCPY, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCPY_IA32, %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features
-	jz	2f
-	leal	STRCPY_SSE2, %eax
-	testl	$bit_Fast_Unaligned_Load, FEATURE_OFFSET+index_Fast_Unaligned_Load+__cpu_features
-	jnz	2f
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	STRCPY_SSSE3, %eax
+	LOAD_FUNC_GOT_EAX (STRCPY_SSSE3)
 2:	ret
 END(STRCPY)
 
-# endif
-
 # undef ENTRY
 # define ENTRY(name) \
 	.type STRCPY_IA32, @function; \
diff --git a/sysdeps/i386/i686/multiarch/strcspn.S b/sysdeps/i386/i686/multiarch/strcspn.S
index e6ea454..7e67f78 100644
--- a/sysdeps/i386/i686/multiarch/strcspn.S
+++ b/sysdeps/i386/i686/multiarch/strcspn.S
@@ -42,40 +42,16 @@
    define multiple versions for strpbrk in static library since we
    need strpbrk before the initialization happened.  */
 #if (defined SHARED || !defined USE_AS_STRPBRK) && IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(STRCSPN)
 	.type	STRCSPN, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCSPN_IA32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (STRCSPN_IA32)
+	HAS_SSE4_2
 	jz	2f
-	leal	STRCSPN_SSE42@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
-END(STRCSPN)
-# else
-	.text
-ENTRY(STRCSPN)
-	.type	STRCSPN, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	STRCSPN_IA32, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
-	jz	2f
-	leal	STRCSPN_SSE42, %eax
+	LOAD_FUNC_GOT_EAX (STRCSPN_SSE42)
 2:	ret
 END(STRCSPN)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/strlen.S b/sysdeps/i386/i686/multiarch/strlen.S
index 2e6993b..8a2fbf2 100644
--- a/sysdeps/i386/i686/multiarch/strlen.S
+++ b/sysdeps/i386/i686/multiarch/strlen.S
@@ -28,24 +28,15 @@
 	.text
 ENTRY(strlen)
 	.type	strlen, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strlen_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strlen_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__strlen_sse2_bsf@GOTOFF(%ebx), %eax
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__strlen_sse2_bsf)
+	HAS_SLOW_BSF
 	jz	2f
-	leal	__strlen_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__strlen_sse2)
+2:	ret
 END(strlen)
 
 # undef ENTRY
diff --git a/sysdeps/i386/i686/multiarch/strncase.S b/sysdeps/i386/i686/multiarch/strncase.S
index c2cb03c..5025477 100644
--- a/sysdeps/i386/i686/multiarch/strncase.S
+++ b/sysdeps/i386/i686/multiarch/strncase.S
@@ -20,49 +20,20 @@
 #include <sysdep.h>
 #include <init-arch.h>
 
-#ifdef SHARED
 	.text
 ENTRY(__strncasecmp)
 	.type	__strncasecmp, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strncasecmp_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ia32)
+	HAS_SSSE3
 	jz	2f
-	leal	__strncasecmp_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__strncasecmp_ssse3)
+	HAS_SSE4_2
 	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	HAS_SLOW_SSE4_2
 	jnz	2f
-	leal	__strncasecmp_sse4_2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
-END(__strncasecmp)
-#else
-	.text
-ENTRY(__strncasecmp)
-	.type	__strncasecmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strncasecmp_ia32, %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features
-	jz	2f
-	leal	__strncasecmp_ssse3, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
-	jz	2f
-	testl	$bit_Slow_SSE4_2, FEATURE_OFFSET+index_Slow_SSE4_2+__cpu_features
-	jnz	2f
-	leal	__strncasecmp_sse4_2, %eax
+	LOAD_FUNC_GOT_EAX (__strncasecmp_sse4_2)
 2:	ret
 END(__strncasecmp)
-#endif
 
 weak_alias (__strncasecmp, strncasecmp)
diff --git a/sysdeps/i386/i686/multiarch/strnlen.S b/sysdeps/i386/i686/multiarch/strnlen.S
index 56a5136..166c81e 100644
--- a/sysdeps/i386/i686/multiarch/strnlen.S
+++ b/sysdeps/i386/i686/multiarch/strnlen.S
@@ -25,21 +25,12 @@
 	.text
 ENTRY(__strnlen)
 	.type	__strnlen, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strnlen_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strnlen_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__strnlen_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__strnlen_sse2)
+2:	ret
 END(__strnlen)
 
 weak_alias(__strnlen, strnlen)
diff --git a/sysdeps/i386/i686/multiarch/strrchr.S b/sysdeps/i386/i686/multiarch/strrchr.S
index 91074b4..984694b 100644
--- a/sysdeps/i386/i686/multiarch/strrchr.S
+++ b/sysdeps/i386/i686/multiarch/strrchr.S
@@ -25,24 +25,15 @@
 	.text
 ENTRY(strrchr)
 	.type	strrchr, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strrchr_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strrchr_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__strrchr_sse2_bsf@GOTOFF(%ebx), %eax
-	testl	$bit_Slow_BSF, FEATURE_OFFSET+index_Slow_BSF+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2_bsf)
+	HAS_SLOW_BSF
 	jz	2f
-	leal	__strrchr_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__strrchr_sse2)
+2:	ret
 END(strrchr)
 
 # undef ENTRY
diff --git a/sysdeps/i386/i686/multiarch/strspn.S b/sysdeps/i386/i686/multiarch/strspn.S
index 9d353a2..b9e2a74 100644
--- a/sysdeps/i386/i686/multiarch/strspn.S
+++ b/sysdeps/i386/i686/multiarch/strspn.S
@@ -27,40 +27,16 @@
 
 /* Define multiple versions only for the definition in libc.  */
 #if IS_IN (libc)
-# ifdef SHARED
 	.text
 ENTRY(strspn)
 	.type	strspn, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strspn_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__strspn_ia32)
+	HAS_SSE4_2
 	jz	2f
-	leal	__strspn_sse42@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
-END(strspn)
-# else
-	.text
-ENTRY(strspn)
-	.type	strspn, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__strspn_ia32, %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
-	jz	2f
-	leal	__strspn_sse42, %eax
+	LOAD_FUNC_GOT_EAX (__strspn_sse42)
 2:	ret
 END(strspn)
-# endif
 
 # undef ENTRY
 # define ENTRY(name) \
diff --git a/sysdeps/i386/i686/multiarch/wcschr.S b/sysdeps/i386/i686/multiarch/wcschr.S
index 603d7d7..0c4ad2f 100644
--- a/sysdeps/i386/i686/multiarch/wcschr.S
+++ b/sysdeps/i386/i686/multiarch/wcschr.S
@@ -25,21 +25,12 @@
 	.text
 ENTRY(__wcschr)
 	.type	wcschr, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wcschr_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcschr_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__wcschr_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wcschr_sse2)
+2:	ret
 END(__wcschr)
 weak_alias (__wcschr, wcschr)
 #endif
diff --git a/sysdeps/i386/i686/multiarch/wcscmp.S b/sysdeps/i386/i686/multiarch/wcscmp.S
index 92c2c84..445e034 100644
--- a/sysdeps/i386/i686/multiarch/wcscmp.S
+++ b/sysdeps/i386/i686/multiarch/wcscmp.S
@@ -28,21 +28,12 @@
 	.text
 ENTRY(__wcscmp)
 	.type	__wcscmp, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wcscmp_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscmp_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__wcscmp_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wcscmp_sse2)
+2:	ret
 END(__wcscmp)
 weak_alias (__wcscmp, wcscmp)
 #endif
diff --git a/sysdeps/i386/i686/multiarch/wcscpy.S b/sysdeps/i386/i686/multiarch/wcscpy.S
index f7253c7..5f9f9f4 100644
--- a/sysdeps/i386/i686/multiarch/wcscpy.S
+++ b/sysdeps/i386/i686/multiarch/wcscpy.S
@@ -26,20 +26,11 @@
 	.text
 ENTRY(wcscpy)
 	.type	wcscpy, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wcscpy_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcscpy_ia32)
+	HAS_SSSE3
 	jz	2f
-	leal	__wcscpy_ssse3@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wcscpy_ssse3)
+2:	ret
 END(wcscpy)
 #endif
diff --git a/sysdeps/i386/i686/multiarch/wcslen.S b/sysdeps/i386/i686/multiarch/wcslen.S
index 3926a50..aabacda 100644
--- a/sysdeps/i386/i686/multiarch/wcslen.S
+++ b/sysdeps/i386/i686/multiarch/wcslen.S
@@ -25,21 +25,12 @@
 	.text
 ENTRY(__wcslen)
 	.type	__wcslen, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wcslen_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcslen_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__wcslen_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wcslen_sse2)
+2:	ret
 END(__wcslen)
 
 weak_alias(__wcslen, wcslen)
diff --git a/sysdeps/i386/i686/multiarch/wcsrchr.S b/sysdeps/i386/i686/multiarch/wcsrchr.S
index 5c96129..24f8313 100644
--- a/sysdeps/i386/i686/multiarch/wcsrchr.S
+++ b/sysdeps/i386/i686/multiarch/wcsrchr.S
@@ -25,20 +25,11 @@
 	.text
 ENTRY(wcsrchr)
 	.type	wcsrchr, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wcsrchr_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSE2, CPUID_OFFSET+index_SSE2+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wcsrchr_ia32)
+	HAS_SSE2
 	jz	2f
-	leal	__wcsrchr_sse2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4);
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wcsrchr_sse2)
+2:	ret
 END(wcsrchr)
 #endif
diff --git a/sysdeps/i386/i686/multiarch/wmemcmp.S b/sysdeps/i386/i686/multiarch/wmemcmp.S
index 6ca6053..dcf0fc0 100644
--- a/sysdeps/i386/i686/multiarch/wmemcmp.S
+++ b/sysdeps/i386/i686/multiarch/wmemcmp.S
@@ -27,23 +27,14 @@
 	.text
 ENTRY(wmemcmp)
 	.type	wmemcmp, @gnu_indirect_function
-	pushl	%ebx
-	cfi_adjust_cfa_offset (4)
-	cfi_rel_offset (ebx, 0)
-	LOAD_PIC_REG(bx)
-	cmpl	$0, KIND_OFFSET+__cpu_features@GOTOFF(%ebx)
-	jne	1f
-	call	__init_cpu_features
-1:	leal	__wmemcmp_ia32@GOTOFF(%ebx), %eax
-	testl	$bit_SSSE3, CPUID_OFFSET+index_SSSE3+__cpu_features@GOTOFF(%ebx)
+	LOAD_GOT_AND_RTLD_GLOBAL_RO
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ia32)
+	HAS_SSSE3
 	jz	2f
-	leal	__wmemcmp_ssse3@GOTOFF(%ebx), %eax
-	testl	$bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
+	LOAD_FUNC_GOT_EAX (__wmemcmp_ssse3)
+	HAS_SSE4_2
 	jz	2f
-	leal	__wmemcmp_sse4_2@GOTOFF(%ebx), %eax
-2:	popl	%ebx
-	cfi_adjust_cfa_offset (-4)
-	cfi_restore (ebx)
-	ret
+	LOAD_FUNC_GOT_EAX (__wmemcmp_sse4_2)
+2:	ret
 END(wmemcmp)
 #endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=dcdc26382ba3019931774043cff7335a612e0642

commit dcdc26382ba3019931774043cff7335a612e0642
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jul 31 13:41:04 2015 -0700

    Use LOAD_RTLD_GLOBAL_RO_RDX and HAS_XXX in x86_64/multiarch

diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index f8b4636..8f0e274 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -26,16 +26,13 @@
 	.text
 ENTRY(memcmp)
 	.type	memcmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_SSSE3
 	jnz	2f
 	leaq	__memcmp_sse2(%rip), %rax
 	ret
 
-2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+2:	HAS_SSE4_1
 	jz	3f
 	leaq	__memcmp_sse4_1(%rip), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 4e18cd3..780c1ad 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -29,19 +29,17 @@
 	.text
 ENTRY(__new_memcpy)
 	.type	__new_memcpy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__memcpy_avx_unaligned(%rip), %rax
-	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memcpy_avx_unaligned(%rip), %rax
+	HAS_AVX_FAST_UNALIGNED_LOAD
 	jz 1f
 	ret
 1:	leaq	__memcpy_sse2(%rip), %rax
-	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+	HAS_SLOW_BSF
 	jnz	2f
 	leaq	__memcpy_sse2_unaligned(%rip), %rax
 	ret
-2:	testl   $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+2:	HAS_SSSE3
 	jz 3f
 	leaq    __memcpy_ssse3(%rip), %rax
 3:	ret
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 1e756ea..b9b157b 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -29,17 +29,15 @@
 	.text
 ENTRY(__memcpy_chk)
 	.type	__memcpy_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__memcpy_chk_sse2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memcpy_chk_sse2(%rip), %rax
+	HAS_SSSE3
 	jz	2f
 	leaq	__memcpy_chk_ssse3(%rip), %rax
-	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	HAS_FAST_COPY_BACKWARD
 	jz	2f
 	leaq	__memcpy_chk_ssse3_back(%rip), %rax
-	testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	HAS_AVX_FAST_UNALIGNED_LOAD
 	jz  2f
 	leaq    __memcpy_chk_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index 2eaacdf..f346696 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -27,17 +27,15 @@
 #if defined SHARED && IS_IN (libc)
 ENTRY(__mempcpy)
 	.type	__mempcpy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__mempcpy_sse2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__mempcpy_sse2(%rip), %rax
+	HAS_SSSE3
 	jz	2f
 	leaq	__mempcpy_ssse3(%rip), %rax
-	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	HAS_FAST_COPY_BACKWARD
 	jz	2f
 	leaq	__mempcpy_ssse3_back(%rip), %rax
-	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	HAS_AVX_FAST_UNALIGNED_LOAD
 	jz	2f
 	leaq	__mempcpy_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index 17b8470..a31c3b1 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -29,17 +29,15 @@
 	.text
 ENTRY(__mempcpy_chk)
 	.type	__mempcpy_chk, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__mempcpy_chk_sse2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__mempcpy_chk_sse2(%rip), %rax
+	HAS_SSSE3
 	jz	2f
 	leaq	__mempcpy_chk_ssse3(%rip), %rax
-	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
+	HAS_FAST_COPY_BACKWARD
 	jz	2f
 	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
-	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+	HAS_AVX_FAST_UNALIGNED_LOAD
 	jz	2f
 	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index c5f1fb3..e542548 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -26,11 +26,9 @@
 # if IS_IN (libc)
 ENTRY(memset)
 	.type	memset, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__memset_sse2(%rip), %rax
-	testl	$bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memset_sse2(%rip), %rax
+	HAS_AVX2
 	jz	2f
 	leaq	__memset_avx2(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
index 64fed31..63bcc89 100644
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -25,11 +25,9 @@
 # if defined SHARED && defined HAVE_AVX2_SUPPORT
 ENTRY(__memset_chk)
 	.type	__memset_chk, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__memset_chk_sse2(%rip), %rax
-	testl	$bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__memset_chk_sse2(%rip), %rax
+	HAS_AVX2
 	jz	2f
 	leaq	__memset_chk_avx2(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/strcat.S b/sysdeps/x86_64/multiarch/strcat.S
index 44993fa..986b13f 100644
--- a/sysdeps/x86_64/multiarch/strcat.S
+++ b/sysdeps/x86_64/multiarch/strcat.S
@@ -47,14 +47,12 @@
 	.text
 ENTRY(STRCAT)
 	.type	STRCAT, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	STRCAT_SSE2_UNALIGNED(%rip), %rax
-	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCAT_SSE2_UNALIGNED(%rip), %rax
+	HAS_FAST_UNALIGNED_LOAD
 	jnz	2f
 	leaq	STRCAT_SSE2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_SSSE3
 	jz	2f
 	leaq	STRCAT_SSSE3(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index af55fac..373fb87 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -25,11 +25,9 @@
 	.text
 ENTRY(strchr)
 	.type	strchr, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strchr_sse2(%rip), %rax
-2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strchr_sse2(%rip), %rax
+2:	HAS_SLOW_BSF
 	jz	3f
 	leaq    __strchr_sse2_no_bsf(%rip), %rax
 3:	ret
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index f50f26c..b219319 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -84,24 +84,20 @@
 	.text
 ENTRY(STRCMP)
 	.type	STRCMP, @gnu_indirect_function
-	/* Manually inlined call to __get_cpu_features.  */
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:
+	LOAD_RTLD_GLOBAL_RO_RDX
 #ifdef USE_AS_STRCMP
 	leaq	__strcmp_sse2_unaligned(%rip), %rax
-	testl   $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	HAS_FAST_UNALIGNED_LOAD
 	jnz     3f
 #else
-	testl	$bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
+	HAS_SLOW_SSE4_2
 	jnz	2f
 	leaq	STRCMP_SSE42(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	HAS_SSE4_2
 	jnz	3f
 #endif
 2:	leaq	STRCMP_SSSE3(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_SSSE3
 	jnz	3f
 	leaq	STRCMP_SSE2(%rip), %rax
 3:	ret
@@ -110,23 +106,19 @@ END(STRCMP)
 # ifdef USE_AS_STRCASECMP_L
 ENTRY(__strcasecmp)
 	.type	__strcasecmp, @gnu_indirect_function
-	/* Manually inlined call to __get_cpu_features.  */
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:
+	LOAD_RTLD_GLOBAL_RO_RDX
 #  ifdef HAVE_AVX_SUPPORT
 	leaq	__strcasecmp_avx(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	HAS_AVX
 	jnz	3f
 #  endif
-	testl	$bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
+	HAS_SLOW_SSE4_2
 	jnz	2f
 	leaq	__strcasecmp_sse42(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	HAS_SSE4_2
 	jnz	3f
 2:	leaq	__strcasecmp_ssse3(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_SSSE3
 	jnz	3f
 	leaq	__strcasecmp_sse2(%rip), %rax
 3:	ret
@@ -136,23 +128,19 @@ weak_alias (__strcasecmp, strcasecmp)
 # ifdef USE_AS_STRNCASECMP_L
 ENTRY(__strncasecmp)
 	.type	__strncasecmp, @gnu_indirect_function
-	/* Manually inlined call to __get_cpu_features.  */
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:
+	LOAD_RTLD_GLOBAL_RO_RDX
 #  ifdef HAVE_AVX_SUPPORT
 	leaq	__strncasecmp_avx(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	HAS_AVX
 	jnz	3f
 #  endif
-	testl	$bit_Slow_SSE4_2, __cpu_features+FEATURE_OFFSET+index_Slow_SSE4_2(%rip)
+	HAS_SLOW_SSE4_2
 	jnz	2f
 	leaq	__strncasecmp_sse42(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	HAS_SSE4_2
 	jnz	3f
 2:	leaq	__strncasecmp_ssse3(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_SSSE3
 	jnz	3f
 	leaq	__strncasecmp_sse2(%rip), %rax
 3:	ret
diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S
index 9464ee8..5c040ae 100644
--- a/sysdeps/x86_64/multiarch/strcpy.S
+++ b/sysdeps/x86_64/multiarch/strcpy.S
@@ -61,14 +61,12 @@
 	.text
 ENTRY(STRCPY)
 	.type	STRCPY, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	STRCPY_SSE2_UNALIGNED(%rip), %rax
-	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCPY_SSE2_UNALIGNED(%rip), %rax
+	HAS_FAST_UNALIGNED_LOAD
 	jnz	2f
 	leaq	STRCPY_SSE2(%rip), %rax
-	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	HAS_SSSE3
 	jz	2f
 	leaq	STRCPY_SSSE3(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/strcspn.S b/sysdeps/x86_64/multiarch/strcspn.S
index 95e882c..340cab6 100644
--- a/sysdeps/x86_64/multiarch/strcspn.S
+++ b/sysdeps/x86_64/multiarch/strcspn.S
@@ -45,11 +45,9 @@
 	.text
 ENTRY(STRCSPN)
 	.type	STRCSPN, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	STRCSPN_SSE2(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	STRCSPN_SSE2(%rip), %rax
+	HAS_SSE4_2
 	jz	2f
 	leaq	STRCSPN_SSE42(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/strspn.S b/sysdeps/x86_64/multiarch/strspn.S
index b734c17..c0afcf3 100644
--- a/sysdeps/x86_64/multiarch/strspn.S
+++ b/sysdeps/x86_64/multiarch/strspn.S
@@ -30,11 +30,9 @@
 	.text
 ENTRY(strspn)
 	.type	strspn, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strspn_sse2(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	leaq	__strspn_sse2(%rip), %rax
+	HAS_SSE4_2
 	jz	2f
 	leaq	__strspn_sse42(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/wcscpy.S b/sysdeps/x86_64/multiarch/wcscpy.S
index ff2f5a7..40c1fc4 100644
--- a/sysdeps/x86_64/multiarch/wcscpy.S
+++ b/sysdeps/x86_64/multiarch/wcscpy.S
@@ -27,11 +27,8 @@
 	.text
 ENTRY(wcscpy)
 	.type	wcscpy, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_SSSE3
 	jnz	2f
 	leaq	__wcscpy_sse2(%rip), %rax
 	ret
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
index 109e245..c6b73aa 100644
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -26,16 +26,13 @@
 	.text
 ENTRY(wmemcmp)
 	.type	wmemcmp, @gnu_indirect_function
-	cmpl	$0, KIND_OFFSET+__cpu_features(%rip)
-	jne	1f
-	call	__init_cpu_features
-
-1:	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+	HAS_SSSE3
 	jnz	2f
 	leaq	__wmemcmp_sse2(%rip), %rax
 	ret
 
-2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+2:	HAS_SSE4_1
 	jz	3f
 	leaq	__wmemcmp_sse4_1(%rip), %rax
 	ret

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ff889a9c8c894dd91459b98cabbc483ee831fc7a

commit ff889a9c8c894dd91459b98cabbc483ee831fc7a
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jul 31 07:30:04 2015 -0700

    _dl_x86_cpu_features to rtld_global for x86

diff --git a/sysdeps/i386/dl-machine.h b/sysdeps/i386/dl-machine.h
index 04f9247..4a28eb3 100644
--- a/sysdeps/i386/dl-machine.h
+++ b/sysdeps/i386/dl-machine.h
@@ -25,6 +25,7 @@
 #include <sysdep.h>
 #include <tls.h>
 #include <dl-tlsdesc.h>
+#include <cpu-features.c>
 
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
@@ -235,6 +236,8 @@ dl_platform_init (void)
   if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
     /* Avoid an empty string which would disturb us.  */
     GLRO(dl_platform) = NULL;
+
+  init_cpu_features (&GLRO(dl_x86_cpu_features));
 }
 
 static inline Elf32_Addr
diff --git a/sysdeps/i386/dl-procinfo.c b/sysdeps/i386/dl-procinfo.c
index b673b3c..e95f335 100644
--- a/sysdeps/i386/dl-procinfo.c
+++ b/sysdeps/i386/dl-procinfo.c
@@ -43,6 +43,22 @@
 # define PROCINFO_CLASS
 #endif
 
+#if !IS_IN (ldconfig)
+# if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_cpu_features
+# else
+PROCINFO_CLASS struct cpu_features _dl_x86_cpu_features
+# endif
+# ifndef PROCINFO_DECL
+= { }
+# endif
+# if !defined SHARED || defined PROCINFO_DECL
+;
+# else
+,
+# endif
+#endif
+
 #if !defined PROCINFO_DECL && defined SHARED
   ._dl_x86_cap_flags
 #else
diff --git a/sysdeps/i386/i686/cacheinfo.c b/sysdeps/i386/i686/cacheinfo.c
index 0f869df..0b50c6d 100644
--- a/sysdeps/i386/i686/cacheinfo.c
+++ b/sysdeps/i386/i686/cacheinfo.c
@@ -1,4 +1,3 @@
 #define DISABLE_PREFETCHW
-#define DISABLE_PREFERRED_MEMORY_INSTRUCTION
 
 #include <sysdeps/x86_64/cacheinfo.c>
diff --git a/sysdeps/i386/ldsodefs.h b/sysdeps/i386/ldsodefs.h
index d80cf01..dae2d04 100644
--- a/sysdeps/i386/ldsodefs.h
+++ b/sysdeps/i386/ldsodefs.h
@@ -20,6 +20,7 @@
 #define	_I386_LDSODEFS_H	1
 
 #include <elf.h>
+#include <cpu-features.h>
 
 struct La_i86_regs;
 struct La_i86_retval;
diff --git a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
index 8ac351e..a3c0c19 100644
--- a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
+++ b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
@@ -1,5 +1,5 @@
 #if IS_IN (ldconfig)
 # include <sysdeps/i386/dl-procinfo.c>
 #else
-# include <sysdeps/generic/dl-procinfo.c>
+# include <sysdeps/x86_64/dl-procinfo.c>
 #endif
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 19f5eca..61dfff3 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -8,3 +8,7 @@ $(objpfx)tst-ld-sse-use.out: ../sysdeps/x86/tst-ld-sse-use.sh $(objpfx)ld.so
 	$(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
 	$(evaluate-test)
 endif
+
+ifeq ($(subdir),csu)
+gen-as-const-headers += cpu-features-offsets.sym rtld-global-offsets.sym
+endif
diff --git a/sysdeps/x86/cpu-features-offsets.sym b/sysdeps/x86/cpu-features-offsets.sym
new file mode 100644
index 0000000..a9d53d1
--- /dev/null
+++ b/sysdeps/x86/cpu-features-offsets.sym
@@ -0,0 +1,7 @@
+#define SHARED 1
+
+#include <ldsodefs.h>
+
+#define rtld_global_ro_offsetof(mem) offsetof (struct rtld_global_ro, mem)
+
+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET rtld_global_ro_offsetof (_dl_x86_cpu_features)
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86/cpu-features.c
similarity index 65%
copy from sysdeps/x86_64/multiarch/init-arch.c
copy to sysdeps/x86/cpu-features.c
index aaad5fa..cbdf4af 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86/cpu-features.c
@@ -1,7 +1,5 @@
-/* Initialize CPU feature data.
+/* Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Copyright (C) 2008-2015 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -17,48 +15,40 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <atomic.h>
 #include <cpuid.h>
-#include "init-arch.h"
+#include <cpu-features.h>
 
-
-struct cpu_features __cpu_features attribute_hidden;
-
-
-static void
-get_common_indeces (unsigned int *family, unsigned int *model)
+static inline void
+get_common_indeces (struct cpu_features *cpu_features,
+		    unsigned int *family, unsigned int *model)
 {
-  __cpuid (1, __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx);
-
-  unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
+  unsigned int eax;
+  __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx,
+	   cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx,
+	   cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx);
+  GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].eax = eax;
   *family = (eax >> 8) & 0x0f;
   *model = (eax >> 4) & 0x0f;
 }
 
-
-void
-__init_cpu_features (void)
+static inline void
+init_cpu_features (struct cpu_features *cpu_features)
 {
-  unsigned int ebx;
-  unsigned int ecx;
-  unsigned int edx;
+  unsigned int ebx, ecx, edx;
   unsigned int family = 0;
   unsigned int model = 0;
   enum cpu_features_kind kind;
 
-  __cpuid (0, __cpu_features.max_cpuid, ebx, ecx, edx);
+  __cpuid (0, cpu_features->max_cpuid, ebx, ecx, edx);
 
   /* This spells out "GenuineIntel".  */
   if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
     {
       kind = arch_kind_intel;
 
-      get_common_indeces (&family, &model);
+      get_common_indeces (cpu_features, &family, &model);
 
-      unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
+      unsigned int eax = cpu_features->cpuid[COMMON_CPUID_INDEX_1].eax;
       unsigned int extended_family = (eax >> 20) & 0xff;
       unsigned int extended_model = (eax >> 12) & 0xf0;
       if (family == 0x0f)
@@ -68,14 +58,14 @@ __init_cpu_features (void)
 	}
       else if (family == 0x06)
 	{
-	  ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
+	  ecx = cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx;
 	  model += extended_model;
 	  switch (model)
 	    {
 	    case 0x1c:
 	    case 0x26:
 	      /* BSF is slow on Atom.  */
-	      __cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF;
+	      cpu_features->feature[index_Slow_BSF] |= bit_Slow_BSF;
 	      break;
 
 	    case 0x37:
@@ -91,7 +81,7 @@ __init_cpu_features (void)
 #if index_Fast_Unaligned_Load != index_Slow_SSE4_2
 # error index_Fast_Unaligned_Load != index_Slow_SSE4_2
 #endif
-	      __cpu_features.feature[index_Fast_Unaligned_Load]
+	      cpu_features->feature[index_Fast_Unaligned_Load]
 		|= (bit_Fast_Unaligned_Load
 		    | bit_Prefer_PMINUB_for_stringop
 		    | bit_Slow_SSE4_2);
@@ -121,7 +111,7 @@ __init_cpu_features (void)
 #if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
 # error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
 #endif
-	      __cpu_features.feature[index_Fast_Rep_String]
+	      cpu_features->feature[index_Fast_Rep_String]
 		|= (bit_Fast_Rep_String
 		    | bit_Fast_Copy_Backward
 		    | bit_Fast_Unaligned_Load
@@ -135,28 +125,28 @@ __init_cpu_features (void)
     {
       kind = arch_kind_amd;
 
-      get_common_indeces (&family, &model);
+      get_common_indeces (cpu_features, &family, &model);
 
-      ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
+      ecx = cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx;
 
       unsigned int eax;
       __cpuid (0x80000000, eax, ebx, ecx, edx);
       if (eax >= 0x80000001)
 	__cpuid (0x80000001,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].eax,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ebx,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ecx,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].edx);
+		 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].eax,
+		 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ebx,
+		 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ecx,
+		 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].edx);
     }
   else
     kind = arch_kind_other;
 
-  if (__cpu_features.max_cpuid >= 7)
+  if (cpu_features->max_cpuid >= 7)
     __cpuid_count (7, 0,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].eax,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ebx,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ecx,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].edx);
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax,
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx,
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx,
+		   cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx);
 
   /* Can we call xgetbv?  */
   if (CPUID_OSXSAVE)
@@ -170,14 +160,14 @@ __init_cpu_features (void)
 	{
 	  /* Determine if AVX is usable.  */
 	  if (CPUID_AVX)
-	    __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
+	    cpu_features->feature[index_AVX_Usable] |= bit_AVX_Usable;
 #if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
 # error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
 #endif
 	  /* Determine if AVX2 is usable.  Unaligned load with 256-bit
 	     AVX registers are faster on processors with AVX2.  */
 	  if (CPUID_AVX2)
-	    __cpu_features.feature[index_AVX2_Usable]
+	    cpu_features->feature[index_AVX2_Usable]
 	      |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
 	  /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and
 	     ZMM16-ZMM31 state are enabled.  */
@@ -188,36 +178,24 @@ __init_cpu_features (void)
 	      /* Determine if AVX512F is usable.  */
 	      if (CPUID_AVX512F)
 		{
-		  __cpu_features.feature[index_AVX512F_Usable]
+		  cpu_features->feature[index_AVX512F_Usable]
 		    |= bit_AVX512F_Usable;
 		  /* Determine if AVX512DQ is usable.  */
 		  if (CPUID_AVX512DQ)
-		    __cpu_features.feature[index_AVX512DQ_Usable]
+		    cpu_features->feature[index_AVX512DQ_Usable]
 		      |= bit_AVX512DQ_Usable;
 		}
 	    }
 	  /* Determine if FMA is usable.  */
 	  if (CPUID_FMA)
-	    __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
+	    cpu_features->feature[index_FMA_Usable] |= bit_FMA_Usable;
 	  /* Determine if FMA4 is usable.  */
 	  if (CPUID_FMA4)
-	    __cpu_features.feature[index_FMA4_Usable] |= bit_FMA4_Usable;
+	    cpu_features->feature[index_FMA4_Usable] |= bit_FMA4_Usable;
 	}
     }
 
-  __cpu_features.family = family;
-  __cpu_features.model = model;
-  atomic_write_barrier ();
-  __cpu_features.kind = kind;
-}
-
-#undef __get_cpu_features
-
-const struct cpu_features *
-__get_cpu_features (void)
-{
-  if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
-
-  return &__cpu_features;
+  cpu_features->family = family;
+  cpu_features->model = model;
+  cpu_features->kind = kind;
 }
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86/cpu-features.h
similarity index 66%
copy from sysdeps/x86_64/multiarch/init-arch.h
copy to sysdeps/x86/cpu-features.h
index cfc6e70..c8ff30e 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86/cpu-features.h
@@ -15,6 +15,9 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifndef cpu_features_h
+#define cpu_features_h
+
 #define bit_Fast_Rep_String		(1 << 0)
 #define bit_Fast_Copy_Backward		(1 << 1)
 #define bit_Slow_BSF			(1 << 2)
@@ -56,14 +59,15 @@
 #define bit_ZMM16_31_state	(1 << 7)
 
 /* The integer bit array index for the first set of internal feature bits.  */
-# define FEATURE_INDEX_1 0
+#define FEATURE_INDEX_1 0
 
 /* The current maximum size of the feature integer bit array.  */
-# define FEATURE_INDEX_MAX 1
+#define FEATURE_INDEX_MAX 1
 
 #ifdef	__ASSEMBLER__
 
 # include <ifunc-defines.h>
+# include <rtld-global-offsets.h>
 
 # define index_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
 # define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
@@ -86,9 +90,62 @@
 # define index_AVX512F_Usable		FEATURE_INDEX_1*FEATURE_SIZE
 # define index_AVX512DQ_Usable		FEATURE_INDEX_1*FEATURE_SIZE
 
-#else	/* __ASSEMBLER__ */
+/* HAS_* evaluates to true if we may use the feature at runtime.  */
+# ifdef __x86_64__
+#  ifdef SHARED
+#   if IS_IN (rtld)
+#    define LOAD_RTLD_GLOBAL_RO_RDX
+#    define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), _rtld_local_ro+offset+(index_##name)(%rip)
+#   else
+#     define LOAD_RTLD_GLOBAL_RO_RDX \
+  mov _rtld_global_ro@GOTPCREL(%rip), %RDX_LP
+#    define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), \
+	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##name)(%rdx)
+#   endif
+#  else /* SHARED */
+#   define LOAD_RTLD_GLOBAL_RO_RDX
+#   define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), _dl_x86_cpu_features+offset+(index_##name)(%rip)
+#  endif /* !SHARED */
+# else  /* __x86_64__ */
+#  ifdef SHARED
+#   define LOAD_FUNC_GOT_EAX(func) \
+  leal func@GOTOFF(%edx), %eax
+#   if IS_IN (rtld)
+#    define LOAD_GOT_AND_RTLD_GLOBAL_RO \
+  LOAD_PIC_REG(dx)
+#    define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), offset+(index_##name)+_rtld_local_ro@GOTOFF(%edx)
+#   else
+#    define LOAD_GOT_AND_RTLD_GLOBAL_RO \
+  LOAD_PIC_REG(dx); \
+  mov _rtld_global_ro@GOT(%edx), %ecx
+#    define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), \
+	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##name)(%ecx)
+#   endif
+#  else  /* SHARED */
+#   define LOAD_FUNC_GOT_EAX(func) \
+  leal func, %eax
+#   define LOAD_GOT_AND_RTLD_GLOBAL_RO
+#   define HAS_FEATURE(offset, name) \
+  testl $(bit_##name), _dl_x86_cpu_features+offset+(index_##name)
+#  endif /* !SHARED */
+# endif /* !__x86_64__ */
 
-# include <sys/param.h>
+# define HAS_CPU_FEATURE(name)	 HAS_FEATURE (CPUID_OFFSET, name)
+# define HAS_ARCH_FEATURE(name) HAS_FEATURE (FEATURE_OFFSET, name)
+
+# define HAS_SSE2	HAS_CPU_FEATURE (SSE2)
+# define HAS_POPCOUNT	HAS_CPU_FEATURE (POPCOUNT)
+# define HAS_SSSE3	HAS_CPU_FEATURE (SSSE3)
+# define HAS_SSE4_1	HAS_CPU_FEATURE (SSE4_1)
+# define HAS_SSE4_2	HAS_CPU_FEATURE (SSE4_2)
+# define HAS_RTM	HAS_CPU_FEATURE (RTM)
+
+#else	/* __ASSEMBLER__ */
 
 enum
   {
@@ -99,7 +156,7 @@ enum
     COMMON_CPUID_INDEX_MAX
   };
 
-extern struct cpu_features
+struct cpu_features
 {
   enum cpu_features_kind
     {
@@ -119,22 +176,18 @@ extern struct cpu_features
   unsigned int family;
   unsigned int model;
   unsigned int feature[FEATURE_INDEX_MAX];
-} __cpu_features attribute_hidden;
+};
 
+/* Unused for x86.  */
+# define INIT_ARCH()
 
-extern void __init_cpu_features (void) attribute_hidden;
-# define INIT_ARCH() \
-  do							\
-    if (__cpu_features.kind == arch_kind_unknown)	\
-      __init_cpu_features ();				\
-  while (0)
-
-/* Used from outside libc.so to get access to the CPU features structure.  */
+/* Used from outside of glibc to get access to the CPU features
+   structure.  */
 extern const struct cpu_features *__get_cpu_features (void)
      __attribute__ ((const));
 
-# if IS_IN (libc)
-#  define __get_cpu_features()	(&__cpu_features)
+# if defined (_LIBC) && !IS_IN (nonlib)
+#  define __get_cpu_features()	(&GLRO(dl_x86_cpu_features))
 # endif
 
 # define HAS_CPU_FEATURE(idx, reg, bit) \
@@ -142,12 +195,8 @@ extern const struct cpu_features *__get_cpu_features (void)
 
 /* Following are the feature tests used throughout libc.  */
 
-/* CPUID_* evaluates to true if the feature flag is enabled.
-   We always use &__cpu_features because the HAS_CPUID_* macros
-   are called only within __init_cpu_features, where we can't
-   call __get_cpu_features without infinite recursion.  */
 # define HAS_CPUID_FLAG(idx, reg, bit) \
-  (((&__cpu_features)->cpuid[idx].reg & (bit)) != 0)
+  ((__get_cpu_features ()->cpuid[idx].reg & (bit)) != 0)
 
 # define CPUID_OSXSAVE \
   HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_OSXSAVE)
@@ -191,16 +240,19 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define HAS_ARCH_FEATURE(name) \
   ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
 
-# define HAS_FAST_REP_STRING		HAS_ARCH_FEATURE (Fast_Rep_String)
-# define HAS_FAST_COPY_BACKWARD		HAS_ARCH_FEATURE (Fast_Copy_Backward)
-# define HAS_SLOW_BSF			HAS_ARCH_FEATURE (Slow_BSF)
-# define HAS_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
-# define HAS_AVX			HAS_ARCH_FEATURE (AVX_Usable)
-# define HAS_AVX2			HAS_ARCH_FEATURE (AVX2_Usable)
-# define HAS_AVX512F			HAS_ARCH_FEATURE (AVX512F_Usable)
-# define HAS_AVX512DQ			HAS_ARCH_FEATURE (AVX512DQ_Usable)
-# define HAS_FMA			HAS_ARCH_FEATURE (FMA_Usable)
-# define HAS_FMA4			HAS_ARCH_FEATURE (FMA4_Usable)
-# define HAS_AVX_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-
 #endif	/* __ASSEMBLER__ */
+
+#define HAS_FAST_REP_STRING		HAS_ARCH_FEATURE (Fast_Rep_String)
+#define HAS_FAST_COPY_BACKWARD		HAS_ARCH_FEATURE (Fast_Copy_Backward)
+#define HAS_SLOW_BSF			HAS_ARCH_FEATURE (Slow_BSF)
+#define HAS_FAST_UNALIGNED_LOAD		HAS_ARCH_FEATURE (Fast_Unaligned_Load)
+#define HAS_SLOW_SSE4_2			HAS_ARCH_FEATURE (Slow_SSE4_2)
+#define HAS_AVX				HAS_ARCH_FEATURE (AVX_Usable)
+#define HAS_AVX2			HAS_ARCH_FEATURE (AVX2_Usable)
+#define HAS_AVX512F			HAS_ARCH_FEATURE (AVX512F_Usable)
+#define HAS_AVX512DQ			HAS_ARCH_FEATURE (AVX512DQ_Usable)
+#define HAS_FMA				HAS_ARCH_FEATURE (FMA_Usable)
+#define HAS_FMA4			HAS_ARCH_FEATURE (FMA4_Usable)
+#define HAS_AVX_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+
+#endif  /* cpu_features_h */
diff --git a/sysdeps/i386/ldsodefs.h b/sysdeps/x86/libc-start.c
similarity index 50%
copy from sysdeps/i386/ldsodefs.h
copy to sysdeps/x86/libc-start.c
index d80cf01..9f0c045 100644
--- a/sysdeps/i386/ldsodefs.h
+++ b/sysdeps/x86/libc-start.c
@@ -1,5 +1,4 @@
-/* Run-time dynamic linker data structures for loaded ELF shared objects.
-   Copyright (C) 1995-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,25 +15,27 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef	_I386_LDSODEFS_H
-#define	_I386_LDSODEFS_H	1
-
-#include <elf.h>
-
-struct La_i86_regs;
-struct La_i86_retval;
-
-#define ARCH_PLTENTER_MEMBERS						\
-    Elf32_Addr (*i86_gnu_pltenter) (Elf32_Sym *, unsigned int, uintptr_t *, \
-				    uintptr_t *, struct La_i86_regs *,	\
-				    unsigned int *, const char *name,	\
-				    long int *framesizep)
-
-#define ARCH_PLTEXIT_MEMBERS						\
-    unsigned int (*i86_gnu_pltexit) (Elf32_Sym *, unsigned int, uintptr_t *, \
-				     uintptr_t *, const struct La_i86_regs *, \
-				     struct La_i86_retval *, const char *)
-
-#include_next <ldsodefs.h>
-
+#ifdef SHARED
+# include <csu/libc-start.c>
+# else
+/* The main work is done in the generic function.  */
+# define LIBC_START_DISABLE_INLINE
+# define LIBC_START_MAIN generic_start_main
+# include <csu/libc-start.c>
+# include <cpu-features.h>
+# include <cpu-features.c>
+
+extern struct cpu_features _dl_x86_cpu_features;
+
+int
+__libc_start_main (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL),
+		   int argc, char **argv,
+		   __typeof (main) init,
+		   void (*fini) (void),
+		   void (*rtld_fini) (void), void *stack_end)
+{
+  init_cpu_features (&_dl_x86_cpu_features);
+  return generic_start_main (main, argc, argv, init, fini, rtld_fini,
+			     stack_end);
+}
 #endif
diff --git a/sysdeps/x86/rtld-global-offsets.sym b/sysdeps/x86/rtld-global-offsets.sym
new file mode 100644
index 0000000..a9d53d1
--- /dev/null
+++ b/sysdeps/x86/rtld-global-offsets.sym
@@ -0,0 +1,7 @@
+#define SHARED 1
+
+#include <ldsodefs.h>
+
+#define rtld_global_ro_offsetof(mem) offsetof (struct rtld_global_ro, mem)
+
+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET rtld_global_ro_offsetof (_dl_x86_cpu_features)
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index b99fb9a..0ff5309 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -21,40 +21,11 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <cpuid.h>
+#include "multiarch/init-arch.h"
 
-#ifndef __cpuid_count
-/* FIXME: Provide __cpuid_count if it isn't defined.  Copied from gcc
-   4.4.0.  Remove this if gcc 4.4 is the minimum requirement.  */
-# if defined(__i386__) && defined(__PIC__)
-/* %ebx may be the PIC register.  */
-#  define __cpuid_count(level, count, a, b, c, d)		\
-  __asm__ ("xchg{l}\t{%%}ebx, %1\n\t"			\
-	   "cpuid\n\t"					\
-	   "xchg{l}\t{%%}ebx, %1\n\t"			\
-	   : "=a" (a), "=r" (b), "=c" (c), "=d" (d)	\
-	   : "0" (level), "2" (count))
-# else
-#  define __cpuid_count(level, count, a, b, c, d)		\
-  __asm__ ("cpuid\n\t"					\
-	   : "=a" (a), "=b" (b), "=c" (c), "=d" (d)	\
-	   : "0" (level), "2" (count))
-# endif
-#endif
-
-#ifdef USE_MULTIARCH
-# include "multiarch/init-arch.h"
-
-# define is_intel __cpu_features.kind == arch_kind_intel
-# define is_amd __cpu_features.kind == arch_kind_amd
-# define max_cpuid __cpu_features.max_cpuid
-#else
-  /* This spells out "GenuineIntel".  */
-# define is_intel \
-  ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69
-  /* This spells out "AuthenticAMD".  */
-# define is_amd \
-  ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65
-#endif
+#define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
+#define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
+#define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
 
 static const struct intel_02_cache_info
 {
@@ -235,21 +206,8 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
 	      /* Intel reused this value.  For family 15, model 6 it
 		 specifies the 3rd level cache.  Otherwise the 2nd
 		 level cache.  */
-	      unsigned int family;
-	      unsigned int model;
-#ifdef USE_MULTIARCH
-	      family = __cpu_features.family;
-	      model = __cpu_features.model;
-#else
-	      unsigned int eax;
-	      unsigned int ebx;
-	      unsigned int ecx;
-	      unsigned int edx;
-	      __cpuid (1, eax, ebx, ecx, edx);
-
-	      family = ((eax >> 20) & 0xff) + ((eax >> 8) & 0xf);
-	      model = (((eax >>16) & 0xf) << 4) + ((eax >> 4) & 0xf);
-#endif
+	      unsigned int family = GLRO(dl_x86_cpu_features).family;
+	      unsigned int model = GLRO(dl_x86_cpu_features).model;
 
 	      if (family == 15 && model == 6)
 		{
@@ -476,18 +434,6 @@ long int
 attribute_hidden
 __cache_sysconf (int name)
 {
-#ifdef USE_MULTIARCH
-  if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
-#else
-  /* Find out what brand of processor.  */
-  unsigned int max_cpuid;
-  unsigned int ebx;
-  unsigned int ecx;
-  unsigned int edx;
-  __cpuid (0, max_cpuid, ebx, ecx, edx);
-#endif
-
   if (is_intel)
     return handle_intel (name, max_cpuid);
 
@@ -523,18 +469,6 @@ long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
 int __x86_prefetchw attribute_hidden;
 #endif
 
-#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
-/* Instructions preferred for memory and string routines.
-
-  0: Regular instructions
-  1: MMX instructions
-  2: SSE2 instructions
-  3: SSSE3 instructions
-
-  */
-int __x86_preferred_memory_instruction attribute_hidden;
-#endif
-
 
 static void
 __attribute__((constructor))
@@ -551,14 +485,6 @@ init_cacheinfo (void)
   unsigned int level;
   unsigned int threads = 0;
 
-#ifdef USE_MULTIARCH
-  if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
-#else
-  int max_cpuid;
-  __cpuid (0, max_cpuid, ebx, ecx, edx);
-#endif
-
   if (is_intel)
     {
       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
@@ -574,34 +500,13 @@ init_cacheinfo (void)
 	  shared = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
 	}
 
-      unsigned int ebx_1;
-
-#ifdef USE_MULTIARCH
-      eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
-      ebx_1 = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
-      ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
-      edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx;
-#else
-      __cpuid (1, eax, ebx_1, ecx, edx);
-#endif
-
-      unsigned int family = (eax >> 8) & 0x0f;
-      unsigned int model = (eax >> 4) & 0x0f;
-      unsigned int extended_model = (eax >> 12) & 0xf0;
-
-#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
-      /* Intel prefers SSSE3 instructions for memory/string routines
-	 if they are available.  */
-      if ((ecx & 0x200))
-	__x86_preferred_memory_instruction = 3;
-      else
-	__x86_preferred_memory_instruction = 2;
-#endif
-
       /* Figure out the number of logical threads that share the
 	 highest cache level.  */
       if (max_cpuid >= 4)
 	{
+	  unsigned int family = GLRO(dl_x86_cpu_features).family;
+	  unsigned int model = GLRO(dl_x86_cpu_features).model;
+
 	  int i = 0;
 
 	  /* Query until desired cache level is enumerated.  */
@@ -653,7 +558,6 @@ init_cacheinfo (void)
 	  threads += 1;
 	  if (threads > 2 && level == 2 && family == 6)
 	    {
-	      model += extended_model;
 	      switch (model)
 		{
 		case 0x57:
@@ -676,7 +580,9 @@ init_cacheinfo (void)
 	intel_bug_no_cache_info:
 	  /* Assume that all logical threads share the highest cache level.  */
 
-	  threads = (ebx_1 >> 16) & 0xff;
+	  threads
+	    = ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx
+		>> 16) & 0xff);
 	}
 
       /* Cap usage of highest cache level to the number of supported
@@ -691,25 +597,6 @@ init_cacheinfo (void)
       long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
 
-#ifndef DISABLE_PREFERRED_MEMORY_INSTRUCTION
-# ifdef USE_MULTIARCH
-      eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
-      ebx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx;
-      ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
-      edx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx;
-# else
-      __cpuid (1, eax, ebx, ecx, edx);
-# endif
-
-      /* AMD prefers SSSE3 instructions for memory/string routines
-	 if they are avaiable, otherwise it prefers integer
-	 instructions.  */
-      if ((ecx & 0x200))
-	__x86_preferred_memory_instruction = 3;
-      else
-	__x86_preferred_memory_instruction = 0;
-#endif
-
       /* Get maximum extended function. */
       __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
 
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index cae6db3..d22359d 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -26,6 +26,7 @@
 #include <sysdep.h>
 #include <tls.h>
 #include <dl-tlsdesc.h>
+#include <cpu-features.c>
 
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
@@ -205,6 +206,8 @@ dl_platform_init (void)
   if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
     /* Avoid an empty string which would disturb us.  */
     GLRO(dl_platform) = NULL;
+
+  init_cpu_features (&GLRO(dl_x86_cpu_features));
 }
 
 static inline ElfW(Addr)
diff --git a/sysdeps/i386/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
similarity index 61%
copy from sysdeps/i386/dl-procinfo.c
copy to sysdeps/x86_64/dl-procinfo.c
index b673b3c..851681a 100644
--- a/sysdeps/i386/dl-procinfo.c
+++ b/sysdeps/x86_64/dl-procinfo.c
@@ -1,7 +1,6 @@
-/* Data for i386 version of processor capability information.
-   Copyright (C) 2001-2015 Free Software Foundation, Inc.
+/* Data for x86-64 version of processor capability information.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Ulrich Drepper <drepper@redhat.com>, 2001.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -17,10 +16,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-/* This information must be kept in sync with the _DL_HWCAP_COUNT and
-   _DL_PLATFORM_COUNT definitions in procinfo.h.
-
-   If anything should be added here check whether the size of each string
+/* If anything should be added here check whether the size of each string
    is still ok with the given array size.
 
    All the #ifdefs in the definitions are quite irritating but
@@ -44,33 +40,12 @@
 #endif
 
 #if !defined PROCINFO_DECL && defined SHARED
-  ._dl_x86_cap_flags
-#else
-PROCINFO_CLASS const char _dl_x86_cap_flags[32][8]
-#endif
-#ifndef PROCINFO_DECL
-= {
-    "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
-    "cx8", "apic", "10", "sep", "mtrr", "pge", "mca", "cmov",
-    "pat", "pse36", "pn", "clflush", "20", "dts", "acpi", "mmx",
-    "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe"
-  }
-#endif
-#if !defined SHARED || defined PROCINFO_DECL
-;
-#else
-,
-#endif
-
-#if !defined PROCINFO_DECL && defined SHARED
-  ._dl_x86_platforms
+  ._dl_x86_cpu_features
 #else
-PROCINFO_CLASS const char _dl_x86_platforms[4][5]
+PROCINFO_CLASS struct cpu_features _dl_x86_cpu_features
 #endif
 #ifndef PROCINFO_DECL
-= {
-    "i386", "i486", "i586", "i686"
-  }
+= { }
 #endif
 #if !defined SHARED || defined PROCINFO_DECL
 ;
diff --git a/sysdeps/x86_64/ldsodefs.h b/sysdeps/x86_64/ldsodefs.h
index 84d36e8..e3f2da2 100644
--- a/sysdeps/x86_64/ldsodefs.h
+++ b/sysdeps/x86_64/ldsodefs.h
@@ -20,6 +20,7 @@
 #define	_X86_64_LDSODEFS_H	1
 
 #include <elf.h>
+#include <cpu-features.h>
 
 struct La_x86_64_regs;
 struct La_x86_64_retval;
diff --git a/sysdeps/x86_64/multiarch/cacheinfo.c b/sysdeps/x86_64/multiarch/cacheinfo.c
deleted file mode 100644
index f87b8dc..0000000
--- a/sysdeps/x86_64/multiarch/cacheinfo.c
+++ /dev/null
@@ -1,2 +0,0 @@
-#define DISABLE_PREFERRED_MEMORY_INSTRUCTION
-#include "../cacheinfo.c"
diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
index a410d88..7ac7acf 100644
--- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
+++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
@@ -1,10 +1,9 @@
-#include "init-arch.h"
+#include "cpu-features.h"
 #include <stddef.h>
 
 --
 
 CPU_FEATURES_SIZE	sizeof (struct cpu_features)
-KIND_OFFSET		offsetof (struct cpu_features, kind)
 CPUID_OFFSET		offsetof (struct cpu_features, cpuid)
 CPUID_SIZE		sizeof (struct cpuid_registers)
 CPUID_EAX_OFFSET	offsetof (struct cpuid_registers, eax)
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index aaad5fa..01a379c 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -17,207 +17,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <atomic.h>
-#include <cpuid.h>
-#include "init-arch.h"
 
-
-struct cpu_features __cpu_features attribute_hidden;
-
-
-static void
-get_common_indeces (unsigned int *family, unsigned int *model)
-{
-  __cpuid (1, __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ebx,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx,
-	   __cpu_features.cpuid[COMMON_CPUID_INDEX_1].edx);
-
-  unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
-  *family = (eax >> 8) & 0x0f;
-  *model = (eax >> 4) & 0x0f;
-}
-
-
-void
-__init_cpu_features (void)
-{
-  unsigned int ebx;
-  unsigned int ecx;
-  unsigned int edx;
-  unsigned int family = 0;
-  unsigned int model = 0;
-  enum cpu_features_kind kind;
-
-  __cpuid (0, __cpu_features.max_cpuid, ebx, ecx, edx);
-
-  /* This spells out "GenuineIntel".  */
-  if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
-    {
-      kind = arch_kind_intel;
-
-      get_common_indeces (&family, &model);
-
-      unsigned int eax = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].eax;
-      unsigned int extended_family = (eax >> 20) & 0xff;
-      unsigned int extended_model = (eax >> 12) & 0xf0;
-      if (family == 0x0f)
-	{
-	  family += extended_family;
-	  model += extended_model;
-	}
-      else if (family == 0x06)
-	{
-	  ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
-	  model += extended_model;
-	  switch (model)
-	    {
-	    case 0x1c:
-	    case 0x26:
-	      /* BSF is slow on Atom.  */
-	      __cpu_features.feature[index_Slow_BSF] |= bit_Slow_BSF;
-	      break;
-
-	    case 0x37:
-	    case 0x4a:
-	    case 0x4d:
-	    case 0x5a:
-	    case 0x5d:
-	      /* Unaligned load versions are faster than SSSE3
-		 on Silvermont.  */
-#if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
-# error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
-#endif
-#if index_Fast_Unaligned_Load != index_Slow_SSE4_2
-# error index_Fast_Unaligned_Load != index_Slow_SSE4_2
-#endif
-	      __cpu_features.feature[index_Fast_Unaligned_Load]
-		|= (bit_Fast_Unaligned_Load
-		    | bit_Prefer_PMINUB_for_stringop
-		    | bit_Slow_SSE4_2);
-	      break;
-
-	    default:
-	      /* Unknown family 0x06 processors.  Assuming this is one
-		 of Core i3/i5/i7 processors if AVX is available.  */
-	      if ((ecx & bit_AVX) == 0)
-		break;
-
-	    case 0x1a:
-	    case 0x1e:
-	    case 0x1f:
-	    case 0x25:
-	    case 0x2c:
-	    case 0x2e:
-	    case 0x2f:
-	      /* Rep string instructions, copy backward, unaligned loads
-		 and pminub are fast on Intel Core i3, i5 and i7.  */
-#if index_Fast_Rep_String != index_Fast_Copy_Backward
-# error index_Fast_Rep_String != index_Fast_Copy_Backward
-#endif
-#if index_Fast_Rep_String != index_Fast_Unaligned_Load
-# error index_Fast_Rep_String != index_Fast_Unaligned_Load
-#endif
-#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
-# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
-#endif
-	      __cpu_features.feature[index_Fast_Rep_String]
-		|= (bit_Fast_Rep_String
-		    | bit_Fast_Copy_Backward
-		    | bit_Fast_Unaligned_Load
-		    | bit_Prefer_PMINUB_for_stringop);
-	      break;
-	    }
-	}
-    }
-  /* This spells out "AuthenticAMD".  */
-  else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
-    {
-      kind = arch_kind_amd;
-
-      get_common_indeces (&family, &model);
-
-      ecx = __cpu_features.cpuid[COMMON_CPUID_INDEX_1].ecx;
-
-      unsigned int eax;
-      __cpuid (0x80000000, eax, ebx, ecx, edx);
-      if (eax >= 0x80000001)
-	__cpuid (0x80000001,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].eax,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ebx,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].ecx,
-		 __cpu_features.cpuid[COMMON_CPUID_INDEX_80000001].edx);
-    }
-  else
-    kind = arch_kind_other;
-
-  if (__cpu_features.max_cpuid >= 7)
-    __cpuid_count (7, 0,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].eax,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ebx,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ecx,
-		   __cpu_features.cpuid[COMMON_CPUID_INDEX_7].edx);
-
-  /* Can we call xgetbv?  */
-  if (CPUID_OSXSAVE)
-    {
-      unsigned int xcrlow;
-      unsigned int xcrhigh;
-      asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0));
-      /* Is YMM and XMM state usable?  */
-      if ((xcrlow & (bit_YMM_state | bit_XMM_state)) ==
-	  (bit_YMM_state | bit_XMM_state))
-	{
-	  /* Determine if AVX is usable.  */
-	  if (CPUID_AVX)
-	    __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
-#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
-# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
-#endif
-	  /* Determine if AVX2 is usable.  Unaligned load with 256-bit
-	     AVX registers are faster on processors with AVX2.  */
-	  if (CPUID_AVX2)
-	    __cpu_features.feature[index_AVX2_Usable]
-	      |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
-	  /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and
-	     ZMM16-ZMM31 state are enabled.  */
-	  if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state
-			 | bit_ZMM16_31_state)) ==
-	      (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state))
-	    {
-	      /* Determine if AVX512F is usable.  */
-	      if (CPUID_AVX512F)
-		{
-		  __cpu_features.feature[index_AVX512F_Usable]
-		    |= bit_AVX512F_Usable;
-		  /* Determine if AVX512DQ is usable.  */
-		  if (CPUID_AVX512DQ)
-		    __cpu_features.feature[index_AVX512DQ_Usable]
-		      |= bit_AVX512DQ_Usable;
-		}
-	    }
-	  /* Determine if FMA is usable.  */
-	  if (CPUID_FMA)
-	    __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
-	  /* Determine if FMA4 is usable.  */
-	  if (CPUID_FMA4)
-	    __cpu_features.feature[index_FMA4_Usable] |= bit_FMA4_Usable;
-	}
-    }
-
-  __cpu_features.family = family;
-  __cpu_features.model = model;
-  atomic_write_barrier ();
-  __cpu_features.kind = kind;
-}
+#include <ldsodefs.h>
 
 #undef __get_cpu_features
 
 const struct cpu_features *
 __get_cpu_features (void)
 {
-  if (__cpu_features.kind == arch_kind_unknown)
-    __init_cpu_features ();
-
-  return &__cpu_features;
+  return &GLRO(dl_x86_cpu_features);
 }
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index cfc6e70..2b9988e 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -15,192 +15,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#define bit_Fast_Rep_String		(1 << 0)
-#define bit_Fast_Copy_Backward		(1 << 1)
-#define bit_Slow_BSF			(1 << 2)
-#define bit_Fast_Unaligned_Load		(1 << 4)
-#define bit_Prefer_PMINUB_for_stringop	(1 << 5)
-#define bit_AVX_Usable			(1 << 6)
-#define bit_FMA_Usable			(1 << 7)
-#define bit_FMA4_Usable			(1 << 8)
-#define bit_Slow_SSE4_2			(1 << 9)
-#define bit_AVX2_Usable			(1 << 10)
-#define bit_AVX_Fast_Unaligned_Load	(1 << 11)
-#define bit_AVX512F_Usable		(1 << 12)
-#define bit_AVX512DQ_Usable		(1 << 13)
-
-/* CPUID Feature flags.  */
-
-/* COMMON_CPUID_INDEX_1.  */
-#define bit_SSE2	(1 << 26)
-#define bit_SSSE3	(1 << 9)
-#define bit_SSE4_1	(1 << 19)
-#define bit_SSE4_2	(1 << 20)
-#define bit_OSXSAVE	(1 << 27)
-#define bit_AVX		(1 << 28)
-#define bit_POPCOUNT	(1 << 23)
-#define bit_FMA		(1 << 12)
-#define bit_FMA4	(1 << 16)
-
-/* COMMON_CPUID_INDEX_7.  */
-#define bit_RTM		(1 << 11)
-#define bit_AVX2	(1 << 5)
-#define bit_AVX512F	(1 << 16)
-#define bit_AVX512DQ	(1 << 17)
-
-/* XCR0 Feature flags.  */
-#define bit_XMM_state  (1 << 1)
-#define bit_YMM_state  (2 << 1)
-#define bit_Opmask_state	(1 << 5)
-#define bit_ZMM0_15_state	(1 << 6)
-#define bit_ZMM16_31_state	(1 << 7)
-
-/* The integer bit array index for the first set of internal feature bits.  */
-# define FEATURE_INDEX_1 0
-
-/* The current maximum size of the feature integer bit array.  */
-# define FEATURE_INDEX_MAX 1
-
-#ifdef	__ASSEMBLER__
-
-# include <ifunc-defines.h>
-
-# define index_SSE2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_EDX_OFFSET
-# define index_SSSE3	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_SSE4_1	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_SSE4_2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_AVX	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-# define index_AVX2	COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
-
-# define index_Fast_Rep_String		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Fast_Copy_Backward	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Slow_BSF			FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_FMA_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_FMA4_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_Slow_SSE4_2		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX2_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX512F_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-# define index_AVX512DQ_Usable		FEATURE_INDEX_1*FEATURE_SIZE
-
-#else	/* __ASSEMBLER__ */
-
-# include <sys/param.h>
-
-enum
-  {
-    COMMON_CPUID_INDEX_1 = 0,
-    COMMON_CPUID_INDEX_7,
-    COMMON_CPUID_INDEX_80000001,	/* for AMD */
-    /* Keep the following line at the end.  */
-    COMMON_CPUID_INDEX_MAX
-  };
-
-extern struct cpu_features
-{
-  enum cpu_features_kind
-    {
-      arch_kind_unknown = 0,
-      arch_kind_intel,
-      arch_kind_amd,
-      arch_kind_other
-    } kind;
-  int max_cpuid;
-  struct cpuid_registers
-  {
-    unsigned int eax;
-    unsigned int ebx;
-    unsigned int ecx;
-    unsigned int edx;
-  } cpuid[COMMON_CPUID_INDEX_MAX];
-  unsigned int family;
-  unsigned int model;
-  unsigned int feature[FEATURE_INDEX_MAX];
-} __cpu_features attribute_hidden;
-
-
-extern void __init_cpu_features (void) attribute_hidden;
-# define INIT_ARCH() \
-  do							\
-    if (__cpu_features.kind == arch_kind_unknown)	\
-      __init_cpu_features ();				\
-  while (0)
-
-/* Used from outside libc.so to get access to the CPU features structure.  */
-extern const struct cpu_features *__get_cpu_features (void)
-     __attribute__ ((const));
-
-# if IS_IN (libc)
-#  define __get_cpu_features()	(&__cpu_features)
-# endif
-
-# define HAS_CPU_FEATURE(idx, reg, bit) \
-  ((__get_cpu_features ()->cpuid[idx].reg & (bit)) != 0)
-
-/* Following are the feature tests used throughout libc.  */
-
-/* CPUID_* evaluates to true if the feature flag is enabled.
-   We always use &__cpu_features because the HAS_CPUID_* macros
-   are called only within __init_cpu_features, where we can't
-   call __get_cpu_features without infinite recursion.  */
-# define HAS_CPUID_FLAG(idx, reg, bit) \
-  (((&__cpu_features)->cpuid[idx].reg & (bit)) != 0)
-
-# define CPUID_OSXSAVE \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_OSXSAVE)
-# define CPUID_AVX \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_AVX)
-# define CPUID_FMA \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_1, ecx, bit_FMA)
-# define CPUID_FMA4 \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_80000001, ecx, bit_FMA4)
-# define CPUID_RTM \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
-# define CPUID_AVX2 \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX2)
-# define CPUID_AVX512F \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX512F)
-# define CPUID_AVX512DQ \
-  HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX512DQ)
-
-/* HAS_* evaluates to true if we may use the feature at runtime.  */
-# define HAS_SSE2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, bit_SSE2)
-# define HAS_POPCOUNT	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_POPCOUNT)
-# define HAS_SSSE3	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSSE3)
-# define HAS_SSE4_1	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_1)
-# define HAS_SSE4_2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_2)
-# define HAS_RTM	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
-
-# define index_Fast_Rep_String		FEATURE_INDEX_1
-# define index_Fast_Copy_Backward	FEATURE_INDEX_1
-# define index_Slow_BSF			FEATURE_INDEX_1
-# define index_Fast_Unaligned_Load	FEATURE_INDEX_1
-# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1
-# define index_AVX_Usable		FEATURE_INDEX_1
-# define index_FMA_Usable		FEATURE_INDEX_1
-# define index_FMA4_Usable		FEATURE_INDEX_1
-# define index_Slow_SSE4_2		FEATURE_INDEX_1
-# define index_AVX2_Usable		FEATURE_INDEX_1
-# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1
-# define index_AVX512F_Usable		FEATURE_INDEX_1
-# define index_AVX512DQ_Usable		FEATURE_INDEX_1
-
-# define HAS_ARCH_FEATURE(name) \
-  ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
-
-# define HAS_FAST_REP_STRING		HAS_ARCH_FEATURE (Fast_Rep_String)
-# define HAS_FAST_COPY_BACKWARD		HAS_ARCH_FEATURE (Fast_Copy_Backward)
-# define HAS_SLOW_BSF			HAS_ARCH_FEATURE (Slow_BSF)
-# define HAS_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (Fast_Unaligned_Load)
-# define HAS_AVX			HAS_ARCH_FEATURE (AVX_Usable)
-# define HAS_AVX2			HAS_ARCH_FEATURE (AVX2_Usable)
-# define HAS_AVX512F			HAS_ARCH_FEATURE (AVX512F_Usable)
-# define HAS_AVX512DQ			HAS_ARCH_FEATURE (AVX512DQ_Usable)
-# define HAS_FMA			HAS_ARCH_FEATURE (FMA_Usable)
-# define HAS_FMA4			HAS_ARCH_FEATURE (FMA4_Usable)
-# define HAS_AVX_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
-
-#endif	/* __ASSEMBLER__ */
+#ifdef  __ASSEMBLER__
+# include <cpu-features.h>
+#else
+# include <ldsodefs.h>
+#endif

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]