This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/plt/master created. glibc-2.21-684-g5b5763b


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/plt/master has been created
        at  5b5763b34b5c12c1c41315f0e823660d8bd5c698 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5b5763b34b5c12c1c41315f0e823660d8bd5c698

commit 5b5763b34b5c12c1c41315f0e823660d8bd5c698
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Aug 3 08:00:40 2015 -0700

    Rename init-arch.c  to dl-get-cpu-features.c
    
    Add testcases for __get_cpu_features

diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 11ce4ba..31bfd39 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -1,5 +1,4 @@
 ifeq ($(subdir),csu)
-aux += init-arch
 tests += test-multiarch
 gen-as-const-headers += ifunc-defines.sym
 endif
diff --git a/sysdeps/i386/i686/multiarch/Versions b/sysdeps/i386/i686/multiarch/Versions
deleted file mode 100644
index 59b185a..0000000
--- a/sysdeps/i386/i686/multiarch/Versions
+++ /dev/null
@@ -1,5 +0,0 @@
-libc {
-  GLIBC_PRIVATE {
-    __get_cpu_features;
-  }
-}
diff --git a/sysdeps/unix/sysv/linux/x86/Makefile b/sysdeps/unix/sysv/linux/x86/Makefile
index d6be472..9e6ec44 100644
--- a/sysdeps/unix/sysv/linux/x86/Makefile
+++ b/sysdeps/unix/sysv/linux/x86/Makefile
@@ -15,7 +15,6 @@ sysdep_headers += sys/elf.h sys/perm.h sys/reg.h sys/vm86.h sys/debugreg.h sys/i
 endif
 
 ifeq ($(subdir),nptl)
-libpthread-sysdep_routines += init-arch
 libpthread-sysdep_routines += elision-lock elision-unlock elision-timed \
 			      elision-trylock
 endif
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index f499308..0de4f42 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -1,3 +1,10 @@
 ifeq ($(subdir),csu)
 gen-as-const-headers += cpu-features-offsets.sym rtld-global-offsets.sym
 endif
+
+ifeq ($(subdir),elf)
+sysdep-dl-routines += dl-get-cpu-features
+
+tests += tst-get-cpu-features
+tests-static += tst-get-cpu-features-static
+endif
diff --git a/sysdeps/x86_64/multiarch/Versions b/sysdeps/x86/Versions
similarity index 87%
rename from sysdeps/x86_64/multiarch/Versions
rename to sysdeps/x86/Versions
index 59b185a..e029237 100644
--- a/sysdeps/x86_64/multiarch/Versions
+++ b/sysdeps/x86/Versions
@@ -1,4 +1,4 @@
-libc {
+ld {
   GLIBC_PRIVATE {
     __get_cpu_features;
   }
diff --git a/sysdeps/x86/cpu-features.h b/sysdeps/x86/cpu-features.h
index 9478e48..22e5abb 100644
--- a/sysdeps/x86/cpu-features.h
+++ b/sysdeps/x86/cpu-features.h
@@ -90,49 +90,53 @@
 # define index_AVX512F_Usable		FEATURE_INDEX_1*FEATURE_SIZE
 # define index_AVX512DQ_Usable		FEATURE_INDEX_1*FEATURE_SIZE
 
-# ifdef __x86_64__
-#  ifdef SHARED
-#   if IS_IN (rtld)
-#    define LOAD_RTLD_GLOBAL_RO_RDX
-#    define HAS_FEATURE(offset, name) \
+# if defined (_LIBC) && !IS_IN (nonlib)
+#  ifdef __x86_64__
+#   ifdef SHARED
+#    if IS_IN (rtld)
+#     define LOAD_RTLD_GLOBAL_RO_RDX
+#     define HAS_FEATURE(offset, name) \
   testl $(bit_##name), _rtld_local_ro+offset+(index_##name)(%rip)
-#   else
-#     define LOAD_RTLD_GLOBAL_RO_RDX \
+#    else
+#      define LOAD_RTLD_GLOBAL_RO_RDX \
   mov _rtld_global_ro@GOTPCREL(%rip), %RDX_LP
-#    define HAS_FEATURE(offset, name) \
+#     define HAS_FEATURE(offset, name) \
   testl $(bit_##name), \
 	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##name)(%rdx)
-#   endif
-#  else /* SHARED */
-#   define LOAD_RTLD_GLOBAL_RO_RDX
-#   define HAS_FEATURE(offset, name) \
+#    endif
+#   else /* SHARED */
+#    define LOAD_RTLD_GLOBAL_RO_RDX
+#    define HAS_FEATURE(offset, name) \
   testl $(bit_##name), _dl_x86_cpu_features+offset+(index_##name)(%rip)
-#  endif /* !SHARED */
-# else  /* __x86_64__ */
-#  ifdef SHARED
-#   define LOAD_FUNC_GOT_EAX(func) \
+#   endif /* !SHARED */
+#  else  /* __x86_64__ */
+#   ifdef SHARED
+#    define LOAD_FUNC_GOT_EAX(func) \
   leal func@GOTOFF(%edx), %eax
-#   if IS_IN (rtld)
+#    if IS_IN (rtld)
 #    define LOAD_GOT_AND_RTLD_GLOBAL_RO \
   LOAD_PIC_REG(dx)
-#    define HAS_FEATURE(offset, name) \
+#     define HAS_FEATURE(offset, name) \
   testl $(bit_##name), offset+(index_##name)+_rtld_local_ro@GOTOFF(%edx)
-#   else
-#    define LOAD_GOT_AND_RTLD_GLOBAL_RO \
+#    else
+#     define LOAD_GOT_AND_RTLD_GLOBAL_RO \
   LOAD_PIC_REG(dx); \
   mov _rtld_global_ro@GOT(%edx), %ecx
-#    define HAS_FEATURE(offset, name) \
+#     define HAS_FEATURE(offset, name) \
   testl $(bit_##name), \
 	RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+offset+(index_##name)(%ecx)
-#   endif
-#  else  /* SHARED */
-#   define LOAD_FUNC_GOT_EAX(func) \
+#    endif
+#   else  /* SHARED */
+#    define LOAD_FUNC_GOT_EAX(func) \
   leal func, %eax
-#   define LOAD_GOT_AND_RTLD_GLOBAL_RO
-#   define HAS_FEATURE(offset, name) \
+#    define LOAD_GOT_AND_RTLD_GLOBAL_RO
+#    define HAS_FEATURE(offset, name) \
   testl $(bit_##name), _dl_x86_cpu_features+offset+(index_##name)
-#  endif /* !SHARED */
-# endif /* !__x86_64__ */
+#   endif /* !SHARED */
+#  endif /* !__x86_64__ */
+# else /* _LIBC && !nonlib */
+#  error "Sorry, <cpu-features.h> is unimplemented for assembler"
+# endif /* !_LIBC || nonlib */
 
 /* HAS_* evaluates to true if we may use the feature at runtime.  */
 # define HAS_CPU_FEATURE(name)	HAS_FEATURE (CPUID_OFFSET, name)
@@ -171,15 +175,14 @@ struct cpu_features
   unsigned int feature[FEATURE_INDEX_MAX];
 };
 
-/* Unused for x86.  */
-# define INIT_ARCH()
-
 /* Used from outside of glibc to get access to the CPU features
    structure.  */
 extern const struct cpu_features *__get_cpu_features (void)
      __attribute__ ((const));
 
 # if defined (_LIBC) && !IS_IN (nonlib)
+/* Unused for x86.  */
+#  define INIT_ARCH()
 #  define __get_cpu_features()	(&GLRO(dl_x86_cpu_features))
 # endif
 
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86/dl-get-cpu-features.c
similarity index 81%
copy from sysdeps/x86_64/multiarch/init-arch.c
copy to sysdeps/x86/dl-get-cpu-features.c
index 01a379c..080e5e8 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86/dl-get-cpu-features.c
@@ -1,7 +1,5 @@
-/* Initialize CPU feature data.
-   This file is part of the GNU C Library.
-   Copyright (C) 2008-2015 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
+/* This file is part of the GNU C Library.
+   Copyright (C) 2015 Free Software Foundation, Inc.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
diff --git a/sysdeps/x86/tst-get-cpu-features-static.c b/sysdeps/x86/tst-get-cpu-features-static.c
new file mode 100644
index 0000000..03f5906
--- /dev/null
+++ b/sysdeps/x86/tst-get-cpu-features-static.c
@@ -0,0 +1 @@
+#include "tst-get-cpu-features.c"
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86/tst-get-cpu-features.c
similarity index 69%
rename from sysdeps/x86_64/multiarch/init-arch.c
rename to sysdeps/x86/tst-get-cpu-features.c
index 01a379c..c17060f 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86/tst-get-cpu-features.c
@@ -1,7 +1,6 @@
-/* Initialize CPU feature data.
+/* Test case for x86 __get_cpu_features interface
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Copyright (C) 2008-2015 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
 
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
@@ -17,13 +16,16 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#include <stdlib.h>
+#include <cpu-features.h>
 
-#include <ldsodefs.h>
-
-#undef __get_cpu_features
-
-const struct cpu_features *
-__get_cpu_features (void)
+static int
+do_test (void)
 {
-  return &GLRO(dl_x86_cpu_features);
+  if (__get_cpu_features ()->kind == arch_kind_unknown)
+    abort ();
+  return 0;
 }
+
+#define TEST_FUNCTION do_test ()
+#include "../../test-skeleton.c"
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d7002a9..d10b4d4 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -1,5 +1,4 @@
 ifeq ($(subdir),csu)
-aux += init-arch
 tests += test-multiarch
 gen-as-const-headers += ifunc-defines.sym
 endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=83cf75b556f9b8dcbd95541a276ee9dfc5761c4f

commit 83cf75b556f9b8dcbd95541a276ee9dfc5761c4f
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Aug 2 22:27:47 2015 -0700

    Don't run tst-getpid2 with LD_BIND_NOW=1
    
    Since _dl_x86_64_save_sse and _dl_x86_64_restore_sse are removed now,
    we don't need to run tst-getpid2 with LD_BIND_NOW=1.
    
    	* sysdeps/unix/sysv/linux/Makefile (tst-getpid2-ENV): Removed.

diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile
index bfbabd4..2c67a66 100644
--- a/sysdeps/unix/sysv/linux/Makefile
+++ b/sysdeps/unix/sysv/linux/Makefile
@@ -193,9 +193,4 @@ endif
 
 ifeq ($(subdir),nptl)
 tests += tst-setgetname tst-align-clone tst-getpid1 tst-getpid2
-
-# In this test, we create a CLONE_VM "thread" that shares TLS storage
-# with the original thread. Both threads then race in ld.so with lazy PLT
-# resolution. Avoid this race by disabling lazy binding. BZ #11214.
-tst-getpid2-ENV = LD_BIND_NOW=1
 endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=708f2aac905ea757bcb6ff23532b6ccfc406236f

commit 708f2aac905ea757bcb6ff23532b6ccfc406236f
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 04:49:38 2015 -0700

    Use SSE optimized strcmp in x86-64 ld.so
    
    Since ld.so preserves vector registers now, we can SSE optimized strcmp
    in x86-64 ld.so.
    
    	* sysdeps/x86_64/strcmp.S: Remove "#if !IS_IN (libc)".

diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 1329649..1624b5d 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -29,13 +29,6 @@
 #endif
 
 #ifdef USE_AS_STRNCMP
-/* The simplified code below is not set up to handle strncmp() so far.
-   Should this become necessary it has to be implemented.  For now
-   just report the problem.  */
-# if !IS_IN (libc)
-#  error "strncmp not implemented so far"
-# endif
-
 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
    if the new counter > the old one or is 0.  */
 # define UPDATE_STRNCMP_COUNTER				\
@@ -50,20 +43,10 @@
 #elif defined USE_AS_STRCASECMP_L
 # include "locale-defines.h"
 
-/* No support for strcasecmp outside libc so far since it is not needed.  */
-# if !IS_IN (libc)
-#  error "strcasecmp_l not implemented so far"
-# endif
-
 # define UPDATE_STRNCMP_COUNTER
 #elif defined USE_AS_STRNCASECMP_L
 # include "locale-defines.h"
 
-/* No support for strncasecmp outside libc so far since it is not needed.  */
-# if !IS_IN (libc)
-#  error "strncasecmp_l not implemented so far"
-# endif
-
 # define UPDATE_STRNCMP_COUNTER				\
 	/* calculate left number to compare */		\
 	lea	-16(%rcx, %r11), %r9;			\
@@ -126,63 +109,44 @@ libc_hidden_def (__strncasecmp)
 #endif
 
 ENTRY (STRCMP)
-#if !IS_IN (libc)
-/* Simple version since we can't use SSE registers in ld.so.  */
-L(oop):	movb	(%rdi), %al
-	cmpb	(%rsi), %al
-	jne	L(neq)
-	incq	%rdi
-	incq	%rsi
-	testb	%al, %al
-	jnz	L(oop)
-
-	xorl	%eax, %eax
-	ret
-
-L(neq):	movl	$1, %eax
-	movl	$-1, %ecx
-	cmovbl	%ecx, %eax
-	ret
-END (STRCMP)
-#else	/* !IS_IN (libc) */
-# ifdef USE_AS_STRCASECMP_L
+#ifdef USE_AS_STRCASECMP_L
 	/* We have to fall back on the C implementation for locales
 	   with encodings not matching ASCII for single bytes.  */
-#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
 	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
-#  else
+# else
 	mov	(%rdx), %RAX_LP
-#  endif
+# endif
 	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
 	jne	__strcasecmp_l_nonascii
-# elif defined USE_AS_STRNCASECMP_L
+#elif defined USE_AS_STRNCASECMP_L
 	/* We have to fall back on the C implementation for locales
 	   with encodings not matching ASCII for single bytes.  */
-#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
 	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
-#  else
+# else
 	mov	(%rcx), %RAX_LP
-#  endif
+# endif
 	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
 	jne	__strncasecmp_l_nonascii
-# endif
+#endif
 
 /*
  * This implementation uses SSE to compare up to 16 bytes at a time.
  */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	test	%rdx, %rdx
 	je	LABEL(strcmp_exitz)
 	cmp	$1, %rdx
 	je	LABEL(Byte0)
 	mov	%rdx, %r11
-# endif
+#endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
 /* Use 64bit AND here to avoid long NOP padding.  */
 	and	$0x3f, %rcx		/* rsi alignment in cache line */
 	and	$0x3f, %rax		/* rdi alignment in cache line */
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
 .Lbelowupper:
@@ -196,12 +160,12 @@ END (STRCMP)
 	.quad	0x2020202020202020
 	.previous
 	movdqa	.Lbelowupper(%rip), %xmm5
-#  define UCLOW_reg %xmm5
+# define UCLOW_reg %xmm5
 	movdqa	.Ltopupper(%rip), %xmm6
-#  define UCHIGH_reg %xmm6
+# define UCHIGH_reg %xmm6
 	movdqa	.Ltouppermask(%rip), %xmm7
-#  define LCQWORD_reg %xmm7
-# endif
+# define LCQWORD_reg %xmm7
+#endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
 	cmp	$0x30, %eax
@@ -210,8 +174,8 @@ END (STRCMP)
 	movlpd	(%rsi), %xmm2
 	movhpd	8(%rdi), %xmm1
 	movhpd	8(%rsi), %xmm2
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-#  define TOLOWER(reg1, reg2) \
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
 	movdqa	reg1, %xmm8;					\
 	movdqa	UCHIGH_reg, %xmm9;				\
 	movdqa	reg2, %xmm10;					\
@@ -227,9 +191,9 @@ END (STRCMP)
 	por	%xmm8, reg1;					\
 	por	%xmm10, reg2
 	TOLOWER (%xmm1, %xmm2)
-# else
-#  define TOLOWER(reg1, reg2)
-# endif
+#else
+# define TOLOWER(reg1, reg2)
+#endif
 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
@@ -237,10 +201,10 @@ END (STRCMP)
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
 	jnz	LABEL(less16bytes)	/* If not, find different value or null char */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)	/* finish comparision */
-# endif
+#endif
 	add	$16, %rsi		/* prepare to search next 16 bytes */
 	add	$16, %rdi		/* prepare to search next 16 bytes */
 
@@ -282,13 +246,13 @@ LABEL(ashr_0):
 	movdqa	(%rsi), %xmm1
 	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
 	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
-# else
+#else
 	movdqa	(%rdi), %xmm2
 	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm2, %xmm1			/* compare 16 bytes for equality */
-# endif
+#endif
 	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
 	pmovmskb %xmm1, %r9d
 	shr	%cl, %edx			/* adjust 0xffff for offset */
@@ -321,10 +285,10 @@ LABEL(loop_ashr_0):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)		/* mismatch or null char seen */
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	(%rsi, %rcx), %xmm1
 	movdqa	(%rdi, %rcx), %xmm2
@@ -336,10 +300,10 @@ LABEL(loop_ashr_0):
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	jmp	LABEL(loop_ashr_0)
 
@@ -388,13 +352,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -404,10 +368,10 @@ LABEL(gobble_ashr_1):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
 
@@ -418,13 +382,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -434,10 +398,10 @@ LABEL(gobble_ashr_1):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
 	jmp	LABEL(loop_ashr_1)
@@ -453,10 +417,10 @@ LABEL(nibble_ashr_1):
 	test	$0xfffe, %edx
 	jnz	LABEL(ashr_1_exittail)	/* find null char*/
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$15, %r11
 	jbe	LABEL(ashr_1_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10		/* substract 4K from %r10 */
@@ -518,13 +482,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -534,10 +498,10 @@ LABEL(gobble_ashr_2):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -549,13 +513,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -565,10 +529,10 @@ LABEL(gobble_ashr_2):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -581,10 +545,10 @@ LABEL(nibble_ashr_2):
 	test	$0xfffc, %edx
 	jnz	LABEL(ashr_2_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$14, %r11
 	jbe	LABEL(ashr_2_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -643,13 +607,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -659,10 +623,10 @@ LABEL(gobble_ashr_3):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -674,13 +638,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -690,10 +654,10 @@ LABEL(gobble_ashr_3):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -706,10 +670,10 @@ LABEL(nibble_ashr_3):
 	test	$0xfff8, %edx
 	jnz	LABEL(ashr_3_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$13, %r11
 	jbe	LABEL(ashr_3_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -768,13 +732,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -784,10 +748,10 @@ LABEL(gobble_ashr_4):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -799,13 +763,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -815,10 +779,10 @@ LABEL(gobble_ashr_4):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -831,10 +795,10 @@ LABEL(nibble_ashr_4):
 	test	$0xfff0, %edx
 	jnz	LABEL(ashr_4_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$12, %r11
 	jbe	LABEL(ashr_4_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -893,13 +857,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -909,10 +873,10 @@ LABEL(gobble_ashr_5):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -924,13 +888,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -940,10 +904,10 @@ LABEL(gobble_ashr_5):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -956,10 +920,10 @@ LABEL(nibble_ashr_5):
 	test	$0xffe0, %edx
 	jnz	LABEL(ashr_5_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$11, %r11
 	jbe	LABEL(ashr_5_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1018,13 +982,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1034,10 +998,10 @@ LABEL(gobble_ashr_6):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1049,13 +1013,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1065,10 +1029,10 @@ LABEL(gobble_ashr_6):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1081,10 +1045,10 @@ LABEL(nibble_ashr_6):
 	test	$0xffc0, %edx
 	jnz	LABEL(ashr_6_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$10, %r11
 	jbe	LABEL(ashr_6_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1143,13 +1107,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1159,10 +1123,10 @@ LABEL(gobble_ashr_7):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1174,13 +1138,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1190,10 +1154,10 @@ LABEL(gobble_ashr_7):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1206,10 +1170,10 @@ LABEL(nibble_ashr_7):
 	test	$0xff80, %edx
 	jnz	LABEL(ashr_7_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$9, %r11
 	jbe	LABEL(ashr_7_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1268,13 +1232,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1284,10 +1248,10 @@ LABEL(gobble_ashr_8):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1299,13 +1263,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1315,10 +1279,10 @@ LABEL(gobble_ashr_8):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1331,10 +1295,10 @@ LABEL(nibble_ashr_8):
 	test	$0xff00, %edx
 	jnz	LABEL(ashr_8_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$8, %r11
 	jbe	LABEL(ashr_8_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1393,13 +1357,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1409,10 +1373,10 @@ LABEL(gobble_ashr_9):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1424,13 +1388,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1440,10 +1404,10 @@ LABEL(gobble_ashr_9):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3		/* store for next cycle */
@@ -1456,10 +1420,10 @@ LABEL(nibble_ashr_9):
 	test	$0xfe00, %edx
 	jnz	LABEL(ashr_9_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$7, %r11
 	jbe	LABEL(ashr_9_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1518,13 +1482,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1534,10 +1498,10 @@ LABEL(gobble_ashr_10):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1549,13 +1513,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1565,10 +1529,10 @@ LABEL(gobble_ashr_10):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1581,10 +1545,10 @@ LABEL(nibble_ashr_10):
 	test	$0xfc00, %edx
 	jnz	LABEL(ashr_10_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$6, %r11
 	jbe	LABEL(ashr_10_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1643,13 +1607,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1659,10 +1623,10 @@ LABEL(gobble_ashr_11):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1674,13 +1638,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1690,10 +1654,10 @@ LABEL(gobble_ashr_11):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1706,10 +1670,10 @@ LABEL(nibble_ashr_11):
 	test	$0xf800, %edx
 	jnz	LABEL(ashr_11_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$5, %r11
 	jbe	LABEL(ashr_11_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1768,13 +1732,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1784,10 +1748,10 @@ LABEL(gobble_ashr_12):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1799,13 +1763,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1815,10 +1779,10 @@ LABEL(gobble_ashr_12):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1831,10 +1795,10 @@ LABEL(nibble_ashr_12):
 	test	$0xf000, %edx
 	jnz	LABEL(ashr_12_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$4, %r11
 	jbe	LABEL(ashr_12_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1893,13 +1857,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1909,10 +1873,10 @@ LABEL(gobble_ashr_13):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1924,13 +1888,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1940,10 +1904,10 @@ LABEL(gobble_ashr_13):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1956,10 +1920,10 @@ LABEL(nibble_ashr_13):
 	test	$0xe000, %edx
 	jnz	LABEL(ashr_13_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$3, %r11
 	jbe	LABEL(ashr_13_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2018,13 +1982,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2034,10 +1998,10 @@ LABEL(gobble_ashr_14):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2049,13 +2013,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2065,10 +2029,10 @@ LABEL(gobble_ashr_14):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2081,10 +2045,10 @@ LABEL(nibble_ashr_14):
 	test	$0xc000, %edx
 	jnz	LABEL(ashr_14_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$2, %r11
 	jbe	LABEL(ashr_14_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2145,13 +2109,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2161,10 +2125,10 @@ LABEL(gobble_ashr_15):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2176,13 +2140,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2192,10 +2156,10 @@ LABEL(gobble_ashr_15):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2208,10 +2172,10 @@ LABEL(nibble_ashr_15):
 	test	$0x8000, %edx
 	jnz	LABEL(ashr_15_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmpq	$1, %r11
 	jbe	LABEL(ashr_15_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2246,18 +2210,18 @@ LABEL(ret):
 LABEL(less16bytes):
 	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	%rdx, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	movzbl	(%rsi, %rdx), %ecx
 	movzbl	(%rdi, %rdx), %eax
 
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 	movl	(%rdx,%rcx,4), %ecx
 	movl	(%rdx,%rax,4), %eax
-# endif
+#endif
 
 	sub	%ecx, %eax
 	ret
@@ -2271,11 +2235,11 @@ LABEL(Byte0):
 	movzx	(%rsi), %ecx
 	movzx	(%rdi), %eax
 
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 	movl	(%rdx,%rcx,4), %ecx
 	movl	(%rdx,%rax,4), %eax
-# endif
+#endif
 
 	sub	%ecx, %eax
 	ret
@@ -2300,5 +2264,4 @@ LABEL(unaligned_table):
 	.int	LABEL(ashr_14) - LABEL(unaligned_table)
 	.int	LABEL(ashr_15) - LABEL(unaligned_table)
 	.int	LABEL(ashr_0) - LABEL(unaligned_table)
-#endif /* !IS_IN (libc) */
 libc_hidden_builtin_def (STRCMP)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=26d4813c6a5bfda63b578f83670f236cbf1c13fe

commit 26d4813c6a5bfda63b578f83670f236cbf1c13fe
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:56:14 2015 -0700

    Remove x86-64 rtld-xxx.c and rtld-xxx.S
    
    Since ld.so preserves vector registers now, we can use the regular,
    non-ifunc string and memory functions in ld.so.
    
    	* sysdeps/x86_64/rtld-memcmp.c: Removed.
    	* sysdeps/x86_64/rtld-memset.S: Likewise.
    	* sysdeps/x86_64/rtld-strchr.S: Likewise.
    	* sysdeps/x86_64/rtld-strlen.S: Likewise.
    	* sysdeps/x86_64/multiarch/rtld-memcmp.c: Likewise.
    	* sysdeps/x86_64/multiarch/rtld-memset.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/rtld-memcmp.c b/sysdeps/x86_64/multiarch/rtld-memcmp.c
deleted file mode 100644
index 0f27135..0000000
--- a/sysdeps/x86_64/multiarch/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memcmp.c"
diff --git a/sysdeps/x86_64/multiarch/rtld-memset.S b/sysdeps/x86_64/multiarch/rtld-memset.S
deleted file mode 100644
index 8092aa0..0000000
--- a/sysdeps/x86_64/multiarch/rtld-memset.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memset.S"
diff --git a/sysdeps/x86_64/rtld-memcmp.c b/sysdeps/x86_64/rtld-memcmp.c
deleted file mode 100644
index 2ee4032..0000000
--- a/sysdeps/x86_64/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <string/memcmp.c>
diff --git a/sysdeps/x86_64/rtld-memset.S b/sysdeps/x86_64/rtld-memset.S
deleted file mode 100644
index f8df333..0000000
--- a/sysdeps/x86_64/rtld-memset.S
+++ /dev/null
@@ -1,37 +0,0 @@
-/* memset implementation for the dynamic linker.  This is separate from the
-   libc implementation to avoid writing to SSE registers.
-   Copyright (C) 2013-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-/* void *memset (void *dest, char c, size_t count)
-   dest	 => %rdi
-   c	 => %rsi
-   count => %rdx  */
-ENTRY (memset)
-	mov	%rdx, %rcx
-	movzbl	%sil, %eax
-	mov	%rdi, %rdx
-	rep	stosb
-	mov	%rdx, %rax
-	ret
-END (memset)
-libc_hidden_builtin_def (memset)
diff --git a/sysdeps/x86_64/rtld-strchr.S b/sysdeps/x86_64/rtld-strchr.S
deleted file mode 100644
index cc694d7..0000000
--- a/sysdeps/x86_64/rtld-strchr.S
+++ /dev/null
@@ -1,288 +0,0 @@
-/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
-   For AMD x86-64.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-ENTRY (strchr)
-
-	/* Before we start with the main loop we process single bytes
-	   until the source pointer is aligned.  This has two reasons:
-	   1. aligned 64-bit memory access is faster
-	   and (more important)
-	   2. we process in the main loop 64 bit in one step although
-	      we don't know the end of the string.  But accessing at
-	      8-byte alignment guarantees that we never access illegal
-	      memory if this would not also be done by the trivial
-	      implementation (this is because all processor inherent
-	      boundaries are multiples of 8).  */
-
-	movq	%rdi, %rdx
-	andl	$7, %edx	/* Mask alignment bits  */
-	movq	%rdi, %rax	/* duplicate destination.  */
-	jz	1f		/* aligned => start loop */
-	neg	%edx
-	addl	$8, %edx	/* Align to 8 bytes.  */
-
-	/* Search the first bytes directly.  */
-0:	movb	(%rax), %cl	/* load byte  */
-	cmpb	%cl,%sil	/* compare byte.  */
-	je	6f		/* target found */
-	testb	%cl,%cl		/* is byte NUL? */
-	je	7f		/* yes => return NULL */
-	incq	%rax		/* increment pointer */
-	decl	%edx
-	jnz	0b
-
-
-1:
-	/* At the moment %rsi contains C.  What we need for the
-	   algorithm is C in all bytes of the register.  Avoid
-	   operations on 16 bit words because these require an
-	   prefix byte (and one more cycle).  */
-	/* Populate 8 bit data to full 64-bit.  */
-	movabs	$0x0101010101010101,%r9
-	movzbl	%sil,%edx
-	imul	%rdx,%r9
-
-	movq $0xfefefefefefefeff, %r8 /* Save magic.  */
-
-      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
-	 change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of QUARDWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.	If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24 tec..  If one of bits 54-63 is set, there will be a carry
-	 into bit 64 (=carry flag), so all of the hole bits will
-	 be changed.
-
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-	.p2align 4
-4:
-	/* Main Loop is unrolled 4 times.  */
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => restart loop */
-
-
-7:	/* Return NULL.  */
-	xorl %eax, %eax
-	retq
-
-
-	/* We now scan for the byte in which the character was matched.
-	   But we have to take care of the case that a NUL char is
-	   found before this in the dword.  Note that we XORed %rcx
-	   with the byte we're looking for, therefore the tests below look
-	   reversed.  */
-
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
-	subq	$8,%rax		/* correct pointer increment.  */
-	testb %cl, %cl		/* is first byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is first byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is second byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is third byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is third byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is fourth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is fourth byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is fifth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is fifth byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is sixth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is sixth byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is seventh byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is seventh byte NUL? */
-	je 7b			/* yes => return NULL */
-
-	/* It must be in the eigth byte and it cannot be NUL.  */
-	incq %rax
-
-6:
-	nop
-	retq
-END (strchr)
-
-weak_alias (strchr, index)
-libc_hidden_builtin_def (strchr)
diff --git a/sysdeps/x86_64/rtld-strlen.S b/sysdeps/x86_64/rtld-strlen.S
deleted file mode 100644
index 1328652..0000000
--- a/sysdeps/x86_64/rtld-strlen.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/* strlen(str) -- determine the length of the string STR.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
-   Based on i486 version contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-ENTRY (strlen)
-	movq %rdi, %rcx		/* Duplicate source pointer. */
-	andl $7, %ecx		/* mask alignment bits */
-	movq %rdi, %rax		/* duplicate destination.  */
-	jz 1f			/* aligned => start loop */
-
-	neg %ecx		/* We need to align to 8 bytes.  */
-	addl $8,%ecx
-	/* Search the first bytes directly.  */
-0:	cmpb $0x0,(%rax)	/* is byte NUL? */
-	je 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-	decl %ecx
-	jnz 0b
-
-1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
-
-	.p2align 4		/* Align loop.  */
-4:	/* Main Loop is unrolled 4 times.  */
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => continue loop */
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0x00ff0000, %ecx /* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	testl $0xff000000, %ecx /* is fourth byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	shrq $32, %rcx		/* look at other half.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0xff0000, %ecx	/* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-2:
-	subq %rdi, %rax		/* compute difference to string start */
-	ret
-END (strlen)
-libc_hidden_builtin_def (strlen)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3f2511ec570058d78256d4a9523285a4089b7ed9

commit 3f2511ec570058d78256d4a9523285a4089b7ed9
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:47:54 2015 -0700

    Replace %xmm8 with %xmm0
    
    Since ld.so preserves vector registers now, we can use %xmm0 to avoid
    the REX prefix.
    
    	* sysdeps/x86_64/memset.S: Replace %xmm8 with %xmm0.

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index e496254..3855cc8 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -24,7 +24,7 @@
 ENTRY(__bzero)
 	movq	%rdi, %rax /* Set return value.  */
 	movq	%rsi, %rdx /* Set n.  */
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	jmp	L(entry_from_bzero)
 END(__bzero)
 weak_alias (__bzero, bzero)
@@ -33,10 +33,10 @@ weak_alias (__bzero, bzero)
 ENTRY(__memset_tail)
 	movq	%rcx, %rax /* Set return value.  */
 
-	movd	%esi, %xmm8
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
+	movd	%esi, %xmm0
+	punpcklbw	%xmm0, %xmm0
+	punpcklwd	%xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
 
 	jmp	L(entry_from_bzero)
 END(__memset_tail)
@@ -50,57 +50,57 @@ END_CHK (__memset_chk)
 #endif
 
 ENTRY (memset)
-	movd	%esi, %xmm8
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
+	punpcklbw	%xmm0, %xmm0
+	punpcklwd	%xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
 L(entry_from_bzero):
 	cmpq	$64, %rdx
 	ja	L(loop_start)
 	cmpq	$16, %rdx
 	jbe	L(less_16_bytes)
 	cmpq	$32, %rdx
-	movdqu	%xmm8, (%rdi)
-	movdqu	%xmm8, -16(%rdi,%rdx)
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm0, -16(%rdi,%rdx)
 	ja	L(between_32_64_bytes)
 L(return):
 	rep
 	ret
 	.p2align 4
 L(between_32_64_bytes):
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi,%rdx)
 	ret
 	.p2align 4
 L(loop_start):
 	leaq	64(%rdi), %rcx
-	movdqu	%xmm8, (%rdi)
+	movdqu	%xmm0, (%rdi)
 	andq	$-64, %rcx
-	movdqu	%xmm8, -16(%rdi,%rdx)
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
-	movdqu	%xmm8, 32(%rdi)
-	movdqu	%xmm8, -48(%rdi,%rdx)
-	movdqu	%xmm8, 48(%rdi)
-	movdqu	%xmm8, -64(%rdi,%rdx)
+	movdqu	%xmm0, -16(%rdi,%rdx)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi,%rdx)
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm0, -48(%rdi,%rdx)
+	movdqu	%xmm0, 48(%rdi)
+	movdqu	%xmm0, -64(%rdi,%rdx)
 	addq	%rdi, %rdx
 	andq	$-64, %rdx
 	cmpq	%rdx, %rcx
 	je	L(return)
 	.p2align 4
 L(loop):
-	movdqa	%xmm8, (%rcx)
-	movdqa	%xmm8, 16(%rcx)
-	movdqa	%xmm8, 32(%rcx)
-	movdqa	%xmm8, 48(%rcx)
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	movdqa	%xmm0, 32(%rcx)
+	movdqa	%xmm0, 48(%rcx)
 	addq	$64, %rcx
 	cmpq	%rcx, %rdx
 	jne	L(loop)
 	rep
 	ret
 L(less_16_bytes):
-	movq %xmm8, %rcx
+	movq %xmm0, %rcx
 	testb	$24, %dl
 	jne	L(between8_16bytes)
 	testb	$4, %dl

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a83593af1006873abd1e26ea944f6cf09f072e28

commit a83593af1006873abd1e26ea944f6cf09f072e28
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:44:39 2015 -0700

    Replace %xmm[8-12] with %xmm[0-4]
    
    Since ld.so preserves vector registers now, we can use %xmm[0-4] to
    avoid the REX prefix.
    
    	* sysdeps/x86_64/strlen.S: Replace %xmm[8-12] with %xmm[0-4].

diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index c382c8d..0725333 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -20,7 +20,7 @@
 
 /* Long lived register in strlen(s), strnlen(s, n) are:
 
-	%xmm11 - zero
+	%xmm3 - zero
 	%rdi   - s
 	%r10  (s+n) & (~(64-1))
 	%r11   s+n
@@ -32,14 +32,14 @@ ENTRY(strlen)
 
 /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
 #define FIND_ZERO	\
-	pcmpeqb	(%rax), %xmm8;	\
-	pcmpeqb	16(%rax), %xmm9;	\
-	pcmpeqb	32(%rax), %xmm10;	\
-	pcmpeqb	48(%rax), %xmm11;	\
-	pmovmskb	%xmm8, %esi;	\
-	pmovmskb	%xmm9, %edx;	\
-	pmovmskb	%xmm10, %r8d;	\
-	pmovmskb	%xmm11, %ecx;	\
+	pcmpeqb	(%rax), %xmm0;	\
+	pcmpeqb	16(%rax), %xmm1;	\
+	pcmpeqb	32(%rax), %xmm2;	\
+	pcmpeqb	48(%rax), %xmm3;	\
+	pmovmskb	%xmm0, %esi;	\
+	pmovmskb	%xmm1, %edx;	\
+	pmovmskb	%xmm2, %r8d;	\
+	pmovmskb	%xmm3, %ecx;	\
 	salq	$16, %rdx;	\
 	salq	$16, %rcx;	\
 	orq	%rsi, %rdx;	\
@@ -63,10 +63,10 @@ L(n_nonzero):
 	mov	%rsi, %r11
 #endif
 
-	pxor	%xmm8, %xmm8
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
 	movq	%rdi, %rax
 	movq	%rdi, %rcx
 	andq	$4095, %rcx
@@ -103,9 +103,9 @@ L(n_nonzero):
 	FIND_ZERO
 #else
 	/* Test first 16 bytes unaligned.  */
-	movdqu	(%rax), %xmm12
-	pcmpeqb	%xmm8, %xmm12
-	pmovmskb	%xmm12, %edx
+	movdqu	(%rax), %xmm4
+	pcmpeqb	%xmm0, %xmm4
+	pmovmskb	%xmm4, %edx
 	test	%edx, %edx
 	je 	L(next48_bytes)
 	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
@@ -114,12 +114,12 @@ L(n_nonzero):
 L(next48_bytes):
 /* Same as FIND_ZERO except we do not check first 16 bytes.  */
 	andq	$-16, %rax
-	pcmpeqb 16(%rax), %xmm9
-	pcmpeqb 32(%rax), %xmm10
-	pcmpeqb 48(%rax), %xmm11
-	pmovmskb	%xmm9, %edx
-	pmovmskb	%xmm10, %r8d
-	pmovmskb	%xmm11, %ecx
+	pcmpeqb 16(%rax), %xmm1
+	pcmpeqb 32(%rax), %xmm2
+	pcmpeqb 48(%rax), %xmm3
+	pmovmskb	%xmm1, %edx
+	pmovmskb	%xmm2, %r8d
+	pmovmskb	%xmm3, %ecx
 	salq	$16, %rdx
 	salq	$16, %rcx
 	orq	%r8, %rcx
@@ -127,7 +127,7 @@ L(next48_bytes):
 	orq	%rcx, %rdx
 #endif
 
-	/* When no zero byte is found xmm9-11 are zero so we do not have to
+	/* When no zero byte is found xmm1-3 are zero so we do not have to
 	   zero them.  */
 	PROLOG(loop)
 
@@ -149,9 +149,9 @@ L(strnlen_ret):
 #endif
 	.p2align 4
 L(loop_init):
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
 #ifdef AS_STRNLEN
 	.p2align 4
 L(loop):
@@ -160,12 +160,12 @@ L(loop):
 	cmpq	%rax, %r10
 	je	L(exit_end)
 
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	(%rax), %xmm0
+	pminub	16(%rax), %xmm0
+	pminub	32(%rax), %xmm0
+	pminub	48(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit)
 	jmp	L(loop)
@@ -174,7 +174,7 @@ L(loop):
 L(exit_end):
 	cmp	%rax, %r11
 	je	L(first) /* Do not read when end is at page boundary.  */
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 L(first):
@@ -186,7 +186,7 @@ L(first):
 
 	.p2align 4
 L(exit):
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 	bsfq	%rdx, %rdx
@@ -200,23 +200,23 @@ L(exit):
 	.p2align 4
 L(loop):
 
-	movdqa	64(%rax), %xmm8
-	pminub	80(%rax), %xmm8
-	pminub	96(%rax), %xmm8
-	pminub	112(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	64(%rax), %xmm0
+	pminub	80(%rax), %xmm0
+	pminub	96(%rax), %xmm0
+	pminub	112(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit64)
 
 	subq	$-128, %rax
 
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	(%rax), %xmm0
+	pminub	16(%rax), %xmm0
+	pminub	32(%rax), %xmm0
+	pminub	48(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit0)
 	jmp	L(loop)
@@ -225,7 +225,7 @@ L(loop):
 L(exit64):
 	addq	$64, %rax
 L(exit0):
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 	bsfq	%rdx, %rdx

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b16fbdf77f3d815e19673478f32556ac8baf7d86

commit b16fbdf77f3d815e19673478f32556ac8baf7d86
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Jul 28 18:56:18 2015 -0700

    Don't disable SSE in x86-64 ld.so
    
    Since ld.so preserves vector registers now, we can use SSE in ld.so.
    
    	* sysdeps/i386/Makefile [$(subdir) == elf] (CFLAGS-.os): Add
    	-mno-sse -mno-mmx for $(all-rtld-routines).
    	[$(subdir) == elf] (tests-special): Add
    	$(objpfx)tst-ld-sse-use.out.
    	[$(subdir) == elf] ($(objpfx)tst-ld-sse-use.out): New rule.
    	* sysdeps/x86/Makefile [$(subdir) == elf] (CFLAGS-.os): Removed.
    	[$(subdir) == elf] (tests-special): Likewise.
    	[$(subdir) == elf] ($(objpfx)tst-ld-sse-use.out): Likewise.
    	* sysdeps/x86_64/Makefile [$(subdir) == elf] (CFLAGS-.os): Add
    	-mno-mmx for $(all-rtld-routines).

diff --git a/sysdeps/i386/Makefile b/sysdeps/i386/Makefile
index 11f425d..2c08907 100644
--- a/sysdeps/i386/Makefile
+++ b/sysdeps/i386/Makefile
@@ -79,3 +79,14 @@ endif
 ifeq ($(subdir),csu)
 gen-as-const-headers += tlsdesc.sym
 endif
+
+ifeq ($(subdir),elf)
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+		   -mno-sse -mno-mmx)
+
+tests-special += $(objpfx)tst-ld-sse-use.out
+$(objpfx)tst-ld-sse-use.out: ../sysdeps/x86/tst-ld-sse-use.sh $(objpfx)ld.so
+	@echo "Checking ld.so for SSE register use.  This will take a few seconds..."
+	$(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
+	$(evaluate-test)
+endif
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 61dfff3..f499308 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -1,14 +1,3 @@
-ifeq ($(subdir),elf)
-CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
-		   -mno-sse -mno-mmx)
-
-tests-special += $(objpfx)tst-ld-sse-use.out
-$(objpfx)tst-ld-sse-use.out: ../sysdeps/x86/tst-ld-sse-use.sh $(objpfx)ld.so
-	@echo "Checking ld.so for SSE register use.  This will take a few seconds..."
-	$(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
-	$(evaluate-test)
-endif
-
 ifeq ($(subdir),csu)
 gen-as-const-headers += cpu-features-offsets.sym rtld-global-offsets.sym
 endif
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index f6ef064..e155691 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -19,6 +19,9 @@ gen-as-const-headers += locale-defines.sym
 endif
 
 ifeq ($(subdir),elf)
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+		   -mno-mmx)
+
 sysdep-dl-routines += tlsdesc dl-tlsdesc
 
 tests += ifuncmain8

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=442f3d779f4b711045eb48a8fc78016d57a9d760

commit 442f3d779f4b711045eb48a8fc78016d57a9d760
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat Jul 11 13:25:25 2015 -0700

    Save and restore vector registers in x86-64 ld.so
    
    This patch initiaizes GLRO(dl_x86_xstate) in dl_platform_init to
    indicate if the processor supports SSE, AVX or AVX512.  It uses
    this information to properly save and restore vector registers in
    ld.so.  Now we can use SSE in ld.so and delete FOREIGN_CALL macros.
    
    	[BZ #15128]
    	* sysdeps/x86_64/Makefile [$(subdir) == elf] (tests): Add
    	ifuncmain8.
    	(modules-names): Add ifuncmod8.
    	($(objpfx)ifuncmain8): New rule.
    	* sysdeps/x86_64/dl-machine.h: Include <dl-procinfo.h> and
    	<cpuid.h>.
    	(elf_machine_runtime_setup): Use _dl_runtime_resolve_sse,
    	_dl_runtime_resolve_avx, or _dl_runtime_resolve_avx512,
    	_dl_runtime_profile_sse, _dl_runtime_profile_avx, or
    	_dl_runtime_profile_avx512, based on HAS_ARCH_FEATURE.
    	* sysdeps/x86_64/dl-trampoline.S: Rewrite.
    	* sysdeps/x86_64/dl-trampoline.h: Likewise.
    	* sysdeps/x86_64/ifuncmain8.c: New file.
    	* sysdeps/x86_64/ifuncmod8.c: Likewise.
    	* sysdeps/x86_64/nptl/tcb-offsets.sym (RTLD_SAVESPACE_SSE):
    	Removed.
    	* sysdeps/x86_64/nptl/tls.h (__128bits): Removed.
    	(tcbhead_t): Change rtld_must_xmm_save to __glibc_unused1.
    	Change rtld_savespace_sse to __glibc_unused2.
    	(RTLD_CHECK_FOREIGN_CALL): Removed.
    	(RTLD_ENABLE_FOREIGN_CALL): Likewise.
    	(RTLD_PREPARE_FOREIGN_CALL): Likewise.
    	(RTLD_FINALIZE_FOREIGN_CALL): Likewise.

diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index ef70a50..f6ef064 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -21,6 +21,11 @@ endif
 ifeq ($(subdir),elf)
 sysdep-dl-routines += tlsdesc dl-tlsdesc
 
+tests += ifuncmain8
+modules-names += ifuncmod8
+
+$(objpfx)ifuncmain8: $(objpfx)ifuncmod8.so
+
 tests += tst-quad1 tst-quad2
 modules-names += tst-quadmod1 tst-quadmod2
 
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index d22359d..32d25da 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -66,8 +66,12 @@ static inline int __attribute__ ((unused, always_inline))
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 {
   Elf64_Addr *got;
-  extern void _dl_runtime_resolve (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_profile (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
 
   if (l->l_info[DT_JMPREL] && lazy)
     {
@@ -95,7 +99,12 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
-	  *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile;
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
 
 	  if (GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), l))
@@ -104,9 +113,17 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	    GL(dl_profile_map) = l;
 	}
       else
-	/* This function will get called to fix up the GOT entry indicated by
-	   the offset on the stack, and then jump to the resolved address.  */
-	*(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve;
+	{
+	  /* This function will get called to fix up the GOT entry
+	     indicated by the offset on the stack, and then jump to
+	     the resolved address.  */
+	  if (HAS_ARCH_FEATURE (AVX512F_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+	  else if (HAS_ARCH_FEATURE (AVX_Usable))
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
+	}
     }
 
   if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 678c57f..8475d26 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -20,23 +20,40 @@
 #include <sysdep.h>
 #include <link-defines.h>
 
-#if (RTLD_SAVESPACE_SSE % 32) != 0
-# error RTLD_SAVESPACE_SSE must be aligned to 32 bytes
+#ifndef DL_STACK_ALIGNMENT
+/* Due to GCC bug:
+
+   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+
+   __tls_get_addr may be called with 8-byte stack alignment.  Although
+   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
+   that stack will be always aligned at 16 bytes.  We use unaligned
+   16-byte move to load and store SSE registers, which has no penalty
+   on modern processors if stack is 16-byte aligned.  */
+# define DL_STACK_ALIGNMENT 8
+#endif
+
+#ifndef DL_RUNIME_UNALIGNED_VEC_SIZE
+/* The maximum size of unaligned vector load and store.  */
+# define DL_RUNIME_UNALIGNED_VEC_SIZE 16
 #endif
 
+/* True if _dl_runtime_resolve should align stack to VEC_SIZE bytes.  */
+#define DL_RUNIME_RESOLVE_REALIGN_STACK \
+  (VEC_SIZE > DL_STACK_ALIGNMENT \
+   && VEC_SIZE > DL_RUNIME_UNALIGNED_VEC_SIZE)
+
+/* Align vector register save area to 16 bytes.  */
+#define REGISTER_SAVE_VEC_OFF	0
+
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
 #ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
-# define REGISTER_SAVE_AREA	(8 * 7)
-# define REGISTER_SAVE_RAX	0
+# define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
 # define PRESERVE_BND_REGS_PREFIX
 #else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
-   BND1, BND2, BND3.  */
-# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4)
 /* Align bound register save area to 16 bytes.  */
-# define REGISTER_SAVE_BND0	0
+# define REGISTER_SAVE_BND0	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
 # define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
 # define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
 # define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
@@ -54,386 +71,53 @@
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
 
-	.text
-	.globl _dl_runtime_resolve
-	.type _dl_runtime_resolve, @function
-	.align 16
-	cfi_startproc
-_dl_runtime_resolve:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	subq $REGISTER_SAVE_AREA,%rsp
-	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
-	# Preserve registers otherwise clobbered.
-	movq %rax, REGISTER_SAVE_RAX(%rsp)
-	movq %rcx, REGISTER_SAVE_RCX(%rsp)
-	movq %rdx, REGISTER_SAVE_RDX(%rsp)
-	movq %rsi, REGISTER_SAVE_RSI(%rsp)
-	movq %rdi, REGISTER_SAVE_RDI(%rsp)
-	movq %r8, REGISTER_SAVE_R8(%rsp)
-	movq %r9, REGISTER_SAVE_R9(%rsp)
-#ifndef __ILP32__
-	# We also have to preserve bound registers.  These are nops if
-	# Intel MPX isn't available or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
-	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
-	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
-	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
-# else
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1b,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
-	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
-# endif
-#endif
-	# Copy args pushed by PLT in register.
-	# %rdi: link_map, %rsi: reloc_index
-	movq (REGISTER_SAVE_AREA + 8)(%rsp), %rsi
-	movq REGISTER_SAVE_AREA(%rsp), %rdi
-	call _dl_fixup		# Call resolver.
-	movq %rax, %r11		# Save return value
-#ifndef __ILP32__
-	# Restore bound registers.  These are nops if Intel MPX isn't
-	# avaiable or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
-	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
-	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
-	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
-# else
-	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
-	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1a,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-# endif
+#define VEC_SIZE		64
+#define VMOVA			vmovdqa64
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa64
+#else
+# define VMOV			vmovdqu64
 #endif
-	# Get register content back.
-	movq REGISTER_SAVE_R9(%rsp), %r9
-	movq REGISTER_SAVE_R8(%rsp), %r8
-	movq REGISTER_SAVE_RDI(%rsp), %rdi
-	movq REGISTER_SAVE_RSI(%rsp), %rsi
-	movq REGISTER_SAVE_RDX(%rsp), %rdx
-	movq REGISTER_SAVE_RCX(%rsp), %rcx
-	movq REGISTER_SAVE_RAX(%rsp), %rax
-	# Adjust stack(PLT did 2 pushes)
-	addq $(REGISTER_SAVE_AREA + 16), %rsp
-	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
-	# Preserve bound registers.
-	PRESERVE_BND_REGS_PREFIX
-	jmp *%r11		# Jump to function address.
-	cfi_endproc
-	.size _dl_runtime_resolve, .-_dl_runtime_resolve
-
-
-#ifndef PROF
-	.globl _dl_runtime_profile
-	.type _dl_runtime_profile, @function
-	.align 16
-	cfi_startproc
-
-_dl_runtime_profile:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	/* The La_x86_64_regs data structure pointed to by the
-	   fourth paramater must be 16-byte aligned.  This must
-	   be explicitly enforced.  We have the set up a dynamically
-	   sized stack frame.  %rbx points to the top half which
-	   has a fixed size and preserves the original stack pointer.  */
-
-	subq $32, %rsp		# Allocate the local storage.
-	cfi_adjust_cfa_offset(32)
-	movq %rbx, (%rsp)
-	cfi_rel_offset(%rbx, 0)
-
-	/* On the stack:
-		56(%rbx)	parameter #1
-		48(%rbx)	return address
-
-		40(%rbx)	reloc index
-		32(%rbx)	link_map
-
-		24(%rbx)	La_x86_64_regs pointer
-		16(%rbx)	framesize
-		 8(%rbx)	rax
-		  (%rbx)	rbx
-	*/
-
-	movq %rax, 8(%rsp)
-	movq %rsp, %rbx
-	cfi_def_cfa_register(%rbx)
-
-	/* Actively align the La_x86_64_regs structure.  */
-	andq $0xfffffffffffffff0, %rsp
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
-	   to detect if any xmm0-xmm7 registers are changed by audit
-	   module.  */
-	subq $(LR_SIZE + XMM_SIZE*8), %rsp
-# else
-	subq $LR_SIZE, %rsp		# sizeof(La_x86_64_regs)
-# endif
-	movq %rsp, 24(%rbx)
-
-	/* Fill the La_x86_64_regs structure.  */
-	movq %rdx, LR_RDX_OFFSET(%rsp)
-	movq %r8,  LR_R8_OFFSET(%rsp)
-	movq %r9,  LR_R9_OFFSET(%rsp)
-	movq %rcx, LR_RCX_OFFSET(%rsp)
-	movq %rsi, LR_RSI_OFFSET(%rsp)
-	movq %rdi, LR_RDI_OFFSET(%rsp)
-	movq %rbp, LR_RBP_OFFSET(%rsp)
-
-	leaq 48(%rbx), %rax
-	movq %rax, LR_RSP_OFFSET(%rsp)
-
-	/* We always store the XMM registers even if AVX is available.
-	   This is to provide backward binary compatibility for existing
-	   audit modules.  */
-	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
-	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
-	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
-	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
-	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
-	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
-	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
-	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
-
-# ifndef __ILP32__
-#  ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
-	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
-	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
-	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
-#  else
-	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
-	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-#  endif
-# endif
-
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	.data
-L(have_avx):
-	.zero 4
-	.size L(have_avx), 4
-	.previous
-
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	10f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in processor?
-	movq	%rbx, %r11		# Save rbx
-	xorl	%ecx, %ecx
-	mov	$0x7, %eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	20f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	cmpl	$0xe6, %eax
-	jne	20f
-	movl	%eax, L(have_avx)(%rip)
-L(avx512):
-#   define RESTORE_AVX
-#   define VMOV    vmovdqu64
-#   define VEC(i)  zmm##i
-#   define MORE_CODE
-#   include "dl-trampoline.h"
-#   undef VMOV
-#   undef VEC
-#   undef RESTORE_AVX
-#  endif
-20:	andl	$0x6, %eax
-10:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined):
-	js	L(no_avx)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512)
-#  endif
-
-#  define RESTORE_AVX
-#  define VMOV    vmovdqu
-#  define VEC(i)  ymm##i
-#  define MORE_CODE
-#  include "dl-trampoline.h"
-
-	.align 16
-L(no_avx):
-# endif
-
-# undef RESTORE_AVX
-# include "dl-trampoline.h"
-
-	cfi_endproc
-	.size _dl_runtime_profile, .-_dl_runtime_profile
+#define VEC(i)			zmm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_avx512
+#define _dl_runtime_profile	_dl_runtime_profile_avx512
+#define RESTORE_AVX
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VEC
+#undef VMOV
+#undef VMOVA
+#undef VEC_SIZE
+
+#define VEC_SIZE		32
+#define VMOVA			vmovdqa
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			vmovdqa
+#else
+# define VMOV			vmovdqu
 #endif
-
-
-#ifdef SHARED
-	.globl _dl_x86_64_save_sse
-	.type _dl_x86_64_save_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_save_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined_5)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	1f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in a processor?
-	movq	%rbx, %r11              # Save rbx
-	xorl	%ecx,%ecx
-	mov	$0x7,%eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	2f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0xe6, %eax
-	je	L(avx512_5)
-#  endif
-
-2:	andl	$0x6, %eax
-1:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined_5):
-	js	L(no_avx5)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_5)
-#  endif
-
-	vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
-	vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
-	vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
-	vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE
-	vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE
-	vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE
-	vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
-	vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_5):
-	vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE
-	vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE
-	vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE
-	vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE
-	vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE
-	vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE
-	vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE
-	vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE
-	ret
-#  endif
-L(no_avx5):
-# endif
-	movdqa	%xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
-	movdqa	%xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE
-	movdqa	%xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE
-	movdqa	%xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE
-	movdqa	%xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE
-	movdqa	%xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE
-	movdqa	%xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE
-	movdqa	%xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE
-	ret
-	cfi_endproc
-	.size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse
-
-
-	.globl _dl_x86_64_restore_sse
-	.type _dl_x86_64_restore_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_restore_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx6)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_6)
-#  endif
-
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_6):
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7
-	ret
-#  endif
-L(no_avx6):
-# endif
-	movdqa	%fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
-	movdqa	%fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1
-	movdqa	%fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2
-	movdqa	%fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3
-	movdqa	%fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4
-	movdqa	%fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5
-	movdqa	%fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6
-	movdqa	%fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7
-	ret
-	cfi_endproc
-	.size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse
+#define VEC(i)			ymm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_avx
+#define _dl_runtime_profile	_dl_runtime_profile_avx
+#include "dl-trampoline.h"
+#undef _dl_runtime_resolve
+#undef _dl_runtime_profile
+#undef VEC
+#undef VMOV
+#undef VMOVA
+#undef VEC_SIZE
+
+/* movaps/movups is 1-byte shorter.  */
+#define VEC_SIZE		16
+#define VMOVA			movaps
+#if DL_RUNIME_RESOLVE_REALIGN_STACK || VEC_SIZE <= DL_STACK_ALIGNMENT
+# define VMOV			movaps
+#else
+# define VMOV			movups
 #endif
+#define VEC(i)			xmm##i
+#define _dl_runtime_resolve	_dl_runtime_resolve_sse
+#define _dl_runtime_profile	_dl_runtime_profile_sse
+#undef RESTORE_AVX
+#include "dl-trampoline.h"
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index d542428..dd6d7c7 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -1,5 +1,4 @@
-/* Partial PLT profile trampoline to save and restore x86-64 vector
-   registers.
+/* PLT trampolines.  x86-64 version.
    Copyright (C) 2009-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,16 +16,252 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifdef RESTORE_AVX
+#undef REGISTER_SAVE_AREA_RAW
+#ifdef __ILP32__
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
+   VEC7.  */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + VEC_SIZE * 8)
+#else
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
+# define REGISTER_SAVE_AREA_RAW	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
+#endif
+
+#undef REGISTER_SAVE_AREA
+#undef LOCAL_STORAGE_AREA
+#undef BASE
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# define REGISTER_SAVE_AREA	(REGISTER_SAVE_AREA_RAW + 8)
+/* Local stack area before jumping to function address: RBX.  */
+# define LOCAL_STORAGE_AREA	8
+# define BASE			rbx
+# if (REGISTER_SAVE_AREA % VEC_SIZE) != 0
+#  error REGISTER_SAVE_AREA must be multples of VEC_SIZE
+# endif
+#else
+# define REGISTER_SAVE_AREA	REGISTER_SAVE_AREA_RAW
+/* Local stack area before jumping to function address:  All saved
+   registers.  */
+# define LOCAL_STORAGE_AREA	REGISTER_SAVE_AREA
+# define BASE			rsp
+# if (REGISTER_SAVE_AREA % 16) != 8
+#  error REGISTER_SAVE_AREA must be odd multples of 8
+# endif
+#endif
+
+	.text
+	.globl _dl_runtime_resolve
+	.hidden _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	.align 16
+	cfi_startproc
+_dl_runtime_resolve:
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+# if LOCAL_STORAGE_AREA != 8
+#  error LOCAL_STORAGE_AREA must be 8
+# endif
+	pushq %rbx			# push subtracts stack by 8.
+	cfi_adjust_cfa_offset(8)
+	cfi_rel_offset(%rbx, 0)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+	and $-VEC_SIZE, %RSP_LP
+#endif
+	sub $REGISTER_SAVE_AREA, %RSP_LP
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+	# Preserve registers otherwise clobbered.
+	movq %rax, REGISTER_SAVE_RAX(%rsp)
+	movq %rcx, REGISTER_SAVE_RCX(%rsp)
+	movq %rdx, REGISTER_SAVE_RDX(%rsp)
+	movq %rsi, REGISTER_SAVE_RSI(%rsp)
+	movq %rdi, REGISTER_SAVE_RDI(%rsp)
+	movq %r8, REGISTER_SAVE_R8(%rsp)
+	movq %r9, REGISTER_SAVE_R9(%rsp)
+	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
+	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
+	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
+	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
+	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
+	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
+	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
+	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
+#ifndef __ILP32__
+	# We also have to preserve bound registers.  These are nops if
+	# Intel MPX isn't available or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
+	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
+	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
+	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
+# else
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1b,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
+	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
+# endif
+#endif
+	# Copy args pushed by PLT in register.
+	# %rdi: link_map, %rsi: reloc_index
+	mov (LOCAL_STORAGE_AREA + 8)(%BASE), %RSI_LP
+	mov LOCAL_STORAGE_AREA(%BASE), %RDI_LP
+	call _dl_fixup		# Call resolver.
+	mov %RAX_LP, %R11_LP	# Save return value
+#ifndef __ILP32__
+	# Restore bound registers.  These are nops if Intel MPX isn't
+	# avaiable or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
+	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
+	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
+	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
+# else
+	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
+	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1a,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+# endif
+#endif
+	# Get register content back.
+	movq REGISTER_SAVE_R9(%rsp), %r9
+	movq REGISTER_SAVE_R8(%rsp), %r8
+	movq REGISTER_SAVE_RDI(%rsp), %rdi
+	movq REGISTER_SAVE_RSI(%rsp), %rsi
+	movq REGISTER_SAVE_RDX(%rsp), %rdx
+	movq REGISTER_SAVE_RCX(%rsp), %rcx
+	movq REGISTER_SAVE_RAX(%rsp), %rax
+	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
+#if DL_RUNIME_RESOLVE_REALIGN_STACK
+	mov %RBX_LP, %RSP_LP
+	cfi_def_cfa_register(%rsp)
+	movq (%rsp), %rbx
+	cfi_restore(%rbx)
+#endif
+	# Adjust stack(PLT did 2 pushes)
+	add $(LOCAL_STORAGE_AREA + 16), %RSP_LP
+	cfi_adjust_cfa_offset(-(LOCAL_STORAGE_AREA + 16))
+	# Preserve bound registers.
+	PRESERVE_BND_REGS_PREFIX
+	jmp *%r11		# Jump to function address.
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+# if (LR_VECTOR_OFFSET % VEC_SIZE) != 0
+#  error LR_VECTOR_OFFSET must be multples of VEC_SIZE
+# endif
+
+	.globl _dl_runtime_profile
+	.hidden _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	.align 16
+_dl_runtime_profile:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	/* The La_x86_64_regs data structure pointed to by the
+	   fourth paramater must be VEC_SIZE-byte aligned.  This must
+	   be explicitly enforced.  We have the set up a dynamically
+	   sized stack frame.  %rbx points to the top half which
+	   has a fixed size and preserves the original stack pointer.  */
+
+	sub $32, %RSP_LP	# Allocate the local storage.
+	cfi_adjust_cfa_offset(32)
+	movq %rbx, (%rsp)
+	cfi_rel_offset(%rbx, 0)
+
+	/* On the stack:
+		56(%rbx)	parameter #1
+		48(%rbx)	return address
+
+		40(%rbx)	reloc index
+		32(%rbx)	link_map
+
+		24(%rbx)	La_x86_64_regs pointer
+		16(%rbx)	framesize
+		 8(%rbx)	rax
+		  (%rbx)	rbx
+	*/
+
+	movq %rax, 8(%rsp)
+	mov %RSP_LP, %RBX_LP
+	cfi_def_cfa_register(%rbx)
+
+	/* Actively align the La_x86_64_regs structure.  */
+	and $-VEC_SIZE, %RSP_LP
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
+	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
+	   to detect if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	sub $(LR_SIZE + XMM_SIZE*8), %RSP_LP
+# else
+	sub $LR_SIZE, %RSP_LP		# sizeof(La_x86_64_regs)
+# endif
+	movq %rsp, 24(%rbx)
+
+	/* Fill the La_x86_64_regs structure.  */
+	movq %rdx, LR_RDX_OFFSET(%rsp)
+	movq %r8,  LR_R8_OFFSET(%rsp)
+	movq %r9,  LR_R9_OFFSET(%rsp)
+	movq %rcx, LR_RCX_OFFSET(%rsp)
+	movq %rsi, LR_RSI_OFFSET(%rsp)
+	movq %rdi, LR_RDI_OFFSET(%rsp)
+	movq %rbp, LR_RBP_OFFSET(%rsp)
+
+	lea 48(%rbx), %RAX_LP
+	movq %rax, LR_RSP_OFFSET(%rsp)
+
+	/* We always store the XMM registers even if AVX is available.
+	   This is to provide backward binary compatibility for existing
+	   audit modules.  */
+	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
+	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
+	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
+	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
+	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
+	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
+#  else
+	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
+# endif
+
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
-	VMOV %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
-	VMOV %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
-	VMOV %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
-	VMOV %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
-	VMOV %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
-	VMOV %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
-	VMOV %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
-	VMOV %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
+	VMOVA %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
+	VMOVA %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
+	VMOVA %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
+	VMOVA %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
+	VMOVA %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
+	VMOVA %VEC(5), (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
+	VMOVA %VEC(6), (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
+	VMOVA %VEC(7), (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
 
 	/* Save xmm0-xmm7 registers to detect if any of them are
 	   changed by audit module.  */
@@ -38,7 +273,7 @@
 	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
 	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
 	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
-#endif
+# endif
 
 	mov %RSP_LP, %RCX_LP	# La_x86_64_regs pointer to %rcx.
 	mov 48(%rbx), %RDX_LP	# Load return address if needed.
@@ -63,7 +298,7 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
 	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
@@ -72,7 +307,7 @@
 	je 2f
 	vmovdqa	%xmm0, (LR_VECTOR_OFFSET)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
+2:	VMOVA (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
 	vmovdqa	%xmm0, (LR_XMM_OFFSET)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
@@ -81,7 +316,7 @@
 	je 2f
 	vmovdqa	%xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
 	vmovdqa	%xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
@@ -90,7 +325,7 @@
 	je 2f
 	vmovdqa	%xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
 	vmovdqa	%xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
@@ -99,7 +334,7 @@
 	je 2f
 	vmovdqa	%xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
 	vmovdqa	%xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
@@ -108,7 +343,7 @@
 	je 2f
 	vmovdqa	%xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
 	vmovdqa	%xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
@@ -117,7 +352,7 @@
 	je 2f
 	vmovdqa	%xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
 	vmovdqa	%xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
@@ -126,7 +361,7 @@
 	je 2f
 	vmovdqa	%xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
 	vmovdqa	%xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
 
 1:	vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
@@ -135,25 +370,25 @@
 	je 2f
 	vmovdqa	%xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
 	jmp 1f
-2:	VMOV (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
+2:	VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp), %VEC(7)
 	vmovdqa	%xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
 	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
 	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
 	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
+#  else
 	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
 	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
 	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
 	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
 # endif
-#endif
 
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
@@ -168,12 +403,12 @@
 	movq LR_RSI_OFFSET(%rsp), %rsi
 	movq LR_RDI_OFFSET(%rsp), %rdi
 
-	movq %rbx, %rsp
+	mov %RBX_LP, %RSP_LP
 	movq (%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore(%rbx)
 	cfi_def_cfa_register(%rsp)
 
-	addq $48, %rsp		# Adjust the stack to the return value
+	add $48, %RSP_LP	# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
 	PRESERVE_BND_REGS_PREFIX
@@ -189,13 +424,13 @@
 	   temporary buffer of the size specified by the 'framesize'
 	   returned from _dl_profile_fixup */
 
-	leaq LR_RSP_OFFSET(%rbx), %rsi	# stack
-	addq $8, %r10
-	andq $0xfffffffffffffff0, %r10
-	movq %r10, %rcx
-	subq %r10, %rsp
-	movq %rsp, %rdi
-	shrq $3, %rcx
+	lea LR_RSP_OFFSET(%rbx), %RSI_LP # stack
+	add $8, %R10_LP
+	and $-16, %R10_LP
+	mov %R10_LP, %RCX_LP
+	sub %R10_LP, %RSP_LP
+	mov %RSP_LP, %RDI_LP
+	shr $3, %RCX_LP
 	rep
 	movsq
 
@@ -206,21 +441,21 @@
 	PRESERVE_BND_REGS_PREFIX
 	call *%r11
 
-	mov 24(%rbx), %rsp	# Drop the copied stack content
+	mov 24(%rbx), %RSP_LP	# Drop the copied stack content
 
 	/* Now we have to prepare the La_x86_64_retval structure for the
 	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
 	   so we just need to allocate the sizeof(La_x86_64_retval) space on
 	   the stack, since the alignment has already been taken care of. */
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
 	   registers to detect if xmm0/xmm1 registers are changed
 	   by audit module.  */
-	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
-#else
-	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
-#endif
-	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
+	sub $(LRV_SIZE + XMM_SIZE*2), %RSP_LP
+# else
+	sub $LRV_SIZE, %RSP_LP	# sizeof(La_x86_64_retval)
+# endif
+	mov %RSP_LP, %RCX_LP	# La_x86_64_retval argument to %rcx.
 
 	/* Fill in the La_x86_64_retval structure.  */
 	movq %rax, LRV_RAX_OFFSET(%rcx)
@@ -229,26 +464,26 @@
 	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
 	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
-	VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
-	VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
+	VMOVA %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
+	VMOVA %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
 
 	/* Save xmm0/xmm1 registers to detect if they are changed
 	   by audit module.  */
 	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
 	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov %bnd0, LRV_BND0_OFFSET(%rcx)  # Preserve returned bounds.
 	bndmov %bnd1, LRV_BND1_OFFSET(%rcx)
-# else
+#  else
 	.byte  0x66,0x0f,0x1b,0x81;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1b,0x89;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fstpt LRV_ST0_OFFSET(%rcx)
 	fstpt LRV_ST1_OFFSET(%rcx)
@@ -265,50 +500,47 @@
 	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
 	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if xmm0/xmm1 registers are changed by audit module.  */
 	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	jne 1f
-	VMOV LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
+	VMOVA LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
 
 1:	vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
 	vpmovmskb %xmm2, %esi
 	cmpl $0xffff, %esi
 	jne 1f
-	VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
+	VMOVA LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
 	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
-# else
+#  else
 	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fldt LRV_ST1_OFFSET(%rsp)
 	fldt LRV_ST0_OFFSET(%rsp)
 
-	movq %rbx, %rsp
+	mov %RBX_LP, %RSP_LP
 	movq (%rsp), %rbx
-	cfi_restore(rbx)
+	cfi_restore(%rbx)
 	cfi_def_cfa_register(%rsp)
 
-	addq $48, %rsp		# Adjust the stack to the return value
+	add $48, %RSP_LP	# Adjust the stack to the return value
 				# (eats the reloc index and link_map)
 	cfi_adjust_cfa_offset(-48)
 	PRESERVE_BND_REGS_PREFIX
 	retq
 
-#ifdef MORE_CODE
-	cfi_adjust_cfa_offset(48)
-	cfi_rel_offset(%rbx, 0)
-	cfi_def_cfa_register(%rbx)
-# undef MORE_CODE
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
 #endif
diff --git a/sysdeps/x86_64/ifuncmain8.c b/sysdeps/x86_64/ifuncmain8.c
new file mode 100644
index 0000000..f75771d
--- /dev/null
+++ b/sysdeps/x86_64/ifuncmain8.c
@@ -0,0 +1,32 @@
+/* Test IFUNC selector with floating-point parameters.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+
+extern float foo (float);
+
+static int
+do_test (void)
+{
+  if (foo (2) != 3)
+    abort ();
+  return 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/sysdeps/x86_64/ifuncmod8.c b/sysdeps/x86_64/ifuncmod8.c
new file mode 100644
index 0000000..741aa13
--- /dev/null
+++ b/sysdeps/x86_64/ifuncmod8.c
@@ -0,0 +1,36 @@
+/* Test IFUNC selector with floating-point parameters.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <emmintrin.h>
+
+void * foo_ifunc (void) __asm__ ("foo");
+__asm__(".type foo, %gnu_indirect_function");
+
+static float
+foo_impl (float x)
+{
+  return x + 1;
+}
+
+void *
+foo_ifunc (void)
+{
+  __m128i xmm = _mm_set1_epi32 (-1);
+  asm volatile ("movdqa %0, %%xmm0" : : "x" (xmm) : "xmm0" );
+  return foo_impl;
+}
diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym
index 729d1da..aeb7526 100644
--- a/sysdeps/x86_64/nptl/tcb-offsets.sym
+++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
@@ -16,7 +16,6 @@ VGETCPU_CACHE_OFFSET	offsetof (tcbhead_t, vgetcpu_cache)
 #ifndef __ASSUME_PRIVATE_FUTEX
 PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
 #endif
-RTLD_SAVESPACE_SSE	offsetof (tcbhead_t, rtld_savespace_sse)
 
 -- Not strictly offsets, but these values are also used in the TCB.
 TCB_CANCELSTATE_BITMASK	 CANCELSTATE_BITMASK
diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
index d7543c6..b73e7ed 100644
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@@ -67,14 +67,15 @@ typedef struct
 # else
   int __glibc_reserved1;
 # endif
-  int rtld_must_xmm_save;
+  int __glibc_unused1;
   /* Reservation of some values for the TM ABI.  */
   void *__private_tm[4];
   /* GCC split stack support.  */
   void *__private_ss;
   long int __glibc_reserved2;
-  /* Have space for the post-AVX register size.  */
-  __128bits rtld_savespace_sse[8][4] __attribute__ ((aligned (32)));
+  /* Must be kept even if it is no longer used by glibc since programs,
+     like AddressSanitizer, depend on the size of tcbhead_t.  */
+  __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32)));
 
   void *__padding[8];
 } tcbhead_t;
@@ -384,41 +385,6 @@ typedef struct
 # define THREAD_GSCOPE_WAIT() \
   GL(dl_wait_lookup_done) ()
 
-
-# ifdef SHARED
-/* Defined in dl-trampoline.S.  */
-extern void _dl_x86_64_save_sse (void);
-extern void _dl_x86_64_restore_sse (void);
-
-# define RTLD_CHECK_FOREIGN_CALL \
-  (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)
-
-/* NB: Don't use the xchg operation because that would imply a lock
-   prefix which is expensive and unnecessary.  The cache line is also
-   not contested at all.  */
-#  define RTLD_ENABLE_FOREIGN_CALL \
-  int old_rtld_must_xmm_save = THREAD_GETMEM (THREAD_SELF,		      \
-					      header.rtld_must_xmm_save);     \
-  THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1)
-
-#  define RTLD_PREPARE_FOREIGN_CALL \
-  do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save))	      \
-    {									      \
-      _dl_x86_64_save_sse ();						      \
-      THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0);	      \
-    }									      \
-  while (0)
-
-#  define RTLD_FINALIZE_FOREIGN_CALL \
-  do {									      \
-    if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0)	      \
-      _dl_x86_64_restore_sse ();					      \
-    THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save,		      \
-		   old_rtld_must_xmm_save);				      \
-  } while (0)
-# endif
-
-
 #endif /* __ASSEMBLER__ */
 
 #endif	/* tls.h */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1ca5bb347ba585e3393c654cebac0c015a08d47b

commit 1ca5bb347ba585e3393c654cebac0c015a08d47b
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:41:20 2015 -0700

    Align stack when calling __errno_location
    
    We should align stack to 16 bytes when calling __errno_location.
    
    	[BZ #18661]
    	* sysdeps/x86_64/fpu/s_cosf.S (__cosf): Align stack to 16 bytes
    	when calling __errno_location.
    	* sysdeps/x86_64/fpu/s_sincosf.S (__sincosf): Likewise.
    	* sysdeps/x86_64/fpu/s_sinf.S (__sinf): Likewise.

diff --git a/sysdeps/x86_64/fpu/s_cosf.S b/sysdeps/x86_64/fpu/s_cosf.S
index b7868ce..bea10ef 100644
--- a/sysdeps/x86_64/fpu/s_cosf.S
+++ b/sysdeps/x86_64/fpu/s_cosf.S
@@ -310,8 +310,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4
diff --git a/sysdeps/x86_64/fpu/s_sincosf.S b/sysdeps/x86_64/fpu/s_sincosf.S
index 21db70a..a2f3133 100644
--- a/sysdeps/x86_64/fpu/s_sincosf.S
+++ b/sysdeps/x86_64/fpu/s_sincosf.S
@@ -354,8 +354,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4
diff --git a/sysdeps/x86_64/fpu/s_sinf.S b/sysdeps/x86_64/fpu/s_sinf.S
index dc92164..90afbe8 100644
--- a/sysdeps/x86_64/fpu/s_sinf.S
+++ b/sysdeps/x86_64/fpu/s_sinf.S
@@ -336,8 +336,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9fe5cccab58b5cbaf3bcec3e9154413f3330c849

commit 9fe5cccab58b5cbaf3bcec3e9154413f3330c849
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:40:25 2015 -0700

    Align stack to 16 bytes when calling __gettimeofday
    
    Subtract stack by 24 bytes instead of 16 bytes so that stack is aligned
    to 16 bytes when calling __gettimeofday.
    
    	[BZ #18661]
    	* sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
    	(__lll_timedwait_tid): Align stack to 16 bytes when calling
    	__gettimeofday.

diff --git a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
index 0935db5..8e1a39d 100644
--- a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
+++ b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
@@ -394,8 +394,9 @@ __lll_timedwait_tid:
 	movq	%rdi, %r12
 	movq	%rsi, %r13
 
-	subq	$16, %rsp
-	cfi_adjust_cfa_offset(16)
+	/* Align stack to 16 bytes when calling __gettimeofday.  */
+	subq	$24, %rsp
+	cfi_adjust_cfa_offset(24)
 
 	/* Get current time.  */
 2:	movq	%rsp, %rdi
@@ -441,8 +442,8 @@ __lll_timedwait_tid:
 	jne	1f
 4:	xorl	%eax, %eax
 
-8:	addq	$16, %rsp
-	cfi_adjust_cfa_offset(-16)
+8:	addq	$24, %rsp
+	cfi_adjust_cfa_offset(-24)
 	popq	%r13
 	cfi_adjust_cfa_offset(-8)
 	cfi_restore(%r13)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e5180480fda8fb740d5a02eb6ecbe25ff82214d4

commit e5180480fda8fb740d5a02eb6ecbe25ff82214d4
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:38:58 2015 -0700

    Align stack to 16 bytes when calling __setcontext
    
    Don't use pop to restore %rdi so that stack is aligned to 16 bytes
    when calling __setcontext.
    
    	[BZ #18661]
    	* sysdeps/unix/sysv/linux/x86_64/__start_context.S
    	(__start_context): Don't use pop to restore %rdi so that stack
    	is aligned to 16 bytes when calling __setcontext.

diff --git a/sysdeps/unix/sysv/linux/x86_64/__start_context.S b/sysdeps/unix/sysv/linux/x86_64/__start_context.S
index 52a5afa..96366e0 100644
--- a/sysdeps/unix/sysv/linux/x86_64/__start_context.S
+++ b/sysdeps/unix/sysv/linux/x86_64/__start_context.S
@@ -31,8 +31,8 @@ ENTRY(__start_context)
 	   on the stack pointer for the next context.  */
 	movq	%rbx, %rsp
 
-	popq	%rdi			/* This is the next context.  */
-	cfi_adjust_cfa_offset(-8)
+	/* Don't use pop here so that stack is aligned to 16 bytes.  */
+	movq	(%rsp), %rdi		/* This is the next context.  */
 	testq	%rdi, %rdi
 	je	2f			/* If it is zero exit.  */
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a691de50e047732d8ed7c010f8a456d67d071789

commit a691de50e047732d8ed7c010f8a456d67d071789
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:41:58 2015 -0700

    Compile {memcpy,strcmp}-sse2-unaligned.S only for libc
    
    {memcpy,strcmp}-sse2-unaligned.S aren't needed in ld.so.
    
    	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Compile
    	only for libc.
    	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index c5450af..5693ba7 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#if IS_IN (libc)
+
 #include <sysdep.h>
 
 #include "asm-syntax.h"
@@ -169,3 +171,5 @@ L(between_5_8):
 	movl	%eax, -4(%rdi,%rdx)
 	jmp	L(return)
 END(__memcpy_sse2_unaligned)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 20b65fa..c6606b4 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#if IS_IN (libc)
+
 #include "sysdep.h"
 
 ENTRY ( __strcmp_sse2_unaligned)
@@ -207,3 +209,5 @@ L(different):
 	subl	%ecx, %eax
 	ret
 END (__strcmp_sse2_unaligned)
+
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=686d22be135392def2b1d2018195cac67bd19208

commit 686d22be135392def2b1d2018195cac67bd19208
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat Aug 1 07:47:16 2015 -0700

    Update x86 elision-conf.c for <cpu-features.h>
    
    This patch updates x86 elision-conf.c to use the newly defined
    HAS_CPU_FEATURE from <cpu-features.h>.
    
    	* sysdeps/unix/sysv/linux/x86/elision-conf.c (elision_init):
    	Replace HAS_RTM with HAS_CPU_FEATURE (RTM).

diff --git a/sysdeps/unix/sysv/linux/x86/elision-conf.c b/sysdeps/unix/sysv/linux/x86/elision-conf.c
index 84902ac..4a73382 100644
--- a/sysdeps/unix/sysv/linux/x86/elision-conf.c
+++ b/sysdeps/unix/sysv/linux/x86/elision-conf.c
@@ -62,11 +62,11 @@ elision_init (int argc __attribute__ ((unused)),
 	      char **argv  __attribute__ ((unused)),
 	      char **environ)
 {
-  __elision_available = HAS_RTM;
+  __elision_available = HAS_CPU_FEATURE (RTM);
 #ifdef ENABLE_LOCK_ELISION
   __pthread_force_elision = __libc_enable_secure ? 0 : __elision_available;
 #endif
-  if (!HAS_RTM)
+  if (!HAS_CPU_FEATURE (RTM))
     __elision_aconf.retry_try_xbegin = 0; /* Disable elision on rwlocks */
 }
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3d98f487d18379acd39c4a1db38bd192b4a9beab

commit 3d98f487d18379acd39c4a1db38bd192b4a9beab
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Fri Jul 31 13:46:05 2015 -0700

    Update libmvec multiarch functions for <cpu-features.h>
    
    This patch updates libmvec multiarch functions to use the newly defined
    HAS_CPU_FEATURE, HAS_ARCH_FEATURE and LOAD_RTLD_GLOBAL_RO_RDX from
    <cpu-features.h>.
    
    	* math/Makefile ($(addprefix $(objpfx), $(libm-vec-tests))):
    	Remove $(objpfx)init-arch.o.
    	* sysdeps/x86_64/fpu/Makefile (libmvec-support): Remove
    	init-arch.
    	* sysdeps/x86_64/fpu/math-tests-arch.h (avx_usable): Removed.
    	(INIT_ARCH_EXT): Defined as empty.
    	(CHECK_ARCH_EXT): Replace HAS_XXX with HAS_ARCH_FEATURE (XXX).
    	* sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S: Remove
    	__init_cpu_features call.  Replace HAS_XXX with
    	HAS_CPU_FEATURE/HAS_ARCH_FEATURE (XXX).
    	* sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S: Likewise.
    	* sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S: Likewise.

diff --git a/math/Makefile b/math/Makefile
index 6388bae..d3b483d 100644
--- a/math/Makefile
+++ b/math/Makefile
@@ -263,7 +263,7 @@ $(objpfx)libieee.a: $(objpfx)ieee-math.o
 $(addprefix $(objpfx),$(filter-out $(tests-static) $(libm-vec-tests),$(tests))): $(libm)
 $(addprefix $(objpfx),$(tests-static)): $(objpfx)libm.a
 $(addprefix $(objpfx), $(libm-vec-tests)): $(objpfx)%: $(libm) $(libmvec) \
-					   $(objpfx)init-arch.o $(objpfx)%-wrappers.o
+					   $(objpfx)%-wrappers.o
 
 gmp-objs = $(patsubst %,$(common-objpfx)stdlib/%.o,\
 		      add_n sub_n cmp addmul_1 mul_1 mul_n divmod_1 \
diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile
index 1ebe511..f98f6cf 100644
--- a/sysdeps/x86_64/fpu/Makefile
+++ b/sysdeps/x86_64/fpu/Makefile
@@ -20,7 +20,7 @@ libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \
 		   svml_d_pow_data svml_s_powf4_core svml_s_powf8_core_avx \
 		   svml_s_powf8_core svml_s_powf16_core svml_s_powf_data \
 		   svml_s_sincosf4_core svml_s_sincosf8_core_avx \
-		   svml_s_sincosf8_core svml_s_sincosf16_core init-arch
+		   svml_s_sincosf8_core svml_s_sincosf16_core
 endif
 
 # Variables for libmvec tests.
diff --git a/sysdeps/x86_64/fpu/math-tests-arch.h b/sysdeps/x86_64/fpu/math-tests-arch.h
index e8833bf..fb8251b 100644
--- a/sysdeps/x86_64/fpu/math-tests-arch.h
+++ b/sysdeps/x86_64/fpu/math-tests-arch.h
@@ -19,66 +19,36 @@
 #if defined REQUIRE_AVX
 # include <init-arch.h>
 
-/* Set to 1 if AVX supported.  */
-static int avx_usable;
-
-# define INIT_ARCH_EXT                                         \
-  do                                                           \
-    {                                                          \
-      __init_cpu_features ();                                  \
-      avx_usable = __cpu_features.feature[index_AVX_Usable]    \
-                   & bit_AVX_Usable;                           \
-    }                                                          \
-  while (0)
+# define INIT_ARCH_EXT
 
 # define CHECK_ARCH_EXT                                        \
   do                                                           \
     {                                                          \
-      if (!avx_usable) return;                                 \
+      if (!HAS_ARCH_FEATURE (AVX_Usable)) return;              \
     }                                                          \
   while (0)
 
 #elif defined REQUIRE_AVX2
 # include <init-arch.h>
 
-  /* Set to 1 if AVX2 supported.  */
-  static int avx2_usable;
-
-# define INIT_ARCH_EXT                                         \
-  do                                                           \
-    {                                                          \
-      __init_cpu_features ();                                  \
-      avx2_usable = __cpu_features.feature[index_AVX2_Usable]  \
-                  & bit_AVX2_Usable;                           \
-    }                                                          \
-  while (0)
+# define INIT_ARCH_EXT
 
 # define CHECK_ARCH_EXT                                        \
   do                                                           \
     {                                                          \
-      if (!avx2_usable) return;                                \
+      if (!HAS_ARCH_FEATURE (AVX2_Usable)) return;             \
     }                                                          \
   while (0)
 
 #elif defined REQUIRE_AVX512F
 # include <init-arch.h>
 
-  /* Set to 1 if supported.  */
-  static int avx512f_usable;
-
-# define INIT_ARCH_EXT                                                \
-  do                                                                  \
-    {                                                                 \
-      __init_cpu_features ();                                         \
-      avx512f_usable = __cpu_features.feature[index_AVX512F_Usable]   \
-		       & bit_AVX512F_Usable;                          \
-    }                                                                 \
-  while (0)
+# define INIT_ARCH_EXT
 
 # define CHECK_ARCH_EXT                                        \
   do                                                           \
     {                                                          \
-      if (!avx512f_usable) return;                             \
+      if (!HAS_ARCH_FEATURE (AVX512F_Usable)) return;          \
     }                                                          \
   while (0)
 
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
index 5f67d83..c64485e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos2_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2v_cos)
         .type   _ZGVbN2v_cos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_cos_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_cos_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_cos_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
index 5babb83..6460690 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_cos)
         .type   _ZGVdN4v_cos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_cos_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_cos_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_cos_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
index d0f4f27..add99a1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_cos)
         .type   _ZGVeN8v_cos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
+	LOAD_RTLD_GLOBAL_RO_RDX
 1:      leaq    _ZGVeN8v_cos_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8v_cos_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8v_cos_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
index ef3dc49..538e991 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp2_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2v_exp)
         .type   _ZGVbN2v_exp, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_exp_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_exp_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_exp_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
index 7f2ebde..c68ca93 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_exp)
         .type   _ZGVdN4v_exp, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_exp_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_exp_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_exp_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
index 7b7c07d..d3985dc 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_exp8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_exp)
         .type   _ZGVeN8v_exp, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8v_exp_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_exp_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8v_exp_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8v_exp_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
index 38d369f..adcb34e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log2_core.S
@@ -22,11 +22,9 @@
         .text
 ENTRY (_ZGVbN2v_log)
         .type   _ZGVbN2v_log, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_log_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_log_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_log_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
index ddb6105..9c9f84a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_log)
         .type   _ZGVdN4v_log, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_log_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_log_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_log_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
index 76375fd..0ceb9eb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_log)
         .type   _ZGVeN8v_log, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8v_log_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_log_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8v_log_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8v_log_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
index f111388..0fbdb43 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2vv_pow)
         .type   _ZGVbN2vv_pow, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2vv_pow_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2vv_pow_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2vv_pow_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
index 21e3070..0cf5c9b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4vv_pow)
         .type   _ZGVdN4vv_pow, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4vv_pow_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4vv_pow_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4vv_pow_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
index c1e5e76..9afdf67 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8vv_pow)
         .type   _ZGVeN8vv_pow, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8vv_pow_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8vv_pow_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8vv_pow_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8vv_pow_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
index 29bd0a7..eec486b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin2_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2v_sin)
         .type   _ZGVbN2v_sin, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2v_sin_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2v_sin_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2v_sin_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
index c3a453a..17cb5c1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4v_sin)
         .type   _ZGVdN4v_sin, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4v_sin_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4v_sin_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4v_sin_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
index 131f2f4..61ee0c0 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8v_sin)
         .type   _ZGVeN8v_sin, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8v_sin_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8v_sin_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8v_sin_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8v_sin_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
index e8e5771..3d03c53 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos2_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN2vvv_sincos)
         .type   _ZGVbN2vvv_sincos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN2vvv_sincos_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN2vvv_sincos_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN2vvv_sincos_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
index 64744ff..1cc2b69 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN4vvv_sincos)
         .type   _ZGVdN4vvv_sincos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN4vvv_sincos_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN4vvv_sincos_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN4vvv_sincos_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
index e331090..850f221 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN8vvv_sincos)
         .type   _ZGVeN8vvv_sincos, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN8vvv_sincos_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN8vvv_sincos_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN8vvv_sincos_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
index 0654d3c..227f46e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_cosf)
         .type   _ZGVeN16v_cosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_cosf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_cosf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16v_cosf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16v_cosf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
index fa2363b..2e98938 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_cosf)
         .type   _ZGVbN4v_cosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_cosf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_cosf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_cosf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
index e14bba4..830b10f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_cosf)
         .type   _ZGVdN8v_cosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8v_cosf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_cosf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_cosf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
index 62858eb..79ac304 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_expf)
         .type   _ZGVeN16v_expf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_expf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_expf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16v_expf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16v_expf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
index 37d38bc..e9781f3 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_expf)
         .type   _ZGVbN4v_expf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_expf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_expf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_expf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
index e3dc1b1..41e59ef 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_expf)
         .type   _ZGVdN8v_expf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8v_expf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_expf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_expf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
index 68c57e4..fa01161 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_logf)
         .type   _ZGVeN16v_logf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_logf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_logf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16v_logf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16v_logf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
index 153ed8e..0f1ca73 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_logf)
         .type   _ZGVbN4v_logf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_logf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_logf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_logf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
index 6f50bf6..65d1f7f 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_logf)
         .type   _ZGVdN8v_logf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8v_logf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8v_logf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_logf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
index 3aa9f95..e33e83e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16vv_powf)
         .type   _ZGVeN16vv_powf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16vv_powf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16vv_powf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16vv_powf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16vv_powf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
index f88b9ca..28abeec 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4vv_powf)
         .type   _ZGVbN4vv_powf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4vv_powf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4vv_powf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4vv_powf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
index 4552e57..0cbbe5d 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8vv_powf)
         .type   _ZGVdN8vv_powf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8vv_powf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8vv_powf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8vv_powf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
index bdcabab..a32b66e 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16vvv_sincosf)
         .type   _ZGVeN16vvv_sincosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16vvv_sincosf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16vvv_sincosf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16vvv_sincosf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
index 610046b..e64fbfb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4vvv_sincosf)
         .type   _ZGVbN4vvv_sincosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4vvv_sincosf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4vvv_sincosf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4vvv_sincosf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
index 9e5be67..b3f31c6 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8vvv_sincosf)
         .type   _ZGVdN8vvv_sincosf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVdN8vvv_sincosf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVdN8vvv_sincosf_avx2(%rip), %rax
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8vvv_sincosf_sse_wrapper(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
index 3ec78a0..c7a0adb 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core.S
@@ -22,14 +22,12 @@
 	.text
 ENTRY (_ZGVeN16v_sinf)
         .type   _ZGVeN16v_sinf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVeN16v_sinf_skx(%rip), %rax
-        testl   $bit_AVX512DQ_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512DQ_Usable(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVeN16v_sinf_skx(%rip), %rax
+	HAS_ARCH_FEATURE (AVX512DQ_Usable)
         jnz     2f
         leaq    _ZGVeN16v_sinf_knl(%rip), %rax
-        testl   $bit_AVX512F_Usable, __cpu_features+FEATURE_OFFSET+index_AVX512F_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX512F_Usable)
         jnz     2f
         leaq    _ZGVeN16v_sinf_avx2_wrapper(%rip), %rax
 2:      ret
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
index cf1e4df..58bd177 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf4_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVbN4v_sinf)
         .type   _ZGVbN4v_sinf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
-1:      leaq    _ZGVbN4v_sinf_sse4(%rip), %rax
-        testl   $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+	LOAD_RTLD_GLOBAL_RO_RDX
+        leaq    _ZGVbN4v_sinf_sse4(%rip), %rax
+	HAS_CPU_FEATURE (SSE4_1)
         jz      2f
         ret
 2:      leaq    _ZGVbN4v_sinf_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
index b28bf3c..debec59 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf8_core.S
@@ -22,11 +22,9 @@
 	.text
 ENTRY (_ZGVdN8v_sinf)
         .type   _ZGVdN8v_sinf, @gnu_indirect_function
-        cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
-        jne     1f
-        call    __init_cpu_features
+	LOAD_RTLD_GLOBAL_RO_RDX
 1:      leaq    _ZGVdN8v_sinf_avx2(%rip), %rax
-        testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
+	HAS_ARCH_FEATURE (AVX2_Usable)
         jz      2f
         ret
 2:      leaq    _ZGVdN8v_sinf_sse_wrapper(%rip), %rax

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]