This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/pr18661 created. glibc-2.21-660-g8c9c409


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/pr18661 has been created
        at  8c9c40924712d4ebde9bc7717da683613c785cd5 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=8c9c40924712d4ebde9bc7717da683613c785cd5

commit 8c9c40924712d4ebde9bc7717da683613c785cd5
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Jul 28 17:06:01 2015 -0700

    Delete FOREIGN_CALL

diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index 3bb0696..4316578 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -223,13 +223,6 @@ _dl_start_user:\n\
 static inline void __attribute__ ((unused))
 dl_platform_init (void)
 {
-  extern void _dl_x86_64_save_sse (void) attribute_hidden;
-  extern void _dl_x86_64_save_avx (void) attribute_hidden;
-  extern void _dl_x86_64_save_avx512 (void) attribute_hidden;
-  extern void _dl_x86_64_restore_sse (void) attribute_hidden;
-  extern void _dl_x86_64_restore_avx (void) attribute_hidden;
-  extern void _dl_x86_64_restore_avx512 (void) attribute_hidden;
-
   if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
     /* Avoid an empty string which would disturb us.  */
     GLRO(dl_platform) = NULL;
@@ -264,24 +257,6 @@ dl_platform_init (void)
 	    }
 	}
 
-#ifdef SHARED
-      if ((x86_xstate & bit_X86_XSTATE_AVX512) != 0)
-	{
-	  GLRO(dl_x86_64_save_vector) = _dl_x86_64_save_avx512;
-	  GLRO(dl_x86_64_restore_vector) = _dl_x86_64_restore_avx512;
-	}
-      else if ((x86_xstate & bit_X86_XSTATE_AVX) != 0)
-	{
-	  GLRO(dl_x86_64_save_vector) = _dl_x86_64_save_avx;
-	  GLRO(dl_x86_64_restore_vector) = _dl_x86_64_restore_avx;
-	}
-      else
-	{
-	  GLRO(dl_x86_64_save_vector) = _dl_x86_64_save_sse;
-	  GLRO(dl_x86_64_restore_vector) = _dl_x86_64_restore_sse;
-	}
-#endif
-
       GLRO(dl_x86_xstate) = x86_xstate;
     }
 }
diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
index 4ad33c2..417c835 100644
--- a/sysdeps/x86_64/dl-procinfo.c
+++ b/sysdeps/x86_64/dl-procinfo.c
@@ -53,33 +53,5 @@ PROCINFO_CLASS unsigned int _dl_x86_xstate
 ,
 #endif
 
-#if !defined PROCINFO_DECL && defined SHARED
-  ._dl_x86_64_save_vector
-#else
-PROCINFO_CLASS void (*_dl_x86_64_save_vector) (void)
-#endif
-#ifndef PROCINFO_DECL
-= 0
-#endif
-#if !defined SHARED || defined PROCINFO_DECL
-;
-#else
-,
-#endif
-
-#if !defined PROCINFO_DECL && defined SHARED
-  ._dl_x86_64_restore_vector
-#else
-PROCINFO_CLASS void (*_dl_x86_64_restore_vector) (void)
-#endif
-#ifndef PROCINFO_DECL
-= 0
-#endif
-#if !defined SHARED || defined PROCINFO_DECL
-;
-#else
-,
-#endif
-
 #undef PROCINFO_DECL
 #undef PROCINFO_CLASS
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 46641e4..f5442b7 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -55,14 +55,10 @@
 # define VEC(i)			zmm##i
 # define _dl_runtime_resolve	_dl_runtime_resolve_avx512
 # define _dl_runtime_profile	_dl_runtime_profile_avx512
-# define _dl_x86_64_save	_dl_x86_64_save_avx512
-# define _dl_x86_64_restore	_dl_x86_64_restore_avx512
 # define RESTORE_AVX
 # include "dl-trampoline.h"
 # undef _dl_runtime_resolve
 # undef _dl_runtime_profile
-# undef _dl_x86_64_save
-# undef _dl_x86_64_restore
 # undef VMOV
 # undef VEC_SIZE
 # undef VEC
@@ -73,13 +69,9 @@
 # define VEC(i)			ymm##i
 # define _dl_runtime_resolve	_dl_runtime_resolve_avx
 # define _dl_runtime_profile	_dl_runtime_profile_avx
-# define _dl_x86_64_save	_dl_x86_64_save_avx
-# define _dl_x86_64_restore	_dl_x86_64_restore_avx
 # include "dl-trampoline.h"
 # undef _dl_runtime_resolve
 # undef _dl_runtime_profile
-# undef _dl_x86_64_save
-# undef _dl_x86_64_restore
 # undef VMOV
 # undef VEC_SIZE
 # undef VEC
@@ -91,7 +83,5 @@
 # define VEC(i)			xmm##i
 # define _dl_runtime_resolve	_dl_runtime_resolve_sse
 # define _dl_runtime_profile	_dl_runtime_profile_sse
-# define _dl_x86_64_save	_dl_x86_64_save_sse
-# define _dl_x86_64_restore	_dl_x86_64_restore_sse
 # undef RESTORE_AVX
 # include "dl-trampoline.h"
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index b909ff8..be0a9a0 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -504,46 +504,3 @@ _dl_runtime_profile:
 	cfi_endproc
 	.size _dl_runtime_profile, .-_dl_runtime_profile
 #endif
-
-
-#ifdef SHARED
-# if (RTLD_SAVESPACE_SSE % 32) != 0
-#  error RTLD_SAVESPACE_SSE must be aligned to 32 bytes
-# endif
-
-	.globl _dl_x86_64_save
-	.hidden _dl_x86_64_save
-	.type _dl_x86_64_save, @function
-	.align 16
-_dl_x86_64_save:
-	cfi_startproc
-	VMOVA	%VEC(0), %fs:RTLD_SAVESPACE_SSE+0*VEC_SIZE
-	VMOVA	%VEC(1), %fs:RTLD_SAVESPACE_SSE+1*VEC_SIZE
-	VMOVA	%VEC(2), %fs:RTLD_SAVESPACE_SSE+2*VEC_SIZE
-	VMOVA	%VEC(3), %fs:RTLD_SAVESPACE_SSE+3*VEC_SIZE
-	VMOVA	%VEC(4), %fs:RTLD_SAVESPACE_SSE+4*VEC_SIZE
-	VMOVA	%VEC(5), %fs:RTLD_SAVESPACE_SSE+5*VEC_SIZE
-	VMOVA	%VEC(6), %fs:RTLD_SAVESPACE_SSE+6*VEC_SIZE
-	VMOVA	%VEC(7), %fs:RTLD_SAVESPACE_SSE+7*VEC_SIZE
-	ret
-	cfi_endproc
-	.size _dl_x86_64_save, .-_dl_x86_64_save
-
-	.globl _dl_x86_64_restore
-	.hidden _dl_x86_64_restore
-	.type _dl_x86_64_restore, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_restore:
-	VMOVA	%fs:RTLD_SAVESPACE_SSE+0*VEC_SIZE, %VEC(0)
-	VMOVA	%fs:RTLD_SAVESPACE_SSE+1*VEC_SIZE, %VEC(1)
-	VMOVA	%fs:RTLD_SAVESPACE_SSE+2*VEC_SIZE, %VEC(2)
-	VMOVA	%fs:RTLD_SAVESPACE_SSE+3*VEC_SIZE, %VEC(3)
-	VMOVA	%fs:RTLD_SAVESPACE_SSE+4*VEC_SIZE, %VEC(4)
-	VMOVA	%fs:RTLD_SAVESPACE_SSE+5*VEC_SIZE, %VEC(5)
-	VMOVA	%fs:RTLD_SAVESPACE_SSE+6*VEC_SIZE, %VEC(6)
-	VMOVA	%fs:RTLD_SAVESPACE_SSE+7*VEC_SIZE, %VEC(7)
-	ret
-	cfi_endproc
-	.size _dl_x86_64_restore, .-_dl_x86_64_restore
-#endif
diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
index b485b53..2466af7 100644
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@@ -384,37 +384,6 @@ typedef struct
 # define THREAD_GSCOPE_WAIT() \
   GL(dl_wait_lookup_done) ()
 
-
-# ifdef SHARED
-# define RTLD_CHECK_FOREIGN_CALL \
-  (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)
-
-/* NB: Don't use the xchg operation because that would imply a lock
-   prefix which is expensive and unnecessary.  The cache line is also
-   not contested at all.  */
-#  define RTLD_ENABLE_FOREIGN_CALL \
-  int old_rtld_must_xmm_save = THREAD_GETMEM (THREAD_SELF,		      \
-					      header.rtld_must_xmm_save);     \
-  THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1)
-
-#  define RTLD_PREPARE_FOREIGN_CALL \
-  do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save))	      \
-    {									      \
-      GLRO(dl_x86_64_save_vector) ();					      \
-      THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0);	      \
-    }									      \
-  while (0)
-
-#  define RTLD_FINALIZE_FOREIGN_CALL \
-  do {									      \
-    if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0)	      \
-      GLRO(dl_x86_64_restore_vector) ();				      \
-    THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save,		      \
-		   old_rtld_must_xmm_save);				      \
-  } while (0)
-# endif
-
-
 #endif /* __ASSEMBLER__ */
 
 #endif	/* tls.h */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=8d8367c7e9fab003222c109a5d4018d93bfb1097

commit 8d8367c7e9fab003222c109a5d4018d93bfb1097
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat Jul 11 13:25:25 2015 -0700

    Save and restore vector registers in ld.so
    
    1. Initiaize dl_x86_xstate in dl_platform_init
    2. Add _dl_x86_64_save_vector/_dl_x86_64_restore_vector
    
    We should check if an indirect branch is faster than a direct branch
    with 2 sets of test and branch.

diff --git a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
index 8ac351e..a3c0c19 100644
--- a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
+++ b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
@@ -1,5 +1,5 @@
 #if IS_IN (ldconfig)
 # include <sysdeps/i386/dl-procinfo.c>
 #else
-# include <sysdeps/generic/dl-procinfo.c>
+# include <sysdeps/x86_64/dl-procinfo.c>
 #endif
diff --git a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.h b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.h
index 7829e1c..dba5cc9 100644
--- a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.h
+++ b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.h
@@ -1,5 +1,5 @@
 #if IS_IN (ldconfig)
 # include <sysdeps/unix/sysv/linux/i386/dl-procinfo.h>
 #else
-# include <sysdeps/generic/dl-procinfo.h>
+# include <sysdeps/x86_64/dl-procinfo.h>
 #endif
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index cae6db3..3bb0696 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -26,6 +26,8 @@
 #include <sysdep.h>
 #include <tls.h>
 #include <dl-tlsdesc.h>
+#include <dl-procinfo.h>
+#include <cpuid.h>
 
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
@@ -65,8 +67,12 @@ static inline int __attribute__ ((unused, always_inline))
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 {
   Elf64_Addr *got;
-  extern void _dl_runtime_resolve (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_profile (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
 
   if (l->l_info[DT_JMPREL] && lazy)
     {
@@ -86,6 +92,8 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
       /* Identify this shared object.  */
       *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
 
+      unsigned int x86_xstate = GLRO(dl_x86_xstate);
+
       /* The got[2] entry contains the address of a function which gets
 	 called to get the address of a so far unresolved function and
 	 jump to it.  The profiling extension of the dynamic linker allows
@@ -94,7 +102,12 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
-	  *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile;
+	  if ((x86_xstate & bit_X86_XSTATE_AVX512) != 0)
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
+	  else if ((x86_xstate & bit_X86_XSTATE_AVX) != 0)
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
 
 	  if (GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), l))
@@ -103,9 +116,17 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	    GL(dl_profile_map) = l;
 	}
       else
-	/* This function will get called to fix up the GOT entry indicated by
-	   the offset on the stack, and then jump to the resolved address.  */
-	*(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve;
+	{
+	  /* This function will get called to fix up the GOT entry
+	     indicated by the offset on the stack, and then jump to
+	     the resolved address.  */
+	  if ((x86_xstate & bit_X86_XSTATE_AVX512) != 0)
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+	  else if ((x86_xstate & bit_X86_XSTATE_AVX) != 0)
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
+	}
     }
 
   if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
@@ -202,9 +223,67 @@ _dl_start_user:\n\
 static inline void __attribute__ ((unused))
 dl_platform_init (void)
 {
+  extern void _dl_x86_64_save_sse (void) attribute_hidden;
+  extern void _dl_x86_64_save_avx (void) attribute_hidden;
+  extern void _dl_x86_64_save_avx512 (void) attribute_hidden;
+  extern void _dl_x86_64_restore_sse (void) attribute_hidden;
+  extern void _dl_x86_64_restore_avx (void) attribute_hidden;
+  extern void _dl_x86_64_restore_avx512 (void) attribute_hidden;
+
   if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
     /* Avoid an empty string which would disturb us.  */
     GLRO(dl_platform) = NULL;
+
+  unsigned int eax, ebx, ecx, edx;
+  __cpuid (1, eax, ebx, ecx, edx);
+  if ((ecx & (bit_AVX | bit_OSXSAVE)) == (bit_AVX | bit_OSXSAVE))
+    {
+      unsigned int x86_xstate;
+
+      __cpuid_count (7, 0, eax, ebx, ecx, edx);
+
+      /* Verify that ZMM, YMM and XMM states are enabled.  */
+      asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+      if ((eax & (1 << 0)) != 0)
+	x86_xstate = bit_X86_XSTATE_X87;
+      else
+	x86_xstate = 0;
+      if ((eax & ((1 << 3) | (1 << 4))) == ((1 << 3) | (1 << 4)))
+	x86_xstate |= bit_X86_XSTATE_MPX;
+      if ((eax & (1 << 5)) != 0)
+	x86_xstate |= bit_X86_XSTATE_K;
+      if ((eax & (1 << 1)) != 0)
+	{
+	  x86_xstate = bit_X86_XSTATE_SSE;
+	  if ((eax & (1 << 2)) != 0)
+	    {
+	      x86_xstate |= bit_X86_XSTATE_AVX;
+	      if ((eax & ((1 << 5) | (1 << 6) | (1 << 7)))
+		  == ((1 << 5) | (1 << 6) | (1 << 7)))
+		x86_xstate |= bit_X86_XSTATE_AVX512;
+	    }
+	}
+
+#ifdef SHARED
+      if ((x86_xstate & bit_X86_XSTATE_AVX512) != 0)
+	{
+	  GLRO(dl_x86_64_save_vector) = _dl_x86_64_save_avx512;
+	  GLRO(dl_x86_64_restore_vector) = _dl_x86_64_restore_avx512;
+	}
+      else if ((x86_xstate & bit_X86_XSTATE_AVX) != 0)
+	{
+	  GLRO(dl_x86_64_save_vector) = _dl_x86_64_save_avx;
+	  GLRO(dl_x86_64_restore_vector) = _dl_x86_64_restore_avx;
+	}
+      else
+	{
+	  GLRO(dl_x86_64_save_vector) = _dl_x86_64_save_sse;
+	  GLRO(dl_x86_64_restore_vector) = _dl_x86_64_restore_sse;
+	}
+#endif
+
+      GLRO(dl_x86_xstate) = x86_xstate;
+    }
 }
 
 static inline ElfW(Addr)
diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
new file mode 100644
index 0000000..4ad33c2
--- /dev/null
+++ b/sysdeps/x86_64/dl-procinfo.c
@@ -0,0 +1,85 @@
+/* Data for x86-64 version of processor capability information.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* If anything should be added here check whether the size of each string
+   is still ok with the given array size.
+
+   All the #ifdefs in the definitions are quite irritating but
+   necessary if we want to avoid duplicating the information.  There
+   are three different modes:
+
+   - PROCINFO_DECL is defined.  This means we are only interested in
+     declarations.
+
+   - PROCINFO_DECL is not defined:
+
+     + if SHARED is defined the file is included in an array
+       initializer.  The .element = { ... } syntax is needed.
+
+     + if SHARED is not defined a normal array initialization is
+       needed.
+  */
+
+#ifndef PROCINFO_CLASS
+# define PROCINFO_CLASS
+#endif
+
+#if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_xstate
+#else
+PROCINFO_CLASS unsigned int _dl_x86_xstate
+#endif
+#ifndef PROCINFO_DECL
+= 0
+#endif
+#if !defined SHARED || defined PROCINFO_DECL
+;
+#else
+,
+#endif
+
+#if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_64_save_vector
+#else
+PROCINFO_CLASS void (*_dl_x86_64_save_vector) (void)
+#endif
+#ifndef PROCINFO_DECL
+= 0
+#endif
+#if !defined SHARED || defined PROCINFO_DECL
+;
+#else
+,
+#endif
+
+#if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_64_restore_vector
+#else
+PROCINFO_CLASS void (*_dl_x86_64_restore_vector) (void)
+#endif
+#ifndef PROCINFO_DECL
+= 0
+#endif
+#if !defined SHARED || defined PROCINFO_DECL
+;
+#else
+,
+#endif
+
+#undef PROCINFO_DECL
+#undef PROCINFO_CLASS
diff --git a/sysdeps/x86_64/dl-procinfo.h b/sysdeps/x86_64/dl-procinfo.h
new file mode 100644
index 0000000..c06fce6
--- /dev/null
+++ b/sysdeps/x86_64/dl-procinfo.h
@@ -0,0 +1,33 @@
+/* x86-64 version of processor capability information handling macros.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_X86_64_PROCINFO_H
+#define _DL_X86_64_PROCINFO_H	1
+
+#ifndef	__ASSEMBLER__
+# include <sysdeps/generic/dl-procinfo.h>
+#endif
+
+#define	bit_X86_XSTATE_X87	(1 << 0)
+#define bit_X86_XSTATE_SSE	(1 << 1)
+#define bit_X86_XSTATE_AVX	(1 << 2)
+#define bit_X86_XSTATE_AVX512	(1 << 3)
+#define bit_X86_XSTATE_MPX	(1 << 4)
+#define bit_X86_XSTATE_K	(1 << 5)
+
+#endif /* dl-procinfo.h */
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 678c57f..46641e4 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -20,33 +20,26 @@
 #include <sysdep.h>
 #include <link-defines.h>
 
-#if (RTLD_SAVESPACE_SSE % 32) != 0
-# error RTLD_SAVESPACE_SSE must be aligned to 32 bytes
-#endif
-
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
 #ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
-# define REGISTER_SAVE_AREA	(8 * 7)
-# define REGISTER_SAVE_RAX	0
+/* Align vector register save area to 16 bytes.  */
+# define REGISTER_SAVE_VEC_OFF	0
 # define PRESERVE_BND_REGS_PREFIX
 #else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
-   BND1, BND2, BND3.  */
-# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4)
 /* Align bound register save area to 16 bytes.  */
 # define REGISTER_SAVE_BND0	0
 # define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
 # define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
 # define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
+# define REGISTER_SAVE_VEC_OFF	(REGISTER_SAVE_BND3 + 16)
 # ifdef HAVE_MPX_SUPPORT
 #  define PRESERVE_BND_REGS_PREFIX bnd
 # else
 #  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
 # endif
 #endif
+#define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
 #define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
 #define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
 #define REGISTER_SAVE_RSI	(REGISTER_SAVE_RDX + 8)
@@ -54,386 +47,51 @@
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
 
-	.text
-	.globl _dl_runtime_resolve
-	.type _dl_runtime_resolve, @function
-	.align 16
-	cfi_startproc
-_dl_runtime_resolve:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	subq $REGISTER_SAVE_AREA,%rsp
-	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
-	# Preserve registers otherwise clobbered.
-	movq %rax, REGISTER_SAVE_RAX(%rsp)
-	movq %rcx, REGISTER_SAVE_RCX(%rsp)
-	movq %rdx, REGISTER_SAVE_RDX(%rsp)
-	movq %rsi, REGISTER_SAVE_RSI(%rsp)
-	movq %rdi, REGISTER_SAVE_RDI(%rsp)
-	movq %r8, REGISTER_SAVE_R8(%rsp)
-	movq %r9, REGISTER_SAVE_R9(%rsp)
-#ifndef __ILP32__
-	# We also have to preserve bound registers.  These are nops if
-	# Intel MPX isn't available or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
-	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
-	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
-	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
-# else
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1b,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
-	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
-# endif
-#endif
-	# Copy args pushed by PLT in register.
-	# %rdi: link_map, %rsi: reloc_index
-	movq (REGISTER_SAVE_AREA + 8)(%rsp), %rsi
-	movq REGISTER_SAVE_AREA(%rsp), %rdi
-	call _dl_fixup		# Call resolver.
-	movq %rax, %r11		# Save return value
-#ifndef __ILP32__
-	# Restore bound registers.  These are nops if Intel MPX isn't
-	# avaiable or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
-	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
-	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
-	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
-# else
-	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
-	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1a,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-# endif
-#endif
-	# Get register content back.
-	movq REGISTER_SAVE_R9(%rsp), %r9
-	movq REGISTER_SAVE_R8(%rsp), %r8
-	movq REGISTER_SAVE_RDI(%rsp), %rdi
-	movq REGISTER_SAVE_RSI(%rsp), %rsi
-	movq REGISTER_SAVE_RDX(%rsp), %rdx
-	movq REGISTER_SAVE_RCX(%rsp), %rcx
-	movq REGISTER_SAVE_RAX(%rsp), %rax
-	# Adjust stack(PLT did 2 pushes)
-	addq $(REGISTER_SAVE_AREA + 16), %rsp
-	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
-	# Preserve bound registers.
-	PRESERVE_BND_REGS_PREFIX
-	jmp *%r11		# Jump to function address.
-	cfi_endproc
-	.size _dl_runtime_resolve, .-_dl_runtime_resolve
-
-
-#ifndef PROF
-	.globl _dl_runtime_profile
-	.type _dl_runtime_profile, @function
-	.align 16
-	cfi_startproc
-
-_dl_runtime_profile:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	/* The La_x86_64_regs data structure pointed to by the
-	   fourth paramater must be 16-byte aligned.  This must
-	   be explicitly enforced.  We have the set up a dynamically
-	   sized stack frame.  %rbx points to the top half which
-	   has a fixed size and preserves the original stack pointer.  */
-
-	subq $32, %rsp		# Allocate the local storage.
-	cfi_adjust_cfa_offset(32)
-	movq %rbx, (%rsp)
-	cfi_rel_offset(%rbx, 0)
-
-	/* On the stack:
-		56(%rbx)	parameter #1
-		48(%rbx)	return address
-
-		40(%rbx)	reloc index
-		32(%rbx)	link_map
-
-		24(%rbx)	La_x86_64_regs pointer
-		16(%rbx)	framesize
-		 8(%rbx)	rax
-		  (%rbx)	rbx
-	*/
-
-	movq %rax, 8(%rsp)
-	movq %rsp, %rbx
-	cfi_def_cfa_register(%rbx)
-
-	/* Actively align the La_x86_64_regs structure.  */
-	andq $0xfffffffffffffff0, %rsp
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
-	   to detect if any xmm0-xmm7 registers are changed by audit
-	   module.  */
-	subq $(LR_SIZE + XMM_SIZE*8), %rsp
-# else
-	subq $LR_SIZE, %rsp		# sizeof(La_x86_64_regs)
-# endif
-	movq %rsp, 24(%rbx)
-
-	/* Fill the La_x86_64_regs structure.  */
-	movq %rdx, LR_RDX_OFFSET(%rsp)
-	movq %r8,  LR_R8_OFFSET(%rsp)
-	movq %r9,  LR_R9_OFFSET(%rsp)
-	movq %rcx, LR_RCX_OFFSET(%rsp)
-	movq %rsi, LR_RSI_OFFSET(%rsp)
-	movq %rdi, LR_RDI_OFFSET(%rsp)
-	movq %rbp, LR_RBP_OFFSET(%rsp)
-
-	leaq 48(%rbx), %rax
-	movq %rax, LR_RSP_OFFSET(%rsp)
-
-	/* We always store the XMM registers even if AVX is available.
-	   This is to provide backward binary compatibility for existing
-	   audit modules.  */
-	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
-	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
-	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
-	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
-	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
-	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
-	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
-	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
-
-# ifndef __ILP32__
-#  ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
-	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
-	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
-	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
-#  else
-	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
-	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-#  endif
-# endif
-
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	.data
-L(have_avx):
-	.zero 4
-	.size L(have_avx), 4
-	.previous
-
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	10f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in processor?
-	movq	%rbx, %r11		# Save rbx
-	xorl	%ecx, %ecx
-	mov	$0x7, %eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	20f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	cmpl	$0xe6, %eax
-	jne	20f
-	movl	%eax, L(have_avx)(%rip)
-L(avx512):
-#   define RESTORE_AVX
-#   define VMOV    vmovdqu64
-#   define VEC(i)  zmm##i
-#   define MORE_CODE
-#   include "dl-trampoline.h"
-#   undef VMOV
-#   undef VEC
-#   undef RESTORE_AVX
-#  endif
-20:	andl	$0x6, %eax
-10:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined):
-	js	L(no_avx)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512)
-#  endif
-
-#  define RESTORE_AVX
-#  define VMOV    vmovdqu
-#  define VEC(i)  ymm##i
-#  define MORE_CODE
-#  include "dl-trampoline.h"
-
-	.align 16
-L(no_avx):
-# endif
-
+/* VMOV is used on 16-byte aligned memory and VMOVA is used on 32-byte
+   aligned memory.  */
+# define VMOV			vmovdqu64
+# define VMOVA			vmovdqu64
+# define VEC_SIZE		64
+# define VEC(i)			zmm##i
+# define _dl_runtime_resolve	_dl_runtime_resolve_avx512
+# define _dl_runtime_profile	_dl_runtime_profile_avx512
+# define _dl_x86_64_save	_dl_x86_64_save_avx512
+# define _dl_x86_64_restore	_dl_x86_64_restore_avx512
+# define RESTORE_AVX
+# include "dl-trampoline.h"
+# undef _dl_runtime_resolve
+# undef _dl_runtime_profile
+# undef _dl_x86_64_save
+# undef _dl_x86_64_restore
+# undef VMOV
+# undef VEC_SIZE
+# undef VEC
+
+# define VMOV			vmovdqu
+# define VMOVA			vmovdqa
+# define VEC_SIZE		32
+# define VEC(i)			ymm##i
+# define _dl_runtime_resolve	_dl_runtime_resolve_avx
+# define _dl_runtime_profile	_dl_runtime_profile_avx
+# define _dl_x86_64_save	_dl_x86_64_save_avx
+# define _dl_x86_64_restore	_dl_x86_64_restore_avx
+# include "dl-trampoline.h"
+# undef _dl_runtime_resolve
+# undef _dl_runtime_profile
+# undef _dl_x86_64_save
+# undef _dl_x86_64_restore
+# undef VMOV
+# undef VEC_SIZE
+# undef VEC
+
+/* movaps is 1-byte shorter.  */
+# define VMOV			movaps
+# define VMOVA			movaps
+# define VEC_SIZE		16
+# define VEC(i)			xmm##i
+# define _dl_runtime_resolve	_dl_runtime_resolve_sse
+# define _dl_runtime_profile	_dl_runtime_profile_sse
+# define _dl_x86_64_save	_dl_x86_64_save_sse
+# define _dl_x86_64_restore	_dl_x86_64_restore_sse
 # undef RESTORE_AVX
 # include "dl-trampoline.h"
-
-	cfi_endproc
-	.size _dl_runtime_profile, .-_dl_runtime_profile
-#endif
-
-
-#ifdef SHARED
-	.globl _dl_x86_64_save_sse
-	.type _dl_x86_64_save_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_save_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined_5)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	1f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in a processor?
-	movq	%rbx, %r11              # Save rbx
-	xorl	%ecx,%ecx
-	mov	$0x7,%eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	2f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0xe6, %eax
-	je	L(avx512_5)
-#  endif
-
-2:	andl	$0x6, %eax
-1:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined_5):
-	js	L(no_avx5)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_5)
-#  endif
-
-	vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
-	vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
-	vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
-	vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE
-	vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE
-	vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE
-	vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
-	vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_5):
-	vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE
-	vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE
-	vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE
-	vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE
-	vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE
-	vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE
-	vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE
-	vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE
-	ret
-#  endif
-L(no_avx5):
-# endif
-	movdqa	%xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
-	movdqa	%xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE
-	movdqa	%xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE
-	movdqa	%xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE
-	movdqa	%xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE
-	movdqa	%xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE
-	movdqa	%xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE
-	movdqa	%xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE
-	ret
-	cfi_endproc
-	.size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse
-
-
-	.globl _dl_x86_64_restore_sse
-	.type _dl_x86_64_restore_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_restore_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx6)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_6)
-#  endif
-
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_6):
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7
-	ret
-#  endif
-L(no_avx6):
-# endif
-	movdqa	%fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
-	movdqa	%fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1
-	movdqa	%fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2
-	movdqa	%fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3
-	movdqa	%fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4
-	movdqa	%fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5
-	movdqa	%fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6
-	movdqa	%fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7
-	ret
-	cfi_endproc
-	.size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse
-#endif
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index d542428..b909ff8 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -1,5 +1,4 @@
-/* Partial PLT profile trampoline to save and restore x86-64 vector
-   registers.
+/* PLT trampolines.  x86-64 version.
    Copyright (C) 2009-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,7 +16,203 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifdef RESTORE_AVX
+#ifdef __ILP32__
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
+   VEC7.  */
+# define REGISTER_SAVE_AREA	(8 * 7 + VEC_SIZE * 8)
+#else
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
+# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
+#endif
+
+#if (REGISTER_SAVE_AREA % 16) != 8
+# error REGISTER_SAVE_AREA must be odd multples of 8
+#endif
+
+	.text
+	.globl _dl_runtime_resolve
+	.hidden _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	.align 16
+	cfi_startproc
+_dl_runtime_resolve:
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	subq $REGISTER_SAVE_AREA, %rsp
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+	# Preserve registers otherwise clobbered.
+	movq %rax, REGISTER_SAVE_RAX(%rsp)
+	movq %rcx, REGISTER_SAVE_RCX(%rsp)
+	movq %rdx, REGISTER_SAVE_RDX(%rsp)
+	movq %rsi, REGISTER_SAVE_RSI(%rsp)
+	movq %rdi, REGISTER_SAVE_RDI(%rsp)
+	movq %r8, REGISTER_SAVE_R8(%rsp)
+	movq %r9, REGISTER_SAVE_R9(%rsp)
+	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
+	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
+	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
+	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
+	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
+	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
+	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
+	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
+#ifndef __ILP32__
+	# We also have to preserve bound registers.  These are nops if
+	# Intel MPX isn't available or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
+	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
+	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
+	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
+# else
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1b,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
+	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
+# endif
+#endif
+	# Copy args pushed by PLT in register.
+	# %rdi: link_map, %rsi: reloc_index
+	movq (REGISTER_SAVE_AREA + 8)(%rsp), %rsi
+	movq REGISTER_SAVE_AREA(%rsp), %rdi
+	call _dl_fixup		# Call resolver.
+	movq %rax, %r11		# Save return value
+#ifndef __ILP32__
+	# Restore bound registers.  These are nops if Intel MPX isn't
+	# avaiable or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
+	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
+	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
+	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
+# else
+	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
+	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1a,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+# endif
+#endif
+	# Get register content back.
+	movq REGISTER_SAVE_R9(%rsp), %r9
+	movq REGISTER_SAVE_R8(%rsp), %r8
+	movq REGISTER_SAVE_RDI(%rsp), %rdi
+	movq REGISTER_SAVE_RSI(%rsp), %rsi
+	movq REGISTER_SAVE_RDX(%rsp), %rdx
+	movq REGISTER_SAVE_RCX(%rsp), %rcx
+	movq REGISTER_SAVE_RAX(%rsp), %rax
+	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
+	# Adjust stack(PLT did 2 pushes)
+	addq $(REGISTER_SAVE_AREA + 16), %rsp
+	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
+	# Preserve bound registers.
+	PRESERVE_BND_REGS_PREFIX
+	jmp *%r11		# Jump to function address.
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+	.globl _dl_runtime_profile
+	.hidden _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	.align 16
+_dl_runtime_profile:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	/* The La_x86_64_regs data structure pointed to by the
+	   fourth paramater must be 16-byte aligned.  This must
+	   be explicitly enforced.  We have the set up a dynamically
+	   sized stack frame.  %rbx points to the top half which
+	   has a fixed size and preserves the original stack pointer.  */
+
+	subq $32, %rsp		# Allocate the local storage.
+	cfi_adjust_cfa_offset(32)
+	movq %rbx, (%rsp)
+	cfi_rel_offset(%rbx, 0)
+
+	/* On the stack:
+		56(%rbx)	parameter #1
+		48(%rbx)	return address
+
+		40(%rbx)	reloc index
+		32(%rbx)	link_map
+
+		24(%rbx)	La_x86_64_regs pointer
+		16(%rbx)	framesize
+		 8(%rbx)	rax
+		  (%rbx)	rbx
+	*/
+
+	movq %rax, 8(%rsp)
+	movq %rsp, %rbx
+	cfi_def_cfa_register(%rbx)
+
+	/* Actively align the La_x86_64_regs structure.  */
+	andq $0xfffffffffffffff0, %rsp
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
+	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
+	   to detect if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	subq $(LR_SIZE + XMM_SIZE*8), %rsp
+# else
+	subq $LR_SIZE, %rsp		# sizeof(La_x86_64_regs)
+# endif
+	movq %rsp, 24(%rbx)
+
+	/* Fill the La_x86_64_regs structure.  */
+	movq %rdx, LR_RDX_OFFSET(%rsp)
+	movq %r8,  LR_R8_OFFSET(%rsp)
+	movq %r9,  LR_R9_OFFSET(%rsp)
+	movq %rcx, LR_RCX_OFFSET(%rsp)
+	movq %rsi, LR_RSI_OFFSET(%rsp)
+	movq %rdi, LR_RDI_OFFSET(%rsp)
+	movq %rbp, LR_RBP_OFFSET(%rsp)
+
+	leaq 48(%rbx), %rax
+	movq %rax, LR_RSP_OFFSET(%rsp)
+
+	/* We always store the XMM registers even if AVX is available.
+	   This is to provide backward binary compatibility for existing
+	   audit modules.  */
+	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
+	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
+	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
+	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
+	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
+	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
+#  else
+	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
+# endif
+
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
 	VMOV %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
 	VMOV %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
@@ -38,7 +233,7 @@
 	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
 	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
 	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
-#endif
+# endif
 
 	mov %RSP_LP, %RCX_LP	# La_x86_64_regs pointer to %rcx.
 	mov 48(%rbx), %RDX_LP	# Load return address if needed.
@@ -63,7 +258,7 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
 	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
@@ -139,21 +334,21 @@
 	vmovdqa	%xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
 	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
 	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
 	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
+#  else
 	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
 	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
 	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
 	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
 # endif
-#endif
 
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
@@ -212,14 +407,14 @@
 	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
 	   so we just need to allocate the sizeof(La_x86_64_retval) space on
 	   the stack, since the alignment has already been taken care of. */
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
 	   registers to detect if xmm0/xmm1 registers are changed
 	   by audit module.  */
 	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
-#else
+# else
 	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
-#endif
+# endif
 	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
 
 	/* Fill in the La_x86_64_retval structure.  */
@@ -229,7 +424,7 @@
 	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
 	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
 	VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
 	VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
@@ -238,17 +433,17 @@
 	   by audit module.  */
 	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
 	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov %bnd0, LRV_BND0_OFFSET(%rcx)  # Preserve returned bounds.
 	bndmov %bnd1, LRV_BND1_OFFSET(%rcx)
-# else
+#  else
 	.byte  0x66,0x0f,0x1b,0x81;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1b,0x89;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fstpt LRV_ST0_OFFSET(%rcx)
 	fstpt LRV_ST1_OFFSET(%rcx)
@@ -265,7 +460,7 @@
 	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
 	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if xmm0/xmm1 registers are changed by audit module.  */
 	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
 	vpmovmskb %xmm2, %esi
@@ -280,17 +475,17 @@
 	VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
 	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
-# else
+#  else
 	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fldt LRV_ST1_OFFSET(%rsp)
 	fldt LRV_ST0_OFFSET(%rsp)
@@ -306,9 +501,49 @@
 	PRESERVE_BND_REGS_PREFIX
 	retq
 
-#ifdef MORE_CODE
-	cfi_adjust_cfa_offset(48)
-	cfi_rel_offset(%rbx, 0)
-	cfi_def_cfa_register(%rbx)
-# undef MORE_CODE
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
+#endif
+
+
+#ifdef SHARED
+# if (RTLD_SAVESPACE_SSE % 32) != 0
+#  error RTLD_SAVESPACE_SSE must be aligned to 32 bytes
+# endif
+
+	.globl _dl_x86_64_save
+	.hidden _dl_x86_64_save
+	.type _dl_x86_64_save, @function
+	.align 16
+_dl_x86_64_save:
+	cfi_startproc
+	VMOVA	%VEC(0), %fs:RTLD_SAVESPACE_SSE+0*VEC_SIZE
+	VMOVA	%VEC(1), %fs:RTLD_SAVESPACE_SSE+1*VEC_SIZE
+	VMOVA	%VEC(2), %fs:RTLD_SAVESPACE_SSE+2*VEC_SIZE
+	VMOVA	%VEC(3), %fs:RTLD_SAVESPACE_SSE+3*VEC_SIZE
+	VMOVA	%VEC(4), %fs:RTLD_SAVESPACE_SSE+4*VEC_SIZE
+	VMOVA	%VEC(5), %fs:RTLD_SAVESPACE_SSE+5*VEC_SIZE
+	VMOVA	%VEC(6), %fs:RTLD_SAVESPACE_SSE+6*VEC_SIZE
+	VMOVA	%VEC(7), %fs:RTLD_SAVESPACE_SSE+7*VEC_SIZE
+	ret
+	cfi_endproc
+	.size _dl_x86_64_save, .-_dl_x86_64_save
+
+	.globl _dl_x86_64_restore
+	.hidden _dl_x86_64_restore
+	.type _dl_x86_64_restore, @function
+	.align 16
+	cfi_startproc
+_dl_x86_64_restore:
+	VMOVA	%fs:RTLD_SAVESPACE_SSE+0*VEC_SIZE, %VEC(0)
+	VMOVA	%fs:RTLD_SAVESPACE_SSE+1*VEC_SIZE, %VEC(1)
+	VMOVA	%fs:RTLD_SAVESPACE_SSE+2*VEC_SIZE, %VEC(2)
+	VMOVA	%fs:RTLD_SAVESPACE_SSE+3*VEC_SIZE, %VEC(3)
+	VMOVA	%fs:RTLD_SAVESPACE_SSE+4*VEC_SIZE, %VEC(4)
+	VMOVA	%fs:RTLD_SAVESPACE_SSE+5*VEC_SIZE, %VEC(5)
+	VMOVA	%fs:RTLD_SAVESPACE_SSE+6*VEC_SIZE, %VEC(6)
+	VMOVA	%fs:RTLD_SAVESPACE_SSE+7*VEC_SIZE, %VEC(7)
+	ret
+	cfi_endproc
+	.size _dl_x86_64_restore, .-_dl_x86_64_restore
 #endif
diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
index d7543c6..b485b53 100644
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@@ -386,10 +386,6 @@ typedef struct
 
 
 # ifdef SHARED
-/* Defined in dl-trampoline.S.  */
-extern void _dl_x86_64_save_sse (void);
-extern void _dl_x86_64_restore_sse (void);
-
 # define RTLD_CHECK_FOREIGN_CALL \
   (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)
 
@@ -404,7 +400,7 @@ extern void _dl_x86_64_restore_sse (void);
 #  define RTLD_PREPARE_FOREIGN_CALL \
   do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save))	      \
     {									      \
-      _dl_x86_64_save_sse ();						      \
+      GLRO(dl_x86_64_save_vector) ();					      \
       THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0);	      \
     }									      \
   while (0)
@@ -412,7 +408,7 @@ extern void _dl_x86_64_restore_sse (void);
 #  define RTLD_FINALIZE_FOREIGN_CALL \
   do {									      \
     if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0)	      \
-      _dl_x86_64_restore_sse ();					      \
+      GLRO(dl_x86_64_restore_vector) ();				      \
     THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save,		      \
 		   old_rtld_must_xmm_save);				      \
   } while (0)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=568bd0a08f7383197b338f3f40a57b7b9cdc02ab

commit 568bd0a08f7383197b338f3f40a57b7b9cdc02ab
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:41:20 2015 -0700

    Align the stack before calling __errno_location

diff --git a/sysdeps/x86_64/fpu/s_cosf.S b/sysdeps/x86_64/fpu/s_cosf.S
index b7868ce..bea10ef 100644
--- a/sysdeps/x86_64/fpu/s_cosf.S
+++ b/sysdeps/x86_64/fpu/s_cosf.S
@@ -310,8 +310,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4
diff --git a/sysdeps/x86_64/fpu/s_sincosf.S b/sysdeps/x86_64/fpu/s_sincosf.S
index 21db70a..a2f3133 100644
--- a/sysdeps/x86_64/fpu/s_sincosf.S
+++ b/sysdeps/x86_64/fpu/s_sincosf.S
@@ -354,8 +354,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4
diff --git a/sysdeps/x86_64/fpu/s_sinf.S b/sysdeps/x86_64/fpu/s_sinf.S
index dc92164..90afbe8 100644
--- a/sysdeps/x86_64/fpu/s_sinf.S
+++ b/sysdeps/x86_64/fpu/s_sinf.S
@@ -336,8 +336,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a1cb3d630d9a28458220464f3935aef1157f69a2

commit a1cb3d630d9a28458220464f3935aef1157f69a2
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:40:25 2015 -0700

    Align the stack before calling __gettimeofday

diff --git a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
index 0935db5..23f3def 100644
--- a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
+++ b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
@@ -394,8 +394,8 @@ __lll_timedwait_tid:
 	movq	%rdi, %r12
 	movq	%rsi, %r13
 
-	subq	$16, %rsp
-	cfi_adjust_cfa_offset(16)
+	subq	$24, %rsp
+	cfi_adjust_cfa_offset(24)
 
 	/* Get current time.  */
 2:	movq	%rsp, %rdi
@@ -441,8 +441,8 @@ __lll_timedwait_tid:
 	jne	1f
 4:	xorl	%eax, %eax
 
-8:	addq	$16, %rsp
-	cfi_adjust_cfa_offset(-16)
+8:	addq	$24, %rsp
+	cfi_adjust_cfa_offset(-24)
 	popq	%r13
 	cfi_adjust_cfa_offset(-8)
 	cfi_restore(%r13)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1fc73512dcc30036ffa9ef7363a9eb3442972e27

commit 1fc73512dcc30036ffa9ef7363a9eb3442972e27
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:38:58 2015 -0700

    Align stack before calling __setcontext

diff --git a/sysdeps/unix/sysv/linux/x86_64/__start_context.S b/sysdeps/unix/sysv/linux/x86_64/__start_context.S
index 52a5afa..57ee2b9 100644
--- a/sysdeps/unix/sysv/linux/x86_64/__start_context.S
+++ b/sysdeps/unix/sysv/linux/x86_64/__start_context.S
@@ -33,6 +33,11 @@ ENTRY(__start_context)
 
 	popq	%rdi			/* This is the next context.  */
 	cfi_adjust_cfa_offset(-8)
+
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
+
 	testq	%rdi, %rdi
 	je	2f			/* If it is zero exit.  */
 

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]