This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch hjl/pr18661 created. glibc-2.21-666-g3eb4ed7


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/pr18661 has been created
        at  3eb4ed7acf86c94ede2f4f041c02cb815ca5056f (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3eb4ed7acf86c94ede2f4f041c02cb815ca5056f

commit 3eb4ed7acf86c94ede2f4f041c02cb815ca5056f
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 02:50:10 2015 -0700

    Remove rtld_savespace_sse and __padding
    
    rtld_savespace_sse and __padding are internal to glibc.  It should be
    safe to remove them when they are no longer needed by glibc.

diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
index 5515c54..76b6898 100644
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@@ -29,12 +29,6 @@
 # include <libc-internal.h>
 # include <kernel-features.h>
 
-typedef struct
-{
-  int i[4];
-} __128bits;
-
-
 /* Type for the dtv.  */
 typedef union dtv
 {
@@ -70,9 +64,6 @@ typedef struct
   /* GCC split stack support.  */
   void *__private_ss;
   long int __glibc_reserved2;
-  __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32)));
-
-  void *__padding[8];
 } tcbhead_t;
 
 #else /* __ASSEMBLER__ */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ae2de31341e82878c4a141c2454876d167329de0

commit ae2de31341e82878c4a141c2454876d167329de0
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 04:49:38 2015 -0700

    Use SSE optimized strcmp in x86-64 ld.so
    
    Since ld.so preserves vector registers now, we can SSE optimized strcmp
    in x86-64 ld.so.
    
    	* sysdeps/x86_64/strcmp.S: Remove "#if !IS_IN (libc)".

diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 1329649..1624b5d 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -29,13 +29,6 @@
 #endif
 
 #ifdef USE_AS_STRNCMP
-/* The simplified code below is not set up to handle strncmp() so far.
-   Should this become necessary it has to be implemented.  For now
-   just report the problem.  */
-# if !IS_IN (libc)
-#  error "strncmp not implemented so far"
-# endif
-
 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
    if the new counter > the old one or is 0.  */
 # define UPDATE_STRNCMP_COUNTER				\
@@ -50,20 +43,10 @@
 #elif defined USE_AS_STRCASECMP_L
 # include "locale-defines.h"
 
-/* No support for strcasecmp outside libc so far since it is not needed.  */
-# if !IS_IN (libc)
-#  error "strcasecmp_l not implemented so far"
-# endif
-
 # define UPDATE_STRNCMP_COUNTER
 #elif defined USE_AS_STRNCASECMP_L
 # include "locale-defines.h"
 
-/* No support for strncasecmp outside libc so far since it is not needed.  */
-# if !IS_IN (libc)
-#  error "strncasecmp_l not implemented so far"
-# endif
-
 # define UPDATE_STRNCMP_COUNTER				\
 	/* calculate left number to compare */		\
 	lea	-16(%rcx, %r11), %r9;			\
@@ -126,63 +109,44 @@ libc_hidden_def (__strncasecmp)
 #endif
 
 ENTRY (STRCMP)
-#if !IS_IN (libc)
-/* Simple version since we can't use SSE registers in ld.so.  */
-L(oop):	movb	(%rdi), %al
-	cmpb	(%rsi), %al
-	jne	L(neq)
-	incq	%rdi
-	incq	%rsi
-	testb	%al, %al
-	jnz	L(oop)
-
-	xorl	%eax, %eax
-	ret
-
-L(neq):	movl	$1, %eax
-	movl	$-1, %ecx
-	cmovbl	%ecx, %eax
-	ret
-END (STRCMP)
-#else	/* !IS_IN (libc) */
-# ifdef USE_AS_STRCASECMP_L
+#ifdef USE_AS_STRCASECMP_L
 	/* We have to fall back on the C implementation for locales
 	   with encodings not matching ASCII for single bytes.  */
-#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
 	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
-#  else
+# else
 	mov	(%rdx), %RAX_LP
-#  endif
+# endif
 	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
 	jne	__strcasecmp_l_nonascii
-# elif defined USE_AS_STRNCASECMP_L
+#elif defined USE_AS_STRNCASECMP_L
 	/* We have to fall back on the C implementation for locales
 	   with encodings not matching ASCII for single bytes.  */
-#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
 	mov	LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
-#  else
+# else
 	mov	(%rcx), %RAX_LP
-#  endif
+# endif
 	testl	$1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
 	jne	__strncasecmp_l_nonascii
-# endif
+#endif
 
 /*
  * This implementation uses SSE to compare up to 16 bytes at a time.
  */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	test	%rdx, %rdx
 	je	LABEL(strcmp_exitz)
 	cmp	$1, %rdx
 	je	LABEL(Byte0)
 	mov	%rdx, %r11
-# endif
+#endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
 /* Use 64bit AND here to avoid long NOP padding.  */
 	and	$0x3f, %rcx		/* rsi alignment in cache line */
 	and	$0x3f, %rax		/* rdi alignment in cache line */
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
 .Lbelowupper:
@@ -196,12 +160,12 @@ END (STRCMP)
 	.quad	0x2020202020202020
 	.previous
 	movdqa	.Lbelowupper(%rip), %xmm5
-#  define UCLOW_reg %xmm5
+# define UCLOW_reg %xmm5
 	movdqa	.Ltopupper(%rip), %xmm6
-#  define UCHIGH_reg %xmm6
+# define UCHIGH_reg %xmm6
 	movdqa	.Ltouppermask(%rip), %xmm7
-#  define LCQWORD_reg %xmm7
-# endif
+# define LCQWORD_reg %xmm7
+#endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)	/* rsi: 16-byte load will cross cache line */
 	cmp	$0x30, %eax
@@ -210,8 +174,8 @@ END (STRCMP)
 	movlpd	(%rsi), %xmm2
 	movhpd	8(%rdi), %xmm1
 	movhpd	8(%rsi), %xmm2
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-#  define TOLOWER(reg1, reg2) \
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
 	movdqa	reg1, %xmm8;					\
 	movdqa	UCHIGH_reg, %xmm9;				\
 	movdqa	reg2, %xmm10;					\
@@ -227,9 +191,9 @@ END (STRCMP)
 	por	%xmm8, reg1;					\
 	por	%xmm10, reg2
 	TOLOWER (%xmm1, %xmm2)
-# else
-#  define TOLOWER(reg1, reg2)
-# endif
+#else
+# define TOLOWER(reg1, reg2)
+#endif
 	pxor	%xmm0, %xmm0		/* clear %xmm0 for null char checks */
 	pcmpeqb	%xmm1, %xmm0		/* Any null chars? */
 	pcmpeqb	%xmm2, %xmm1		/* compare first 16 bytes for equality */
@@ -237,10 +201,10 @@ END (STRCMP)
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx		/* if first 16 bytes are same, edx == 0xffff */
 	jnz	LABEL(less16bytes)	/* If not, find different value or null char */
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)	/* finish comparision */
-# endif
+#endif
 	add	$16, %rsi		/* prepare to search next 16 bytes */
 	add	$16, %rdi		/* prepare to search next 16 bytes */
 
@@ -282,13 +246,13 @@ LABEL(ashr_0):
 	movdqa	(%rsi), %xmm1
 	pxor	%xmm0, %xmm0			/* clear %xmm0 for null char check */
 	pcmpeqb	%xmm1, %xmm0			/* Any null chars? */
-# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
 	pcmpeqb	(%rdi), %xmm1			/* compare 16 bytes for equality */
-# else
+#else
 	movdqa	(%rdi), %xmm2
 	TOLOWER (%xmm1, %xmm2)
 	pcmpeqb	%xmm2, %xmm1			/* compare 16 bytes for equality */
-# endif
+#endif
 	psubb	%xmm0, %xmm1			/* packed sub of comparison results*/
 	pmovmskb %xmm1, %r9d
 	shr	%cl, %edx			/* adjust 0xffff for offset */
@@ -321,10 +285,10 @@ LABEL(loop_ashr_0):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)		/* mismatch or null char seen */
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	(%rsi, %rcx), %xmm1
 	movdqa	(%rdi, %rcx), %xmm2
@@ -336,10 +300,10 @@ LABEL(loop_ashr_0):
 	pmovmskb %xmm1, %edx
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	jmp	LABEL(loop_ashr_0)
 
@@ -388,13 +352,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -404,10 +368,10 @@ LABEL(gobble_ashr_1):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
 
@@ -418,13 +382,13 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -434,10 +398,10 @@ LABEL(gobble_ashr_1):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
 	jmp	LABEL(loop_ashr_1)
@@ -453,10 +417,10 @@ LABEL(nibble_ashr_1):
 	test	$0xfffe, %edx
 	jnz	LABEL(ashr_1_exittail)	/* find null char*/
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$15, %r11
 	jbe	LABEL(ashr_1_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10		/* substract 4K from %r10 */
@@ -518,13 +482,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -534,10 +498,10 @@ LABEL(gobble_ashr_2):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -549,13 +513,13 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -565,10 +529,10 @@ LABEL(gobble_ashr_2):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -581,10 +545,10 @@ LABEL(nibble_ashr_2):
 	test	$0xfffc, %edx
 	jnz	LABEL(ashr_2_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$14, %r11
 	jbe	LABEL(ashr_2_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -643,13 +607,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -659,10 +623,10 @@ LABEL(gobble_ashr_3):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -674,13 +638,13 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -690,10 +654,10 @@ LABEL(gobble_ashr_3):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -706,10 +670,10 @@ LABEL(nibble_ashr_3):
 	test	$0xfff8, %edx
 	jnz	LABEL(ashr_3_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$13, %r11
 	jbe	LABEL(ashr_3_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -768,13 +732,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -784,10 +748,10 @@ LABEL(gobble_ashr_4):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -799,13 +763,13 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -815,10 +779,10 @@ LABEL(gobble_ashr_4):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -831,10 +795,10 @@ LABEL(nibble_ashr_4):
 	test	$0xfff0, %edx
 	jnz	LABEL(ashr_4_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$12, %r11
 	jbe	LABEL(ashr_4_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -893,13 +857,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -909,10 +873,10 @@ LABEL(gobble_ashr_5):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -924,13 +888,13 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -940,10 +904,10 @@ LABEL(gobble_ashr_5):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -956,10 +920,10 @@ LABEL(nibble_ashr_5):
 	test	$0xffe0, %edx
 	jnz	LABEL(ashr_5_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$11, %r11
 	jbe	LABEL(ashr_5_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1018,13 +982,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1034,10 +998,10 @@ LABEL(gobble_ashr_6):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1049,13 +1013,13 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1065,10 +1029,10 @@ LABEL(gobble_ashr_6):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1081,10 +1045,10 @@ LABEL(nibble_ashr_6):
 	test	$0xffc0, %edx
 	jnz	LABEL(ashr_6_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$10, %r11
 	jbe	LABEL(ashr_6_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1143,13 +1107,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1159,10 +1123,10 @@ LABEL(gobble_ashr_7):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1174,13 +1138,13 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1190,10 +1154,10 @@ LABEL(gobble_ashr_7):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1206,10 +1170,10 @@ LABEL(nibble_ashr_7):
 	test	$0xff80, %edx
 	jnz	LABEL(ashr_7_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$9, %r11
 	jbe	LABEL(ashr_7_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1268,13 +1232,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1284,10 +1248,10 @@ LABEL(gobble_ashr_8):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1299,13 +1263,13 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1315,10 +1279,10 @@ LABEL(gobble_ashr_8):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1331,10 +1295,10 @@ LABEL(nibble_ashr_8):
 	test	$0xff00, %edx
 	jnz	LABEL(ashr_8_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$8, %r11
 	jbe	LABEL(ashr_8_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1393,13 +1357,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1409,10 +1373,10 @@ LABEL(gobble_ashr_9):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1424,13 +1388,13 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1440,10 +1404,10 @@ LABEL(gobble_ashr_9):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3		/* store for next cycle */
@@ -1456,10 +1420,10 @@ LABEL(nibble_ashr_9):
 	test	$0xfe00, %edx
 	jnz	LABEL(ashr_9_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$7, %r11
 	jbe	LABEL(ashr_9_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1518,13 +1482,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1534,10 +1498,10 @@ LABEL(gobble_ashr_10):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1549,13 +1513,13 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1565,10 +1529,10 @@ LABEL(gobble_ashr_10):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1581,10 +1545,10 @@ LABEL(nibble_ashr_10):
 	test	$0xfc00, %edx
 	jnz	LABEL(ashr_10_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$6, %r11
 	jbe	LABEL(ashr_10_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1643,13 +1607,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1659,10 +1623,10 @@ LABEL(gobble_ashr_11):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1674,13 +1638,13 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1690,10 +1654,10 @@ LABEL(gobble_ashr_11):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1706,10 +1670,10 @@ LABEL(nibble_ashr_11):
 	test	$0xf800, %edx
 	jnz	LABEL(ashr_11_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$5, %r11
 	jbe	LABEL(ashr_11_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1768,13 +1732,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1784,10 +1748,10 @@ LABEL(gobble_ashr_12):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1799,13 +1763,13 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1815,10 +1779,10 @@ LABEL(gobble_ashr_12):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1831,10 +1795,10 @@ LABEL(nibble_ashr_12):
 	test	$0xf000, %edx
 	jnz	LABEL(ashr_12_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$4, %r11
 	jbe	LABEL(ashr_12_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -1893,13 +1857,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1909,10 +1873,10 @@ LABEL(gobble_ashr_13):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1924,13 +1888,13 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1940,10 +1904,10 @@ LABEL(gobble_ashr_13):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -1956,10 +1920,10 @@ LABEL(nibble_ashr_13):
 	test	$0xe000, %edx
 	jnz	LABEL(ashr_13_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$3, %r11
 	jbe	LABEL(ashr_13_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2018,13 +1982,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2034,10 +1998,10 @@ LABEL(gobble_ashr_14):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2049,13 +2013,13 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2065,10 +2029,10 @@ LABEL(gobble_ashr_14):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2081,10 +2045,10 @@ LABEL(nibble_ashr_14):
 	test	$0xc000, %edx
 	jnz	LABEL(ashr_14_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmp	$2, %r11
 	jbe	LABEL(ashr_14_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2145,13 +2109,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2161,10 +2125,10 @@ LABEL(gobble_ashr_15):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2176,13 +2140,13 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-# ifndef USE_SSSE3
+#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-# else
+#else
 	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-# endif
+#endif
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2192,10 +2156,10 @@ LABEL(gobble_ashr_15):
 	sub	$0xffff, %edx
 	jnz	LABEL(exit)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	$16, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 
 	add	$16, %rcx
 	movdqa	%xmm4, %xmm3
@@ -2208,10 +2172,10 @@ LABEL(nibble_ashr_15):
 	test	$0x8000, %edx
 	jnz	LABEL(ashr_15_exittail)
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	cmpq	$1, %r11
 	jbe	LABEL(ashr_15_exittail)
-# endif
+#endif
 
 	pxor	%xmm0, %xmm0
 	sub	$0x1000, %r10
@@ -2246,18 +2210,18 @@ LABEL(ret):
 LABEL(less16bytes):
 	bsf	%rdx, %rdx		/* find and store bit index in %rdx */
 
-# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 	sub	%rdx, %r11
 	jbe	LABEL(strcmp_exitz)
-# endif
+#endif
 	movzbl	(%rsi, %rdx), %ecx
 	movzbl	(%rdi, %rdx), %eax
 
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 	movl	(%rdx,%rcx,4), %ecx
 	movl	(%rdx,%rax,4), %eax
-# endif
+#endif
 
 	sub	%ecx, %eax
 	ret
@@ -2271,11 +2235,11 @@ LABEL(Byte0):
 	movzx	(%rsi), %ecx
 	movzx	(%rdi), %eax
 
-# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 	movl	(%rdx,%rcx,4), %ecx
 	movl	(%rdx,%rax,4), %eax
-# endif
+#endif
 
 	sub	%ecx, %eax
 	ret
@@ -2300,5 +2264,4 @@ LABEL(unaligned_table):
 	.int	LABEL(ashr_14) - LABEL(unaligned_table)
 	.int	LABEL(ashr_15) - LABEL(unaligned_table)
 	.int	LABEL(ashr_0) - LABEL(unaligned_table)
-#endif /* !IS_IN (libc) */
 libc_hidden_builtin_def (STRCMP)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c7a93fbbeeba2b71ede5c865428cf9bf9cce6d4a

commit c7a93fbbeeba2b71ede5c865428cf9bf9cce6d4a
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:56:14 2015 -0700

    Remove x86-64 rtld-xxx.c and rtld-xxx.S
    
    Since ld.so preserves vector registers now, we can use the regular,
    non-ifunc string and memory functions in ld.so.
    
    	* sysdeps/x86_64/rtld-memcmp.c: Removed.
    	* sysdeps/x86_64/rtld-memset.S: Likewise.
    	* sysdeps/x86_64/rtld-strchr.S: Likewise.
    	* sysdeps/x86_64/rtld-strlen.S: Likewise.
    	* sysdeps/x86_64/multiarch/rtld-memcmp.c: Likewise.
    	* sysdeps/x86_64/multiarch/rtld-memset.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/rtld-memcmp.c b/sysdeps/x86_64/multiarch/rtld-memcmp.c
deleted file mode 100644
index 0f27135..0000000
--- a/sysdeps/x86_64/multiarch/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memcmp.c"
diff --git a/sysdeps/x86_64/multiarch/rtld-memset.S b/sysdeps/x86_64/multiarch/rtld-memset.S
deleted file mode 100644
index 8092aa0..0000000
--- a/sysdeps/x86_64/multiarch/rtld-memset.S
+++ /dev/null
@@ -1 +0,0 @@
-#include "../rtld-memset.S"
diff --git a/sysdeps/x86_64/rtld-memcmp.c b/sysdeps/x86_64/rtld-memcmp.c
deleted file mode 100644
index 2ee4032..0000000
--- a/sysdeps/x86_64/rtld-memcmp.c
+++ /dev/null
@@ -1 +0,0 @@
-#include <string/memcmp.c>
diff --git a/sysdeps/x86_64/rtld-memset.S b/sysdeps/x86_64/rtld-memset.S
deleted file mode 100644
index f8df333..0000000
--- a/sysdeps/x86_64/rtld-memset.S
+++ /dev/null
@@ -1,37 +0,0 @@
-/* memset implementation for the dynamic linker.  This is separate from the
-   libc implementation to avoid writing to SSE registers.
-   Copyright (C) 2013-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-/* void *memset (void *dest, char c, size_t count)
-   dest	 => %rdi
-   c	 => %rsi
-   count => %rdx  */
-ENTRY (memset)
-	mov	%rdx, %rcx
-	movzbl	%sil, %eax
-	mov	%rdi, %rdx
-	rep	stosb
-	mov	%rdx, %rax
-	ret
-END (memset)
-libc_hidden_builtin_def (memset)
diff --git a/sysdeps/x86_64/rtld-strchr.S b/sysdeps/x86_64/rtld-strchr.S
deleted file mode 100644
index cc694d7..0000000
--- a/sysdeps/x86_64/rtld-strchr.S
+++ /dev/null
@@ -1,288 +0,0 @@
-/* strchr (str, ch) -- Return pointer to first occurrence of CH in STR.
-   For AMD x86-64.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-ENTRY (strchr)
-
-	/* Before we start with the main loop we process single bytes
-	   until the source pointer is aligned.  This has two reasons:
-	   1. aligned 64-bit memory access is faster
-	   and (more important)
-	   2. we process in the main loop 64 bit in one step although
-	      we don't know the end of the string.  But accessing at
-	      8-byte alignment guarantees that we never access illegal
-	      memory if this would not also be done by the trivial
-	      implementation (this is because all processor inherent
-	      boundaries are multiples of 8).  */
-
-	movq	%rdi, %rdx
-	andl	$7, %edx	/* Mask alignment bits  */
-	movq	%rdi, %rax	/* duplicate destination.  */
-	jz	1f		/* aligned => start loop */
-	neg	%edx
-	addl	$8, %edx	/* Align to 8 bytes.  */
-
-	/* Search the first bytes directly.  */
-0:	movb	(%rax), %cl	/* load byte  */
-	cmpb	%cl,%sil	/* compare byte.  */
-	je	6f		/* target found */
-	testb	%cl,%cl		/* is byte NUL? */
-	je	7f		/* yes => return NULL */
-	incq	%rax		/* increment pointer */
-	decl	%edx
-	jnz	0b
-
-
-1:
-	/* At the moment %rsi contains C.  What we need for the
-	   algorithm is C in all bytes of the register.  Avoid
-	   operations on 16 bit words because these require an
-	   prefix byte (and one more cycle).  */
-	/* Populate 8 bit data to full 64-bit.  */
-	movabs	$0x0101010101010101,%r9
-	movzbl	%sil,%edx
-	imul	%rdx,%r9
-
-	movq $0xfefefefefefefeff, %r8 /* Save magic.  */
-
-      /* We exit the loop if adding MAGIC_BITS to LONGWORD fails to
-	 change any of the hole bits of LONGWORD.
-
-	 1) Is this safe?  Will it catch all the zero bytes?
-	 Suppose there is a byte with all zeros.  Any carry bits
-	 propagating from its left will fall into the hole at its
-	 least significant bit and stop.  Since there will be no
-	 carry from its most significant bit, the LSB of the
-	 byte to the left will be unchanged, and the zero will be
-	 detected.
-
-	 2) Is this worthwhile?  Will it ignore everything except
-	 zero bytes?  Suppose every byte of QUARDWORD has a bit set
-	 somewhere.  There will be a carry into bit 8.	If bit 8
-	 is set, this will carry into bit 16.  If bit 8 is clear,
-	 one of bits 9-15 must be set, so there will be a carry
-	 into bit 16.  Similarly, there will be a carry into bit
-	 24 tec..  If one of bits 54-63 is set, there will be a carry
-	 into bit 64 (=carry flag), so all of the hole bits will
-	 be changed.
-
-	 3) But wait!  Aren't we looking for C, not zero?
-	 Good point.  So what we do is XOR LONGWORD with a longword,
-	 each of whose bytes is C.  This turns each byte that is C
-	 into a zero.  */
-
-	.p2align 4
-4:
-	/* Main Loop is unrolled 4 times.  */
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 7f			/* found NUL => return NULL */
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	xorq %r9, %rcx		/* XOR with qword c|...|c => bytes of str == c
-				   are now 0 */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found c => return pointer */
-
-	/* The quadword we looked at does not contain the value we're looking
-	   for.  Let's search now whether we have reached the end of the
-	   string.  */
-	xorq %r9, %rcx		/* restore original dword without reload */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 7f			/* highest byte is NUL => return NULL */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => restart loop */
-
-
-7:	/* Return NULL.  */
-	xorl %eax, %eax
-	retq
-
-
-	/* We now scan for the byte in which the character was matched.
-	   But we have to take care of the case that a NUL char is
-	   found before this in the dword.  Note that we XORed %rcx
-	   with the byte we're looking for, therefore the tests below look
-	   reversed.  */
-
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	movq	%r9,%rdx	/* move to %rdx so that we can access bytes */
-	subq	$8,%rax		/* correct pointer increment.  */
-	testb %cl, %cl		/* is first byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is first byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is second byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is third byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is third byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is fourth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is fourth byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is fifth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is fifth byte NUL? */
-	je 7b			/* yes => return NULL */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is sixth byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %ch		/* is sixth byte NUL? */
-	je 7b			/* yes => return NULL? */
-	incq %rax		/* increment pointer */
-
-	shrq $16, %rcx		/* make upper bytes accessible */
-	testb %cl, %cl		/* is seventh byte C? */
-	jz 6f			/* yes => return pointer */
-	cmpb %dl, %cl		/* is seventh byte NUL? */
-	je 7b			/* yes => return NULL */
-
-	/* It must be in the eigth byte and it cannot be NUL.  */
-	incq %rax
-
-6:
-	nop
-	retq
-END (strchr)
-
-weak_alias (strchr, index)
-libc_hidden_builtin_def (strchr)
diff --git a/sysdeps/x86_64/rtld-strlen.S b/sysdeps/x86_64/rtld-strlen.S
deleted file mode 100644
index 1328652..0000000
--- a/sysdeps/x86_64/rtld-strlen.S
+++ /dev/null
@@ -1,136 +0,0 @@
-/* strlen(str) -- determine the length of the string STR.
-   Copyright (C) 2002-2015 Free Software Foundation, Inc.
-   Based on i486 version contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-
-	.text
-ENTRY (strlen)
-	movq %rdi, %rcx		/* Duplicate source pointer. */
-	andl $7, %ecx		/* mask alignment bits */
-	movq %rdi, %rax		/* duplicate destination.  */
-	jz 1f			/* aligned => start loop */
-
-	neg %ecx		/* We need to align to 8 bytes.  */
-	addl $8,%ecx
-	/* Search the first bytes directly.  */
-0:	cmpb $0x0,(%rax)	/* is byte NUL? */
-	je 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-	decl %ecx
-	jnz 0b
-
-1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
-
-	.p2align 4		/* Align loop.  */
-4:	/* Main Loop is unrolled 4 times.  */
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => continue loop */
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0x00ff0000, %ecx /* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	testl $0xff000000, %ecx /* is fourth byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	shrq $32, %rcx		/* look at other half.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0xff0000, %ecx	/* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-2:
-	subq %rdi, %rax		/* compute difference to string start */
-	ret
-END (strlen)
-libc_hidden_builtin_def (strlen)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d1966ef11829d0f7191dc5efb1d630cea0eda7ad

commit d1966ef11829d0f7191dc5efb1d630cea0eda7ad
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:47:54 2015 -0700

    Replace %xmm8 with %xmm0
    
    Since ld.so preserves vector registers now, we can use %xmm0 to avoid
    the REX prefix.
    
    	* sysdeps/x86_64/memset.S: Replace %xmm8 with %xmm0.

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index e496254..3855cc8 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -24,7 +24,7 @@
 ENTRY(__bzero)
 	movq	%rdi, %rax /* Set return value.  */
 	movq	%rsi, %rdx /* Set n.  */
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	jmp	L(entry_from_bzero)
 END(__bzero)
 weak_alias (__bzero, bzero)
@@ -33,10 +33,10 @@ weak_alias (__bzero, bzero)
 ENTRY(__memset_tail)
 	movq	%rcx, %rax /* Set return value.  */
 
-	movd	%esi, %xmm8
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
+	movd	%esi, %xmm0
+	punpcklbw	%xmm0, %xmm0
+	punpcklwd	%xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
 
 	jmp	L(entry_from_bzero)
 END(__memset_tail)
@@ -50,57 +50,57 @@ END_CHK (__memset_chk)
 #endif
 
 ENTRY (memset)
-	movd	%esi, %xmm8
+	movd	%esi, %xmm0
 	movq	%rdi, %rax
-	punpcklbw	%xmm8, %xmm8
-	punpcklwd	%xmm8, %xmm8
-	pshufd	$0, %xmm8, %xmm8
+	punpcklbw	%xmm0, %xmm0
+	punpcklwd	%xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
 L(entry_from_bzero):
 	cmpq	$64, %rdx
 	ja	L(loop_start)
 	cmpq	$16, %rdx
 	jbe	L(less_16_bytes)
 	cmpq	$32, %rdx
-	movdqu	%xmm8, (%rdi)
-	movdqu	%xmm8, -16(%rdi,%rdx)
+	movdqu	%xmm0, (%rdi)
+	movdqu	%xmm0, -16(%rdi,%rdx)
 	ja	L(between_32_64_bytes)
 L(return):
 	rep
 	ret
 	.p2align 4
 L(between_32_64_bytes):
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi,%rdx)
 	ret
 	.p2align 4
 L(loop_start):
 	leaq	64(%rdi), %rcx
-	movdqu	%xmm8, (%rdi)
+	movdqu	%xmm0, (%rdi)
 	andq	$-64, %rcx
-	movdqu	%xmm8, -16(%rdi,%rdx)
-	movdqu	%xmm8, 16(%rdi)
-	movdqu	%xmm8, -32(%rdi,%rdx)
-	movdqu	%xmm8, 32(%rdi)
-	movdqu	%xmm8, -48(%rdi,%rdx)
-	movdqu	%xmm8, 48(%rdi)
-	movdqu	%xmm8, -64(%rdi,%rdx)
+	movdqu	%xmm0, -16(%rdi,%rdx)
+	movdqu	%xmm0, 16(%rdi)
+	movdqu	%xmm0, -32(%rdi,%rdx)
+	movdqu	%xmm0, 32(%rdi)
+	movdqu	%xmm0, -48(%rdi,%rdx)
+	movdqu	%xmm0, 48(%rdi)
+	movdqu	%xmm0, -64(%rdi,%rdx)
 	addq	%rdi, %rdx
 	andq	$-64, %rdx
 	cmpq	%rdx, %rcx
 	je	L(return)
 	.p2align 4
 L(loop):
-	movdqa	%xmm8, (%rcx)
-	movdqa	%xmm8, 16(%rcx)
-	movdqa	%xmm8, 32(%rcx)
-	movdqa	%xmm8, 48(%rcx)
+	movdqa	%xmm0, (%rcx)
+	movdqa	%xmm0, 16(%rcx)
+	movdqa	%xmm0, 32(%rcx)
+	movdqa	%xmm0, 48(%rcx)
 	addq	$64, %rcx
 	cmpq	%rcx, %rdx
 	jne	L(loop)
 	rep
 	ret
 L(less_16_bytes):
-	movq %xmm8, %rcx
+	movq %xmm0, %rcx
 	testb	$24, %dl
 	jne	L(between8_16bytes)
 	testb	$4, %dl

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=114abebe2b249f7f9e8627d283c88353e9f2edf9

commit 114abebe2b249f7f9e8627d283c88353e9f2edf9
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:44:39 2015 -0700

    Replace %xmm[8-12] with %xmm[0-4]
    
    Since ld.so preserves vector registers now, we can use %xmm[0-4] to
    avoid the REX prefix.
    
    	* sysdeps/x86_64/strlen.S: Replace %xmm[8-12] with %xmm[0-4].

diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index c382c8d..0725333 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -20,7 +20,7 @@
 
 /* Long lived register in strlen(s), strnlen(s, n) are:
 
-	%xmm11 - zero
+	%xmm3 - zero
 	%rdi   - s
 	%r10  (s+n) & (~(64-1))
 	%r11   s+n
@@ -32,14 +32,14 @@ ENTRY(strlen)
 
 /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
 #define FIND_ZERO	\
-	pcmpeqb	(%rax), %xmm8;	\
-	pcmpeqb	16(%rax), %xmm9;	\
-	pcmpeqb	32(%rax), %xmm10;	\
-	pcmpeqb	48(%rax), %xmm11;	\
-	pmovmskb	%xmm8, %esi;	\
-	pmovmskb	%xmm9, %edx;	\
-	pmovmskb	%xmm10, %r8d;	\
-	pmovmskb	%xmm11, %ecx;	\
+	pcmpeqb	(%rax), %xmm0;	\
+	pcmpeqb	16(%rax), %xmm1;	\
+	pcmpeqb	32(%rax), %xmm2;	\
+	pcmpeqb	48(%rax), %xmm3;	\
+	pmovmskb	%xmm0, %esi;	\
+	pmovmskb	%xmm1, %edx;	\
+	pmovmskb	%xmm2, %r8d;	\
+	pmovmskb	%xmm3, %ecx;	\
 	salq	$16, %rdx;	\
 	salq	$16, %rcx;	\
 	orq	%rsi, %rdx;	\
@@ -63,10 +63,10 @@ L(n_nonzero):
 	mov	%rsi, %r11
 #endif
 
-	pxor	%xmm8, %xmm8
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
 	movq	%rdi, %rax
 	movq	%rdi, %rcx
 	andq	$4095, %rcx
@@ -103,9 +103,9 @@ L(n_nonzero):
 	FIND_ZERO
 #else
 	/* Test first 16 bytes unaligned.  */
-	movdqu	(%rax), %xmm12
-	pcmpeqb	%xmm8, %xmm12
-	pmovmskb	%xmm12, %edx
+	movdqu	(%rax), %xmm4
+	pcmpeqb	%xmm0, %xmm4
+	pmovmskb	%xmm4, %edx
 	test	%edx, %edx
 	je 	L(next48_bytes)
 	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
@@ -114,12 +114,12 @@ L(n_nonzero):
 L(next48_bytes):
 /* Same as FIND_ZERO except we do not check first 16 bytes.  */
 	andq	$-16, %rax
-	pcmpeqb 16(%rax), %xmm9
-	pcmpeqb 32(%rax), %xmm10
-	pcmpeqb 48(%rax), %xmm11
-	pmovmskb	%xmm9, %edx
-	pmovmskb	%xmm10, %r8d
-	pmovmskb	%xmm11, %ecx
+	pcmpeqb 16(%rax), %xmm1
+	pcmpeqb 32(%rax), %xmm2
+	pcmpeqb 48(%rax), %xmm3
+	pmovmskb	%xmm1, %edx
+	pmovmskb	%xmm2, %r8d
+	pmovmskb	%xmm3, %ecx
 	salq	$16, %rdx
 	salq	$16, %rcx
 	orq	%r8, %rcx
@@ -127,7 +127,7 @@ L(next48_bytes):
 	orq	%rcx, %rdx
 #endif
 
-	/* When no zero byte is found xmm9-11 are zero so we do not have to
+	/* When no zero byte is found xmm1-3 are zero so we do not have to
 	   zero them.  */
 	PROLOG(loop)
 
@@ -149,9 +149,9 @@ L(strnlen_ret):
 #endif
 	.p2align 4
 L(loop_init):
-	pxor	%xmm9, %xmm9
-	pxor	%xmm10, %xmm10
-	pxor	%xmm11, %xmm11
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
 #ifdef AS_STRNLEN
 	.p2align 4
 L(loop):
@@ -160,12 +160,12 @@ L(loop):
 	cmpq	%rax, %r10
 	je	L(exit_end)
 
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	(%rax), %xmm0
+	pminub	16(%rax), %xmm0
+	pminub	32(%rax), %xmm0
+	pminub	48(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit)
 	jmp	L(loop)
@@ -174,7 +174,7 @@ L(loop):
 L(exit_end):
 	cmp	%rax, %r11
 	je	L(first) /* Do not read when end is at page boundary.  */
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 L(first):
@@ -186,7 +186,7 @@ L(first):
 
 	.p2align 4
 L(exit):
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 	bsfq	%rdx, %rdx
@@ -200,23 +200,23 @@ L(exit):
 	.p2align 4
 L(loop):
 
-	movdqa	64(%rax), %xmm8
-	pminub	80(%rax), %xmm8
-	pminub	96(%rax), %xmm8
-	pminub	112(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	64(%rax), %xmm0
+	pminub	80(%rax), %xmm0
+	pminub	96(%rax), %xmm0
+	pminub	112(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit64)
 
 	subq	$-128, %rax
 
-	movdqa	(%rax), %xmm8
-	pminub	16(%rax), %xmm8
-	pminub	32(%rax), %xmm8
-	pminub	48(%rax), %xmm8
-	pcmpeqb	%xmm11, %xmm8
-	pmovmskb	%xmm8, %edx
+	movdqa	(%rax), %xmm0
+	pminub	16(%rax), %xmm0
+	pminub	32(%rax), %xmm0
+	pminub	48(%rax), %xmm0
+	pcmpeqb	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
 	testl	%edx, %edx
 	jne	L(exit0)
 	jmp	L(loop)
@@ -225,7 +225,7 @@ L(loop):
 L(exit64):
 	addq	$64, %rax
 L(exit0):
-	pxor	%xmm8, %xmm8
+	pxor	%xmm0, %xmm0
 	FIND_ZERO
 
 	bsfq	%rdx, %rdx

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=489b012e1648bb7d185e48369aeb591bfa53f7da

commit 489b012e1648bb7d185e48369aeb591bfa53f7da
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Tue Jul 28 18:56:18 2015 -0700

    Don't disable SSE in x86-64 ld.so

diff --git a/sysdeps/i386/Makefile b/sysdeps/i386/Makefile
index 11f425d..2c08907 100644
--- a/sysdeps/i386/Makefile
+++ b/sysdeps/i386/Makefile
@@ -79,3 +79,14 @@ endif
 ifeq ($(subdir),csu)
 gen-as-const-headers += tlsdesc.sym
 endif
+
+ifeq ($(subdir),elf)
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+		   -mno-sse -mno-mmx)
+
+tests-special += $(objpfx)tst-ld-sse-use.out
+$(objpfx)tst-ld-sse-use.out: ../sysdeps/x86/tst-ld-sse-use.sh $(objpfx)ld.so
+	@echo "Checking ld.so for SSE register use.  This will take a few seconds..."
+	$(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
+	$(evaluate-test)
+endif
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
deleted file mode 100644
index 19f5eca..0000000
--- a/sysdeps/x86/Makefile
+++ /dev/null
@@ -1,10 +0,0 @@
-ifeq ($(subdir),elf)
-CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
-		   -mno-sse -mno-mmx)
-
-tests-special += $(objpfx)tst-ld-sse-use.out
-$(objpfx)tst-ld-sse-use.out: ../sysdeps/x86/tst-ld-sse-use.sh $(objpfx)ld.so
-	@echo "Checking ld.so for SSE register use.  This will take a few seconds..."
-	$(BASH) $< $(objpfx) '$(NM)' '$(OBJDUMP)' '$(READELF)' > $@; \
-	$(evaluate-test)
-endif
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index ef70a50..de906f2 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -19,6 +19,9 @@ gen-as-const-headers += locale-defines.sym
 endif
 
 ifeq ($(subdir),elf)
+CFLAGS-.os += $(if $(filter $(@F),$(patsubst %,%.os,$(all-rtld-routines))),\
+		   -mno-mmx)
+
 sysdep-dl-routines += tlsdesc dl-tlsdesc
 
 tests += tst-quad1 tst-quad2

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=298bedf71fcc558c2db9a964acf152d0140952db

commit 298bedf71fcc558c2db9a964acf152d0140952db
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sat Jul 11 13:25:25 2015 -0700

    Save and restore vector registers in ld.so
    
    1. Initiaize dl_x86_xstate in dl_platform_init
    2. Delete FOREIGN_CALL
    3. Change rtld_must_xmm_save and rtld_savespace_sse in x86-64 tcbhead_t
    to unused.

diff --git a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
index 8ac351e..a3c0c19 100644
--- a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
+++ b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c
@@ -1,5 +1,5 @@
 #if IS_IN (ldconfig)
 # include <sysdeps/i386/dl-procinfo.c>
 #else
-# include <sysdeps/generic/dl-procinfo.c>
+# include <sysdeps/x86_64/dl-procinfo.c>
 #endif
diff --git a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.h b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.h
index 7829e1c..dba5cc9 100644
--- a/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.h
+++ b/sysdeps/unix/sysv/linux/x86_64/dl-procinfo.h
@@ -1,5 +1,5 @@
 #if IS_IN (ldconfig)
 # include <sysdeps/unix/sysv/linux/i386/dl-procinfo.h>
 #else
-# include <sysdeps/generic/dl-procinfo.h>
+# include <sysdeps/x86_64/dl-procinfo.h>
 #endif
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
index cae6db3..4316578 100644
--- a/sysdeps/x86_64/dl-machine.h
+++ b/sysdeps/x86_64/dl-machine.h
@@ -26,6 +26,8 @@
 #include <sysdep.h>
 #include <tls.h>
 #include <dl-tlsdesc.h>
+#include <dl-procinfo.h>
+#include <cpuid.h>
 
 /* Return nonzero iff ELF header is compatible with the running host.  */
 static inline int __attribute__ ((unused))
@@ -65,8 +67,12 @@ static inline int __attribute__ ((unused, always_inline))
 elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 {
   Elf64_Addr *got;
-  extern void _dl_runtime_resolve (ElfW(Word)) attribute_hidden;
-  extern void _dl_runtime_profile (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_resolve_avx512 (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
+  extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
 
   if (l->l_info[DT_JMPREL] && lazy)
     {
@@ -86,6 +92,8 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
       /* Identify this shared object.  */
       *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
 
+      unsigned int x86_xstate = GLRO(dl_x86_xstate);
+
       /* The got[2] entry contains the address of a function which gets
 	 called to get the address of a so far unresolved function and
 	 jump to it.  The profiling extension of the dynamic linker allows
@@ -94,7 +102,12 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	 end in this function.  */
       if (__glibc_unlikely (profile))
 	{
-	  *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile;
+	  if ((x86_xstate & bit_X86_XSTATE_AVX512) != 0)
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
+	  else if ((x86_xstate & bit_X86_XSTATE_AVX) != 0)
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_sse;
 
 	  if (GLRO(dl_profile) != NULL
 	      && _dl_name_match_p (GLRO(dl_profile), l))
@@ -103,9 +116,17 @@ elf_machine_runtime_setup (struct link_map *l, int lazy, int profile)
 	    GL(dl_profile_map) = l;
 	}
       else
-	/* This function will get called to fix up the GOT entry indicated by
-	   the offset on the stack, and then jump to the resolved address.  */
-	*(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve;
+	{
+	  /* This function will get called to fix up the GOT entry
+	     indicated by the offset on the stack, and then jump to
+	     the resolved address.  */
+	  if ((x86_xstate & bit_X86_XSTATE_AVX512) != 0)
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx512;
+	  else if ((x86_xstate & bit_X86_XSTATE_AVX) != 0)
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_avx;
+	  else
+	    *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_resolve_sse;
+	}
     }
 
   if (l->l_info[ADDRIDX (DT_TLSDESC_GOT)] && lazy)
@@ -205,6 +226,39 @@ dl_platform_init (void)
   if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
     /* Avoid an empty string which would disturb us.  */
     GLRO(dl_platform) = NULL;
+
+  unsigned int eax, ebx, ecx, edx;
+  __cpuid (1, eax, ebx, ecx, edx);
+  if ((ecx & (bit_AVX | bit_OSXSAVE)) == (bit_AVX | bit_OSXSAVE))
+    {
+      unsigned int x86_xstate;
+
+      __cpuid_count (7, 0, eax, ebx, ecx, edx);
+
+      /* Verify that ZMM, YMM and XMM states are enabled.  */
+      asm ("xgetbv" : "=a" (eax), "=d" (edx) : "c" (0));
+      if ((eax & (1 << 0)) != 0)
+	x86_xstate = bit_X86_XSTATE_X87;
+      else
+	x86_xstate = 0;
+      if ((eax & ((1 << 3) | (1 << 4))) == ((1 << 3) | (1 << 4)))
+	x86_xstate |= bit_X86_XSTATE_MPX;
+      if ((eax & (1 << 5)) != 0)
+	x86_xstate |= bit_X86_XSTATE_K;
+      if ((eax & (1 << 1)) != 0)
+	{
+	  x86_xstate = bit_X86_XSTATE_SSE;
+	  if ((eax & (1 << 2)) != 0)
+	    {
+	      x86_xstate |= bit_X86_XSTATE_AVX;
+	      if ((eax & ((1 << 5) | (1 << 6) | (1 << 7)))
+		  == ((1 << 5) | (1 << 6) | (1 << 7)))
+		x86_xstate |= bit_X86_XSTATE_AVX512;
+	    }
+	}
+
+      GLRO(dl_x86_xstate) = x86_xstate;
+    }
 }
 
 static inline ElfW(Addr)
diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
new file mode 100644
index 0000000..417c835
--- /dev/null
+++ b/sysdeps/x86_64/dl-procinfo.c
@@ -0,0 +1,57 @@
+/* Data for x86-64 version of processor capability information.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* If anything should be added here check whether the size of each string
+   is still ok with the given array size.
+
+   All the #ifdefs in the definitions are quite irritating but
+   necessary if we want to avoid duplicating the information.  There
+   are three different modes:
+
+   - PROCINFO_DECL is defined.  This means we are only interested in
+     declarations.
+
+   - PROCINFO_DECL is not defined:
+
+     + if SHARED is defined the file is included in an array
+       initializer.  The .element = { ... } syntax is needed.
+
+     + if SHARED is not defined a normal array initialization is
+       needed.
+  */
+
+#ifndef PROCINFO_CLASS
+# define PROCINFO_CLASS
+#endif
+
+#if !defined PROCINFO_DECL && defined SHARED
+  ._dl_x86_xstate
+#else
+PROCINFO_CLASS unsigned int _dl_x86_xstate
+#endif
+#ifndef PROCINFO_DECL
+= 0
+#endif
+#if !defined SHARED || defined PROCINFO_DECL
+;
+#else
+,
+#endif
+
+#undef PROCINFO_DECL
+#undef PROCINFO_CLASS
diff --git a/sysdeps/x86_64/dl-procinfo.h b/sysdeps/x86_64/dl-procinfo.h
new file mode 100644
index 0000000..dd51e9a
--- /dev/null
+++ b/sysdeps/x86_64/dl-procinfo.h
@@ -0,0 +1,33 @@
+/* x86-64 version of processor capability information handling macros.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _DL_X86_64_PROCINFO_H
+#define _DL_X86_64_PROCINFO_H	1
+
+#ifndef __ASSEMBLER__
+# include <sysdeps/generic/dl-procinfo.h>
+#endif
+
+#define bit_X86_XSTATE_X87	(1 << 0)
+#define bit_X86_XSTATE_SSE	(1 << 1)
+#define bit_X86_XSTATE_AVX	(1 << 2)
+#define bit_X86_XSTATE_AVX512	(1 << 3)
+#define bit_X86_XSTATE_MPX	(1 << 4)
+#define bit_X86_XSTATE_K	(1 << 5)
+
+#endif /* dl-procinfo.h */
diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
index 678c57f..f5442b7 100644
--- a/sysdeps/x86_64/dl-trampoline.S
+++ b/sysdeps/x86_64/dl-trampoline.S
@@ -20,33 +20,26 @@
 #include <sysdep.h>
 #include <link-defines.h>
 
-#if (RTLD_SAVESPACE_SSE % 32) != 0
-# error RTLD_SAVESPACE_SSE must be aligned to 32 bytes
-#endif
-
 /* Area on stack to save and restore registers used for parameter
    passing when calling _dl_fixup.  */
 #ifdef __ILP32__
-/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX.  */
-# define REGISTER_SAVE_AREA	(8 * 7)
-# define REGISTER_SAVE_RAX	0
+/* Align vector register save area to 16 bytes.  */
+# define REGISTER_SAVE_VEC_OFF	0
 # define PRESERVE_BND_REGS_PREFIX
 #else
-/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as BND0,
-   BND1, BND2, BND3.  */
-# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4)
 /* Align bound register save area to 16 bytes.  */
 # define REGISTER_SAVE_BND0	0
 # define REGISTER_SAVE_BND1	(REGISTER_SAVE_BND0 + 16)
 # define REGISTER_SAVE_BND2	(REGISTER_SAVE_BND1 + 16)
 # define REGISTER_SAVE_BND3	(REGISTER_SAVE_BND2 + 16)
-# define REGISTER_SAVE_RAX	(REGISTER_SAVE_BND3 + 16)
+# define REGISTER_SAVE_VEC_OFF	(REGISTER_SAVE_BND3 + 16)
 # ifdef HAVE_MPX_SUPPORT
 #  define PRESERVE_BND_REGS_PREFIX bnd
 # else
 #  define PRESERVE_BND_REGS_PREFIX .byte 0xf2
 # endif
 #endif
+#define REGISTER_SAVE_RAX	(REGISTER_SAVE_VEC_OFF + VEC_SIZE * 8)
 #define REGISTER_SAVE_RCX	(REGISTER_SAVE_RAX + 8)
 #define REGISTER_SAVE_RDX	(REGISTER_SAVE_RCX + 8)
 #define REGISTER_SAVE_RSI	(REGISTER_SAVE_RDX + 8)
@@ -54,386 +47,41 @@
 #define REGISTER_SAVE_R8	(REGISTER_SAVE_RDI + 8)
 #define REGISTER_SAVE_R9	(REGISTER_SAVE_R8 + 8)
 
-	.text
-	.globl _dl_runtime_resolve
-	.type _dl_runtime_resolve, @function
-	.align 16
-	cfi_startproc
-_dl_runtime_resolve:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	subq $REGISTER_SAVE_AREA,%rsp
-	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
-	# Preserve registers otherwise clobbered.
-	movq %rax, REGISTER_SAVE_RAX(%rsp)
-	movq %rcx, REGISTER_SAVE_RCX(%rsp)
-	movq %rdx, REGISTER_SAVE_RDX(%rsp)
-	movq %rsi, REGISTER_SAVE_RSI(%rsp)
-	movq %rdi, REGISTER_SAVE_RDI(%rsp)
-	movq %r8, REGISTER_SAVE_R8(%rsp)
-	movq %r9, REGISTER_SAVE_R9(%rsp)
-#ifndef __ILP32__
-	# We also have to preserve bound registers.  These are nops if
-	# Intel MPX isn't available or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
-	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
-	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
-	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
-# else
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1b,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
-	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
-# endif
-#endif
-	# Copy args pushed by PLT in register.
-	# %rdi: link_map, %rsi: reloc_index
-	movq (REGISTER_SAVE_AREA + 8)(%rsp), %rsi
-	movq REGISTER_SAVE_AREA(%rsp), %rdi
-	call _dl_fixup		# Call resolver.
-	movq %rax, %r11		# Save return value
-#ifndef __ILP32__
-	# Restore bound registers.  These are nops if Intel MPX isn't
-	# avaiable or disabled.
-# ifdef HAVE_MPX_SUPPORT
-	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
-	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
-	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
-	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
-# else
-	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
-	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
-	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
-#  if REGISTER_SAVE_BND0 == 0
-	.byte 0x66,0x0f,0x1a,0x04,0x24
-#  else
-	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
-#  endif
-# endif
-#endif
-	# Get register content back.
-	movq REGISTER_SAVE_R9(%rsp), %r9
-	movq REGISTER_SAVE_R8(%rsp), %r8
-	movq REGISTER_SAVE_RDI(%rsp), %rdi
-	movq REGISTER_SAVE_RSI(%rsp), %rsi
-	movq REGISTER_SAVE_RDX(%rsp), %rdx
-	movq REGISTER_SAVE_RCX(%rsp), %rcx
-	movq REGISTER_SAVE_RAX(%rsp), %rax
-	# Adjust stack(PLT did 2 pushes)
-	addq $(REGISTER_SAVE_AREA + 16), %rsp
-	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
-	# Preserve bound registers.
-	PRESERVE_BND_REGS_PREFIX
-	jmp *%r11		# Jump to function address.
-	cfi_endproc
-	.size _dl_runtime_resolve, .-_dl_runtime_resolve
-
-
-#ifndef PROF
-	.globl _dl_runtime_profile
-	.type _dl_runtime_profile, @function
-	.align 16
-	cfi_startproc
-
-_dl_runtime_profile:
-	cfi_adjust_cfa_offset(16) # Incorporate PLT
-	/* The La_x86_64_regs data structure pointed to by the
-	   fourth paramater must be 16-byte aligned.  This must
-	   be explicitly enforced.  We have the set up a dynamically
-	   sized stack frame.  %rbx points to the top half which
-	   has a fixed size and preserves the original stack pointer.  */
-
-	subq $32, %rsp		# Allocate the local storage.
-	cfi_adjust_cfa_offset(32)
-	movq %rbx, (%rsp)
-	cfi_rel_offset(%rbx, 0)
-
-	/* On the stack:
-		56(%rbx)	parameter #1
-		48(%rbx)	return address
-
-		40(%rbx)	reloc index
-		32(%rbx)	link_map
-
-		24(%rbx)	La_x86_64_regs pointer
-		16(%rbx)	framesize
-		 8(%rbx)	rax
-		  (%rbx)	rbx
-	*/
-
-	movq %rax, 8(%rsp)
-	movq %rsp, %rbx
-	cfi_def_cfa_register(%rbx)
-
-	/* Actively align the La_x86_64_regs structure.  */
-	andq $0xfffffffffffffff0, %rsp
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
-	   to detect if any xmm0-xmm7 registers are changed by audit
-	   module.  */
-	subq $(LR_SIZE + XMM_SIZE*8), %rsp
-# else
-	subq $LR_SIZE, %rsp		# sizeof(La_x86_64_regs)
-# endif
-	movq %rsp, 24(%rbx)
-
-	/* Fill the La_x86_64_regs structure.  */
-	movq %rdx, LR_RDX_OFFSET(%rsp)
-	movq %r8,  LR_R8_OFFSET(%rsp)
-	movq %r9,  LR_R9_OFFSET(%rsp)
-	movq %rcx, LR_RCX_OFFSET(%rsp)
-	movq %rsi, LR_RSI_OFFSET(%rsp)
-	movq %rdi, LR_RDI_OFFSET(%rsp)
-	movq %rbp, LR_RBP_OFFSET(%rsp)
-
-	leaq 48(%rbx), %rax
-	movq %rax, LR_RSP_OFFSET(%rsp)
-
-	/* We always store the XMM registers even if AVX is available.
-	   This is to provide backward binary compatibility for existing
-	   audit modules.  */
-	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
-	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
-	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
-	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
-	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
-	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
-	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
-	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
-
-# ifndef __ILP32__
-#  ifdef HAVE_MPX_SUPPORT
-	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
-	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
-	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
-	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
-#  else
-	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
-	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
-	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
-	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
-#  endif
-# endif
-
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	.data
-L(have_avx):
-	.zero 4
-	.size L(have_avx), 4
-	.previous
-
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	10f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in processor?
-	movq	%rbx, %r11		# Save rbx
-	xorl	%ecx, %ecx
-	mov	$0x7, %eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	20f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	cmpl	$0xe6, %eax
-	jne	20f
-	movl	%eax, L(have_avx)(%rip)
-L(avx512):
-#   define RESTORE_AVX
-#   define VMOV    vmovdqu64
-#   define VEC(i)  zmm##i
-#   define MORE_CODE
-#   include "dl-trampoline.h"
-#   undef VMOV
-#   undef VEC
-#   undef RESTORE_AVX
-#  endif
-20:	andl	$0x6, %eax
-10:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined):
-	js	L(no_avx)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512)
-#  endif
-
-#  define RESTORE_AVX
-#  define VMOV    vmovdqu
-#  define VEC(i)  ymm##i
-#  define MORE_CODE
-#  include "dl-trampoline.h"
-
-	.align 16
-L(no_avx):
-# endif
-
+/* VMOV is used on 16-byte aligned memory and VMOVA is used on 32-byte
+   aligned memory.  */
+# define VMOV			vmovdqu64
+# define VMOVA			vmovdqu64
+# define VEC_SIZE		64
+# define VEC(i)			zmm##i
+# define _dl_runtime_resolve	_dl_runtime_resolve_avx512
+# define _dl_runtime_profile	_dl_runtime_profile_avx512
+# define RESTORE_AVX
+# include "dl-trampoline.h"
+# undef _dl_runtime_resolve
+# undef _dl_runtime_profile
+# undef VMOV
+# undef VEC_SIZE
+# undef VEC
+
+# define VMOV			vmovdqu
+# define VMOVA			vmovdqa
+# define VEC_SIZE		32
+# define VEC(i)			ymm##i
+# define _dl_runtime_resolve	_dl_runtime_resolve_avx
+# define _dl_runtime_profile	_dl_runtime_profile_avx
+# include "dl-trampoline.h"
+# undef _dl_runtime_resolve
+# undef _dl_runtime_profile
+# undef VMOV
+# undef VEC_SIZE
+# undef VEC
+
+/* movaps is 1-byte shorter.  */
+# define VMOV			movaps
+# define VMOVA			movaps
+# define VEC_SIZE		16
+# define VEC(i)			xmm##i
+# define _dl_runtime_resolve	_dl_runtime_resolve_sse
+# define _dl_runtime_profile	_dl_runtime_profile_sse
 # undef RESTORE_AVX
 # include "dl-trampoline.h"
-
-	cfi_endproc
-	.size _dl_runtime_profile, .-_dl_runtime_profile
-#endif
-
-
-#ifdef SHARED
-	.globl _dl_x86_64_save_sse
-	.type _dl_x86_64_save_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_save_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	jne	L(defined_5)
-	movq	%rbx, %r11		# Save rbx
-	movl	$1, %eax
-	cpuid
-	movq	%r11,%rbx		# Restore rbx
-	xorl	%eax, %eax
-	// AVX and XSAVE supported?
-	andl	$((1 << 28) | (1 << 27)), %ecx
-	cmpl	$((1 << 28) | (1 << 27)), %ecx
-	jne	1f
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	// AVX512 supported in a processor?
-	movq	%rbx, %r11              # Save rbx
-	xorl	%ecx,%ecx
-	mov	$0x7,%eax
-	cpuid
-	andl	$(1 << 16), %ebx
-#  endif
-	xorl	%ecx, %ecx
-	// Get XFEATURE_ENABLED_MASK
-	xgetbv
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	test	%ebx, %ebx
-	movq	%r11, %rbx		# Restore rbx
-	je	2f
-	// Verify that XCR0[7:5] = '111b' and
-	// XCR0[2:1] = '11b' which means
-	// that zmm state is enabled
-	andl	$0xe6, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0xe6, %eax
-	je	L(avx512_5)
-#  endif
-
-2:	andl	$0x6, %eax
-1:	subl	$0x5, %eax
-	movl	%eax, L(have_avx)(%rip)
-	cmpl	$0, %eax
-
-L(defined_5):
-	js	L(no_avx5)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_5)
-#  endif
-
-	vmovdqa %ymm0, %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE
-	vmovdqa %ymm1, %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE
-	vmovdqa %ymm2, %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE
-	vmovdqa %ymm3, %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE
-	vmovdqa %ymm4, %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE
-	vmovdqa %ymm5, %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE
-	vmovdqa %ymm6, %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE
-	vmovdqa %ymm7, %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_5):
-	vmovdqu64 %zmm0, %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE
-	vmovdqu64 %zmm1, %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE
-	vmovdqu64 %zmm2, %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE
-	vmovdqu64 %zmm3, %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE
-	vmovdqu64 %zmm4, %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE
-	vmovdqu64 %zmm5, %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE
-	vmovdqu64 %zmm6, %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE
-	vmovdqu64 %zmm7, %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE
-	ret
-#  endif
-L(no_avx5):
-# endif
-	movdqa	%xmm0, %fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE
-	movdqa	%xmm1, %fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE
-	movdqa	%xmm2, %fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE
-	movdqa	%xmm3, %fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE
-	movdqa	%xmm4, %fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE
-	movdqa	%xmm5, %fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE
-	movdqa	%xmm6, %fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE
-	movdqa	%xmm7, %fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE
-	ret
-	cfi_endproc
-	.size _dl_x86_64_save_sse, .-_dl_x86_64_save_sse
-
-
-	.globl _dl_x86_64_restore_sse
-	.type _dl_x86_64_restore_sse, @function
-	.align 16
-	cfi_startproc
-_dl_x86_64_restore_sse:
-# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0, L(have_avx)(%rip)
-	js	L(no_avx6)
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-	cmpl	$0xe6, L(have_avx)(%rip)
-	je	L(avx512_6)
-#  endif
-
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+0*YMM_SIZE, %ymm0
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+1*YMM_SIZE, %ymm1
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+2*YMM_SIZE, %ymm2
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+3*YMM_SIZE, %ymm3
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+4*YMM_SIZE, %ymm4
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+5*YMM_SIZE, %ymm5
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+6*YMM_SIZE, %ymm6
-	vmovdqa %fs:RTLD_SAVESPACE_SSE+7*YMM_SIZE, %ymm7
-	ret
-#  ifdef HAVE_AVX512_ASM_SUPPORT
-L(avx512_6):
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+0*ZMM_SIZE, %zmm0
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+1*ZMM_SIZE, %zmm1
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+2*ZMM_SIZE, %zmm2
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+3*ZMM_SIZE, %zmm3
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+4*ZMM_SIZE, %zmm4
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+5*ZMM_SIZE, %zmm5
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+6*ZMM_SIZE, %zmm6
-	vmovdqu64 %fs:RTLD_SAVESPACE_SSE+7*ZMM_SIZE, %zmm7
-	ret
-#  endif
-L(no_avx6):
-# endif
-	movdqa	%fs:RTLD_SAVESPACE_SSE+0*XMM_SIZE, %xmm0
-	movdqa	%fs:RTLD_SAVESPACE_SSE+1*XMM_SIZE, %xmm1
-	movdqa	%fs:RTLD_SAVESPACE_SSE+2*XMM_SIZE, %xmm2
-	movdqa	%fs:RTLD_SAVESPACE_SSE+3*XMM_SIZE, %xmm3
-	movdqa	%fs:RTLD_SAVESPACE_SSE+4*XMM_SIZE, %xmm4
-	movdqa	%fs:RTLD_SAVESPACE_SSE+5*XMM_SIZE, %xmm5
-	movdqa	%fs:RTLD_SAVESPACE_SSE+6*XMM_SIZE, %xmm6
-	movdqa	%fs:RTLD_SAVESPACE_SSE+7*XMM_SIZE, %xmm7
-	ret
-	cfi_endproc
-	.size _dl_x86_64_restore_sse, .-_dl_x86_64_restore_sse
-#endif
diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
index d542428..be0a9a0 100644
--- a/sysdeps/x86_64/dl-trampoline.h
+++ b/sysdeps/x86_64/dl-trampoline.h
@@ -1,5 +1,4 @@
-/* Partial PLT profile trampoline to save and restore x86-64 vector
-   registers.
+/* PLT trampolines.  x86-64 version.
    Copyright (C) 2009-2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,7 +16,203 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifdef RESTORE_AVX
+#ifdef __ILP32__
+/* X32 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as VEC0 to
+   VEC7.  */
+# define REGISTER_SAVE_AREA	(8 * 7 + VEC_SIZE * 8)
+#else
+/* X86-64 saves RCX, RDX, RSI, RDI, R8 and R9 plus RAX as well as
+   BND0, BND1, BND2, BND3 and VEC0 to VEC7. */
+# define REGISTER_SAVE_AREA	(8 * 7 + 16 * 4 + VEC_SIZE * 8)
+#endif
+
+#if (REGISTER_SAVE_AREA % 16) != 8
+# error REGISTER_SAVE_AREA must be odd multples of 8
+#endif
+
+	.text
+	.globl _dl_runtime_resolve
+	.hidden _dl_runtime_resolve
+	.type _dl_runtime_resolve, @function
+	.align 16
+	cfi_startproc
+_dl_runtime_resolve:
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	subq $REGISTER_SAVE_AREA, %rsp
+	cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
+	# Preserve registers otherwise clobbered.
+	movq %rax, REGISTER_SAVE_RAX(%rsp)
+	movq %rcx, REGISTER_SAVE_RCX(%rsp)
+	movq %rdx, REGISTER_SAVE_RDX(%rsp)
+	movq %rsi, REGISTER_SAVE_RSI(%rsp)
+	movq %rdi, REGISTER_SAVE_RDI(%rsp)
+	movq %r8, REGISTER_SAVE_R8(%rsp)
+	movq %r9, REGISTER_SAVE_R9(%rsp)
+	VMOV %VEC(0), (REGISTER_SAVE_VEC_OFF)(%rsp)
+	VMOV %VEC(1), (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp)
+	VMOV %VEC(2), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp)
+	VMOV %VEC(3), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp)
+	VMOV %VEC(4), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp)
+	VMOV %VEC(5), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp)
+	VMOV %VEC(6), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp)
+	VMOV %VEC(7), (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp)
+#ifndef __ILP32__
+	# We also have to preserve bound registers.  These are nops if
+	# Intel MPX isn't available or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, REGISTER_SAVE_BND0(%rsp)
+	bndmov %bnd1, REGISTER_SAVE_BND1(%rsp)
+	bndmov %bnd2, REGISTER_SAVE_BND2(%rsp)
+	bndmov %bnd3, REGISTER_SAVE_BND3(%rsp)
+# else
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1b,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1b,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+	.byte 0x66,0x0f,0x1b,0x4c,0x24,REGISTER_SAVE_BND1
+	.byte 0x66,0x0f,0x1b,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1b,0x5c,0x24,REGISTER_SAVE_BND3
+# endif
+#endif
+	# Copy args pushed by PLT in register.
+	# %rdi: link_map, %rsi: reloc_index
+	movq (REGISTER_SAVE_AREA + 8)(%rsp), %rsi
+	movq REGISTER_SAVE_AREA(%rsp), %rdi
+	call _dl_fixup		# Call resolver.
+	movq %rax, %r11		# Save return value
+#ifndef __ILP32__
+	# Restore bound registers.  These are nops if Intel MPX isn't
+	# avaiable or disabled.
+# ifdef HAVE_MPX_SUPPORT
+	bndmov REGISTER_SAVE_BND3(%rsp), %bnd3
+	bndmov REGISTER_SAVE_BND2(%rsp), %bnd2
+	bndmov REGISTER_SAVE_BND1(%rsp), %bnd1
+	bndmov REGISTER_SAVE_BND0(%rsp), %bnd0
+# else
+	.byte 0x66,0x0f,0x1a,0x5c,0x24,REGISTER_SAVE_BND3
+	.byte 0x66,0x0f,0x1a,0x54,0x24,REGISTER_SAVE_BND2
+	.byte 0x66,0x0f,0x1a,0x4c,0x24,REGISTER_SAVE_BND1
+#  if REGISTER_SAVE_BND0 == 0
+	.byte 0x66,0x0f,0x1a,0x04,0x24
+#  else
+	.byte 0x66,0x0f,0x1a,0x44,0x24,REGISTER_SAVE_BND0
+#  endif
+# endif
+#endif
+	# Get register content back.
+	movq REGISTER_SAVE_R9(%rsp), %r9
+	movq REGISTER_SAVE_R8(%rsp), %r8
+	movq REGISTER_SAVE_RDI(%rsp), %rdi
+	movq REGISTER_SAVE_RSI(%rsp), %rsi
+	movq REGISTER_SAVE_RDX(%rsp), %rdx
+	movq REGISTER_SAVE_RCX(%rsp), %rcx
+	movq REGISTER_SAVE_RAX(%rsp), %rax
+	VMOV (REGISTER_SAVE_VEC_OFF)(%rsp), %VEC(0)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE)(%rsp), %VEC(1)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 2)(%rsp), %VEC(2)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 3)(%rsp), %VEC(3)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 4)(%rsp), %VEC(4)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 5)(%rsp), %VEC(5)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 6)(%rsp), %VEC(6)
+	VMOV (REGISTER_SAVE_VEC_OFF + VEC_SIZE * 7)(%rsp), %VEC(7)
+	# Adjust stack(PLT did 2 pushes)
+	addq $(REGISTER_SAVE_AREA + 16), %rsp
+	cfi_adjust_cfa_offset(-(REGISTER_SAVE_AREA + 16))
+	# Preserve bound registers.
+	PRESERVE_BND_REGS_PREFIX
+	jmp *%r11		# Jump to function address.
+	cfi_endproc
+	.size _dl_runtime_resolve, .-_dl_runtime_resolve
+
+
+#ifndef PROF
+	.globl _dl_runtime_profile
+	.hidden _dl_runtime_profile
+	.type _dl_runtime_profile, @function
+	.align 16
+_dl_runtime_profile:
+	cfi_startproc
+	cfi_adjust_cfa_offset(16) # Incorporate PLT
+	/* The La_x86_64_regs data structure pointed to by the
+	   fourth paramater must be 16-byte aligned.  This must
+	   be explicitly enforced.  We have the set up a dynamically
+	   sized stack frame.  %rbx points to the top half which
+	   has a fixed size and preserves the original stack pointer.  */
+
+	subq $32, %rsp		# Allocate the local storage.
+	cfi_adjust_cfa_offset(32)
+	movq %rbx, (%rsp)
+	cfi_rel_offset(%rbx, 0)
+
+	/* On the stack:
+		56(%rbx)	parameter #1
+		48(%rbx)	return address
+
+		40(%rbx)	reloc index
+		32(%rbx)	link_map
+
+		24(%rbx)	La_x86_64_regs pointer
+		16(%rbx)	framesize
+		 8(%rbx)	rax
+		  (%rbx)	rbx
+	*/
+
+	movq %rax, 8(%rsp)
+	movq %rsp, %rbx
+	cfi_def_cfa_register(%rbx)
+
+	/* Actively align the La_x86_64_regs structure.  */
+	andq $0xfffffffffffffff0, %rsp
+# if defined HAVE_AVX_SUPPORT || defined HAVE_AVX512_ASM_SUPPORT
+	/* sizeof(La_x86_64_regs).  Need extra space for 8 SSE registers
+	   to detect if any xmm0-xmm7 registers are changed by audit
+	   module.  */
+	subq $(LR_SIZE + XMM_SIZE*8), %rsp
+# else
+	subq $LR_SIZE, %rsp		# sizeof(La_x86_64_regs)
+# endif
+	movq %rsp, 24(%rbx)
+
+	/* Fill the La_x86_64_regs structure.  */
+	movq %rdx, LR_RDX_OFFSET(%rsp)
+	movq %r8,  LR_R8_OFFSET(%rsp)
+	movq %r9,  LR_R9_OFFSET(%rsp)
+	movq %rcx, LR_RCX_OFFSET(%rsp)
+	movq %rsi, LR_RSI_OFFSET(%rsp)
+	movq %rdi, LR_RDI_OFFSET(%rsp)
+	movq %rbp, LR_RBP_OFFSET(%rsp)
+
+	leaq 48(%rbx), %rax
+	movq %rax, LR_RSP_OFFSET(%rsp)
+
+	/* We always store the XMM registers even if AVX is available.
+	   This is to provide backward binary compatibility for existing
+	   audit modules.  */
+	movaps %xmm0,		   (LR_XMM_OFFSET)(%rsp)
+	movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
+	movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
+	movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
+	movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
+	movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
+	movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
+	movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
+
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
+	bndmov %bnd0, 		   (LR_BND_OFFSET)(%rsp)  # Preserve bound
+	bndmov %bnd1, (LR_BND_OFFSET +   BND_SIZE)(%rsp)  # registers. Nops if
+	bndmov %bnd2, (LR_BND_OFFSET + BND_SIZE*2)(%rsp)  # MPX not available
+	bndmov %bnd3, (LR_BND_OFFSET + BND_SIZE*3)(%rsp)  # or disabled.
+#  else
+	.byte 0x66,0x0f,0x1b,0x84,0x24;.long (LR_BND_OFFSET)
+	.byte 0x66,0x0f,0x1b,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
+	.byte 0x66,0x0f,0x1b,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
+	.byte 0x66,0x0f,0x1b,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
+# endif
+
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
 	VMOV %VEC(0),		      (LR_VECTOR_OFFSET)(%rsp)
 	VMOV %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
@@ -38,7 +233,7 @@
 	vmovdqa %xmm5, (LR_SIZE + XMM_SIZE*5)(%rsp)
 	vmovdqa %xmm6, (LR_SIZE + XMM_SIZE*6)(%rsp)
 	vmovdqa %xmm7, (LR_SIZE + XMM_SIZE*7)(%rsp)
-#endif
+# endif
 
 	mov %RSP_LP, %RCX_LP	# La_x86_64_regs pointer to %rcx.
 	mov 48(%rbx), %RDX_LP	# Load return address if needed.
@@ -63,7 +258,7 @@
 	movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
 	movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if any xmm0-xmm7 registers are changed by audit
 	   module.  */
 	vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
@@ -139,21 +334,21 @@
 	vmovdqa	%xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov              (LR_BND_OFFSET)(%rsp), %bnd0  # Restore bound
 	bndmov (LR_BND_OFFSET +   BND_SIZE)(%rsp), %bnd1  # registers.
 	bndmov (LR_BND_OFFSET + BND_SIZE*2)(%rsp), %bnd2
 	bndmov (LR_BND_OFFSET + BND_SIZE*3)(%rsp), %bnd3
-# else
+#  else
 	.byte 0x66,0x0f,0x1a,0x84,0x24;.long (LR_BND_OFFSET)
 	.byte 0x66,0x0f,0x1a,0x8c,0x24;.long (LR_BND_OFFSET + BND_SIZE)
 	.byte 0x66,0x0f,0x1a,0x94,0x24;.long (LR_BND_OFFSET + BND_SIZE*2)
 	.byte 0x66,0x0f,0x1a,0x9c,0x24;.long (LR_BND_OFFSET + BND_SIZE*3)
+#  endif
 # endif
-#endif
 
 	mov  16(%rbx), %R10_LP	# Anything in framesize?
 	test %R10_LP, %R10_LP
@@ -212,14 +407,14 @@
 	   _dl_call_pltexit.  The La_x86_64_regs is being pointed by rsp now,
 	   so we just need to allocate the sizeof(La_x86_64_retval) space on
 	   the stack, since the alignment has already been taken care of. */
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* sizeof(La_x86_64_retval).  Need extra space for 2 SSE
 	   registers to detect if xmm0/xmm1 registers are changed
 	   by audit module.  */
 	subq $(LRV_SIZE + XMM_SIZE*2), %rsp
-#else
+# else
 	subq $LRV_SIZE, %rsp	# sizeof(La_x86_64_retval)
-#endif
+# endif
 	movq %rsp, %rcx		# La_x86_64_retval argument to %rcx.
 
 	/* Fill in the La_x86_64_retval structure.  */
@@ -229,7 +424,7 @@
 	movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
 	movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* This is to support AVX audit modules.  */
 	VMOV %VEC(0), LRV_VECTOR0_OFFSET(%rcx)
 	VMOV %VEC(1), LRV_VECTOR1_OFFSET(%rcx)
@@ -238,17 +433,17 @@
 	   by audit module.  */
 	vmovdqa %xmm0,		  (LRV_SIZE)(%rcx)
 	vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov %bnd0, LRV_BND0_OFFSET(%rcx)  # Preserve returned bounds.
 	bndmov %bnd1, LRV_BND1_OFFSET(%rcx)
-# else
+#  else
 	.byte  0x66,0x0f,0x1b,0x81;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1b,0x89;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fstpt LRV_ST0_OFFSET(%rcx)
 	fstpt LRV_ST1_OFFSET(%rcx)
@@ -265,7 +460,7 @@
 	movaps LRV_XMM0_OFFSET(%rsp), %xmm0
 	movaps LRV_XMM1_OFFSET(%rsp), %xmm1
 
-#ifdef RESTORE_AVX
+# ifdef RESTORE_AVX
 	/* Check if xmm0/xmm1 registers are changed by audit module.  */
 	vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
 	vpmovmskb %xmm2, %esi
@@ -280,17 +475,17 @@
 	VMOV LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
 
 1:
-#endif
+# endif
 
-#ifndef __ILP32__
-# ifdef HAVE_MPX_SUPPORT
+# ifndef __ILP32__
+#  ifdef HAVE_MPX_SUPPORT
 	bndmov LRV_BND0_OFFSET(%rsp), %bnd0  # Restore bound registers.
 	bndmov LRV_BND1_OFFSET(%rsp), %bnd1
-# else
+#  else
 	.byte  0x66,0x0f,0x1a,0x84,0x24;.long (LRV_BND0_OFFSET)
 	.byte  0x66,0x0f,0x1a,0x8c,0x24;.long (LRV_BND1_OFFSET)
+#  endif
 # endif
-#endif
 
 	fldt LRV_ST1_OFFSET(%rsp)
 	fldt LRV_ST0_OFFSET(%rsp)
@@ -306,9 +501,6 @@
 	PRESERVE_BND_REGS_PREFIX
 	retq
 
-#ifdef MORE_CODE
-	cfi_adjust_cfa_offset(48)
-	cfi_rel_offset(%rbx, 0)
-	cfi_def_cfa_register(%rbx)
-# undef MORE_CODE
+	cfi_endproc
+	.size _dl_runtime_profile, .-_dl_runtime_profile
 #endif
diff --git a/sysdeps/x86_64/nptl/tcb-offsets.sym b/sysdeps/x86_64/nptl/tcb-offsets.sym
index 729d1da..aeb7526 100644
--- a/sysdeps/x86_64/nptl/tcb-offsets.sym
+++ b/sysdeps/x86_64/nptl/tcb-offsets.sym
@@ -16,7 +16,6 @@ VGETCPU_CACHE_OFFSET	offsetof (tcbhead_t, vgetcpu_cache)
 #ifndef __ASSUME_PRIVATE_FUTEX
 PRIVATE_FUTEX		offsetof (tcbhead_t, private_futex)
 #endif
-RTLD_SAVESPACE_SSE	offsetof (tcbhead_t, rtld_savespace_sse)
 
 -- Not strictly offsets, but these values are also used in the TCB.
 TCB_CANCELSTATE_BITMASK	 CANCELSTATE_BITMASK
diff --git a/sysdeps/x86_64/nptl/tls.h b/sysdeps/x86_64/nptl/tls.h
index d7543c6..5515c54 100644
--- a/sysdeps/x86_64/nptl/tls.h
+++ b/sysdeps/x86_64/nptl/tls.h
@@ -29,9 +29,6 @@
 # include <libc-internal.h>
 # include <kernel-features.h>
 
-/* Replacement type for __m128 since this file is included by ld.so,
-   which is compiled with -mno-sse.  It must not change the alignment
-   of rtld_savespace_sse.  */
 typedef struct
 {
   int i[4];
@@ -67,14 +64,13 @@ typedef struct
 # else
   int __glibc_reserved1;
 # endif
-  int rtld_must_xmm_save;
+  int __glibc_unused1;
   /* Reservation of some values for the TM ABI.  */
   void *__private_tm[4];
   /* GCC split stack support.  */
   void *__private_ss;
   long int __glibc_reserved2;
-  /* Have space for the post-AVX register size.  */
-  __128bits rtld_savespace_sse[8][4] __attribute__ ((aligned (32)));
+  __128bits __glibc_unused2[8][4] __attribute__ ((aligned (32)));
 
   void *__padding[8];
 } tcbhead_t;
@@ -384,41 +380,6 @@ typedef struct
 # define THREAD_GSCOPE_WAIT() \
   GL(dl_wait_lookup_done) ()
 
-
-# ifdef SHARED
-/* Defined in dl-trampoline.S.  */
-extern void _dl_x86_64_save_sse (void);
-extern void _dl_x86_64_restore_sse (void);
-
-# define RTLD_CHECK_FOREIGN_CALL \
-  (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) != 0)
-
-/* NB: Don't use the xchg operation because that would imply a lock
-   prefix which is expensive and unnecessary.  The cache line is also
-   not contested at all.  */
-#  define RTLD_ENABLE_FOREIGN_CALL \
-  int old_rtld_must_xmm_save = THREAD_GETMEM (THREAD_SELF,		      \
-					      header.rtld_must_xmm_save);     \
-  THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 1)
-
-#  define RTLD_PREPARE_FOREIGN_CALL \
-  do if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save))	      \
-    {									      \
-      _dl_x86_64_save_sse ();						      \
-      THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save, 0);	      \
-    }									      \
-  while (0)
-
-#  define RTLD_FINALIZE_FOREIGN_CALL \
-  do {									      \
-    if (THREAD_GETMEM (THREAD_SELF, header.rtld_must_xmm_save) == 0)	      \
-      _dl_x86_64_restore_sse ();					      \
-    THREAD_SETMEM (THREAD_SELF, header.rtld_must_xmm_save,		      \
-		   old_rtld_must_xmm_save);				      \
-  } while (0)
-# endif
-
-
 #endif /* __ASSEMBLER__ */
 
 #endif	/* tls.h */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=17d5a73892bdf6aa960f8c6825577a12a28b6c9a

commit 17d5a73892bdf6aa960f8c6825577a12a28b6c9a
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:41:20 2015 -0700

    Align the stack before calling __errno_location

diff --git a/sysdeps/x86_64/fpu/s_cosf.S b/sysdeps/x86_64/fpu/s_cosf.S
index b7868ce..bea10ef 100644
--- a/sysdeps/x86_64/fpu/s_cosf.S
+++ b/sysdeps/x86_64/fpu/s_cosf.S
@@ -310,8 +310,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4
diff --git a/sysdeps/x86_64/fpu/s_sincosf.S b/sysdeps/x86_64/fpu/s_sincosf.S
index 21db70a..a2f3133 100644
--- a/sysdeps/x86_64/fpu/s_sincosf.S
+++ b/sysdeps/x86_64/fpu/s_sincosf.S
@@ -354,8 +354,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4
diff --git a/sysdeps/x86_64/fpu/s_sinf.S b/sysdeps/x86_64/fpu/s_sinf.S
index dc92164..90afbe8 100644
--- a/sysdeps/x86_64/fpu/s_sinf.S
+++ b/sysdeps/x86_64/fpu/s_sinf.S
@@ -336,8 +336,14 @@ L(arg_inf_or_nan):
 	/* Here if |x| is Inf or NAN */
 	jne	L(skip_errno_setting)	/* in case of x is NaN */
 
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
 	/* Here if x is Inf. Set errno to EDOM.  */
 	call	JUMPTARGET(__errno_location)
+	addq	$8, %rsp
+	cfi_adjust_cfa_offset (-8)
+
 	movl	$EDOM, (%rax)
 
 	.p2align	4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=57be8082638e55130e2ad53ad690882128d253e1

commit 57be8082638e55130e2ad53ad690882128d253e1
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:40:25 2015 -0700

    Align the stack before calling __gettimeofday

diff --git a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
index 0935db5..23f3def 100644
--- a/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
+++ b/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S
@@ -394,8 +394,8 @@ __lll_timedwait_tid:
 	movq	%rdi, %r12
 	movq	%rsi, %r13
 
-	subq	$16, %rsp
-	cfi_adjust_cfa_offset(16)
+	subq	$24, %rsp
+	cfi_adjust_cfa_offset(24)
 
 	/* Get current time.  */
 2:	movq	%rsp, %rdi
@@ -441,8 +441,8 @@ __lll_timedwait_tid:
 	jne	1f
 4:	xorl	%eax, %eax
 
-8:	addq	$16, %rsp
-	cfi_adjust_cfa_offset(-16)
+8:	addq	$24, %rsp
+	cfi_adjust_cfa_offset(-24)
 	popq	%r13
 	cfi_adjust_cfa_offset(-8)
 	cfi_restore(%r13)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e04eb834f435e1d18ea165e1719aa257cd3c4b33

commit e04eb834f435e1d18ea165e1719aa257cd3c4b33
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Jul 12 14:38:58 2015 -0700

    Align stack before calling __setcontext

diff --git a/sysdeps/unix/sysv/linux/x86_64/__start_context.S b/sysdeps/unix/sysv/linux/x86_64/__start_context.S
index 52a5afa..57ee2b9 100644
--- a/sysdeps/unix/sysv/linux/x86_64/__start_context.S
+++ b/sysdeps/unix/sysv/linux/x86_64/__start_context.S
@@ -33,6 +33,11 @@ ENTRY(__start_context)
 
 	popq	%rdi			/* This is the next context.  */
 	cfi_adjust_cfa_offset(-8)
+
+	/* Align stack to 16 bytes.  */
+	subq	$8, %rsp
+	cfi_adjust_cfa_offset (8)
+
 	testq	%rdi, %rdi
 	je	2f			/* If it is zero exit.  */
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b94f0392eb08dda99cc955aa4d19d1803ce96910

commit b94f0392eb08dda99cc955aa4d19d1803ce96910
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Jul 29 03:41:58 2015 -0700

    Compile {memcpy,strcmp}-sse2-unaligned.S only for libc
    
    {memcpy,strcmp}-sse2-unaligned.S aren't needed in ld.so.
    
    	* sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Compile
    	only for libc.
    	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Likewise.

diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index c5450af..5693ba7 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#if IS_IN (libc)
+
 #include <sysdep.h>
 
 #include "asm-syntax.h"
@@ -169,3 +171,5 @@ L(between_5_8):
 	movl	%eax, -4(%rdi,%rdx)
 	jmp	L(return)
 END(__memcpy_sse2_unaligned)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 20b65fa..c6606b4 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -16,6 +16,8 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#if IS_IN (libc)
+
 #include "sysdep.h"
 
 ENTRY ( __strcmp_sse2_unaligned)
@@ -207,3 +209,5 @@ L(different):
 	subl	%ecx, %eax
 	ret
 END (__strcmp_sse2_unaligned)
+
+#endif

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]