This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch master updated. glibc-2.17-428-g37bb363


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  37bb363f03d75e5e6f2ca45f2c686a3a0167797e (commit)
      from  f816705060415c476d8a9a0cbb683dc7a5aeef8e (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=37bb363f03d75e5e6f2ca45f2c686a3a0167797e

commit 37bb363f03d75e5e6f2ca45f2c686a3a0167797e
Author: Ondrej Bilka <neleai@seznam.cz>
Date:   Mon Mar 18 07:39:12 2013 +0100

    Faster strlen on x64.

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 4f7c070..86787ee 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -10,14 +10,12 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
 		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
-		   strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf \
+		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
-		   strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
-		   memcmp-ssse3
+		   strrchr-sse2-no-bsf strchr-sse2-no-bsf memcmp-ssse3
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index cb4aba3..05315fd 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -176,11 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
 
-  /* Support sysdeps/x86_64/multiarch/strnlen.S.  */
-  IFUNC_IMPL (i, name, strnlen,
-	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2_no_bsf)
-	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
-
   /* Support sysdeps/x86_64/multiarch/strpbrk.S.  */
   IFUNC_IMPL (i, name, strpbrk,
 	      IFUNC_IMPL_ADD (array, i, strpbrk, HAS_SSE4_2,
@@ -251,14 +246,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2))
 
-  /* Support sysdeps/x86_64/multiarch/strlen.S.  */
-  IFUNC_IMPL (i, name, strlen,
-	      IFUNC_IMPL_ADD (array, i, strlen, HAS_SSE4_2, __strlen_sse42)
-	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_pminub)
-	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2_no_bsf)
-	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)
-	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
-
   /* Support sysdeps/x86_64/multiarch/strncmp.S.  */
   IFUNC_IMPL (i, name, strncmp,
 	      IFUNC_IMPL_ADD (array, i, strncmp, HAS_SSE4_2,
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
index 72bb609..028c6d3 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S
@@ -34,10 +34,236 @@ ENTRY (STRCAT)
 	mov	%rdx, %r8
 # endif
 
-# define RETURN  jmp L(StartStrcpyPart)
-# include "strlen-sse2-pminub.S"
-# undef RETURN
+/* Inline corresponding strlen file, temporary until new strcpy
+   implementation gets merged.  */
 
+	xor	%rax, %rax
+	mov	%edi, %ecx
+	and	$0x3f, %ecx
+	pxor	%xmm0, %xmm0
+	cmp	$0x30, %ecx
+	ja	L(next)
+	movdqu	(%rdi), %xmm1
+	pcmpeqb	%xmm1, %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_less16)
+	mov	%rdi, %rax
+	and	$-16, %rax
+	jmp	L(align16_start)
+L(next):
+	mov	%rdi, %rax
+	and	$-16, %rax
+	pcmpeqb	(%rax), %xmm0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	pmovmskb %xmm0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+
+L(align16_start):
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	pcmpeqb	16(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$64, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit64)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	80(%rax), %xmm0
+	add	$80, %rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm1
+	add	$16, %rax
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm2
+	add	$16, %rax
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$0x3f, %rax
+	jz	L(align64_loop)
+
+	pcmpeqb	16(%rax), %xmm3
+	add	$16, %rax
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$16, %rax
+	.p2align 4
+	L(align64_loop):
+	movaps	(%rax),	%xmm4
+	pminub	16(%rax),	%xmm4
+	movaps	32(%rax),	%xmm5
+	pminub	48(%rax),	%xmm5
+	add	$64,	%rax
+	pminub	%xmm4,	%xmm5
+	pcmpeqb	%xmm0,	%xmm5
+	pmovmskb %xmm5,	%edx
+	test	%edx,	%edx
+	jz	L(align64_loop)
+
+	pcmpeqb	-64(%rax), %xmm0
+	sub	$80,	%rax
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	jnz	L(exit16)
+
+	pcmpeqb	32(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	jnz	L(exit32)
+
+	pcmpeqb	48(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	jnz	L(exit48)
+
+	pcmpeqb	64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_less16):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit16):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$16, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit32):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$32, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit48):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$48, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit64):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$64, %rax
+
+	.p2align 4
 L(StartStrcpyPart):
 	lea	(%r9, %rax), %rdi
 	mov	%rsi, %rcx
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
index fea9d11..8101b91 100644
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S
@@ -33,11 +33,321 @@ ENTRY (STRCAT)
 	mov	%rdx, %r8
 # endif
 
-# define RETURN  jmp L(StartStrcpyPart)
-# include "strlen-sse2-no-bsf.S"
 
-# undef RETURN
+/* Inline corresponding strlen file, temporary until new strcpy
+   implementation gets merged.  */
+
+	xor	%eax, %eax
+	cmpb	$0, (%rdi)
+	jz	L(exit_tail0)
+	cmpb	$0, 1(%rdi)
+	jz	L(exit_tail1)
+	cmpb	$0, 2(%rdi)
+	jz	L(exit_tail2)
+	cmpb	$0, 3(%rdi)
+	jz	L(exit_tail3)
+
+	cmpb	$0, 4(%rdi)
+	jz	L(exit_tail4)
+	cmpb	$0, 5(%rdi)
+	jz	L(exit_tail5)
+	cmpb	$0, 6(%rdi)
+	jz	L(exit_tail6)
+	cmpb	$0, 7(%rdi)
+	jz	L(exit_tail7)
+
+	cmpb	$0, 8(%rdi)
+	jz	L(exit_tail8)
+	cmpb	$0, 9(%rdi)
+	jz	L(exit_tail9)
+	cmpb	$0, 10(%rdi)
+	jz	L(exit_tail10)
+	cmpb	$0, 11(%rdi)
+	jz	L(exit_tail11)
+
+	cmpb	$0, 12(%rdi)
+	jz	L(exit_tail12)
+	cmpb	$0, 13(%rdi)
+	jz	L(exit_tail13)
+	cmpb	$0, 14(%rdi)
+	jz	L(exit_tail14)
+	cmpb	$0, 15(%rdi)
+	jz	L(exit_tail15)
+	pxor	%xmm0, %xmm0
+	lea	16(%rdi), %rcx
+	lea	16(%rdi), %rax
+	and	$-16, %rax
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	pcmpeqb	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	test	%edx, %edx
+	lea	16(%rax), %rax
+	jnz	L(exit)
+
+	and	$-0x40, %rax
 
+	.p2align 4
+L(aligned_64):
+	pcmpeqb	(%rax), %xmm0
+	pcmpeqb	16(%rax), %xmm1
+	pcmpeqb	32(%rax), %xmm2
+	pcmpeqb	48(%rax), %xmm3
+	pmovmskb %xmm0, %edx
+	pmovmskb %xmm1, %r11d
+	pmovmskb %xmm2, %r10d
+	pmovmskb %xmm3, %r9d
+	or	%edx, %r9d
+	or	%r11d, %r9d
+	or	%r10d, %r9d
+	lea	64(%rax), %rax
+	jz	L(aligned_64)
+
+	test	%edx, %edx
+	jnz	L(aligned_64_exit_16)
+	test	%r11d, %r11d
+	jnz	L(aligned_64_exit_32)
+	test	%r10d, %r10d
+	jnz	L(aligned_64_exit_48)
+
+L(aligned_64_exit_64):
+	pmovmskb %xmm3, %edx
+	jmp	L(exit)
+
+L(aligned_64_exit_48):
+	lea	-16(%rax), %rax
+	mov	%r10d, %edx
+	jmp	L(exit)
+
+L(aligned_64_exit_32):
+	lea	-32(%rax), %rax
+	mov	%r11d, %edx
+	jmp	L(exit)
+
+L(aligned_64_exit_16):
+	lea	-48(%rax), %rax
+
+L(exit):
+	sub	%rcx, %rax
+	test	%dl, %dl
+	jz	L(exit_high)
+	test	$0x01, %dl
+	jnz	L(exit_tail0)
+
+	test	$0x02, %dl
+	jnz	L(exit_tail1)
+
+	test	$0x04, %dl
+	jnz	L(exit_tail2)
+
+	test	$0x08, %dl
+	jnz	L(exit_tail3)
+
+	test	$0x10, %dl
+	jnz	L(exit_tail4)
+
+	test	$0x20, %dl
+	jnz	L(exit_tail5)
+
+	test	$0x40, %dl
+	jnz	L(exit_tail6)
+	add	$7, %eax
+L(exit_tail0):
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_high):
+	add	$8, %eax
+	test	$0x01, %dh
+	jnz	L(exit_tail0)
+
+	test	$0x02, %dh
+	jnz	L(exit_tail1)
+
+	test	$0x04, %dh
+	jnz	L(exit_tail2)
+
+	test	$0x08, %dh
+	jnz	L(exit_tail3)
+
+	test	$0x10, %dh
+	jnz	L(exit_tail4)
+
+	test	$0x20, %dh
+	jnz	L(exit_tail5)
+
+	test	$0x40, %dh
+	jnz	L(exit_tail6)
+	add	$7, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail1):
+	add	$1, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail2):
+	add	$2, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail3):
+	add	$3, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail4):
+	add	$4, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail5):
+	add	$5, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail6):
+	add	$6, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail7):
+	add	$7, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail8):
+	add	$8, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail9):
+	add	$9, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail10):
+	add	$10, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail11):
+	add	$11, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail12):
+	add	$12, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail13):
+	add	$13, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail14):
+	add	$14, %eax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_tail15):
+	add	$15, %eax
+
+	.p2align 4
 L(StartStrcpyPart):
 	mov	%rsi, %rcx
 	lea	(%rdi, %rax), %rdx
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S
deleted file mode 100644
index ff2ab70..0000000
--- a/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S
+++ /dev/null
@@ -1,685 +0,0 @@
-/* strlen SSE2 without bsf
-   Copyright (C) 2010-2013 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* only for strlen case we don't use optimized version for STATIC build just for SHARED */
-
-#if (defined SHARED || defined USE_AS_STRCAT || defined USE_AS_STRNLEN) && !defined NOT_IN_libc
-
-# ifndef USE_AS_STRCAT
-
-#  include <sysdep.h>
-
-#  define RETURN	ret
-
-#  ifndef STRLEN
-#   define STRLEN	__strlen_sse2_no_bsf
-#  endif
-
-	atom_text_section
-ENTRY (STRLEN)
-# endif
-	xor	%eax, %eax
-#  ifdef USE_AS_STRNLEN
-	mov	%rsi, %r8
-	sub	$4, %rsi
-	jbe	L(len_less4_prolog)
-#  endif
-	cmpb	$0, (%rdi)
-	jz	L(exit_tail0)
-	cmpb	$0, 1(%rdi)
-	jz	L(exit_tail1)
-	cmpb	$0, 2(%rdi)
-	jz	L(exit_tail2)
-	cmpb	$0, 3(%rdi)
-	jz	L(exit_tail3)
-
-# ifdef USE_AS_STRNLEN
-	sub	$4, %rsi
-	jbe	L(len_less8_prolog)
-# endif
-
-	cmpb	$0, 4(%rdi)
-	jz	L(exit_tail4)
-	cmpb	$0, 5(%rdi)
-	jz	L(exit_tail5)
-	cmpb	$0, 6(%rdi)
-	jz	L(exit_tail6)
-	cmpb	$0, 7(%rdi)
-	jz	L(exit_tail7)
-
-# ifdef USE_AS_STRNLEN
-	sub	$4, %rsi
-	jbe	L(len_less12_prolog)
-# endif
-
-	cmpb	$0, 8(%rdi)
-	jz	L(exit_tail8)
-	cmpb	$0, 9(%rdi)
-	jz	L(exit_tail9)
-	cmpb	$0, 10(%rdi)
-	jz	L(exit_tail10)
-	cmpb	$0, 11(%rdi)
-	jz	L(exit_tail11)
-
-# ifdef USE_AS_STRNLEN
-	sub	$4, %rsi
-	jbe	L(len_less16_prolog)
-# endif
-
-	cmpb	$0, 12(%rdi)
-	jz	L(exit_tail12)
-	cmpb	$0, 13(%rdi)
-	jz	L(exit_tail13)
-	cmpb	$0, 14(%rdi)
-	jz	L(exit_tail14)
-	cmpb	$0, 15(%rdi)
-	jz	L(exit_tail15)
-	pxor	%xmm0, %xmm0
-	lea	16(%rdi), %rcx
-	lea	16(%rdi), %rax
-	and	$-16, %rax
-
-# ifdef USE_AS_STRNLEN
-	and	$15, %rdi
-	add	%rdi, %rsi
-	sub	$64, %rsi
-	jbe	L(len_less64)
-# endif
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	pxor	%xmm1, %xmm1
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	pxor	%xmm2, %xmm2
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	pxor	%xmm3, %xmm3
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-# ifdef USE_AS_STRNLEN
-	sub	$64, %rsi
-	jbe	L(len_less64)
-# endif
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-# ifdef USE_AS_STRNLEN
-	sub	$64, %rsi
-	jbe	L(len_less64)
-# endif
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-# ifdef USE_AS_STRNLEN
-	sub	$64, %rsi
-	jbe	L(len_less64)
-# endif
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-# ifdef USE_AS_STRNLEN
-	mov	%rax, %rdx
-	and	$63, %rdx
-	add	%rdx, %rsi
-# endif
-
-	and	$-0x40, %rax
-
-	.p2align 4
-L(aligned_64):
-# ifdef USE_AS_STRNLEN
-	sub	$64, %rsi
-	jbe	L(len_less64)
-# endif
-	pcmpeqb	(%rax), %xmm0
-	pcmpeqb	16(%rax), %xmm1
-	pcmpeqb	32(%rax), %xmm2
-	pcmpeqb	48(%rax), %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm1, %r11d
-	pmovmskb %xmm2, %r10d
-	pmovmskb %xmm3, %r9d
-	or	%edx, %r9d
-	or	%r11d, %r9d
-	or	%r10d, %r9d
-	lea	64(%rax), %rax
-	jz	L(aligned_64)
-
-	test	%edx, %edx
-	jnz	L(aligned_64_exit_16)
-	test	%r11d, %r11d
-	jnz	L(aligned_64_exit_32)
-	test	%r10d, %r10d
-	jnz	L(aligned_64_exit_48)
-L(aligned_64_exit_64):
-	pmovmskb %xmm3, %edx
-	jmp	L(aligned_64_exit)
-L(aligned_64_exit_48):
-	lea	-16(%rax), %rax
-	mov	%r10d, %edx
-	jmp	L(aligned_64_exit)
-L(aligned_64_exit_32):
-	lea	-32(%rax), %rax
-	mov	%r11d, %edx
-	jmp	L(aligned_64_exit)
-L(aligned_64_exit_16):
-	lea	-48(%rax), %rax
-L(aligned_64_exit):
-L(exit):
-	sub	%rcx, %rax
-	test	%dl, %dl
-	jz	L(exit_high)
-	test	$0x01, %dl
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dl
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dl
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dl
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dl
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dl
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dl
-	jnz	L(exit_tail6)
-	add	$7, %eax
-L(exit_tail0):
-	RETURN
-
-L(exit_high):
-	add	$8, %eax
-	test	$0x01, %dh
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dh
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dh
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dh
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dh
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dh
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dh
-	jnz	L(exit_tail6)
-	add	$7, %eax
-	RETURN
-
-# ifdef USE_AS_STRNLEN
-
-	.p2align 4
-L(len_less64):
-	pxor	%xmm0, %xmm0
-	add	$64, %rsi
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	pxor	%xmm1, %xmm1
-	lea	16(%rax), %rax
-	test	%edx, %edx
-	jnz	L(strnlen_exit)
-
-	sub	$16, %rsi
-	jbe	L(return_start_len)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	lea	16(%rax), %rax
-	test	%edx, %edx
-	jnz	L(strnlen_exit)
-
-	sub	$16, %rsi
-	jbe	L(return_start_len)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	lea	16(%rax), %rax
-	test	%edx, %edx
-	jnz	L(strnlen_exit)
-
-	sub	$16, %rsi
-	jbe	L(return_start_len)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	lea	16(%rax), %rax
-	test	%edx, %edx
-	jnz	L(strnlen_exit)
-
-	mov	%r8, %rax
-	ret
-
-	.p2align 4
-L(strnlen_exit):
-	sub	%rcx, %rax
-
-	test	%dl, %dl
-	jz	L(strnlen_exit_high)
-	mov	%dl, %cl
-	and	$15, %cl
-	jz	L(strnlen_exit_8)
-	test	$0x01, %dl
-	jnz	L(exit_tail0)
-	test	$0x02, %dl
-	jnz	L(strnlen_exit_tail1)
-	test	$0x04, %dl
-	jnz	L(strnlen_exit_tail2)
-	sub	$4, %rsi
-	jb	L(return_start_len)
-	lea	3(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_8):
-	test	$0x10, %dl
-	jnz	L(strnlen_exit_tail4)
-	test	$0x20, %dl
-	jnz	L(strnlen_exit_tail5)
-	test	$0x40, %dl
-	jnz	L(strnlen_exit_tail6)
-	sub	$8, %rsi
-	jb	L(return_start_len)
-	lea	7(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_high):
-	mov	%dh, %ch
-	and	$15, %ch
-	jz	L(strnlen_exit_high_8)
-	test	$0x01, %dh
-	jnz	L(strnlen_exit_tail8)
-	test	$0x02, %dh
-	jnz	L(strnlen_exit_tail9)
-	test	$0x04, %dh
-	jnz	L(strnlen_exit_tail10)
-	sub	$12, %rsi
-	jb	L(return_start_len)
-	lea	11(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_high_8):
-	test	$0x10, %dh
-	jnz	L(strnlen_exit_tail12)
-	test	$0x20, %dh
-	jnz	L(strnlen_exit_tail13)
-	test	$0x40, %dh
-	jnz	L(strnlen_exit_tail14)
-	sub	$16, %rsi
-	jb	L(return_start_len)
-	lea	15(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_tail1):
-	sub	$2, %rsi
-	jb	L(return_start_len)
-	lea	1(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_tail2):
-	sub	$3, %rsi
-	jb	L(return_start_len)
-	lea	2(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_tail4):
-	sub	$5, %rsi
-	jb	L(return_start_len)
-	lea	4(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_tail5):
-	sub	$6, %rsi
-	jb	L(return_start_len)
-	lea	5(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_tail6):
-	sub	$7, %rsi
-	jb	L(return_start_len)
-	lea	6(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_tail8):
-	sub	$9, %rsi
-	jb	L(return_start_len)
-	lea	8(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_tail9):
-	sub	$10, %rsi
-	jb	L(return_start_len)
-	lea	9(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_tail10):
-	sub	$11, %rsi
-	jb	L(return_start_len)
-	lea	10(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_tail12):
-	sub	$13, %rsi
-	jb	L(return_start_len)
-	lea	12(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_tail13):
-	sub	$14, %rsi
-	jb	L(return_start_len)
-	lea	13(%eax), %eax
-	ret
-
-	.p2align 4
-L(strnlen_exit_tail14):
-	sub	$15, %rsi
-	jb	L(return_start_len)
-	lea	14(%eax), %eax
-	ret
-
-	.p2align 4
-L(return_start_len):
-	mov	%r8, %rax
-	ret
-
-/* for prolog only */
-
-	.p2align 4
-L(len_less4_prolog):
-	add	$4, %rsi
-	jz	L(exit_tail0)
-
-	cmpb	$0, (%rdi)
-	jz	L(exit_tail0)
-	cmp	$1, %esi
-	je	L(exit_tail1)
-
-	cmpb	$0, 1(%rdi)
-	jz	L(exit_tail1)
-	cmp	$2, %esi
-	je	L(exit_tail2)
-
-	cmpb	$0, 2(%rdi)
-	jz	L(exit_tail2)
-	cmp	$3, %esi
-	je	L(exit_tail3)
-
-	cmpb	$0, 3(%rdi)
-	jz	L(exit_tail3)
-	mov	$4, %eax
-	ret
-
-	.p2align 4
-L(len_less8_prolog):
-	add	$4, %rsi
-
-	cmpb	$0, 4(%rdi)
-	jz	L(exit_tail4)
-	cmp	$1, %esi
-	je	L(exit_tail5)
-
-	cmpb	$0, 5(%rdi)
-	jz	L(exit_tail5)
-	cmp	$2, %esi
-	je	L(exit_tail6)
-
-	cmpb	$0, 6(%rdi)
-	jz	L(exit_tail6)
-	cmp	$3, %esi
-	je	L(exit_tail7)
-
-	cmpb	$0, 7(%rdi)
-	jz	L(exit_tail7)
-	mov	$8, %eax
-	ret
-
-	.p2align 4
-L(len_less12_prolog):
-	add	$4, %rsi
-
-	cmpb	$0, 8(%rdi)
-	jz	L(exit_tail8)
-	cmp	$1, %esi
-	je	L(exit_tail9)
-
-	cmpb	$0, 9(%rdi)
-	jz	L(exit_tail9)
-	cmp	$2, %esi
-	je	L(exit_tail10)
-
-	cmpb	$0, 10(%rdi)
-	jz	L(exit_tail10)
-	cmp	$3, %esi
-	je	L(exit_tail11)
-
-	cmpb	$0, 11(%rdi)
-	jz	L(exit_tail11)
-	mov	$12, %eax
-	ret
-
-	.p2align 4
-L(len_less16_prolog):
-	add	$4, %rsi
-
-	cmpb	$0, 12(%rdi)
-	jz	L(exit_tail12)
-	cmp	$1, %esi
-	je	L(exit_tail13)
-
-	cmpb	$0, 13(%rdi)
-	jz	L(exit_tail13)
-	cmp	$2, %esi
-	je	L(exit_tail14)
-
-	cmpb	$0, 14(%rdi)
-	jz	L(exit_tail14)
-	cmp	$3, %esi
-	je	L(exit_tail15)
-
-	cmpb	$0, 15(%rdi)
-	jz	L(exit_tail15)
-	mov	$16, %eax
-	ret
-# endif
-
-	.p2align 4
-L(exit_tail1):
-	add	$1, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail2):
-	add	$2, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail3):
-	add	$3, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail4):
-	add	$4, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail5):
-	add	$5, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail6):
-	add	$6, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail7):
-	add	$7, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail8):
-	add	$8, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail9):
-	add	$9, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail10):
-	add	$10, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail11):
-	add	$11, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail12):
-	add	$12, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail13):
-	add	$13, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail14):
-	add	$14, %eax
-	RETURN
-
-	.p2align 4
-L(exit_tail15):
-	add	$15, %eax
-# ifndef USE_AS_STRCAT
-	RETURN
-END (STRLEN)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S b/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
deleted file mode 100644
index cc4bb57..0000000
--- a/sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
+++ /dev/null
@@ -1,259 +0,0 @@
-/* strlen SSE2
-   Copyright (C) 2011-2013 Free Software Foundation, Inc.
-   Contributed by Intel Corporation.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if !defined NOT_IN_libc && (defined SHARED || defined USE_AS_STRCAT)
-
-# ifndef USE_AS_STRCAT
-
-#  include <sysdep.h>
-
-#  define RETURN ret
-
-	.section .text.sse2,"ax",@progbits
-ENTRY (__strlen_sse2_pminub)
-
-# endif
-	xor	%rax, %rax
-	mov	%edi, %ecx
-	and	$0x3f, %ecx
-	pxor	%xmm0, %xmm0
-	cmp	$0x30, %ecx
-	ja	L(next)
-	movdqu	(%rdi), %xmm1
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_less16)
-	mov	%rdi, %rax
-	and	$-16, %rax
-	jmp	L(align16_start)
-L(next):
-	mov	%rdi, %rax
-	and	$-16, %rax
-	pcmpeqb	(%rax), %xmm0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	pmovmskb %xmm0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-L(align16_start):
-	pxor	%xmm0, %xmm0
-	pxor	%xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	pxor	%xmm3, %xmm3
-	pcmpeqb	16(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
-
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
-
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
-
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	jnz	L(exit64)
-
-	pcmpeqb	80(%rax), %xmm0
-	add	$64, %rax
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
-
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
-
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
-
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	jnz	L(exit64)
-
-	pcmpeqb	80(%rax), %xmm0
-	add	$64, %rax
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
-
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
-
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
-
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	jnz	L(exit64)
-
-	pcmpeqb	80(%rax), %xmm0
-	add	$64, %rax
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
-
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
-
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
-
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	jnz	L(exit64)
-
-
-	test	$0x3f, %rax
-	jz	L(align64_loop)
-
-	pcmpeqb	80(%rax), %xmm0
-	add	$80, %rax
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$0x3f, %rax
-	jz	L(align64_loop)
-
-	pcmpeqb	16(%rax), %xmm1
-	add	$16, %rax
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$0x3f, %rax
-	jz	L(align64_loop)
-
-	pcmpeqb	16(%rax), %xmm2
-	add	$16, %rax
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$0x3f, %rax
-	jz	L(align64_loop)
-
-	pcmpeqb	16(%rax), %xmm3
-	add	$16, %rax
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$16, %rax
-	.p2align 4
-	L(align64_loop):
-	movaps	(%rax),	%xmm4
-	pminub	16(%rax),	%xmm4
-	movaps	32(%rax),	%xmm5
-	pminub	48(%rax),	%xmm5
-	add	$64,	%rax
-	pminub	%xmm4,	%xmm5
-	pcmpeqb	%xmm0,	%xmm5
-	pmovmskb %xmm5,	%edx
-	test	%edx,	%edx
-	jz	L(align64_loop)
-
-
-	pcmpeqb	-64(%rax), %xmm0
-	sub	$80,	%rax
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
-
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
-
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
-
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$64, %rax
-	RETURN
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_less16):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	RETURN
-	.p2align 4
-L(exit16):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$16, %rax
-	RETURN
-	.p2align 4
-L(exit32):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$32, %rax
-	RETURN
-	.p2align 4
-L(exit48):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$48, %rax
-	RETURN
-	.p2align 4
-L(exit64):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$64, %rax
-# ifndef USE_AS_STRCAT
-	RETURN
-
-END (__strlen_sse2_pminub)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S
deleted file mode 100644
index 8d685df..0000000
--- a/sysdeps/x86_64/multiarch/strlen-sse4.S
+++ /dev/null
@@ -1,84 +0,0 @@
-/* strlen with SSE4
-   Copyright (C) 2009-2013 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#if defined SHARED && !defined NOT_IN_libc
-
-#include <sysdep.h>
-
-	.section .text.sse4.2,"ax",@progbits
-ENTRY (__strlen_sse42)
-	pxor	%xmm1, %xmm1
-	movl	%edi, %ecx
-	movq	%rdi, %r8
-	andq	$~15, %rdi
-	xor	%edi, %ecx
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb %xmm1, %edx
-	shrl	%cl, %edx
-	shll	%cl, %edx
-	andl	%edx, %edx
-	jnz	L(less16bytes)
-	pxor	%xmm1, %xmm1
-
-	.p2align 4
-L(more64bytes_loop):
-	pcmpistri $0x08, 16(%rdi), %xmm1
-	jz	L(more32bytes)
-
-	pcmpistri $0x08, 32(%rdi), %xmm1
-	jz	L(more48bytes)
-
-	pcmpistri $0x08, 48(%rdi), %xmm1
-	jz	L(more64bytes)
-
-	add	$64, %rdi
-	pcmpistri $0x08, (%rdi), %xmm1
-	jnz	L(more64bytes_loop)
-	leaq	(%rdi,%rcx), %rax
-	subq	%r8, %rax
-	ret
-
-	.p2align 4
-L(more32bytes):
-	leaq	16(%rdi,%rcx, 1), %rax
-	subq	%r8, %rax
-	ret
-
-	.p2align 4
-L(more48bytes):
-	leaq	32(%rdi,%rcx, 1), %rax
-	subq	%r8, %rax
-	ret
-
-	.p2align 4
-L(more64bytes):
-	leaq	48(%rdi,%rcx, 1), %rax
-	subq	%r8, %rax
-	ret
-
-	.p2align 4
-L(less16bytes):
-	subq	%r8, %rdi
-	bsfl	%edx, %eax
-	addq	%rdi, %rax
-	ret
-
-END (__strlen_sse42)
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/strlen.S b/sysdeps/x86_64/multiarch/strlen.S
deleted file mode 100644
index ab29cef..0000000
--- a/sysdeps/x86_64/multiarch/strlen.S
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Multiple versions of strlen(str) -- determine the length of the string STR.
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2009-2013 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc and for
-   the DSO.  In static binaries we need strlen before the initialization
-   happened.  */
-#if defined SHARED && !defined NOT_IN_libc
-	.text
-ENTRY(strlen)
-	.type	strlen, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strlen_sse2_pminub(%rip), %rax
-	testl	$bit_Prefer_PMINUB_for_stringop, __cpu_features+FEATURE_OFFSET+index_Prefer_PMINUB_for_stringop(%rip)
-	jnz	2f
-	leaq	__strlen_sse2(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-	jz	2f
-	leaq	__strlen_sse42(%rip), %rax
-	ret
-2:	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
-	jz	3f
-	leaq    __strlen_sse2_no_bsf(%rip), %rax
-3:	ret
-END(strlen)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __strlen_sse2, @function; \
-	.align 16; \
-	.globl __strlen_sse2; \
-	.hidden __strlen_sse2; \
-	__strlen_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __strlen_sse2, .-__strlen_sse2
-# undef libc_hidden_builtin_def
-/* It doesn't make sense to send libc-internal strlen calls through a PLT.
-   The speedup we get from using SSE4.2 instruction is likely eaten away
-   by the indirect call in the PLT.  */
-# define libc_hidden_builtin_def(name) \
-	.globl __GI_strlen; __GI_strlen = __strlen_sse2
-#endif
-
-#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
deleted file mode 100644
index 248328d..0000000
--- a/sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNLEN
-#define STRLEN __strnlen_sse2_no_bsf
-#include "strlen-sse2-no-bsf.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen.S b/sysdeps/x86_64/multiarch/strnlen.S
deleted file mode 100644
index 124f845..0000000
--- a/sysdeps/x86_64/multiarch/strnlen.S
+++ /dev/null
@@ -1,57 +0,0 @@
-/* multiple version of strnlen
-   All versions must be listed in ifunc-impl-list.c.
-   Copyright (C) 2011-2013 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc.  */
-#ifndef NOT_IN_libc
-
-	.text
-ENTRY(__strnlen)
-	.type	__strnlen, @gnu_indirect_function
-	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-	jne	1f
-	call	__init_cpu_features
-1:	leaq	__strnlen_sse2(%rip), %rax
-	testl	$bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
-	jz	2f
-	leaq	__strnlen_sse2_no_bsf(%rip), %rax
-2:	ret
-END(__strnlen)
-
-# undef ENTRY
-# define ENTRY(name) \
-	.type __strnlen_sse2, @function; \
-	.align 16; \
-	.globl __strnlen_sse2; \
-	.hidden __strnlen_sse2; \
-	__strnlen_sse2: cfi_startproc; \
-	CALL_MCOUNT
-# undef END
-# define END(name) \
-	cfi_endproc; .size __strnlen_sse2, .-__strnlen_sse2
-
-# undef libc_hidden_def
-# define libc_hidden_def(name) \
-	.globl __GI_strnlen; __GI_strnlen = __strnlen_sse2
-#endif
-
-#include "../strnlen.S"
diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S
index 287ffd2..8bea6fb 100644
--- a/sysdeps/x86_64/strcat.S
+++ b/sysdeps/x86_64/strcat.S
@@ -21,6 +21,7 @@
 #include <sysdep.h>
 #include "asm-syntax.h"
 
+/* Will be removed when new strcpy implementation gets merged.  */
 
 	.text
 ENTRY (strcat)
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index 4bdca0a..eeb1092 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,6 +1,5 @@
-/* strlen(str) -- determine the length of the string STR.
-   Copyright (C) 2009-2013 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
+/* SSE2 version of strlen.
+   Copyright (C) 2012-2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -19,83 +18,222 @@
 
 #include <sysdep.h>
 
+/* Long lived register in strlen(s), strnlen(s, n) are:
 
-	.text
+	%xmm11 - zero
+	%rdi   - s
+	%r10  (s+n) & (~(64-1))
+	%r11   s+n
+*/
+
+
+.text
 ENTRY(strlen)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+#define FIND_ZERO	\
+	pcmpeqb	(%rax), %xmm8;	\
+	pcmpeqb	16(%rax), %xmm9;	\
+	pcmpeqb	32(%rax), %xmm10;	\
+	pcmpeqb	48(%rax), %xmm11;	\
+	pmovmskb	%xmm8, %esi;	\
+	pmovmskb	%xmm9, %edx;	\
+	pmovmskb	%xmm10, %r8d;	\
+	pmovmskb	%xmm11, %ecx;	\
+	salq	$16, %rdx;	\
+	salq	$16, %rcx;	\
+	orq	%rsi, %rdx;	\
+	orq	%r8, %rcx;	\
+	salq	$32, %rcx;	\
+	orq	%rcx, %rdx;
+
+#ifdef AS_STRNLEN
+/* Do not read anything when n==0.  */
+	test	%rsi, %rsi
+	jne	L(n_nonzero)
 	xor	%rax, %rax
-	mov	%edi, %ecx
-	and	$0x3f, %ecx
-	pxor	%xmm0, %xmm0
-	cmp	$0x30, %ecx
-	ja	L(next)
-	movdqu	(%rdi), %xmm1
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit_less16)
-	mov	%rdi, %rax
-	and	$-16, %rax
-	jmp	L(align16_start)
-L(next):
-	mov	%rdi, %rax
-	and	$-16, %rax
-	pcmpeqb	(%rax), %xmm0
-	mov	$-1, %esi
-	sub	%rax, %rcx
-	shl	%cl, %esi
-	pmovmskb %xmm0, %edx
-	and	%esi, %edx
-	jnz	L(exit)
-L(align16_start):
-	pxor	%xmm0, %xmm0
-	pxor	%xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	pxor	%xmm3, %xmm3
-	.p2align 4
-L(align16_loop):
-	pcmpeqb	16(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	jnz	L(exit16)
+	ret
+L(n_nonzero):
 
-	pcmpeqb	32(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	jnz	L(exit32)
+/* Initialize long lived registers.  */
 
-	pcmpeqb	48(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	jnz	L(exit48)
+	add	%rdi, %rsi
+	mov	%rsi, %r10
+	and	$-64, %r10
+	mov	%rsi, %r11
+#endif
 
-	pcmpeqb	64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	lea	64(%rax), %rax
+	pxor	%xmm8, %xmm8
+	pxor	%xmm9, %xmm9
+	pxor	%xmm10, %xmm10
+	pxor	%xmm11, %xmm11
+	movq	%rdi, %rax
+	movq	%rdi, %rcx
+	andq	$4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+	cmpq	$4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower.  */
+	ja	L(cross_page)
+
+#ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes.  */
+# define STRNLEN_PROLOG	\
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+#else
+# define STRNLEN_PROLOG  andq $-64, %rax;
+#endif
+
+/* Ignore bits in mask that come before start of string.  */
+#define PROLOG(lab)	\
+	movq	%rdi, %rcx;	\
+	xorq	%rax, %rcx;	\
+	STRNLEN_PROLOG;	\
+	sarq	%cl, %rdx;	\
+	test	%rdx, %rdx;	\
+	je	L(lab);	\
+	bsfq	%rdx, %rax;	\
+	ret
+
+#ifdef AS_STRNLEN
+	andq	$-16, %rax
+	FIND_ZERO
+#else
+	/* Test first 16 bytes unaligned.  */
+	movdqu	(%rax), %xmm12
+	pcmpeqb	%xmm8, %xmm12
+	pmovmskb	%xmm12, %edx
 	test	%edx, %edx
-	jz	L(align16_loop)
-L(exit):
-	sub	%rdi, %rax
-L(exit_less16):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
+	je 	L(next48_bytes)
+	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+	ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+	andq	$-16, %rax
+	pcmpeqb 16(%rax), %xmm9
+	pcmpeqb 32(%rax), %xmm10
+	pcmpeqb 48(%rax), %xmm11
+	pmovmskb	%xmm9, %edx
+	pmovmskb	%xmm10, %r8d
+	pmovmskb	%xmm11, %ecx
+	salq	$16, %rdx
+	salq	$16, %rcx
+	orq	%r8, %rcx
+	salq	$32, %rcx
+	orq	%rcx, %rdx
+#endif
+
+	/* When no zero byte is found xmm9-11 are zero so we do not have to
+	   zero them.  */
+	PROLOG(loop)
+
+	.p2align 4
+L(cross_page):
+	andq	$-64, %rax
+	FIND_ZERO
+	PROLOG(loop_init)
+
+#ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1).  */
+L(strnlen_ret):
+	bts	%rsi, %rdx
+	sarq	%cl, %rdx
+	test	%rdx, %rdx
+	je	L(loop_init)
+	bsfq	%rdx, %rax
 	ret
+#endif
+	.p2align 4
+L(loop_init):
+	pxor	%xmm9, %xmm9
+	pxor	%xmm10, %xmm10
+	pxor	%xmm11, %xmm11
+#ifdef AS_STRNLEN
+	.p2align 4
+L(loop):
+
+	addq	$64, %rax
+	cmpq	%rax, %r10
+	je	L(exit_end)
+
+	movdqa	(%rax), %xmm8
+	pminub	16(%rax), %xmm8
+	pminub	32(%rax), %xmm8
+	pminub	48(%rax), %xmm8
+	pcmpeqb	%xmm11, %xmm8
+	pmovmskb	%xmm8, %edx
+	testl	%edx, %edx
+	jne	L(exit)
+	jmp	L(loop)
+
 	.p2align 4
-L(exit16):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	lea	16(%rdx,%rax), %rax
+L(exit_end):
+	cmp	%rax, %r11
+	je	L(first) /* Do not read when end is at page boundary.  */
+	pxor	%xmm8, %xmm8
+	FIND_ZERO
+
+L(first):
+	bts	%r11, %rdx
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
 	ret
+
 	.p2align 4
-L(exit32):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	lea	32(%rdx,%rax), %rax
+L(exit):
+	pxor	%xmm8, %xmm8
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
 	ret
+
+#else
+
+	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+	.p2align 4
+L(loop):
+
+	movdqa	64(%rax), %xmm8
+	pminub	80(%rax), %xmm8
+	pminub	96(%rax), %xmm8
+	pminub	112(%rax), %xmm8
+	pcmpeqb	%xmm11, %xmm8
+	pmovmskb	%xmm8, %edx
+	testl	%edx, %edx
+	jne	L(exit64)
+
+	subq	$-128, %rax
+
+	movdqa	(%rax), %xmm8
+	pminub	16(%rax), %xmm8
+	pminub	32(%rax), %xmm8
+	pminub	48(%rax), %xmm8
+	pcmpeqb	%xmm11, %xmm8
+	pmovmskb	%xmm8, %edx
+	testl	%edx, %edx
+	jne	L(exit0)
+	jmp	L(loop)
+
 	.p2align 4
-L(exit48):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	lea	48(%rdx,%rax), %rax
+L(exit64):
+	addq	$64, %rax
+L(exit0):
+	pxor	%xmm8, %xmm8
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
 	ret
+
+#endif
+
 END(strlen)
 libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
index 6e53503..d3c43ac 100644
--- a/sysdeps/x86_64/strnlen.S
+++ b/sysdeps/x86_64/strnlen.S
@@ -1,63 +1,6 @@
-/* strnlen(str,maxlen) -- determine the length of the string STR up to MAXLEN.
-   Copyright (C) 2010-2013 Free Software Foundation, Inc.
-   Contributed by Ulrich Drepper <drepper@redhat.com>.
-   This file is part of the GNU C Library.
+#define AS_STRNLEN
+#define strlen __strnlen
+#include "strlen.S"
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-
-	.text
-ENTRY(__strnlen)
-	movq	%rsi, %rax
-	testq	%rsi, %rsi
-	jz	3f
-	pxor	%xmm2, %xmm2
-	movq	%rdi, %rcx
-	movq	%rdi, %r8
-	movq	$16, %r9
-	andq	$~15, %rdi
-	movdqa	%xmm2, %xmm1
-	pcmpeqb	(%rdi), %xmm2
-	orl	$0xffffffff, %r10d
-	subq	%rdi, %rcx
-	shll	%cl, %r10d
-	subq	%rcx, %r9
-	pmovmskb %xmm2, %edx
-	andl	%r10d, %edx
-	jnz	1f
-	subq	%r9, %rsi
-	jbe	3f
-
-2:	movdqa	16(%rdi), %xmm0
-	leaq	16(%rdi), %rdi
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %edx
-	testl	%edx, %edx
-	jnz	1f
-	subq	$16, %rsi
-	jnbe	2b
-3:	ret
-
-1:	subq	%r8, %rdi
-	bsfl	%edx, %edx
-	addq	%rdi, %rdx
-	cmpq	%rdx, %rax
-	cmovnbq	%rdx, %rax
-	ret
-END(__strnlen)
-weak_alias (__strnlen, strnlen)
-libc_hidden_def (strnlen)
+weak_alias (__strnlen, strnlen);
+libc_hidden_builtin_def (strnlen)

-----------------------------------------------------------------------

Summary of changes:
 sysdeps/x86_64/multiarch/Makefile                |    6 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c       |   13 -
 sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S |  232 ++++++++-
 sysdeps/x86_64/multiarch/strcat-ssse3.S          |  316 ++++++++++-
 sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S    |  685 ----------------------
 sysdeps/x86_64/multiarch/strlen-sse2-pminub.S    |  259 --------
 sysdeps/x86_64/multiarch/strlen-sse4.S           |   84 ---
 sysdeps/x86_64/multiarch/strlen.S                |   68 ---
 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S   |    3 -
 sysdeps/x86_64/multiarch/strnlen.S               |   57 --
 sysdeps/x86_64/strcat.S                          |    1 +
 sysdeps/x86_64/strlen.S                          |  272 +++++++---
 sysdeps/x86_64/strnlen.S                         |   67 +--
 13 files changed, 755 insertions(+), 1308 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S
 delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse2-pminub.S
 delete mode 100644 sysdeps/x86_64/multiarch/strlen-sse4.S
 delete mode 100644 sysdeps/x86_64/multiarch/strlen.S
 delete mode 100644 sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S
 delete mode 100644 sysdeps/x86_64/multiarch/strnlen.S


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]