This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch master updated. glibc-2.20-534-g8bedcb5


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  8bedcb5f03c62bf6001396dafdd82fbd4da7c2db (commit)
       via  f06a4faf8a2b4d046eb40e94b47948cc47d79902 (commit)
       via  9f2f36e5a91c2ce6edba5415e176155eb1008ae1 (commit)
       via  94c9680945369d63ef9ed266a29f28ebaaaeb5ce (commit)
       via  96d6fd6c4060d739abb1822e7ad633af749532b2 (commit)
      from  0f9e585480edcdf1e30dc3d79e24b84aeee516fa (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=8bedcb5f03c62bf6001396dafdd82fbd4da7c2db

commit 8bedcb5f03c62bf6001396dafdd82fbd4da7c2db
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Wed Jan 7 07:18:30 2015 -0500

    powerpc: Optimized strcmp for POWER8/PPC64
    
    This patch adds an optimized POWER8 strcmp using unaligned accesses.
    The algorithm first check the initial 16 bytes, then align the first
    function source and uses unaligned loads on second argument only.
    Aditional checks for page boundaries are done for unaligned cases

diff --git a/ChangeLog b/ChangeLog
index 20aded4..3fa5e3b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,16 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Add strcmp-power8 object.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strcmp_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/strcmp.c (strcmp): Add
+	__strcmp_power8 implementation.
+	* sysdeps/powerpc/powerpc64/power8/strcmp.S: New file.
+	* NEWS: Update.
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
 	Add strncpy-power8 and stpncpy-power8 objects.
 	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 	(__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
diff --git a/NEWS b/NEWS
index 08b3daa..e9f5034 100644
--- a/NEWS
+++ b/NEWS
@@ -19,8 +19,9 @@ Version 2.21
   17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782,
   17791, 17793, 17796, 17797, 17803, 17806, 17834
 
-* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for
+* Optimized strcpy, stpcpy, strncpy, stpncpy, and strcmp implementations for
   powerpc64/powerpc64le.
+  Implemented by Adhemerval Zanella (IBM).
 
 * Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
   and powerpc64le.  This may improve lock scaling of existing programs on
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 18d3378..ec4fca7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -18,7 +18,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
-		   strcmp-power7 strcmp-ppc64 \
+		   strcmp-power8 strcmp-power7 strcmp-ppc64 \
 		   strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
 		   memmove-ppc64 bcopy-ppc64 strncpy-power8
 
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 132cb13..2c03060 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c.  */
   IFUNC_IMPL (i, name, strcmp,
 	      IFUNC_IMPL_ADD (array, i, strcmp,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strcmp_power8)
+	      IFUNC_IMPL_ADD (array, i, strcmp,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcmp_power7)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcmp.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
index 9b2922f..dc4bfac 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcmp. PowerPC64 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+/* Optimized strcmp implementation for POWER8/PPC64.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if defined SHARED && IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
 
-extern __typeof (strcmp) __strcmp_ppc attribute_hidden;
-extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strcmp_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strcmp_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strcmp_power8)
 
-libc_ifunc (strcmp,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcmp_power7
-            : __strcmp_ppc);
-#endif
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strcmp_power8)					\
+  END_2(__strcmp_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strcmp.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
index 9b2922f..b45ba1f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -23,9 +23,12 @@
 
 extern __typeof (strcmp) __strcmp_ppc attribute_hidden;
 extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
+extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
 
 libc_ifunc (strcmp,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcmp_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+              ? __strcmp_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strcmp_power7
             : __strcmp_ppc);
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S
new file mode 100644
index 0000000..223d891
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -0,0 +1,257 @@
+/* Optimized strcmp implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the function
+
+   size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+EALIGN (strcmp, 4, 0)
+	li	r0,0
+
+	/* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+	   the code:
+
+	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+	   with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
+
+	rldicl	r7,r3,0,52
+	rldicl	r9,r4,0,52
+	cmpldi	cr7,r7,4096-32
+	bgt	cr7,L(pagecross_check)
+	cmpldi	cr5,r9,4096-32
+	bgt	cr5,L(pagecross_check)
+
+	/* For short string up to 32 bytes, load both s1 and s2 using
+	   unaligned dwords and compare.  */
+	ld	r8,0(r3)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,8(r3)
+	ld	r10,8(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,16(r3)
+	ld	r10,16(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,24(r3)
+	ld	r10,24(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	addi	r7,r3,32
+	addi	r4,r4,32
+
+L(align_8b):
+	/* Now it has checked for first 32 bytes, align source1 to doubleword
+	   and adjust source2 address.  */
+	rldicl	r9,r7,0,61	/* source1 alignment to doubleword  */
+	subf	r4,r9,r4	/* Adjust source2 address based on source1
+				   alignment.  */
+	rldicr	r7,r7,0,60	/* Align source1 to doubleword.  */
+
+	/* At this point, source1 alignment is 0 and source2 alignment is
+	   between 0 and 7.  Check is source2 alignment is 0, meaning both
+	   sources have the same alignment.  */
+	andi.	r9,r4,0x7
+	bne	cr0,L(loop_diff_align)
+
+	/* If both source1 and source2 are doubleword aligned, there is no
+	   need for page boundary cross checks.  */
+
+	ld	r8,0(r7)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	.align 4
+L(loop_equal_align):
+	ld	r8,8(r7)
+	ld	r10,8(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,16(r7)
+	ld	r10,16(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ldu	r8,24(r7)
+	ldu	r10,24(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	b	L(loop_equal_align)
+
+	/* A zero byte was found in r8 (s1 dword), r9 contains the cmpb
+	   result and r10 the dword from s2.  To code isolate the byte
+	   up to end (including the '\0'), masking with 0xFF the remaining
+	   ones:
+
+           #if __LITTLE_ENDIAN__
+	     (__builtin_ffsl (x) - 1) = counting trailing zero bits
+	     r9 = (__builtin_ffsl (r9) - 1) + 8;
+	     r9 = -1UL << r9
+	   #else
+	     r9  = __builtin_clzl (r9) + 8;
+	     r9  = -1UL >> r9
+	   #endif
+	     r8  = r8  | r9
+	     r10 = r10 | r9  */
+
+#ifdef __LITTLE_ENDIAN__
+	nor 	r9,r9,r9
+L(different_nocmpb):
+	neg	r3,r9
+	and	r9,r9,r3
+	cntlzd	r9,r9
+	subfic	r9,r9,63
+#else
+	not	r9,r9
+L(different_nocmpb):
+	cntlzd	r9,r9
+	subfic	r9,r9,56
+#endif
+	srd	r3,r8,r9
+	srd	r10,r10,r9
+	rldicl	r10,r10,0,56
+	rldicl	r3,r3,0,56
+	subf	r3,r10,r3
+	extsw	r3,r3
+	blr
+
+	.align	4
+L(pagecross_check):
+	subfic	r9,r9,4096
+	subfic	r7,r7,4096
+	cmpld	cr7,r7,r9
+	bge	cr7,L(pagecross)
+	mr	r7,r9
+
+	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
+	   a simple byte a byte comparison until the page alignment for s1
+	   is reached.  */
+L(pagecross):
+	add	r7,r3,r7
+	subf	r9,r3,r7
+	mtctr	r9
+
+	.align	4
+L(pagecross_loop):
+	/* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
+	   and if *s1 is '\0'.  */
+	lbz	r9,0(r3)
+	lbz	r10,0(r4)
+	addi	r3,r3,1
+	addi	r4,r4,1
+	cmplw	cr7,r9,r10
+	cmpdi	cr5,r9,r0
+	bne	cr7,L(pagecross_ne)
+	beq	cr5,L(pagecross_nullfound)
+	bdnz	L(pagecross_loop)
+	b	L(align_8b)
+
+	.align	4
+	/* The unaligned read of source2 will cross a 4K page boundary,
+	   and the different byte or NULL maybe be in the remaining page
+	   bytes. Since it can not use the unaligned load, the algorithm
+	   reads and compares 8 bytes to keep source1 doubleword aligned.  */
+L(check_source2_byte):
+	li	r9,8
+	mtctr	r9
+
+	.align	4
+L(check_source2_byte_loop):
+	lbz	r9,0(r7)
+	lbz	r10,0(r4)
+	addi	r7,r7,1
+	addi	r4,r4,1
+	cmplw	cr7,r9,10
+	cmpdi	r5,r9,0
+	bne	cr7,L(pagecross_ne)
+	beq	cr5,L(pagecross_nullfound)
+	bdnz	L(check_source2_byte_loop)
+
+	/* If source2 is unaligned to doubleword, the code needs to check
+	   on each interation if the unaligned doubleword access will cross
+	   a 4k page boundary.  */
+	.align	5
+L(loop_unaligned):
+	ld	r8,0(r7)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+	addi	r7,r7,8
+	addi	r4,r4,8
+
+L(loop_diff_align):
+	/* Check if [src2]+8 cross a 4k page boundary:
+
+	     srcin2 % PAGE_SIZE > (PAGE_SIZE - 8)
+
+	     with PAGE_SIZE being 4096.  */
+	rldicl	r9,r4,0,52
+	cmpldi	cr7,r9,4088
+	ble	cr7,L(loop_unaligned)
+	b	L(check_source2_byte)
+
+	.align	4
+L(pagecross_ne):
+	extsw	r3,r9
+	mr	r9,r10
+L(pagecross_retdiff):
+	subf	r9,r9,r3
+	extsw	r3,r9
+	blr
+
+	.align	4
+L(pagecross_nullfound):
+	li	r3,0
+	b	L(pagecross_retdiff)
+END (strcmp)
+libc_hidden_builtin_def (strcmp)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f06a4faf8a2b4d046eb40e94b47948cc47d79902

commit f06a4faf8a2b4d046eb40e94b47948cc47d79902
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Wed Dec 31 11:47:41 2014 -0500

    powerpc: Optimized st{r,p}ncpy for POWER8/PPC64
    
    This patch adds an optimized POWER8 st{r,p}ncpy using unaligned accesses.
    It shows 10%-80% improvement over the optimized POWER7 one that uses
    only aligned accesses, specially on unaligned inputs.
    
    The algorithm first read and check 16 bytes (if inputs do not cross a 4K
    page size).  The it realign source to 16-bytes and issue a 16 bytes read
    and compare loop to speedup null byte checks for large strings.  Also,
    different from POWER7 optimization, the null pad is done inline in the
    implementation using possible unaligned accesses, instead of realying on
    a memset call.  Special case is added for page cross reads.

diff --git a/ChangeLog b/ChangeLog
index 16199e3..20aded4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Add strncpy-power8 and stpncpy-power8 objects.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
+	implementations.
+	* sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/stpncpy.c (__stpncpy): Add
+	__stpncpy_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/strncpy.c (strncpy): Add
+	__strncpy_power8 implementation.
+	* sysdeps/powerpc/powerpc64/power8/stpncpy.S: New file.
+	* sysdeps/powerpc/powerpc64/power8/strncpy.S: New file.
+	* NEWS: Update.
+
 	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
 	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
 	* sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
diff --git a/NEWS b/NEWS
index e020918..08b3daa 100644
--- a/NEWS
+++ b/NEWS
@@ -19,7 +19,8 @@ Version 2.21
   17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782,
   17791, 17793, 17796, 17797, 17803, 17806, 17834
 
-* Optimized strcpy and stpcpy implementations for powerpc64/powerpc64le.
+* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for
+  powerpc64/powerpc64le.
 
 * Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
   and powerpc64le.  This may improve lock scaling of existing programs on
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 74b2daa..18d3378 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -17,9 +17,10 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   stpcpy-power7 stpcpy-ppc64 \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
-		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
+		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
+		   strcmp-power7 strcmp-ppc64 \
 		   strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
-		   memmove-ppc64 bcopy-ppc64
+		   memmove-ppc64 bcopy-ppc64 strncpy-power8
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index dbb21fd..132cb13 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -279,6 +279,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
 	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strncpy_power8)
+	      IFUNC_IMPL_ADD (array, i, strncpy,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strncpy_power7)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
@@ -287,6 +290,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __stpncpy_power8)
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __stpncpy_power7)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
similarity index 55%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
index 9e5a270..d5d835d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+/* Optimized stpncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,24 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
 
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+#define USE_AS_STPNCPY
 
-libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
-            : __stpncpy_ppc);
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__stpncpy_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__stpncpy_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__stpncpy_power8)
 
-weak_alias (__stpncpy, stpncpy)
-#endif
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__stpncpy_power8)					\
+  END_2(__stpncpy_power8)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 9e5a270..0f4072f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -23,10 +23,13 @@
 
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
 
 libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __stpncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __stpncpy_power7
             : __stpncpy_ppc);
 
 weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
index 9e5a270..ed906a4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+/* Optimized strncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,25 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
 
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strncpy_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strncpy_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strncpy_power8)
 
-libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
-            : __stpncpy_ppc);
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strncpy_power8)					\
+  END_2(__strncpy_power8)
 
-weak_alias (__stpncpy, stpncpy)
-#endif
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index ae4e97a..ffb0f23 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -24,12 +24,15 @@
 
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc (strncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strncpy_power7
             : __strncpy_ppc);
 
 #endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
similarity index 59%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/power8/stpncpy.S
index 9e5a270..76a1466 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+/* Optimized stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,5 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
-
-libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
-            : __stpncpy_ppc);
-
-weak_alias (__stpncpy, stpncpy)
-#endif
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
new file mode 100644
index 0000000..5fda953
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -0,0 +1,424 @@
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPCPY is defined.
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+
+        /* Check if the [src]+15 will cross a 4K page by checking if the bit
+           indicating the page size changes.  Basically:
+
+           uint64_t srcin = (uint64_t)src;
+           uint64_t ob = srcin & 4096UL;
+           uint64_t nb = (srcin+15UL) & 4096UL;
+           if (ob ^ nb)
+             goto pagecross;  */
+
+	addi	r10,r4,16
+	rlwinm	r9,r4,0,19,19
+
+	/* Since it is a leaf function, save some non-volatile registers on the
+	   protected/red zone.  */
+	std	r26,-48(r1)
+	std	r27,-40(r1)
+
+	rlwinm	r8,r10,0,19,19
+
+	std	r28,-32(r1)
+	std	r29,-24(r1)
+
+	cmpld	r7,r9,r8
+
+	std	r30,-16(r1)
+	std	r31,-8(r1)
+
+	beq	cr7,L(unaligned_lt_16)
+	rldicl	r9,r4,0,61
+	subfic	r8,r9,8
+	cmpld	cr7,r5,r8
+	bgt 	cr7,L(pagecross)
+
+	/* At this points there is 1 to 15 bytes to check and write.  Since it could
+	   be either from first unaligned 16 bytes access or from bulk copy, the code
+	   uses an unrolled byte read/write instead of trying to analyze the cmpb
+	   results.  */
+L(short_path):
+	mr	r9,r3
+L(short_path_1):
+	cmpdi	cr7,r5,0
+	beq	cr7,L(short_path_loop_end_1)
+L(short_path_2):
+	lbz	r10,0(r4)
+	cmpdi	cr7,r10,0
+	stb	r10,0(r9)
+	beq	cr7,L(zero_pad_start_1)
+	cmpdi	cr0,r5,1
+	addi	r8,r9,1
+	addi	r6,r5,-1
+	beq	cr0,L(short_path_loop_end_0)
+	lbz	r10,1(r4)
+	cmpdi	cr7,r10,0
+	stb	r10,1(r9)
+	beq	cr7,L(zero_pad_start_prepare_1)
+	addi	r10,r5,-3
+	b	L(short_path_loop_1)
+
+	.align	4
+L(short_path_loop):
+	lbz	r8,0(r4)
+	addi	r7,r10,-2
+	cmpdi	cr5,r8,0
+	stb	r8,0(r9)
+	beq	cr5,L(zero_pad_start_1)
+	beq	r7,L(short_path_loop_end_0)
+	lbz	r8,1(r4)
+	cmpdi	cr7,r8,0
+	stb	r8,1(r9)
+	beq	cr7,L(zero_pad_start)
+	mr	r10,r7
+L(short_path_loop_1):
+	addic.	r5,r5,-2
+	addi	r9,r9,2
+	cmpdi	cr7,r10,0
+	addi	r4,r4,2
+	addi	r6,r9,1
+	bne	cr0,L(short_path_loop)
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+	b	L(short_path_loop_end)
+#endif
+
+L(short_path_loop_end_0):
+#ifdef USE_AS_STPNCPY
+	addi	r3,r9,1
+	b	L(short_path_loop_end)
+#endif
+L(short_path_loop_end_1):
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+#endif
+L(short_path_loop_end):
+	/* Restore non-volatile registers.  */
+	ld	r26,-48(r1)
+	ld	r27,-40(r1)
+	ld	r28,-32(r1)
+	ld	r29,-24(r1)
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
+	blr
+
+	/* This code pads the remainder dest with NULL bytes.  The algorithm
+	   calculate the remanining size and issues a doubleword unrolled
+	   loops followed by a byte a byte set.  */
+	.align	4
+L(zero_pad_start):
+	mr	r5,r10
+	mr	r9,r6
+L(zero_pad_start_1):
+	srdi.	r8,r5,r3
+	mr	r10,r9
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+#endif
+	beq-	cr0,L(zero_pad_loop_b_start)
+	cmpldi	cr7,r8,1
+	li	cr7,0
+	std	r7,0(r9)
+	beq	cr7,L(zero_pad_loop_b_prepare)
+	addic.	r8,r8,-2
+	addi	r10,r9,r16
+	std	r7,8(r9)
+	beq	cr0,L(zero_pad_loop_dw_2)
+	std	r7,16(r9)
+	li	r9,0
+	b	L(zero_pad_loop_dw_1)
+
+	.align	4
+L(zero_pad_loop_dw):
+	addi	r10,r10,16
+	std	r9,-8(r10)
+	beq	cr0,L(zero_pad_loop_dw_2)
+	std	r9,0(r10)
+L(zero_pad_loop_dw_1):
+	cmpldi	cr7,r8,1
+	std	r9,0(r10)
+	addic.	r8,r8,-2
+	bne	cr7,L(zero_pad_loop_dw)
+	addi	r10,r10,8
+L(zero_pad_loop_dw_2):
+	rldicl	r5,r5,0,61
+L(zero_pad_loop_b_start):
+	cmpdi	cr7,r5,0
+	addi	r5,r5,-1
+	addi	r9,r10,-1
+	add	r10,r10,5
+	subf	r10,r9,r10
+	li	r8,0
+	beq-	cr7,L(short_path_loop_end)
+
+	/* Write remaining 1-8 bytes.  */
+        .align  4
+	addi	r9,r9,1
+	mtocrf	0x1,r10
+	bf	29,4f
+        stw     r8,0(r9)
+        addi	r9,r9,4
+
+        .align  4
+4:      bf      30,2f
+        sth     r8,0(r9)
+        addi	r9,r9,2
+
+        .align  4
+2:      bf	31,1f
+        stb	r8,0(r9)
+
+	/* Restore non-volatile registers.  */
+1:	ld	r26,-48(r1)
+	ld	r27,-40(r1)
+	ld	r28,-32(r1)
+	ld	r29,-24(r1)
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
+	blr
+
+	/* The common case where [src]+16 will not cross a 4K page boundary.
+	   In this case the code fast check the first 16 bytes by using doubleword
+	   read/compares and update destiny if neither total size or null byte
+	   is found in destiny. */
+	.align	4
+L(unaligned_lt_16):
+	cmpldi	cr7,r5,7
+	ble	cr7,L(short_path)
+	ld	r7,0(r4)
+	li	r8,0
+	cmpb	r8,r7,r8
+	cmpdi	cr7,r8,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	r6,r5,-8
+	std	r7,0(r3)
+	addi	r9,r3,r8
+	cmpldi	cr7,r6,7
+	addi	r7,r4,8
+	ble	cr7,L(short_path_prepare_1_1)
+	ld	r4,8(r4)
+	cmpb	r8,r4,r8
+	cmpdi	cr7,r8,0
+	bne	cr7,L(short_path_prepare_2_1)
+	std	r4,8(r3)
+	addi	r29,r3,16
+	addi	r5,r5,-16
+	/* Neither the null byte was found or total length was reached,
+	   align to 16 bytes and issue a bulk copy/compare.  */
+	b	L(align_to_16b)
+
+	/* In the case of 4k page boundary cross, the algorithm first align
+	   the address to a doubleword, calculate a mask based on alignment
+	   to ignore the bytes and continue using doubleword.  */
+	.align	4
+L(pagecross):
+	rldicr	r11,r4,0,59	/* Align the address to 8 bytes boundary.  */
+	li	r6,-1		/* MASK = 0xffffffffffffffffUL.  */
+	sldi	r9,r9,3		/* Calculate padding.  */
+	ld	r7,0(r11)	/* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r9,r6,r9	/* MASK = MASK << padding.  */
+#else
+	srd	r9,r6,r9	/* MASK = MASK >> padding.  */
+#endif
+	orc	r9,r7,r9	/* Mask bits that are not part of the
+				   string.  */
+	li	cr7,0
+	cmpb	r9,r9,r7	/* Check for null bytes in DWORD1.  */
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	subf	r8,r8,r5	/* Adjust total length.  */
+	cmpldi	cr7,r8,8	/* Check if length was reached.  */
+	ble	cr7,L(short_path_prepare_2)
+
+	/* For next checks we have aligned address, so we check for more
+	   three doublewords to make sure we can read 16 unaligned bytes
+	   to start the bulk copy with 16 aligned addresses.  */
+	ld	cr7,8(r11)
+	cmpb	r9,r7,r9
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	cr7,r8,-8
+	cmpldi	cr7,r7,8
+	ble	cr7,L(short_path_prepare_2)
+	ld	cr7,16(r11)
+	cmpb	r9,r7,r9
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	r8,r8,-16
+	cmpldi	r7,r8,8
+	ble	cr7,L(short_path_prepare_2)
+	ld	r8,24(r11)
+	cmpb	r9,r8,r9
+	cmpdi	r7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+
+	/* No null byte found in the 32 bytes readed and length not reached,
+	   read source again using unaligned loads and store them.  */
+	ld	r9,0(r4)
+	addi	r29,r3,16
+	addi	r5,r5,-16
+	std	r9,0(r3)
+	ld	r9,8(r4)
+	std	r9,8(r3)
+
+	/* Align source to 16 bytes and adjust destiny and size.  */
+L(align_to_16b):
+	rldicl	r9,r10,0,60
+	rldicr	r28,r10,0,59
+	add	r12,r5,r9
+	subf	r29,r9,r29
+
+	/* The bulk read/compare/copy loads two doublewords, compare and merge
+	   in a single register for speed.  This is an attempt to speed up the
+	   null-checking process for bigger strings.  */
+
+	cmpldi	cr7,r12,15
+	ble	cr7,L(short_path_prepare_1_2)
+
+	/* Main loop for large sizes, unrolled 2 times to get better use of
+	   pipeline.  */
+	ld	r8,0(28)
+	ld	r10,8(28)
+	li	r9,0
+	cmpb	r7,r8,r9
+	cmpb	r9,r10,r9
+	or.	r6,r9,r7
+	bne	cr0,L(short_path_prepare_2_3)
+	addi	r5,r12,-16
+	addi	r4,r28,16
+	std	r8,0(r29)
+	std	r10,8(r29)
+	cmpldi	cr7,r5,15
+	addi	r9,r29,16
+	ble	cr7,L(short_path_1)
+	mr	r11,r28
+	mr	r6,r29
+	li	r30,0
+	subfic	r26,r4,48
+	subfic	r27,r9,48
+
+	b	L(loop_16b)
+
+	.align	4
+L(loop_start):
+	ld	r31,0(r11)
+	ld	r10,8(r11)
+	cmpb	r0,r31,r7
+	cmpb	r8,r10,r7
+	or.	r7,r0,r8
+	addi	r5,r5,-32
+	cmpldi	cr7,r5,15
+	add	r4,r4,r26
+	add	r9,r9,r27
+	bne	cr0,L(short_path_prepare_2_2)
+	add	r4,r28,r4
+	std	r31,0(r6)
+	add	r9,r29,r9
+	std	r10,8(r6)
+	ble	cr7,L(short_path_1)
+
+L(loop_16b):
+	ld	r10,16(r11)
+	ld	r0,24(r11)
+	cmpb	r8,r10,r30
+	cmpb	r7,r0,r30
+	or.	r7,r8,r7
+	addi	r12,r12,-32
+	cmpldi	r7,r12,15
+	addi	r11,r11,32
+	bne	cr0,L(short_path_2)
+	std	r10,16(r6)
+	addi	r6,r6,32
+	std	r0,-8(r6)
+	bgt	cr7,L(loop_start)
+
+	mr	r5,r12
+	mr	r4,r11
+	mr	r9,r6
+	b	L(short_path_1)
+
+	.align	4
+L(short_path_prepare_1_1):
+	mr	r5,r6
+	mr	r4,r7
+	b	L(short_path_1)
+L(short_path_prepare_1_2):
+	mr	r5,r12
+	mr	r4,r28
+	mr	r9,r29
+	b	L(short_path_1)
+L(short_path_prepare_2):
+	mr	r9,r3
+	b	L(short_path_2)
+L(short_path_prepare_2_1):
+	mr	r5,r6
+	mr	r4,r7
+	b	L(short_path_2)
+L(short_path_prepare_2_2):
+	mr	r5,r12
+	mr	r4,r11
+	mr	r9,r6
+	b	L(short_path_2)
+L(short_path_prepare_2_3):
+	mr	r5,r12
+	mr	r4,r28
+	mr	r9,r29
+	b	L(short_path_2)
+L(zero_pad_loop_b_prepare):
+	addi	r10,r9,8
+	rldicl	r5,r5,0,61
+	b	L(zero_pad_loop_b_start)
+L(zero_pad_start_prepare_1):
+	mr	r5,r6
+	mr	r9,r8
+	b	L(zero_pad_start_1)
+END (FUNC_NAME)
+
+#ifdef USE_AS_STPNCPY
+libc_hidden_def (__stpncpy)
+#else
+libc_hidden_builtin_def (strncpy)
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9f2f36e5a91c2ce6edba5415e176155eb1008ae1

commit 9f2f36e5a91c2ce6edba5415e176155eb1008ae1
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Tue Dec 23 13:39:23 2014 -0500

    powerpc: Optimized strncat for POWER7/PPC64
    
    With 3eb38795dbbbd816 (Simplify strncat) the generic algorithms uses
    strlen, strnlen, and memcpy.  This is faster than POWER7 current
    implementation, especially for unaligned strings (where POWER7 code
    uses byte-byte operations).
    
    This patch removes the assembly implementation and uses a multiarch
    specialization based on default algorithm calling optimized POWER7
    symbols.

diff --git a/ChangeLog b/ChangeLog
index 744632a..16199e3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
+	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
+	* sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
+
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
 	strncat-power8 object.
 	* sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S
deleted file mode 100644
index 6216284..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Optimized strncat implementation for POWER7.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#undef EALIGN
-#define EALIGN(name, alignt, words)				\
-  .section ".text";						\
-  ENTRY_2(__strncat_power7)					\
-  .align ALIGNARG(alignt);					\
-  EALIGN_W_##words;						\
-  BODY_LABEL(__strncat_power7):					\
-  cfi_startproc;						\
-  LOCALENTRY(__strncat_power7)
-
-#undef END
-#define END(name)						\
-  cfi_endproc;							\
-  TRACEBACK(__strncat_power7)					\
-  END_2(__strncat_power7)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#define STRLEN __strlen_power7
-
-#include <sysdeps/powerpc/powerpc64/power7/strncat.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
new file mode 100644
index 0000000..39b1aeb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
@@ -0,0 +1,31 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/ >.  */
+
+#include <string.h>
+
+#define STRNCAT __strncat_power7
+
+extern __typeof (strncat) __strncat_power7 attribute_hidden;
+extern __typeof (strlen) __strlen_power7 attribute_hidden;
+extern __typeof (strnlen) __strnlen_power7 attribute_hidden;
+extern __typeof (memcpy) __memcpy_power7 attribute_hidden;
+
+#define strlen    __strlen_power7
+#define __strnlen __strnlen_power7
+#define memcpy    __memcpy_power7
+
+#include <string/strncat.c>
diff --git a/sysdeps/powerpc/powerpc64/power7/strncat.S b/sysdeps/powerpc/powerpc64/power7/strncat.S
deleted file mode 100644
index 05502ac..0000000
--- a/sysdeps/powerpc/powerpc64/power7/strncat.S
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Optimized strncat implementation for PowerPC64/POWER7.
-
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* The algorithm is as follows for aligned memory access :
-
-   if address of s2 is divisible by 0x7UL,
-       perform aligned doubleword catenation
-   else
-       perform unaligned catenation
-
-   The aligned comparison are made using cmpb instructions.  */
-
-/* char* [r3] strncat (const char *s1 [r3],
-                       const char *s2 [r4],
-                       size_t size [r5])  */
-
-#include <sysdep.h>
-
-#ifndef STRNCAT
-# undef strncat
-# define STRNCAT  strncat
-#endif
-
-#ifndef STRLEN
-/* For builds with no IFUNC support, local calls should be made to internal
-   GLIBC symbol (created by libc_hidden_builtin_def).  */
-# ifdef SHARED
-#  define STRLEN   __GI_strlen
-# else
-#  define STRLEN   strlen
-# endif
-#endif
-
-#define	FRAMESIZE	(FRAME_MIN_SIZE+32)
-
-	.machine  power7
-EALIGN(STRNCAT, 4, 0)
-	CALL_MCOUNT 3
-
-	mflr r0				/* Load link register LR to r0.  */
-
-/* We shall use r29, r30 and r31 non volatile register for retention.
-   Save all the callee registers in the GPR save area.  */
-	std r29, -24(r1)		/* Save callers register r29.  */
-	std r30, -16(r1)		/* Save callers register r30.  */
-	std r31, -8(r1)			/* Save callers register r31.  */
-
-	std r0, 16(r1)			/* Store the link register.  */
-	stdu r1, -FRAMESIZE(r1)		/* Create the stack frame.  */
-
-/* Improve performance with CPU pre-fetch.  */
-	dcbt 0, r3			/* Pre-fetch str to avoid cache
-					   miss.  */
-	dcbt 0, r4			/* Pre-fetch accept to avoid cache
-					   miss.  */
-
-	mr. r29, r5			/* Save "n" in r29.  */
-	mr r30, r3			/* Save "s1" in r30 from r3.  */
-	beq cr0,L(done)
-
-	mr r31, r4			/* Save "s2" in r31 from r4.  */
-	bl STRLEN			/* Call optimized strlen on s1; goto
-					   end of s1.  */
-	nop
-	cmpldi cr7, r29, 7		/* If s2 is <=7 process
-					    byte-by-byte.  */
-	add r3, r30, r3			/* Grab the last character of s1.  */
-	bgt cr7,L(alignment)		/* Process by aligned strings.  */
-
-	cmpldi cr7, r29, 3		/* If n is >= 4, we can
-					   byte-unroll.  */
-	addi r9, r3, -1			/* Make "s1" point before next
-					   character, increment when read.  */
-	bgt cr7, L(bytes_unroll)	/* Process each byte.  */
-
-L(byte_by_byte):
-	lbz r10, 0(r31)
-	addi r8, r9, 1
-	cmpdi cr7, r10, 0		/* Check for NULL in "s2".  */
-	stb r10, 1(r9)
-	beq cr7, L(done)
-	add r9, r9, r29
-	subf r9, r8, r9
-	addi r9, r9, 1
-	mtctr r9
-	b L(branch2)
-	.p2align 4
-L(branch1):
-	lbzu r10, 1(r31)
-	cmpdi cr7, r10, 0
-	stbu r10, 1(r8)
-	beq cr7,L(done)
-L(branch2):
-	mr r9, r8
-	bdnz L(branch1)
-	beq cr7,L(done)
-L(nullTerminate):
-	li r10, 0			/* Load NULL for termination.  */
-	stb r10, 1(r9)			/* Append or terminate s1 with
-					   NULL.  */
-	.p2align 4			/* A small section here.  */
-L(done):				/* We return now.   */
-	addi r1, r1, FRAMESIZE		/* Restore stack pointer.  */
-	mr r3, r30			/* Set the return value length of
-					   string.  */
-	ld r0, 16(r1)			/* Read the saved link register.  */
-	ld r29, -24(r1)			/* Restore save register r29.  */
-	ld r30, -16(r1)			/* Restore save register r30.  */
-	ld r31, -8(r1)			/* Restore save register r31.  */
-	mtlr r0				/* Restore link register.  */
-	blr				/* Branch to link register.  */
-
-	.p2align 4
-L(alignment):
-	rldicl. r9, r31, 0, 61		/* Check if s2 is 8byte aligned  */
-	beq cr0,L(dwordAligned)
-
-	.p2align 4
-/* Unaligned bytes in string, so process byte by byte.
-   POWER7 has performance gains over loop unroll.  */
-L(bytes_unroll):
-	addi r9, r3, -1
-	srdi r10, r29, 2
-	mtctr r10
-	b L(L10)
-	.p2align 4
-L(L44):
-	lbz r10, 1(r31)			/* Load byte.  */
-	cmpdi cr7, r10, 0		/* Compare ; if byte not zero,
-					   continue.  */
-	stb r10, 2(r9)			/* Store byte  */
-	beq cr7, L(done)
-	addi r31, r31, 4
-
-	lbz r10, -2(r31)		/* Perform loop unroll here on byte
-					   load and store.  */
-	cmpdi cr7, r10, 0
-	stb r10, 3(r9)
-	beq cr7, L(done)
-
-	lbz r10, -1(r31)		/* Loop unroll here.  */
-	cmpdi cr7, r10, 0
-	stbu r10, 4(r9)
-	beq cr7, L(done)
-
-	bdz L(leftNbytes)
-
-L(L10):
-	lbz r10, 0(r31)			/* Loop unroll here.  */
-	cmpdi cr7, r10, 0
-	stb r10, 1(r9)
-	bne cr7,L(L44)
-	b L(done)
-	.p2align 4
-/* If s2 is double word aligned, we load and store double word.  */
-L(dwordAligned):
-/* read, write 8 bytes at a time  */
-	srdi r8, r29, 3			/* Compute count for CTR to loop;
-					   count = n/8.  */
-	li r7, 0			/* Load r7 with NULL.  */
-	li r10, 0			/* Load r10 with MASK '0'.  */
-
-	mtctr r8			/* Move count to CTR.  */
-L(loop8):
-	ld r9, 0(r31)			/* Read double word from s2.  */
-	cmpb r6, r9, r10		/* Compare bytes in s2 we read
-					   just now.  */
-	cmpdi r6, 0			/* If cmpb returned NULL,
-					   we continue.  */
-	bne+ L(a8)
-	std r9, 0(r3)			/* Append double word from s2
-					   with s1.  */
-	addi r3, r3, 8			/* Increment s1.  */
-	addi r31, r31, 8		/* Increment s2.  */
-	subi r29, r29, 8		/* Decrement count by 8.  */
-	bdnz L(loop8)			/* Continue until "count" is
-					   non zero.  */
-
-L(a8):
-	cmpdi r29, 0			/* If "n" is already zero, we skip. */
-	beq+ L(align8align)
-
-	mtctr r29			/* Process left over bytes in "n".  */
-L(unaligned0):
-	lbz r9, 0(r31)			/* Read a byte from s2.  */
-	cmpw r9, r7			/* If byte is NULL, we stop here . */
-	beq+ L(align8align)		/* Skip processing further if NULL.  */
-	stb  r9, 0(r3)			/* If not NULL, store byte into s1.  */
-	addi r3, r3, 1			/* Increment s1 by 1.  */
-	addi r31, r31, 1		/* Increment s2 by 1.  */
-	bdnz L(unaligned0)		/* Decrement counter "n" and loop
-					   until non zero.  */
-L(align8align):
-	stb r7, 0(r3)			/* Terminate s1 with NULL.  */
-
-	addi r1, r1, FRAMESIZE		/* Restore stack pointer.  */
-	mr r3, r30			/* Set the return value, length of
-					   string.  */
-	ld r0, 16(r1)			/* Read the saved link register.  */
-	ld r29, -24(r1)			/* Restore save register r29.  */
-	ld r30, -16(r1)			/* Restore save register r30.  */
-	ld r31, -8(r1)			/* Restore save register r31.  */
-	mtlr r0				/* Restore link register.  */
-	blr				/* Branch to link register  */
-
-	.p2align 4
-L(leftNbytes):
-	rldicl. r29, r29, 0, 62		/* Check if n>0 and n < 4 bytes.  */
-	bne cr0,L(byte_by_byte)		/* Process bytes one by one. */
-	b L(nullTerminate)		/* Now, finish catenation with
-					   NULL termination.  */
-END(STRNCAT)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=94c9680945369d63ef9ed266a29f28ebaaaeb5ce

commit 94c9680945369d63ef9ed266a29f28ebaaaeb5ce
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Tue Dec 23 13:36:34 2014 -0500

    powerpc: Optimized strcat for POWER8/PPC64
    
    With new optimized strcpy for POWER8, this patch adds an optimized
    strcat which uses it along with default implementation at strings/.

diff --git a/ChangeLog b/ChangeLog
index 7204573..744632a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,15 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+	strncat-power8 object.
+	* sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add
+	__strcat_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strcat_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c: New file:
+	optimized strcat for power8.
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
 	strcpy-power8 and stpcpy-power8 objects.
 	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 	(__libc_ifunc_impl_list): Add __strcpy_power8 and __stpcpy_power8
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index f170551..74b2daa 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -18,8 +18,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
-		   strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
-		   bcopy-ppc64
+		   strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
+		   memmove-ppc64 bcopy-ppc64
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 88c8234..dbb21fd 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -303,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c.  */
   IFUNC_IMPL (i, name, strcat,
 	      IFUNC_IMPL_ADD (array, i, strcat,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strcat_power8)
+	      IFUNC_IMPL_ADD (array, i, strcat,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcat_power7)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcat.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
index af188d3..6c7544c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
@@ -1,5 +1,4 @@
-/* Multiple versions of strcat. PowerPC64 version.
-   Copyright (C) 2014-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -9,23 +8,23 @@
 
    The GNU C Library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
+   <http://www.gnu.org/licenses/ >.  */
 
-#if IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <string.h>
 
-extern __typeof (strcat) __strcat_ppc attribute_hidden;
-extern __typeof (strcat) __strcat_power7 attribute_hidden;
+#define STRCAT __strcat_power8
 
-libc_ifunc (strcat,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcat_power7
-            : __strcat_ppc);
-#endif
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strcpy) __strcpy_power8;
+extern typeof (strlen) __strlen_power7;
+
+#define strcpy __strcpy_power8
+#define strlen __strlen_power7
+#include <sysdeps/powerpc/strcat.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
index af188d3..4708a9a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
@@ -23,9 +23,12 @@
 
 extern __typeof (strcat) __strcat_ppc attribute_hidden;
 extern __typeof (strcat) __strcat_power7 attribute_hidden;
+extern __typeof (strcat) __strcat_power8 attribute_hidden;
 
 libc_ifunc (strcat,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcat_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strcat_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strcat_power7
             : __strcat_ppc);
 #endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=96d6fd6c4060d739abb1822e7ad633af749532b2

commit 96d6fd6c4060d739abb1822e7ad633af749532b2
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Tue Dec 23 05:59:44 2014 -0600

    powerpc: Optimized st{r,p}cpy for POWER8/PPC64
    
    This patch adds an optimized POWER8 strcpy using unaligned accesses.
    For strings up to 16 bytes the implementation first calculate the
    string size, like strlen, and issues a memcpy.  For larger strings,
    source is first aligned to 16 bytes and then tested over a loop that
    reads 16 bytes am combine the cmpb results for speedup.  Special case is
    added for page cross reads.
    
    It shows 30%-60% improvement over the optimized POWER7 one that uses
    only aligned accesses.

diff --git a/ChangeLog b/ChangeLog
index 09f1a80..7204573 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+	strcpy-power8 and stpcpy-power8 objects.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strcpy_power8 and __stpcpy_power8
+	implementations.
+	* sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S: New file:
+	multiarch stpcpy implementation for POWER8.
+	* sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S: New file;
+	multiarch strcpy implementation for POWER8.
+	* sysdeps/powerpc/powerpc64/multiarch/strcpy.c (strcpy): Add
+	__strcpy_power8 function.
+	* sysdeps/powerpc/powerpc64/power8/stpcpy.S: New file: optimized
+	stpcpy for POWER8.
+	* sysdeps/powerpc/powerpc64/power8/strcpy.S: New file: optimized
+	strcpy for POWER8.
+	* NEWS: Update.
+
 2015-01-13  Leonhard Holz  <leonhard.holz@web.de>
 
 	[BZ #16009]
diff --git a/NEWS b/NEWS
index fbf133e..e020918 100644
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,8 @@ Version 2.21
   17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782,
   17791, 17793, 17796, 17797, 17803, 17806, 17834
 
+* Optimized strcpy and stpcpy implementations for powerpc64/powerpc64le.
+
 * Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
   and powerpc64le.  This may improve lock scaling of existing programs on
   HTM capable systems.  The lock elision code is only enabled with
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 39e441b..f170551 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -13,7 +13,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \
 		   wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \
 		   wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \
-		   strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
+		   strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
+		   stpcpy-power7 stpcpy-ppc64 \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 47e3398..88c8234 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -83,6 +83,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
   IFUNC_IMPL (i, name, strcpy,
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strcpy_power8)
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1,
@@ -90,6 +92,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
+	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __stpcpy_power8)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_HAS_VSX,
 			      __stpcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
index cd47bf6..66e6f70 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
-   Copyright (C) 2013-2015 Free Software Foundation, Inc.
+/* Optimized stpcpy implementation for POWER8/PPC64.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if defined SHARED && IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
 
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__stpcpy_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__stpcpy_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__stpcpy_power8)
 
-libc_ifunc (strcpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcpy_power7
-            : __strcpy_ppc);
-#endif
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__stpcpy_power8)					\
+  END_2(__stpcpy_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
index cd47bf6..64cbc16 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
-   Copyright (C) 2013-2015 Free Software Foundation, Inc.
+/* Optimized strcpy implementation for POWER8/PPC64.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if defined SHARED && IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
 
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strcpy_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strcpy_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strcpy_power8)
 
-libc_ifunc (strcpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcpy_power7
-            : __strcpy_ppc);
-#endif
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strcpy_power8)					\
+  END_2(__strcpy_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
index cd47bf6..fd0afd4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
@@ -23,9 +23,12 @@
 
 extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
 extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
 
 libc_ifunc (strcpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strcpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strcpy_power7
             : __strcpy_ppc);
 #endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
similarity index 61%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/power8/stpcpy.S
index cd47bf6..bf72065 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
-   Copyright (C) 2013-2015 Free Software Foundation, Inc.
+/* Optimized stpcpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,9 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if defined SHARED && IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#define USE_AS_STPCPY
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
 
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
-
-libc_ifunc (strcpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcpy_power7
-            : __strcpy_ppc);
-#endif
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S
new file mode 100644
index 0000000..d3e9a10
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcpy.S
@@ -0,0 +1,262 @@
+/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPCPY
+# define FUNC_NAME __stpcpy
+#else
+# define FUNC_NAME strcpy
+#endif
+
+/* Implements the function
+
+   char * [r3] strcpy (char *dest [r3], const char *src [r4])
+
+   or
+
+   char * [r3] stpcpy (char *dest [r3], const char *src [r4])
+
+   if USE_AS_STPCPY is defined.
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+        li      r0,0          /* Doubleword with null chars to use
+                                 with cmpb.  */
+
+	/* Check if the [src]+15 will cross a 4K page by checking if the bit
+	   indicating the page size changes.  Basically:
+
+	   uint64_t srcin = (uint64_t)src;
+	   uint64_t ob = srcin & 4096UL;
+	   uint64_t nb = (srcin+15UL) & 4096UL;
+	   if (ob ^ nb)
+	     goto pagecross;  */
+
+	addi	r9,r4,15
+	xor	r9,r9,r4
+	rlwinm.	r9,r9,0,19,19
+	bne	L(pagecross)
+
+	/* For short string (less than 16 bytes), just calculate its size as
+	   strlen and issues a memcpy if null is found.  */
+	mr	r7,r4
+        ld      r12,0(r7)     /* Load doubleword from memory.  */
+        cmpb    r10,r12,r0    /* Check for null bytes in DWORD1.  */
+        cmpdi   cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+        bne     cr7,L(done)
+
+        ldu     r8,8(r7)
+        cmpb    r10,r8,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+	b	L(loop_before)
+
+	.align	4
+L(pagecross):
+	clrrdi  r7,r4,3       /* Align the address to doubleword boundary.  */
+	rlwinm  r6,r4,3,26,28 /* Calculate padding.  */
+	li      r5,-1         /* MASK = 0xffffffffffffffff.  */
+        ld      r12,0(r7)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+        sld     r5,r5,r6
+#else
+        srd     r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
+        orc     r9,r12,r5     /* Mask bits that are not part of the string.  */
+        cmpb    r10,r9,r0     /* Check for null bytes in DWORD1.  */
+        cmpdi   cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+        bne     cr7,L(done)
+
+        ldu     r6,8(r7)
+        cmpb    r10,r6,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+        ld      r12,0(r7)
+        cmpb    r10,r12,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+        ldu     r6,8(r7)
+        cmpb    r10,r6,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+	/* We checked for 24 - x bytes, with x being the source alignment
+	   (0 <= x <= 16), and no zero has been found.  Start the loop
+	   copy with doubleword aligned address.  */
+	mr	r7,r4
+	ld	r12, 0(r7)
+	ldu	r8, 8(r7)
+
+L(loop_before):
+	/* Save the two doublewords readed from source and align the source
+	   to 16 bytes for the loop.  */
+	mr	r11,r3
+	std	r12,0(r11)
+	std	r8,8(r11)
+	addi	r11,r11,16
+	rldicl	r9,r4,0,60
+	subf	r7,r9,r7
+	subf	r11,r9,r11
+	b	L(loop_start)
+
+        .align  5
+L(loop):
+        std     r12, 0(r11)
+        std     r6, 8(r11)
+	addi	r11,r11,16
+L(loop_start):
+        /* Load two doublewords, compare and merge in a
+           single register for speed.  This is an attempt
+           to speed up the null-checking process for bigger strings.  */
+
+        ld      r12, 8(r7)
+        ldu     r6, 16(r7)
+        cmpb    r10,r12,r0
+        cmpb    r9,r6,r0
+        or      r8,r9,r10     /* Merge everything in one doubleword.  */
+        cmpdi   cr7,r8,0
+        beq     cr7,L(loop)
+
+
+        /* OK, one (or both) of the doublewords contains a null byte.  Check
+           the first doubleword and decrement the address in case the first
+           doubleword really contains a null byte.  */
+
+	addi	r4,r7,-8
+        cmpdi   cr6,r10,0
+        addi    r7,r7,-8
+        bne     cr6,L(done2)
+
+        /* The null byte must be in the second doubleword.  Adjust the address
+           again and move the result of cmpb to r10 so we can calculate the
+           length.  */
+
+        mr      r10,r9
+        addi    r7,r7,8
+	b	L(done2)
+
+        /* r10 has the output of the cmpb instruction, that is, it contains
+           0xff in the same position as the null byte in the original
+           doubleword from the string.  Use that to calculate the length.  */
+L(done):
+	mr	r11,r3
+L(done2):
+#ifdef __LITTLE_ENDIAN__
+        addi    r9, r10, -1   /* Form a mask from trailing zeros.  */
+        andc    r9, r9, r10
+        popcntd r6, r9        /* Count the bits in the mask.  */
+#else
+        cntlzd  r6,r10        /* Count leading zeros before the match.  */
+#endif
+        subf    r5,r4,r7
+        srdi    r6,r6,3       /* Convert leading/trailing zeros to bytes.  */
+        add     r8,r5,r6      /* Compute final length.  */
+#ifdef USE_AS_STPCPY
+	/* stpcpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
+	addi	r8,r8,1       /* Final '/0'.  */
+
+	cmpldi	cr6,r8,8
+	mtocrf	0x01,r8
+	ble	cr6,L(copy_LE_8)
+
+	cmpldi	cr1,r8,16
+	blt	cr1,8f
+
+	/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	/* At least 6 bytes to go.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	ld	r6,0(r4)
+	ld	r8,8(r4)
+	addi	r4,r4,16
+	std	r6,0(r11)
+	std	r8,8(r11)
+	addi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	ld	r6,0(r4)
+	addi	r4,r4,8
+	std	r6,0(r11)
+	addi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	r6,0(r4)
+	stw	r6,0(r11)
+	bf	30,L(tail5)
+	lhz	r7,4(r4)
+	sth	r7,4(r11)
+	bflr	31
+	lbz	r8,6(r4)
+	stb	r8,6(r11)
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	r6,0(r4)
+	sth	r6,0(r11)
+	bflr	31
+	lbz	r7,2(r4)
+	stb	r7,2(r11)
+	blr
+
+	.align	4
+L(tail5):
+	bf	31,1f
+	lbz	r6,4(r4)
+	stb	r6,4(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	r6,0(r4)
+	stb	r6,0(r11)
+	blr
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+	ld	r6,0(r4)
+	std	r6,0(r11)
+	blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                          |   57 +++
 NEWS                                               |    4 +
 sysdeps/powerpc/powerpc64/multiarch/Makefile       |   10 +-
 .../powerpc/powerpc64/multiarch/ifunc-impl-list.c  |   16 +
 .../powerpc/powerpc64/multiarch/stpcpy-power8.S    |   40 ++
 .../powerpc/powerpc64/multiarch/stpncpy-power8.S   |   39 ++
 sysdeps/powerpc/powerpc64/multiarch/stpncpy.c      |    7 +-
 .../powerpc/powerpc64/multiarch/strcat-power8.c    |   30 ++
 sysdeps/powerpc/powerpc64/multiarch/strcat.c       |    7 +-
 .../powerpc/powerpc64/multiarch/strcmp-power8.S    |   40 ++
 sysdeps/powerpc/powerpc64/multiarch/strcmp.c       |    7 +-
 .../powerpc/powerpc64/multiarch/strcpy-power8.S    |   40 ++
 sysdeps/powerpc/powerpc64/multiarch/strcpy.c       |    7 +-
 .../powerpc/powerpc64/multiarch/strncat-power7.S   |   42 --
 .../powerpc/powerpc64/multiarch/strncat-power7.c   |   31 ++
 .../powerpc/powerpc64/multiarch/strncpy-power8.S   |   40 ++
 sysdeps/powerpc/powerpc64/multiarch/strncpy.c      |    7 +-
 sysdeps/powerpc/powerpc64/power7/strncat.S         |  228 -----------
 sysdeps/powerpc/powerpc64/power8/stpcpy.S          |   24 ++
 sysdeps/powerpc/powerpc64/power8/stpncpy.S         |   20 +
 sysdeps/powerpc/powerpc64/power8/strcmp.S          |  257 ++++++++++++
 sysdeps/powerpc/powerpc64/power8/strcpy.S          |  262 ++++++++++++
 sysdeps/powerpc/powerpc64/power8/strncpy.S         |  424 ++++++++++++++++++++
 23 files changed, 1355 insertions(+), 284 deletions(-)
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
 delete mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
 delete mode 100644 sysdeps/powerpc/powerpc64/power7/strncat.S
 create mode 100644 sysdeps/powerpc/powerpc64/power8/stpcpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/power8/stpncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strcmp.S
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strcpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/power8/strncpy.S


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]