This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch ibm/2.20/master created. glibc-2.20-31-g6831ddb


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, ibm/2.20/master has been created
        at  6831ddb38379c1924bd19b3203d161a4c3ed1e2e (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6831ddb38379c1924bd19b3203d161a4c3ed1e2e

commit 6831ddb38379c1924bd19b3203d161a4c3ed1e2e
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Sun Jan 11 19:33:17 2015 -0600

    powerpc: Fix POWER7/PPC64 performance regression on LE
    
    This patch fixes a performance regression on the POWER7/PPC64 memcmp
    porting for Little Endian.  The LE code uses 'ldbrx' instruction to read
    the memory on byte reversed form, however ISA 2.06 just provide the indexed
    form which uses a register value as additional index, instead of a fixed value
    enconded in the instruction.
    
    And the port strategy for LE uses r0 index value and update the address
    value on each compare loop interation.  For large compare size values,
    it adds 8 more instructions plus some more depending of trailing
    size.  This patch fixes it by adding pre-calculate indexes to remove the
    address update on loops and tailing sizes.
    
    For large sizes it shows a considerable gain, with double performance
    pairing with BE.

diff --git a/ChangeLog b/ChangeLog
index a8b90fe..dbaa566 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
+	* sysdeps/powerpc/powerpc64/power7/memcmp.S (memcmp): Fix performance
+	regression on LE.
+
 	* sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S: New file.
 	* sysdeps/powerpc/powerpc64/power8/strncmp.S: New file.
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
index 09bff69..98b9e54 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -26,18 +26,48 @@
 EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT 3
 
-#define rRTN	r3
-#define rSTR1	r3	/* first string arg */
-#define rSTR2	r4	/* second string arg */
-#define rN	r5	/* max string length */
-#define rWORD1	r6	/* current word in s1 */
-#define rWORD2	r7	/* current word in s2 */
-#define rWORD3	r8	/* next word in s1 */
-#define rWORD4	r9	/* next word in s2 */
-#define rWORD5	r10	/* next word in s1 */
-#define rWORD6	r11	/* next word in s2 */
-#define rWORD7	r30	/* next word in s1 */
-#define rWORD8	r31	/* next word in s2 */
+#define rRTN		r3
+#define rSTR1		r3	/* first string arg */
+#define rSTR2		r4	/* second string arg */
+#define rN		r5	/* max string length */
+#define rWORD1		r6	/* current word in s1 */
+#define rWORD2		r7	/* current word in s2 */
+#define rWORD3		r8	/* next word in s1 */
+#define rWORD4		r9	/* next word in s2 */
+#define rWORD5		r10	/* next word in s1 */
+#define rWORD6		r11	/* next word in s2 */
+
+#define rOFF8		r20	/* 8 bytes offset.  */
+#define rOFF16  	r21	/* 16 bytes offset.  */
+#define rOFF24		r22	/* 24 bytes offset.  */
+#define rOFF32		r23	/* 24 bytes offset.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rSHR		r28	/* Unaligned shift right count.  */
+#define rSHL		r29	/* Unaligned shift left count.  */
+#define rWORD7		r30	/* next word in s1 */
+#define rWORD8		r31	/* next word in s2 */
+
+#define rWORD8SAVE	(-8)
+#define rWORD7SAVE	(-16)
+#define rOFF8SAVE	(-24)
+#define rOFF16SAVE	(-32)
+#define rOFF24SAVE	(-40)
+#define rOFF32SAVE	(-48)
+#define rSHRSAVE	(-56)
+#define rSHLSAVE	(-64)
+#define rWORD8SHIFTSAVE	(-72)
+#define rWORD2SHIFTSAVE	(-80)
+#define rWORD4SHIFTSAVE	(-88)
+#define rWORD6SHIFTSAVE	(-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD	ldbrx
+#else
+# define LD	ldx
+#endif
 
 	xor	r0, rSTR2, rSTR1
 	cmpldi	cr6, rN, 0
@@ -51,10 +81,24 @@ EALIGN (memcmp, 4, 0)
 /* If less than 8 bytes or not aligned, use the unaligned
    byte loop.  */
 	blt	cr1, L(bytealigned)
-	std	rWORD8, -8(r1)
-	cfi_offset(rWORD8, -8)
-	std	rWORD7, -16(r1)
-	cfi_offset(rWORD7, -16)
+	std	rWORD8, rWORD8SAVE(r1)
+	cfi_offset(rWORD8, rWORD8SAVE)
+	std	rWORD7, rWORD7SAVE(r1)
+	cfi_offset(rWORD7, rWORD7SAVE)
+	std	rOFF8, rOFF8SAVE(r1)
+	cfi_offset(rWORD7, rOFF8SAVE)
+	std	rOFF16, rOFF16SAVE(r1)
+	cfi_offset(rWORD7, rOFF16SAVE)
+	std	rOFF24, rOFF24SAVE(r1)
+	cfi_offset(rWORD7, rOFF24SAVE)
+	std	rOFF32, rOFF32SAVE(r1)
+	cfi_offset(rWORD7, rOFF32SAVE)
+
+	li	rOFF8,8
+	li	rOFF16,16
+	li	rOFF24,24
+	li	rOFF32,32
+
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
    compare length is at least 8 bytes.  r12 contains the low order
@@ -79,15 +123,8 @@ L(samealignment):
 	sldi	rWORD6, r12, 3
 	srdi	r0, rN, 5	/* Divide by 32 */
 	andi.	r12, rN, 24	/* Get the DW remainder */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-	ld	rWORD2, 0(rSTR2)
-#endif
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
 	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
@@ -104,15 +141,8 @@ L(dsP1):
 	cmpld	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 16 */
@@ -123,15 +153,8 @@ L(dPs2):
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 8(rSTR1)
-	ld	rWORD8, 8(rSTR2)
-#endif
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 24 */
@@ -173,72 +196,43 @@ L(dP1):
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 0(rSTR1)
-	ld	rWORD6, 0(rSTR2)
-#endif
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
 	cmpld	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 L(dP1e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(dLcr5x)
 	bne	cr7, L(dLcr7x)
 
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 32(rSTR1)
-	ldu	rWORD8, 32(rSTR2)
-#endif
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
 	bne	cr1, L(dLcr1)
 	cmpld	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
 	bne	cr6, L(dLcr6)
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	.align	3
 L(dP1x):
 	sldi.	r12, rN, 3
 	bne	cr5, L(dLcr5x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
@@ -246,79 +240,41 @@ L(dP1x):
 	.align	4
 L(dP2):
 	mtctr	r0
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 0(rSTR1)
-	ld	rWORD6, 0(rSTR2)
-#endif
+	LD	rWORD5, 0, rSTR1
+	LD	rWORD6, 0, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 8(rSTR1)
-	ld	rWORD8, 8(rSTR2)
-#endif
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 L(dP2e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 16(rSTR1)
-	ld	rWORD2, 16(rSTR2)
-#endif
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 24(rSTR1)
-	ld	rWORD4, 24(rSTR2)
-#endif
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	bne	cr6, L(dLcr6)
 	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
-/* Again we are on a early exit path (16-23 byte compare), we want to
-   only use volatile registers and avoid restoring non-volatile
-   registers.  */
 	.align	4
 L(dP2x):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 8(rSTR1)
-	ld	rWORD4, 8(rSTR2)
-#endif
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	sldi.	r12, rN, 3
 	bne	cr6, L(dLcr6x)
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	bne	cr1, L(dLcr1x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
@@ -326,52 +282,22 @@ L(dP2x):
 	.align	4
 L(dP3):
 	mtctr	r0
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 0(rSTR1)
-	ld	rWORD4, 0(rSTR2)
-#endif
+	LD	rWORD3, 0, rSTR1
+	LD	rWORD4, 0, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 L(dP3e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 8(rSTR1)
-	ld	rWORD6, 8(rSTR2)
-#endif
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP3x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 16(rSTR1)
-	ld	rWORD8, 16(rSTR2)
-#endif
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 24(rSTR1)
-	ld	rWORD2, 24(rSTR2)
-#endif
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
 	bne	cr1, L(dLcr1)
 	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
@@ -380,26 +306,21 @@ L(dP3e):
    registers.  */
 	.align	4
 L(dP3x):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 16(rSTR1)
-	ld	rWORD2, 16(rSTR2)
-#endif
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	sldi.	r12, rN, 3
 	bne	cr1, L(dLcr1x)
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
 	bne	cr6, L(dLcr6x)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
 	bne	cr7, L(dLcr7x)
 	bne	L(d00)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
@@ -407,46 +328,20 @@ L(dP3x):
 	.align	4
 L(dP4):
 	mtctr	r0
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-	ld	rWORD2, 0(rSTR2)
-#endif
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 L(dP4e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 8(rSTR1)
-	ld	rWORD4, 8(rSTR2)
-#endif
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 16(rSTR1)
-	ld	rWORD6, 16(rSTR2)
-#endif
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 24(rSTR1)
-	ldu	rWORD8, 24(rSTR2)
-#endif
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr7, L(dLcr7)
 	bne	cr1, L(dLcr1)
@@ -454,51 +349,25 @@ L(dP4e):
 /* This is the primary loop */
 	.align	4
 L(dLoop):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(dLcr6)
 L(dLoop1):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(dLcr5)
 L(dLoop2):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr7, L(dLcr7)
 L(dLoop3):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 32(rSTR1)
-	ldu	rWORD8, 32(rSTR2)
-#endif
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
 	bne	cr1, L(dLcr1)
 	cmpld	cr7, rWORD1, rWORD2
 	bdnz	L(dLoop)
@@ -519,62 +388,75 @@ L(d14):
 	sldi.	r12, rN, 3
 	bne	cr5, L(dLcr5)
 L(d04):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */
-	beq	L(zeroLength)
+	beq	L(duzeroLength)
 /* At this point we have a remainder of 1 to 7 bytes to compare.  Since
    we are aligned it is safe to load the whole double word, and use
    shift right double to eliminate bits beyond the compare length.  */
 L(d00):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	srd	rWORD1, rWORD1, rN
 	srd	rWORD2, rWORD2, rN
 	cmpld	cr7, rWORD1, rWORD2
 	bne	cr7, L(dLcr7x)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 
 	.align	4
 L(dLcr7):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr7x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr7
 	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr1):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr1x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr1
 	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr6):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr6x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr6
 	li	rRTN, -1
 	blr
 	.align	4
 L(dLcr5):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dLcr5x):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 1
 	bgtlr	cr5
 	li	rRTN, -1
@@ -583,10 +465,6 @@ L(dLcr5x):
 	.align	4
 L(bytealigned):
 	mtctr	rN
-#if 0
-/* Huh?  We've already branched on cr6!  */
-	beq	cr6, L(zeroLength)
-#endif
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
    to avoid pipe delays.  The dependent instruction latencies (load to
@@ -685,6 +563,7 @@ L(b11):
 L(bx12):
 	sub	rRTN, rWORD1, rWORD2
 	blr
+
 	.align	4
 L(zeroLength):
 	li	rRTN, 0
@@ -705,42 +584,36 @@ L(zeroLength):
    we need to adjust the length (rN) and special case the loop
    versioning for the first DW. This ensures that the loop count is
    correct and the first DW (shifted) is in the expected resister pair.  */
-#define rSHL		r29	/* Unaligned shift left count.  */
-#define rSHR		r28	/* Unaligned shift right count.  */
-#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
-#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
-#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
-#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
 L(unaligned):
-	std	rSHL, -24(r1)
-	cfi_offset(rSHL, -24)
+	std	rSHL, rSHLSAVE(r1)
+	cfi_offset(rSHL, rSHLSAVE)
 	clrldi	rSHL, rSTR2, 61
 	beq	cr6, L(duzeroLength)
-	std	rSHR, -32(r1)
-	cfi_offset(rSHR, -32)
+	std	rSHR, rSHRSAVE(r1)
+	cfi_offset(rSHR, rSHRSAVE)
 	beq	cr5, L(DWunaligned)
-	std	rWORD8_SHIFT, -40(r1)
-	cfi_offset(rWORD8_SHIFT, -40)
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
 /* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 DW.  */
 	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the DW before that DW that contains
    the actual start of rSTR2.  */
 	clrrdi	rSTR2, rSTR2, 3
-	std	rWORD2_SHIFT, -48(r1)
-	cfi_offset(rWORD2_SHIFT, -48)
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
 /* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (DW aligned) start of rSTR1.  */
 	clrldi	rSHL, rWORD8_SHIFT, 61
 	clrrdi	rSTR1, rSTR1, 3
-	std	rWORD4_SHIFT, -56(r1)
-	cfi_offset(rWORD4_SHIFT, -56)
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
 	sldi	rSHL, rSHL, 3
 	cmpld	cr5, rWORD8_SHIFT, rSTR2
 	add	rN, rN, r12
 	sldi	rWORD6, r12, 3
-	std	rWORD6_SHIFT, -64(r1)
-	cfi_offset(rWORD6_SHIFT, -64)
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
 	subfic	rSHR, rSHL, 64
 	srdi	r0, rN, 5	/* Divide by 32 */
 	andi.	r12, rN, 24	/* Get the DW remainder */
@@ -750,25 +623,13 @@ L(unaligned):
    this may cross a page boundary and cause a page fault.  */
 	li	rWORD8, 0
 	blt	cr5, L(dus0)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD8, 0, rSTR2
+	LD	rWORD8, 0, rSTR2
 	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD8, 0(rSTR2)
-	addi	rSTR2, rSTR2, 8
-#endif
 	sld	rWORD8, rWORD8, rSHL
 
 L(dus0):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-	ld	rWORD2, 0(rSTR2)
-#endif
+	LD	rWORD1, 0, rSTR1
+	LD	rWORD2, 0, rSTR2
 	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	srd	r12, rWORD2, rSHR
@@ -796,12 +657,7 @@ L(dusP1):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
@@ -832,27 +688,21 @@ L(duPs4):
    compare length is at least 8 bytes.  */
 	.align	4
 L(DWunaligned):
-	std	rWORD8_SHIFT, -40(r1)
-	cfi_offset(rWORD8_SHIFT, -40)
+	std	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
 	clrrdi	rSTR2, rSTR2, 3
-	std	rWORD2_SHIFT, -48(r1)
-	cfi_offset(rWORD2_SHIFT, -48)
+	std	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
 	srdi	r0, rN, 5	/* Divide by 32 */
-	std	rWORD4_SHIFT, -56(r1)
-	cfi_offset(rWORD4_SHIFT, -56)
+	std	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+	cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
 	andi.	r12, rN, 24	/* Get the DW remainder */
-	std	rWORD6_SHIFT, -64(r1)
-	cfi_offset(rWORD6_SHIFT, -64)
+	std	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
 	sldi	rSHL, rSHL, 3
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD6, 0, rSTR2
+	LD	rWORD6, 0, rSTR2
+	LD	rWORD8, rOFF8, rSTR2
 	addi	rSTR2, rSTR2, 8
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD6, 0(rSTR2)
-	ldu	rWORD8, 8(rSTR2)
-#endif
 	cmpldi	cr1, r12, 16
 	cmpldi	cr7, rN, 32
 	clrldi	rN, rN, 61
@@ -867,52 +717,26 @@ L(DWunaligned):
 	.align	4
 L(duP1):
 	srd	r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD7, 0(rSTR1)
-#endif
+	LD	rWORD7, 0, rSTR1
 	sld	rWORD8_SHIFT, rWORD8, rSHL
 	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP1x)
 L(duP1e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	bne	cr5, L(duLcr5)
 	or	rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
@@ -932,82 +756,47 @@ L(duP1x):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 16 */
 	.align	4
 L(duP2):
 	srd	r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD5, 0(rSTR1)
-#endif
+	LD	rWORD5, 0, rSTR1
 	or	rWORD6, r0, rWORD6_SHIFT
 	sld	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 8(rSTR1)
-	ld	rWORD8, 8(rSTR2)
-#endif
+	LD	rWORD7, rOFF8, rSTR1
+	LD	rWORD8, rOFF8, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	srd	r12, rWORD8, rSHR
 	sld	rWORD8_SHIFT, rWORD8, rSHL
 	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP2x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 16(rSTR1)
-	ld	rWORD2, 16(rSTR2)
-#endif
+	LD	rWORD1, rOFF16, rSTR1
+	LD	rWORD2, rOFF16, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 24(rSTR1)
-	ld	rWORD4, 24(rSTR2)
-#endif
+	LD	rWORD3, rOFF24, rSTR1
+	LD	rWORD4, rOFF24, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	bne	cr5, L(duLcr5)
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	or	rWORD4, r12, rWORD2_SHIFT
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	cmpld	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
 	.align	4
 L(duP2x):
 	cmpld	cr5, rWORD7, rWORD8
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-#endif
 	bne	cr6, L(duLcr6)
 	sldi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
@@ -1015,12 +804,7 @@ L(duP2x):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
@@ -1028,73 +812,39 @@ L(duP2x):
 	.align	4
 L(duP3):
 	srd	r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD3, 0(rSTR1)
-#endif
+	LD	rWORD3, 0, rSTR1
 	sld	rWORD4_SHIFT, rWORD8, rSHL
 	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 8(rSTR1)
-	ld	rWORD6, 8(rSTR2)
-#endif
+	LD	rWORD5, rOFF8, rSTR1
+	LD	rWORD6, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
 	or	rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD7, 16(rSTR1)
-	ld	rWORD8, 16(rSTR2)
-#endif
+	LD	rWORD7, rOFF16, rSTR1
+	LD	rWORD8, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
 	srd	r12, rWORD8, rSHR
 	sld	rWORD8_SHIFT, rWORD8, rSHL
 	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP3x)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 24(rSTR1)
-	ld	rWORD2, 24(rSTR2)
-#endif
+	LD	rWORD1, rOFF24, rSTR1
+	LD	rWORD2, rOFF24, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
 	cmpld	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
 	.align	4
 L(duP3x):
-#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 16
 	addi	rSTR2, rSTR2, 16
-#endif
-#if 0
-/* Huh?  We've already branched on cr1!  */
-	bne	cr1, L(duLcr1)
-#endif
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	sldi.	rN, rN, 3
@@ -1103,12 +853,7 @@ L(duP3x):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	b	L(dutrim)
 
@@ -1117,51 +862,27 @@ L(duP3x):
 L(duP4):
 	mtctr	r0
 	srd	r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	addi	rSTR1, rSTR1, 8
-#else
-	ld	rWORD1, 0(rSTR1)
-#endif
+	LD	rWORD1, 0, rSTR1
 	sld	rWORD2_SHIFT, rWORD8, rSHL
 	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 8(rSTR1)
-	ld	rWORD4, 8(rSTR2)
-#endif
+	LD	rWORD3, rOFF8, rSTR1
+	LD	rWORD4, rOFF8, rSTR2
 	cmpld	cr7, rWORD1, rWORD2
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	or	rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 16(rSTR1)
-	ld	rWORD6, 16(rSTR2)
-#endif
+	LD	rWORD5, rOFF16, rSTR1
+	LD	rWORD6, rOFF16, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr7, L(duLcr7)
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
 	or	rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 24(rSTR1)
-	ldu	rWORD8, 24(rSTR2)
-#endif
+	LD	rWORD7, rOFF24, rSTR1
+	LD	rWORD8, rOFF24, rSTR2
+	addi	rSTR1, rSTR1, 24
+	addi	rSTR2, rSTR2, 24
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
 	srd	r12, rWORD8, rSHR
@@ -1172,60 +893,34 @@ L(duP4e):
 /* This is the primary loop */
 	.align	4
 L(duLoop):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD1, 8(rSTR1)
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
+	LD	rWORD2, rOFF8, rSTR2
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
 	srd	r0, rWORD2, rSHR
 	sld	rWORD2_SHIFT, rWORD2, rSHL
 	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD3, 0, rSTR1
-	ldbrx	rWORD4, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD3, 16(rSTR1)
-	ld	rWORD4, 16(rSTR2)
-#endif
+	LD	rWORD3, rOFF16, rSTR1
+	LD	rWORD4, rOFF16, rSTR2
 	cmpld	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
 	srd	r12, rWORD4, rSHR
 	sld	rWORD4_SHIFT, rWORD4, rSHL
 	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD5, 0, rSTR1
-	ldbrx	rWORD6, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD5, 24(rSTR1)
-	ld	rWORD6, 24(rSTR2)
-#endif
+	LD	rWORD5, rOFF24, rSTR1
+	LD	rWORD6, rOFF24, rSTR2
 	cmpld	cr5, rWORD7, rWORD8
 	bne	cr7, L(duLcr7)
 	srd	r0, rWORD6, rSHR
 	sld	rWORD6_SHIFT, rWORD6, rSHL
 	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD7, 0, rSTR1
-	ldbrx	rWORD8, 0, rSTR2
-	addi	rSTR1, rSTR1, 8
-	addi	rSTR2, rSTR2, 8
-#else
-	ldu	rWORD7, 32(rSTR1)
-	ldu	rWORD8, 32(rSTR2)
-#endif
+	LD	rWORD7, rOFF32, rSTR1
+	LD	rWORD8, rOFF32, rSTR2
+	addi	rSTR1, rSTR1, 32
+	addi	rSTR2, rSTR2, 32
 	cmpld	cr7, rWORD1, rWORD2
 	bne	cr1, L(duLcr1)
 	srd	r12, rWORD8, rSHR
@@ -1234,10 +929,6 @@ L(duLoop3):
 	bdnz	L(duLoop)
 
 L(duL4):
-#if 0
-/* Huh?  We've already branched on cr1!  */
-	bne	cr1, L(duLcr1)
-#endif
 	cmpld	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
 	cmpld	cr6, rWORD5, rWORD6
@@ -1264,99 +955,102 @@ L(du14):
 	beq	L(duZeroReturn)
 	li	r0, 0
 	ble	cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD2, 0, rSTR2
-	addi	rSTR2, rSTR2, 8
-#else
-	ld	rWORD2, 8(rSTR2)
-#endif
+	LD	rWORD2, rOFF8, rSTR2
 	srd	r0, rWORD2, rSHR
 	.align	4
 L(dutrim):
-#ifdef __LITTLE_ENDIAN__
-	ldbrx	rWORD1, 0, rSTR1
-#else
-	ld	rWORD1, 8(rSTR1)
-#endif
+	LD	rWORD1, rOFF8, rSTR1
 	ld	rWORD8, -8(r1)
 	subfic	rN, rN, 64	/* Shift count is 64 - (rN * 8).  */
 	or	rWORD2, r0, rWORD8_SHIFT
-	ld	rWORD7, -16(r1)
-	ld	rSHL, -24(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
+	ld	rSHL, rSHLSAVE(r1)
 	srd	rWORD1, rWORD1, rN
 	srd	rWORD2, rWORD2, rN
-	ld	rSHR, -32(r1)
-	ld	rWORD8_SHIFT, -40(r1)
+	ld	rSHR, rSHRSAVE(r1)
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
 	li	rRTN, 0
 	cmpld	cr7, rWORD1, rWORD2
-	ld	rWORD2_SHIFT, -48(r1)
-	ld	rWORD4_SHIFT, -56(r1)
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
 	beq	cr7, L(dureturn24)
 	li	rRTN, 1
-	ld	rWORD6_SHIFT, -64(r1)
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	bgtlr	cr7
 	li	rRTN, -1
 	blr
 	.align	4
 L(duLcr7):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr7, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr1):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr1, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr6):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr6, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr5):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 	li	rRTN, 1
 	bgt	cr5, L(dureturn29)
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
+
 	.align	3
 L(duZeroReturn):
 	li	rRTN, 0
 	.align	4
 L(dureturn):
-	ld	rWORD8, -8(r1)
-	ld	rWORD7, -16(r1)
+	ld	rWORD8, rWORD8SAVE(r1)
+	ld	rWORD7, rWORD7SAVE(r1)
 L(dureturn29):
-	ld	rSHL, -24(r1)
-	ld	rSHR, -32(r1)
+	ld	rSHL, rSHLSAVE(r1)
+	ld	rSHR, rSHRSAVE(r1)
 L(dureturn27):
-	ld	rWORD8_SHIFT, -40(r1)
-L(dureturn26):
-	ld	rWORD2_SHIFT, -48(r1)
-L(dureturn25):
-	ld	rWORD4_SHIFT, -56(r1)
+	ld	rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+	ld	rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+	ld	rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
 L(dureturn24):
-	ld	rWORD6_SHIFT, -64(r1)
+	ld	rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	blr
+
 L(duzeroLength):
+	ld	rOFF8,  rOFF8SAVE(r1)
+	ld	rOFF16, rOFF16SAVE(r1)
+	ld	rOFF24, rOFF24SAVE(r1)
+	ld	rOFF32, rOFF32SAVE(r1)
 	li	rRTN, 0
 	blr
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=473b6083820fd156985bf7b2cb60db9d4031b536

commit 473b6083820fd156985bf7b2cb60db9d4031b536
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Fri Jan 9 16:04:26 2015 -0500

    powerpc: Optimized strncmp for POWER8/PPC64
    
    This patch adds an optimized POWER8 strncmp.  The implementation focus
    on speeding up unaligned cases follwing the ideas of power8 strcmp.
    
    The algorithm first check the initial 16 bytes, then align the first
    function source and uses unaligned loads on second argument only.
    Aditional checks for page boundaries are done for unaligned cases
    (where sources alignment are different).

diff --git a/ChangeLog b/ChangeLog
index 621daa4..a8b90fe 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/power8/strncmp.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+	strncmp-power8 object.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strncmp_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strncmp.c (strncmp): Likewise.
+	* NEWS: Update.
+
 2015-01-13  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
 	    Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
diff --git a/NEWS b/NEWS
index 0a4fa77..4b52bd0 100644
--- a/NEWS
+++ b/NEWS
@@ -11,8 +11,8 @@ Version 2.20.1
 
   16617, 17266, 17370, 17371, 17625, 17630.
 
-* Optimized strcpy, stpcpy, strncpy, stpncpy, and strcmp implementations for
-  powerpc64/powerpc64le.
+* Optimized strcpy, stpcpy, strncpy, stpncpy, strcmp, and strncmp
+  implementations for powerpc64/powerpc64le.
   Implemented by Adhemerval Zanella (IBM).
 
 * CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index ec4fca7..b7ea284 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -7,8 +7,9 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
 		   rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
 		   strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \
-		   strncase-power7 strncase_l-power7 strncmp-power7 \
-		   strncmp-power4 strncmp-ppc64 strchr-power7 strchr-ppc64 \
+		   strncase-power7 strncase_l-power7 \
+		   strncmp-power8 strncmp-power7 strncmp-power4 strncmp-ppc64 \
+		   strchr-power7 strchr-ppc64 \
 		   strchrnul-power7 strchrnul-ppc64 wcschr-power7 \
 		   wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \
 		   wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 36c5149..bd92cf6 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -108,6 +108,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c.  */
   IFUNC_IMPL (i, name, strncmp,
+	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strncmp_power8)
 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_HAS_VSX,
 			      __strncmp_power7)
 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_POWER4,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S
new file mode 100644
index 0000000..8d7223d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S
@@ -0,0 +1,40 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name,alignt,words)				\
+  .section ".text";						\
+  ENTRY_2(__strncmp_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strncmp_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strncmp_power8)
+
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strncmp_power8)					\
+  END_2(__strncmp_power8)
+
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncmp.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
index 9829d69..5e76783 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
@@ -25,13 +25,16 @@
 extern __typeof (strncmp) __strncmp_ppc attribute_hidden;
 extern __typeof (strncmp) __strncmp_power4 attribute_hidden;
 extern __typeof (strncmp) __strncmp_power7 attribute_hidden;
+extern __typeof (strncmp) __strncmp_power8 attribute_hidden;
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc (strncmp,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strncmp_power7 :
-	      (hwcap & PPC_FEATURE_POWER4)
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strncmp_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strncmp_power7 :
+		(hwcap & PPC_FEATURE_POWER4)
 		? __strncmp_power4
             : __strncmp_ppc);
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strncmp.S b/sysdeps/powerpc/powerpc64/power8/strncmp.S
new file mode 100644
index 0000000..56c814b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncmp.S
@@ -0,0 +1,323 @@
+/* Optimized strncmp implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the function
+
+   int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (strncmp, 4, 0)
+	/* Check if size is 0.  */
+	mr.	r10,r5
+	beq	cr0,L(ret0)
+
+	/* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
+	   the code:
+
+	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+	   with PAGE_SIZE being 4096 and ITER_SIZE begin 16.  */
+	rldicl	r8,r3,0,52
+	cmpldi	cr7,r8,4096-16
+	bgt	cr7,L(pagecross)
+	rldicl	r9,r4,0,52
+	cmpldi	cr7,r9,4096-16
+	bgt	cr7,L(pagecross)
+
+	/* For short string up to 16 bytes, load both s1 and s2 using
+	   unaligned dwords and compare.  */
+	ld	r7,0(r3)
+	ld	r9,0(r4)
+	li	r8,0
+	cmpb	r8,r7,r8
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different1)
+
+	/* If the string compared are equal, but size is less or equal
+	   to 8, return 0.  */
+	cmpldi	cr7,r10,8
+	li	r9,0
+	ble	cr7,L(ret1)
+	addi	r5,r10,-8
+
+	ld	r7,8(r3)
+	ld	r9,8(r4)
+	cmpb	r8,r7,r8
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different0)
+
+	cmpldi	cr7,r5,8
+	mr	r9,r8
+	ble	cr7,L(ret1)
+
+	/* Update pointers and size.  */
+	addi	r10,r10,-16
+	addi	r3,r3,16
+	addi	r4,r4,16
+
+	/* Now it has checked for first 16 bytes, align source1 to doubleword
+	   and adjust source2 address.  */
+L(align_8b):
+	rldicl	r5,r3,0,61
+	rldicr	r3,r3,0,60
+	subf	r4,r5,r4
+	add	r10,r10,r5
+
+	/* At this point, source1 alignment is 0 and source2 alignment is
+	   between 0 and 7.  Check is source2 alignment is 0, meaning both
+	   sources have the same alignment.  */
+	andi.	r8,r4,0x7
+	beq	cr0,L(loop_eq_align_0)
+
+	li	r5,0
+	b	L(loop_ne_align_1)
+
+	/* If source2 is unaligned to doubleword, the code needs to check
+	   on each interation if the unaligned doubleword access will cross
+	   a 4k page boundary.  */
+	.align 4
+L(loop_ne_align_0):
+	ld	r7,0(r3)
+	ld	r9,0(r4)
+	cmpb	r8,r7,r5
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different1)
+
+	cmpldi	cr7,r10,8
+	ble	cr7,L(ret0)
+	addi	r10,r10,-8
+	addi	r3,r3,8
+	addi	r4,r4,8
+L(loop_ne_align_1):
+	rldicl	r9,r4,0,52
+	cmpldi	r7,r9,4088
+	ble	cr7,L(loop_ne_align_0)
+	cmpdi	cr7,r10,0
+	beq	cr7,L(ret0)
+
+	lbz	r9,0(r3)
+	lbz	r8,0(r4)
+	cmplw	cr7,r9,r8
+	bne	cr7,L(byte_ne_4)
+	cmpdi	cr7,r9,0
+	beq	cr7,L(size_reached_0)
+
+	li	r9,r7
+	addi	r8,r3,1
+	mtctr	r9
+	addi	r4,r4,1
+	addi	r10,r10,-1
+	addi	r3,r3,8
+
+	/* The unaligned read of source2 will cross a 4K page boundary,
+	   and the different byte or NULL maybe be in the remaining page
+	   bytes.  Since it can not use the unaligned load the algorithm
+	   reads and compares 8 bytes to keep source1 doubleword aligned.  */
+	.align 4
+L(loop_ne_align_byte):
+	cmpdi	cr7,r10,0
+	addi	r10,r10,-1
+	beq	cr7,L(ret0)
+	lbz	r9,0(r8)
+	lbz	r7,0(r4)
+	addi	r8,r8,1
+	addi	r4,r4,1
+	cmplw	cr7,r9,r7
+	cmpdi	cr5,r9,0
+	bne	cr7,L(size_reached_2)
+	beq	cr5,L(size_reached_0)
+	bdnz	L(loop_ne_align_byte)
+
+	cmpdi	cr7,r10,0
+	bne+	cr7,L(loop_ne_align_0)
+
+	.align 4
+L(ret0):
+	li	r9,0
+L(ret1):
+	mr	r3,r9
+	blr
+
+	/* The code now check if r8 and r10 are different by issuing a
+	   cmpb and shift the result based on its output:
+
+	#ifdef __LITTLE_ENDIAN__
+	  leadzero = (__builtin_ffsl (z1) - 1);
+	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+	  r1 = (r1 >> leadzero) & 0xFFUL;
+	  r2 = (r2 >> leadzero) & 0xFFUL;
+	#else
+	  leadzero = __builtin_clzl (z1);
+	  leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+	  r1 = (r1 >> (56 - leadzero)) & 0xFFUL;
+	  r2 = (r2 >> (56 - leadzero)) & 0xFFUL;
+	#endif
+	  return r1 - r2;  */
+
+	.align 4
+L(different0):
+	mr	r10,r5
+#ifdef __LITTLE_ENDIAN__
+L(different1):
+        neg	r11,r8
+        sldi	r10,r10,3
+        and	r8,r11,r8
+        addi	r10,r10,-8
+        cntlzd	r8,r8
+        subfic	r8,r8,63
+        extsw 	r8,r8
+        cmpld	cr7,r8,r10
+        ble	cr7,L(different2)
+        mr	r8,r10
+L(different2):
+        extsw	r8,r8
+#else
+L(different1):
+	addi	r10,r10,-1
+	cntlzd	r8,r8
+	sldi	r10,r10,3
+	cmpld	cr7,r8,r10
+	blt	cr7,L(different2)
+	mr	r8,r10
+L(different2):
+	subfic	r8,r8,56
+#endif
+	srd	r7,r7,r8
+	srd	r9,r9,r8
+	rldicl	r3,r7,0,56
+	rldicl	r9,r9,0,56
+	subf	r9,r9,3
+	extsw	r9,r9
+	mr	r3,r9
+	blr
+
+	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
+	   a simple byte a byte comparison until the page alignment for s1
+	   is reached.  */
+	.align 4
+L(pagecross):
+	lbz	r7,0(r3)
+	lbz	r9,0(r4)
+	subfic	r8,r8,4095
+	cmplw	cr7,r9,r7
+	bne	cr7,L(byte_ne_3)
+	cmpdi	cr7,r9,0
+	beq	cr7,L(byte_ne_0)
+	addi	r10,r10,-1
+	subf	r7,r8,r10
+	subf	r9,r7,r10
+	addi	r9,r9,1
+	mtctr	r9
+	b	L(pagecross_loop1)
+
+	.align 4
+L(pagecross_loop0):
+	beq	cr7,L(ret0)
+	lbz	r9,0(r3)
+	lbz	r8,0(r4)
+	addi	r10,r10,-1
+	cmplw	cr7,r9,r8
+	cmpdi	cr5,r9,0
+	bne	r7,L(byte_ne_2)
+	beq	r5,L(byte_ne_0)
+L(pagecross_loop1):
+	cmpdi	cr7,r10,0
+	addi	r3,r3,1
+	addi	r4,r4,1
+	bdnz	L(pagecross_loop0)
+	cmpdi	cr7,r7,0
+	li	r9,0
+	bne+	cr7,L(align_8b)
+	b	L(ret1)
+
+	/* If both source1 and source2 are doubleword aligned, there is no
+	   need for page boundary cross checks.  */
+	.align 4
+L(loop_eq_align_0):
+	ld	r7,0(r3)
+	ld	r9,0(r4)
+	cmpb	r8,r7,r8
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	bne	cr0,L(different1)
+
+	cmpldi	cr7,r10,8
+	ble	cr7,L(ret0)
+	addi	r9,r10,-9
+
+	li	r5,0
+	srdi	r9,r9,3
+	addi	r9,r9,1
+	mtctr	r9
+	b	L(loop_eq_align_2)
+
+	.align 4
+L(loop_eq_align_1):
+	bdz	L(ret0)
+L(loop_eq_align_2):
+	ldu	r7,8(r3)
+	addi	r10,r10,-8
+	ldu	r9,8(r4)
+	cmpb	r8,r7,r5
+	cmpb	r6,r7,r9
+	orc.	r8,r8,r6
+	beq	cr0,L(loop_eq_align_1)
+	b	L(different1)
+
+	.align 4
+L(byte_ne_0):
+	li	r7,0
+L(byte_ne_1):
+	subf	r9,r9,r7
+	extsw	r9,r9
+	b	L(ret1)
+
+	.align 4
+L(byte_ne_2):
+	extsw	r7,r9
+	mr	r9,r8
+	b	L(byte_ne_1)
+L(size_reached_0):
+	li	r10,0
+L(size_reached_1):
+	subf	r9,r9,r10
+	extsw	r9,r9
+	b	L(ret1)
+L(size_reached_2):
+	extsw	r10,r9
+	mr	r9,r7
+	b	L(size_reached_1)
+L(byte_ne_3):
+	extsw	r7,r7
+	b	L(byte_ne_1)
+L(byte_ne_4):
+	extsw	r10,r9
+	mr	r9,r8
+	b	L(size_reached_1)
+END(strncmp)
+libc_hidden_builtin_def(strncmp)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=299b9464d9a1a48bbcfbc1c7a99604091ec5248f

commit 299b9464d9a1a48bbcfbc1c7a99604091ec5248f
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Date:   Fri Jan 9 11:56:35 2015 -0500

    powerpc: Optimize POWER7 strcmp trailing checks
    
    This patch optimized the POWER7 trailing check by avoiding using byte
    read operations and instead use the doubleword already readed with
    bitwise operations.

diff --git a/ChangeLog b/ChangeLog
index b947d5b..621daa4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2015-01-13  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
+	    Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/power7/strcmp.S (strcmp): Optimize
+	trailing byte check.
+
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S
index f16a9d8..ade2811 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S
@@ -25,122 +25,96 @@
 
 /* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4])  */
 
+	.machine	power7
 EALIGN (strcmp, 4, 0)
 	CALL_MCOUNT 2
 
 	or r9, r3, r4
 	rldicl. r10, r9, 0, 61	/* are s1 and s2 8 byte aligned..?  */
 	bne cr0, L(process_unaligned_bytes)
+	li	r5, 0
 
+	.align 4
 /* process input parameters on double word aligned boundary  */
-	ld r9, 0(r4)		/* load s2 at offset=0  */
-	li r10, 0		/* load mask=0  */
-	cmpb r10, r9, r10	/* compare bytes at s2 with mask  */
-	cmpdi cr7, r10, 0	/* is NULL found ..? is end of string HIT  */
-	bne cr7, L(process_unaligned_bytes)	/* process byte by byte  */
-
-	ld r10, 0(r3)		/* load s1 at offset=0  */
-	li r8, 0		/* load mask=0  */
-	cmpb r8, r10, r8	/* compare bytes at s1 with mask  */
-	cmpdi cr7, r8, 0	/* is NULL found ..? is end of string HIT  */
-	bne cr7, L(process_unaligned_bytes)	/* process byte by byte  */
-
-/*s1 and s2 does not contain NULL now , so compare all 8 bytes in a GO  */
-	cmpb r9, r10, r9	/* compare s1 and s2  */
-	cmpdi cr7, r9, -1	/* compare result with 0xFFFFFFFFFFFFFFFF  */
-	bne cr7, L(process_unaligned_bytes)	/* s1,s2 mismatch found  */
-
-	addi r5, r3, 8		/* save next offset of s2  */
-	addi r11, r4, 8		/* save next offset of s1  */
-	ld r8, 8(r4)		/* load s2 at offset=8  */
-	li r9, 0		/* load mask=0  */
-	cmpb r9, r8, r9		/* compare bytes at s2 with mask  */
-	cmpdi cr7, r9, 0	/* NULL found ..?  */
-	bne cr7, L(processBytes)/* update input and process bytes one by one  */
-
-	mr r9, r4		/* save s2  */
-	li r10, 0		/* load mask=0  */
-
-	ld r7, 8(r3)		/* load s1 at offset=8  */
-	cmpb r6, r7, r10	/* compare bytes at s1 with mask  */
-	cmpdi cr7, r6, 0	/* is NULL found  */
-	bne cr7, L(processBytes)/* mismatch, so process one by one  */
-
 L(unrollDword):
-	cmpb r8, r7, r8		/* compare s1 and s2  */
-	cmpdi cr7, r8, -1	/* compare result with 0xFFFFFFFFFFFFFFFF  */
-	bne cr7, L(processBytes)/* mismatch with s1 and s2  */
-
-	addi r5, r3, 16		/* save offset=16 of s1  */
-	addi r4, r9, 16		/* save offset=16 of s2  */
-	ld r8, 16(r9)		/* load s2 at offset=16  */
-	cmpb r7, r8, r10	/* compare bytes at s2 with mask  */
-	cmpdi cr7, r7, 0	/* NULL found  ..?  */
-	bne cr7, L(update2processBytes)
-
-	ld r7, 16(r3)		/* load s1 at offset=16  */
-	cmpb r6, r7, r10	/* check s1 for end of string  */
-	cmpdi cr7, r6, 0	/* end of s1 ?,then handle byte by byte  */
-	bne 7,L(update2processBytes)
-
-	cmpb r8, r7, r8		/* compare s1 and s2 double words  */
-	cmpdi cr7, r8, -1	/* compare results with 0xFFFFFFFFFFFFFFFF  */
-	bne cr7,L(update2processBytes)
-
-	addi r5, r3, 24		/* update s1 to offset=24  */
-	addi r4, r9, 24		/* update s2 to offset=24  */
-
-	ld r8, 24(r9)		/* load s2  */
-	cmpb r7, r8, r10	/* compare s2 for NULL  */
-	cmpdi cr7, r7, 0	/* verify if s2 is ending now  */
-	bne cr7,L(update2processBytes)
-
-	ld r7, 24(r3)		/* load s1 at offset=24  */
-	cmpb r6, r7, r10	/* verify for NULL  */
-	cmpdi cr7, r6, 0	/* is NULL found  */
-	bne cr7, L(update2processBytes)
-
-	cmpb r8, r7, r8		/* compare s1 and s2  */
-	cmpdi cr7, r8, -1	/* are s1 and s2 same ..?  */
-	bne cr7, L(update2processBytes)
-
-	addi r7, r9, 32		/* update s2 to next double word  */
-	addi r3, r3, 32		/* update s1 to next double word  */
-
-	ld r8, 32(r9)		/* load s2  */
-	mr r4, r7		/* save s2  */
-	cmpb r6, r8, r10	/* compare s2 with NULL  */
-	cmpdi cr7, r6, 0	/* end of s2 ..? */
-	bne cr7, L(process_unaligned_bytes)
-
-	ld r6, 0(r3)		/* load and compare s1 for NULL  */
-	cmpb r5, r6, r10
-	cmpdi cr7, r5, 0
-	bne cr7, L(process_unaligned_bytes)
-
-	cmpb r8, r6, r8		/* compare s1 and s2  */
-	cmpdi cr7, r8, -1
-	bne cr7, L(process_unaligned_bytes)
-
-	addi r5, r3, 8		/* increment s1 and d2 here  */
-	addi r11, r9, 40
-
-	ld r8, 40(r9)		/* process s2 now  */
-	cmpb r9, r8, r10
-	cmpdi cr7, r9, 0
-	bne cr7, L(processBytes)
-
-	mr r9, r7
-	ld r7, 8(r3)		/* process s1 now  */
-	cmpb r6, r7, r10
-	cmpdi cr7, r6, 0
-	beq cr7, L(unrollDword)	/* unroll to compare s1 and s2  */
-
-L(processBytes):
-	mr r4, r11		/* update input params  */
-	mr r3, r5
-
-	.p2align 4
+	ld	r8,0(r3)
+	ld	r10,0(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,8(r3)
+	ld	r10,8(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,16(r3)
+	ld	r10,16(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	ld	r8,24(r3)
+	ld	r10,24(r4)
+	cmpb	r7,r8,r5
+	cmpdi	cr7,r7,0
+	mr	r9,r7
+	bne 	cr7,L(null_found)
+	cmpld	cr7,r8,r10
+	bne	cr7,L(different)
+
+	addi r3, r3, 32
+	addi r4, r4, 32
+	beq cr7, L(unrollDword)
+
+	.align 4
+L(null_found):
+#ifdef __LITTLE_ENDIAN__
+	neg	r7,r9
+	and	r9,r9,r7
+	li	r7,-1
+	cntlzd	r9,r9
+	subfic	r9,r9,71
+	sld	r9,r7,r9
+#else
+	cntlzd	r9,r9
+	li	r7,-1
+	addi	r9,r9,8
+	srd	r9,r7,r9
+#endif
+	or	r8,r8,r9
+	or	r10,r10,r9
+
+L(different):
+	cmpb	r9,r8,r10
+#ifdef __LITTLE_ENDIAN__
+	addi	r7,r9,1
+	andc	r9,r7,r9
+	cntlzd	r9,r9
+	subfic	r9,r9,63
+#else
+	not	r9,r9
+	cntlzd	r9,r9
+	subfic	r9,r9,56
+#endif
+	srd	r3,r8,r9
+	srd	r10,r10,r9
+	rldicl	r10,r10,0,56
+	rldicl	r3,r3,0,56
+	subf	r3,r10,r3
+	blr
+
+	.align 4
 L(process_unaligned_bytes):
 	lbz r9, 0(r3)		/* load byte from s1  */
 	lbz r10, 0(r4)		/* load byte from s2  */
@@ -172,24 +146,19 @@ L(process_unaligned_bytes):
 	addi r4, r4, 4		/* increment s2 by unroll factor  */
 	beq cr6, L(process_unaligned_bytes)	/* unroll byte processing  */
 
-	.p2align 4
+	.align 4
 L(ComputeDiff):
 	extsw r9, r9
 	subf r10, r10, r9	/* compute s1 - s2  */
 	extsw r3, r10
 	blr			/* return  */
 
-	.p2align 4
+	.align 4
 L(diffOfNULL):
 	li r9, 0
 	subf r10, r10, r9	/* compute s1 - s2  */
 	extsw r3, r10		/* sign extend result  */
 	blr			/* return  */
 
-	.p2align 4
-L(update2processBytes):
-	mr r3, r5		/* update and proceed  */
-	b L(process_unaligned_bytes)
-
 END (strcmp)
 libc_hidden_builtin_def (strcmp)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6f0685edc6676c7266fdc30fd0769fb88d058f04

commit 6f0685edc6676c7266fdc30fd0769fb88d058f04
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Wed Jan 7 07:18:30 2015 -0500

    powerpc: Optimized strcmp for POWER8/PPC64
    
    This patch adds an optimized POWER8 strcmp using unaligned accesses.
    The algorithm first check the initial 16 bytes, then align the first
    function source and uses unaligned loads on second argument only.
    Aditional checks for page boundaries are done for unaligned cases

diff --git a/ChangeLog b/ChangeLog
index 383f1f5..b947d5b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,16 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Add strcmp-power8 object.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strcmp_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/strcmp.c (strcmp): Add
+	__strcmp_power8 implementation.
+	* sysdeps/powerpc/powerpc64/power8/strcmp.S: New file.
+	* NEWS: Update.
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
 	Add strncpy-power8 and stpncpy-power8 objects.
 	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 	(__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
diff --git a/NEWS b/NEWS
index afb48df..0a4fa77 100644
--- a/NEWS
+++ b/NEWS
@@ -11,8 +11,9 @@ Version 2.20.1
 
   16617, 17266, 17370, 17371, 17625, 17630.
 
-* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for
+* Optimized strcpy, stpcpy, strncpy, stpncpy, and strcmp implementations for
   powerpc64/powerpc64le.
+  Implemented by Adhemerval Zanella (IBM).
 
 * CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
   under certain input conditions resulting in the execution of a shell for
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 18d3378..ec4fca7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -18,7 +18,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
-		   strcmp-power7 strcmp-ppc64 \
+		   strcmp-power8 strcmp-power7 strcmp-ppc64 \
 		   strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
 		   memmove-ppc64 bcopy-ppc64 strncpy-power8
 
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index b698b90..36c5149 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c.  */
   IFUNC_IMPL (i, name, strcmp,
 	      IFUNC_IMPL_ADD (array, i, strcmp,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strcmp_power8)
+	      IFUNC_IMPL_ADD (array, i, strcmp,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcmp_power7)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcmp.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
index 2013301..dc4bfac 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcmp. PowerPC64 version.
-   Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized strcmp implementation for POWER8/PPC64.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if defined SHARED && !defined NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
 
-extern __typeof (strcmp) __strcmp_ppc attribute_hidden;
-extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strcmp_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strcmp_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strcmp_power8)
 
-libc_ifunc (strcmp,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcmp_power7
-            : __strcmp_ppc);
-#endif
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strcmp_power8)					\
+  END_2(__strcmp_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strcmp.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
index 2013301..c711969 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -23,9 +23,12 @@
 
 extern __typeof (strcmp) __strcmp_ppc attribute_hidden;
 extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
+extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
 
 libc_ifunc (strcmp,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcmp_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+              ? __strcmp_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strcmp_power7
             : __strcmp_ppc);
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S
new file mode 100644
index 0000000..223d891
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -0,0 +1,257 @@
+/* Optimized strcmp implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Implements the function
+
+   size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+EALIGN (strcmp, 4, 0)
+	li	r0,0
+
+	/* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+	   the code:
+
+	    (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+	   with PAGE_SIZE being 4096 and ITER_SIZE begin 32.  */
+
+	rldicl	r7,r3,0,52
+	rldicl	r9,r4,0,52
+	cmpldi	cr7,r7,4096-32
+	bgt	cr7,L(pagecross_check)
+	cmpldi	cr5,r9,4096-32
+	bgt	cr5,L(pagecross_check)
+
+	/* For short string up to 32 bytes, load both s1 and s2 using
+	   unaligned dwords and compare.  */
+	ld	r8,0(r3)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,8(r3)
+	ld	r10,8(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,16(r3)
+	ld	r10,16(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,24(r3)
+	ld	r10,24(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	addi	r7,r3,32
+	addi	r4,r4,32
+
+L(align_8b):
+	/* Now it has checked for first 32 bytes, align source1 to doubleword
+	   and adjust source2 address.  */
+	rldicl	r9,r7,0,61	/* source1 alignment to doubleword  */
+	subf	r4,r9,r4	/* Adjust source2 address based on source1
+				   alignment.  */
+	rldicr	r7,r7,0,60	/* Align source1 to doubleword.  */
+
+	/* At this point, source1 alignment is 0 and source2 alignment is
+	   between 0 and 7.  Check is source2 alignment is 0, meaning both
+	   sources have the same alignment.  */
+	andi.	r9,r4,0x7
+	bne	cr0,L(loop_diff_align)
+
+	/* If both source1 and source2 are doubleword aligned, there is no
+	   need for page boundary cross checks.  */
+
+	ld	r8,0(r7)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	.align 4
+L(loop_equal_align):
+	ld	r8,8(r7)
+	ld	r10,8(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ld	r8,16(r7)
+	ld	r10,16(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	ldu	r8,24(r7)
+	ldu	r10,24(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+
+	b	L(loop_equal_align)
+
+	/* A zero byte was found in r8 (s1 dword), r9 contains the cmpb
+	   result and r10 the dword from s2.  To code isolate the byte
+	   up to end (including the '\0'), masking with 0xFF the remaining
+	   ones:
+
+           #if __LITTLE_ENDIAN__
+	     (__builtin_ffsl (x) - 1) = counting trailing zero bits
+	     r9 = (__builtin_ffsl (r9) - 1) + 8;
+	     r9 = -1UL << r9
+	   #else
+	     r9  = __builtin_clzl (r9) + 8;
+	     r9  = -1UL >> r9
+	   #endif
+	     r8  = r8  | r9
+	     r10 = r10 | r9  */
+
+#ifdef __LITTLE_ENDIAN__
+	nor 	r9,r9,r9
+L(different_nocmpb):
+	neg	r3,r9
+	and	r9,r9,r3
+	cntlzd	r9,r9
+	subfic	r9,r9,63
+#else
+	not	r9,r9
+L(different_nocmpb):
+	cntlzd	r9,r9
+	subfic	r9,r9,56
+#endif
+	srd	r3,r8,r9
+	srd	r10,r10,r9
+	rldicl	r10,r10,0,56
+	rldicl	r3,r3,0,56
+	subf	r3,r10,r3
+	extsw	r3,r3
+	blr
+
+	.align	4
+L(pagecross_check):
+	subfic	r9,r9,4096
+	subfic	r7,r7,4096
+	cmpld	cr7,r7,r9
+	bge	cr7,L(pagecross)
+	mr	r7,r9
+
+	/* If unaligned 16 bytes reads across a 4K page boundary, it uses
+	   a simple byte a byte comparison until the page alignment for s1
+	   is reached.  */
+L(pagecross):
+	add	r7,r3,r7
+	subf	r9,r3,r7
+	mtctr	r9
+
+	.align	4
+L(pagecross_loop):
+	/* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
+	   and if *s1 is '\0'.  */
+	lbz	r9,0(r3)
+	lbz	r10,0(r4)
+	addi	r3,r3,1
+	addi	r4,r4,1
+	cmplw	cr7,r9,r10
+	cmpdi	cr5,r9,r0
+	bne	cr7,L(pagecross_ne)
+	beq	cr5,L(pagecross_nullfound)
+	bdnz	L(pagecross_loop)
+	b	L(align_8b)
+
+	.align	4
+	/* The unaligned read of source2 will cross a 4K page boundary,
+	   and the different byte or NULL maybe be in the remaining page
+	   bytes. Since it can not use the unaligned load, the algorithm
+	   reads and compares 8 bytes to keep source1 doubleword aligned.  */
+L(check_source2_byte):
+	li	r9,8
+	mtctr	r9
+
+	.align	4
+L(check_source2_byte_loop):
+	lbz	r9,0(r7)
+	lbz	r10,0(r4)
+	addi	r7,r7,1
+	addi	r4,r4,1
+	cmplw	cr7,r9,10
+	cmpdi	r5,r9,0
+	bne	cr7,L(pagecross_ne)
+	beq	cr5,L(pagecross_nullfound)
+	bdnz	L(check_source2_byte_loop)
+
+	/* If source2 is unaligned to doubleword, the code needs to check
+	   on each interation if the unaligned doubleword access will cross
+	   a 4k page boundary.  */
+	.align	5
+L(loop_unaligned):
+	ld	r8,0(r7)
+	ld	r10,0(r4)
+	cmpb	r12,r8,r0
+	cmpb	r11,r8,r10
+	orc.	r9,r12,r11
+	bne	cr0,L(different_nocmpb)
+	addi	r7,r7,8
+	addi	r4,r4,8
+
+L(loop_diff_align):
+	/* Check if [src2]+8 cross a 4k page boundary:
+
+	     srcin2 % PAGE_SIZE > (PAGE_SIZE - 8)
+
+	     with PAGE_SIZE being 4096.  */
+	rldicl	r9,r4,0,52
+	cmpldi	cr7,r9,4088
+	ble	cr7,L(loop_unaligned)
+	b	L(check_source2_byte)
+
+	.align	4
+L(pagecross_ne):
+	extsw	r3,r9
+	mr	r9,r10
+L(pagecross_retdiff):
+	subf	r9,r9,r3
+	extsw	r3,r9
+	blr
+
+	.align	4
+L(pagecross_nullfound):
+	li	r3,0
+	b	L(pagecross_retdiff)
+END (strcmp)
+libc_hidden_builtin_def (strcmp)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a38f68f12fd03374d599eeb0b6943e50b0ff7348

commit a38f68f12fd03374d599eeb0b6943e50b0ff7348
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Wed Dec 31 11:47:41 2014 -0500

    powerpc: Optimized st{r,p}ncpy for POWER8/PPC64
    
    This patch adds an optimized POWER8 st{r,p}ncpy using unaligned accesses.
    It shows 10%-80% improvement over the optimized POWER7 one that uses
    only aligned accesses, specially on unaligned inputs.
    
    The algorithm first read and check 16 bytes (if inputs do not cross a 4K
    page size).  The it realign source to 16-bytes and issue a 16 bytes read
    and compare loop to speedup null byte checks for large strings.  Also,
    different from POWER7 optimization, the null pad is done inline in the
    implementation using possible unaligned accesses, instead of realying on
    a memset call.  Special case is added for page cross reads.

diff --git a/ChangeLog b/ChangeLog
index 0c3f78d..383f1f5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Add strncpy-power8 and stpncpy-power8 objects.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
+	implementations.
+	* sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/stpncpy.c (__stpncpy): Add
+	__stpncpy_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/strncpy.c (strncpy): Add
+	__strncpy_power8 implementation.
+	* sysdeps/powerpc/powerpc64/power8/stpncpy.S: New file.
+	* sysdeps/powerpc/powerpc64/power8/strncpy.S: New file.
+	* NEWS: Update.
+
 	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
 	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
 	* sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
diff --git a/NEWS b/NEWS
index 769e841..afb48df 100644
--- a/NEWS
+++ b/NEWS
@@ -11,7 +11,8 @@ Version 2.20.1
 
   16617, 17266, 17370, 17371, 17625, 17630.
 
-* Optimized strcpy and stpcpy implementations for powerpc64/powerpc64le.
+* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for
+  powerpc64/powerpc64le.
 
 * CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
   under certain input conditions resulting in the execution of a shell for
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 74b2daa..18d3378 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -17,9 +17,10 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   stpcpy-power7 stpcpy-ppc64 \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
-		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
+		   stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
+		   strcmp-power7 strcmp-ppc64 \
 		   strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
-		   memmove-ppc64 bcopy-ppc64
+		   memmove-ppc64 bcopy-ppc64 strncpy-power8
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index d5b2184..b698b90 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -279,6 +279,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
 	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strncpy_power8)
+	      IFUNC_IMPL_ADD (array, i, strncpy,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strncpy_power7)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
@@ -287,6 +290,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __stpncpy_power8)
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __stpncpy_power7)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
similarity index 55%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
index dbf8521..d5d835d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
-   Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized stpncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,24 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
 
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+#define USE_AS_STPNCPY
 
-libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
-            : __stpncpy_ppc);
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__stpncpy_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__stpncpy_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__stpncpy_power8)
 
-weak_alias (__stpncpy, stpncpy)
-#endif
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__stpncpy_power8)					\
+  END_2(__stpncpy_power8)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index dbf8521..3ee50e5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -23,10 +23,13 @@
 
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
 
 libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __stpncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __stpncpy_power7
             : __stpncpy_ppc);
 
 weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
index dbf8521..ed906a4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
-   Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized strncpy implementation for POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,25 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
 
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strncpy_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strncpy_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strncpy_power8)
 
-libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
-            : __stpncpy_ppc);
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strncpy_power8)					\
+  END_2(__strncpy_power8)
 
-weak_alias (__stpncpy, stpncpy)
-#endif
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 8fd5e4b..19927bc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -24,12 +24,15 @@
 
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc (strncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strncpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strncpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strncpy_power7
             : __strncpy_ppc);
 
 #endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
similarity index 59%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/power8/stpncpy.S
index dbf8521..76a1466 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
-   Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,5 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
-
-libc_ifunc (__stpncpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __stpncpy_power7
-            : __stpncpy_ppc);
-
-weak_alias (__stpncpy, stpncpy)
-#endif
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
new file mode 100644
index 0000000..5fda953
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -0,0 +1,424 @@
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPCPY is defined.
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+
+        /* Check if the [src]+15 will cross a 4K page by checking if the bit
+           indicating the page size changes.  Basically:
+
+           uint64_t srcin = (uint64_t)src;
+           uint64_t ob = srcin & 4096UL;
+           uint64_t nb = (srcin+15UL) & 4096UL;
+           if (ob ^ nb)
+             goto pagecross;  */
+
+	addi	r10,r4,16
+	rlwinm	r9,r4,0,19,19
+
+	/* Since it is a leaf function, save some non-volatile registers on the
+	   protected/red zone.  */
+	std	r26,-48(r1)
+	std	r27,-40(r1)
+
+	rlwinm	r8,r10,0,19,19
+
+	std	r28,-32(r1)
+	std	r29,-24(r1)
+
+	cmpld	r7,r9,r8
+
+	std	r30,-16(r1)
+	std	r31,-8(r1)
+
+	beq	cr7,L(unaligned_lt_16)
+	rldicl	r9,r4,0,61
+	subfic	r8,r9,8
+	cmpld	cr7,r5,r8
+	bgt 	cr7,L(pagecross)
+
+	/* At this points there is 1 to 15 bytes to check and write.  Since it could
+	   be either from first unaligned 16 bytes access or from bulk copy, the code
+	   uses an unrolled byte read/write instead of trying to analyze the cmpb
+	   results.  */
+L(short_path):
+	mr	r9,r3
+L(short_path_1):
+	cmpdi	cr7,r5,0
+	beq	cr7,L(short_path_loop_end_1)
+L(short_path_2):
+	lbz	r10,0(r4)
+	cmpdi	cr7,r10,0
+	stb	r10,0(r9)
+	beq	cr7,L(zero_pad_start_1)
+	cmpdi	cr0,r5,1
+	addi	r8,r9,1
+	addi	r6,r5,-1
+	beq	cr0,L(short_path_loop_end_0)
+	lbz	r10,1(r4)
+	cmpdi	cr7,r10,0
+	stb	r10,1(r9)
+	beq	cr7,L(zero_pad_start_prepare_1)
+	addi	r10,r5,-3
+	b	L(short_path_loop_1)
+
+	.align	4
+L(short_path_loop):
+	lbz	r8,0(r4)
+	addi	r7,r10,-2
+	cmpdi	cr5,r8,0
+	stb	r8,0(r9)
+	beq	cr5,L(zero_pad_start_1)
+	beq	r7,L(short_path_loop_end_0)
+	lbz	r8,1(r4)
+	cmpdi	cr7,r8,0
+	stb	r8,1(r9)
+	beq	cr7,L(zero_pad_start)
+	mr	r10,r7
+L(short_path_loop_1):
+	addic.	r5,r5,-2
+	addi	r9,r9,2
+	cmpdi	cr7,r10,0
+	addi	r4,r4,2
+	addi	r6,r9,1
+	bne	cr0,L(short_path_loop)
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+	b	L(short_path_loop_end)
+#endif
+
+L(short_path_loop_end_0):
+#ifdef USE_AS_STPNCPY
+	addi	r3,r9,1
+	b	L(short_path_loop_end)
+#endif
+L(short_path_loop_end_1):
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+#endif
+L(short_path_loop_end):
+	/* Restore non-volatile registers.  */
+	ld	r26,-48(r1)
+	ld	r27,-40(r1)
+	ld	r28,-32(r1)
+	ld	r29,-24(r1)
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
+	blr
+
+	/* This code pads the remainder dest with NULL bytes.  The algorithm
+	   calculate the remanining size and issues a doubleword unrolled
+	   loops followed by a byte a byte set.  */
+	.align	4
+L(zero_pad_start):
+	mr	r5,r10
+	mr	r9,r6
+L(zero_pad_start_1):
+	srdi.	r8,r5,r3
+	mr	r10,r9
+#ifdef USE_AS_STPNCPY
+	mr	r3,r9
+#endif
+	beq-	cr0,L(zero_pad_loop_b_start)
+	cmpldi	cr7,r8,1
+	li	cr7,0
+	std	r7,0(r9)
+	beq	cr7,L(zero_pad_loop_b_prepare)
+	addic.	r8,r8,-2
+	addi	r10,r9,r16
+	std	r7,8(r9)
+	beq	cr0,L(zero_pad_loop_dw_2)
+	std	r7,16(r9)
+	li	r9,0
+	b	L(zero_pad_loop_dw_1)
+
+	.align	4
+L(zero_pad_loop_dw):
+	addi	r10,r10,16
+	std	r9,-8(r10)
+	beq	cr0,L(zero_pad_loop_dw_2)
+	std	r9,0(r10)
+L(zero_pad_loop_dw_1):
+	cmpldi	cr7,r8,1
+	std	r9,0(r10)
+	addic.	r8,r8,-2
+	bne	cr7,L(zero_pad_loop_dw)
+	addi	r10,r10,8
+L(zero_pad_loop_dw_2):
+	rldicl	r5,r5,0,61
+L(zero_pad_loop_b_start):
+	cmpdi	cr7,r5,0
+	addi	r5,r5,-1
+	addi	r9,r10,-1
+	add	r10,r10,5
+	subf	r10,r9,r10
+	li	r8,0
+	beq-	cr7,L(short_path_loop_end)
+
+	/* Write remaining 1-8 bytes.  */
+        .align  4
+	addi	r9,r9,1
+	mtocrf	0x1,r10
+	bf	29,4f
+        stw     r8,0(r9)
+        addi	r9,r9,4
+
+        .align  4
+4:      bf      30,2f
+        sth     r8,0(r9)
+        addi	r9,r9,2
+
+        .align  4
+2:      bf	31,1f
+        stb	r8,0(r9)
+
+	/* Restore non-volatile registers.  */
+1:	ld	r26,-48(r1)
+	ld	r27,-40(r1)
+	ld	r28,-32(r1)
+	ld	r29,-24(r1)
+	ld	r30,-16(r1)
+	ld	r31,-8(r1)
+	blr
+
+	/* The common case where [src]+16 will not cross a 4K page boundary.
+	   In this case the code fast check the first 16 bytes by using doubleword
+	   read/compares and update destiny if neither total size or null byte
+	   is found in destiny. */
+	.align	4
+L(unaligned_lt_16):
+	cmpldi	cr7,r5,7
+	ble	cr7,L(short_path)
+	ld	r7,0(r4)
+	li	r8,0
+	cmpb	r8,r7,r8
+	cmpdi	cr7,r8,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	r6,r5,-8
+	std	r7,0(r3)
+	addi	r9,r3,r8
+	cmpldi	cr7,r6,7
+	addi	r7,r4,8
+	ble	cr7,L(short_path_prepare_1_1)
+	ld	r4,8(r4)
+	cmpb	r8,r4,r8
+	cmpdi	cr7,r8,0
+	bne	cr7,L(short_path_prepare_2_1)
+	std	r4,8(r3)
+	addi	r29,r3,16
+	addi	r5,r5,-16
+	/* Neither the null byte was found or total length was reached,
+	   align to 16 bytes and issue a bulk copy/compare.  */
+	b	L(align_to_16b)
+
+	/* In the case of 4k page boundary cross, the algorithm first align
+	   the address to a doubleword, calculate a mask based on alignment
+	   to ignore the bytes and continue using doubleword.  */
+	.align	4
+L(pagecross):
+	rldicr	r11,r4,0,59	/* Align the address to 8 bytes boundary.  */
+	li	r6,-1		/* MASK = 0xffffffffffffffffUL.  */
+	sldi	r9,r9,3		/* Calculate padding.  */
+	ld	r7,0(r11)	/* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	sld	r9,r6,r9	/* MASK = MASK << padding.  */
+#else
+	srd	r9,r6,r9	/* MASK = MASK >> padding.  */
+#endif
+	orc	r9,r7,r9	/* Mask bits that are not part of the
+				   string.  */
+	li	cr7,0
+	cmpb	r9,r9,r7	/* Check for null bytes in DWORD1.  */
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	subf	r8,r8,r5	/* Adjust total length.  */
+	cmpldi	cr7,r8,8	/* Check if length was reached.  */
+	ble	cr7,L(short_path_prepare_2)
+
+	/* For next checks we have aligned address, so we check for more
+	   three doublewords to make sure we can read 16 unaligned bytes
+	   to start the bulk copy with 16 aligned addresses.  */
+	ld	cr7,8(r11)
+	cmpb	r9,r7,r9
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	cr7,r8,-8
+	cmpldi	cr7,r7,8
+	ble	cr7,L(short_path_prepare_2)
+	ld	cr7,16(r11)
+	cmpb	r9,r7,r9
+	cmpdi	cr7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+	addi	r8,r8,-16
+	cmpldi	r7,r8,8
+	ble	cr7,L(short_path_prepare_2)
+	ld	r8,24(r11)
+	cmpb	r9,r8,r9
+	cmpdi	r7,r9,0
+	bne	cr7,L(short_path_prepare_2)
+
+	/* No null byte found in the 32 bytes readed and length not reached,
+	   read source again using unaligned loads and store them.  */
+	ld	r9,0(r4)
+	addi	r29,r3,16
+	addi	r5,r5,-16
+	std	r9,0(r3)
+	ld	r9,8(r4)
+	std	r9,8(r3)
+
+	/* Align source to 16 bytes and adjust destiny and size.  */
+L(align_to_16b):
+	rldicl	r9,r10,0,60
+	rldicr	r28,r10,0,59
+	add	r12,r5,r9
+	subf	r29,r9,r29
+
+	/* The bulk read/compare/copy loads two doublewords, compare and merge
+	   in a single register for speed.  This is an attempt to speed up the
+	   null-checking process for bigger strings.  */
+
+	cmpldi	cr7,r12,15
+	ble	cr7,L(short_path_prepare_1_2)
+
+	/* Main loop for large sizes, unrolled 2 times to get better use of
+	   pipeline.  */
+	ld	r8,0(28)
+	ld	r10,8(28)
+	li	r9,0
+	cmpb	r7,r8,r9
+	cmpb	r9,r10,r9
+	or.	r6,r9,r7
+	bne	cr0,L(short_path_prepare_2_3)
+	addi	r5,r12,-16
+	addi	r4,r28,16
+	std	r8,0(r29)
+	std	r10,8(r29)
+	cmpldi	cr7,r5,15
+	addi	r9,r29,16
+	ble	cr7,L(short_path_1)
+	mr	r11,r28
+	mr	r6,r29
+	li	r30,0
+	subfic	r26,r4,48
+	subfic	r27,r9,48
+
+	b	L(loop_16b)
+
+	.align	4
+L(loop_start):
+	ld	r31,0(r11)
+	ld	r10,8(r11)
+	cmpb	r0,r31,r7
+	cmpb	r8,r10,r7
+	or.	r7,r0,r8
+	addi	r5,r5,-32
+	cmpldi	cr7,r5,15
+	add	r4,r4,r26
+	add	r9,r9,r27
+	bne	cr0,L(short_path_prepare_2_2)
+	add	r4,r28,r4
+	std	r31,0(r6)
+	add	r9,r29,r9
+	std	r10,8(r6)
+	ble	cr7,L(short_path_1)
+
+L(loop_16b):
+	ld	r10,16(r11)
+	ld	r0,24(r11)
+	cmpb	r8,r10,r30
+	cmpb	r7,r0,r30
+	or.	r7,r8,r7
+	addi	r12,r12,-32
+	cmpldi	r7,r12,15
+	addi	r11,r11,32
+	bne	cr0,L(short_path_2)
+	std	r10,16(r6)
+	addi	r6,r6,32
+	std	r0,-8(r6)
+	bgt	cr7,L(loop_start)
+
+	mr	r5,r12
+	mr	r4,r11
+	mr	r9,r6
+	b	L(short_path_1)
+
+	.align	4
+L(short_path_prepare_1_1):
+	mr	r5,r6
+	mr	r4,r7
+	b	L(short_path_1)
+L(short_path_prepare_1_2):
+	mr	r5,r12
+	mr	r4,r28
+	mr	r9,r29
+	b	L(short_path_1)
+L(short_path_prepare_2):
+	mr	r9,r3
+	b	L(short_path_2)
+L(short_path_prepare_2_1):
+	mr	r5,r6
+	mr	r4,r7
+	b	L(short_path_2)
+L(short_path_prepare_2_2):
+	mr	r5,r12
+	mr	r4,r11
+	mr	r9,r6
+	b	L(short_path_2)
+L(short_path_prepare_2_3):
+	mr	r5,r12
+	mr	r4,r28
+	mr	r9,r29
+	b	L(short_path_2)
+L(zero_pad_loop_b_prepare):
+	addi	r10,r9,8
+	rldicl	r5,r5,0,61
+	b	L(zero_pad_loop_b_start)
+L(zero_pad_start_prepare_1):
+	mr	r5,r6
+	mr	r9,r8
+	b	L(zero_pad_start_1)
+END (FUNC_NAME)
+
+#ifdef USE_AS_STPNCPY
+libc_hidden_def (__stpncpy)
+#else
+libc_hidden_builtin_def (strncpy)
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4242356131256e54ca3e96b0c6f2af773b7a69c8

commit 4242356131256e54ca3e96b0c6f2af773b7a69c8
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Tue Dec 23 13:39:23 2014 -0500

    powerpc: Optimized strncat for POWER7/PPC64
    
    With 3eb38795dbbbd816 (Simplify strncat) the generic algorithms uses
    strlen, strnlen, and memcpy.  This is faster than POWER7 current
    implementation, especially for unaligned strings (where POWER7 code
    uses byte-byte operations).
    
    This patch removes the assembly implementation and uses a multiarch
    specialization based on default algorithm calling optimized POWER7
    symbols.

diff --git a/ChangeLog b/ChangeLog
index 4ff5e7d..0c3f78d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
+	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
+	* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
+	* sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
+
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
 	strncat-power8 object.
 	* sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S
deleted file mode 100644
index ead4a9a..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Optimized strncat implementation for POWER7.
-   Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#undef EALIGN
-#define EALIGN(name, alignt, words)				\
-  .section ".text";						\
-  ENTRY_2(__strncat_power7)					\
-  .align ALIGNARG(alignt);					\
-  EALIGN_W_##words;						\
-  BODY_LABEL(__strncat_power7):					\
-  cfi_startproc;						\
-  LOCALENTRY(__strncat_power7)
-
-#undef END
-#define END(name)						\
-  cfi_endproc;							\
-  TRACEBACK(__strncat_power7)					\
-  END_2(__strncat_power7)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#define STRLEN __strlen_power7
-
-#include <sysdeps/powerpc/powerpc64/power7/strncat.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
new file mode 100644
index 0000000..39b1aeb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
@@ -0,0 +1,31 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/ >.  */
+
+#include <string.h>
+
+#define STRNCAT __strncat_power7
+
+extern __typeof (strncat) __strncat_power7 attribute_hidden;
+extern __typeof (strlen) __strlen_power7 attribute_hidden;
+extern __typeof (strnlen) __strnlen_power7 attribute_hidden;
+extern __typeof (memcpy) __memcpy_power7 attribute_hidden;
+
+#define strlen    __strlen_power7
+#define __strnlen __strnlen_power7
+#define memcpy    __memcpy_power7
+
+#include <string/strncat.c>
diff --git a/sysdeps/powerpc/powerpc64/power7/strncat.S b/sysdeps/powerpc/powerpc64/power7/strncat.S
deleted file mode 100644
index f5ea52d..0000000
--- a/sysdeps/powerpc/powerpc64/power7/strncat.S
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Optimized strncat implementation for PowerPC64/POWER7.
-
-   Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-/* The algorithm is as follows for aligned memory access :
-
-   if address of s2 is divisible by 0x7UL,
-       perform aligned doubleword catenation
-   else
-       perform unaligned catenation
-
-   The aligned comparison are made using cmpb instructions.  */
-
-/* char* [r3] strncat (const char *s1 [r3],
-                       const char *s2 [r4],
-                       size_t size [r5])  */
-
-#include <sysdep.h>
-
-#ifndef STRNCAT
-# undef strncat
-# define STRNCAT  strncat
-#endif
-
-#ifndef STRLEN
-/* For builds with no IFUNC support, local calls should be made to internal
-   GLIBC symbol (created by libc_hidden_builtin_def).  */
-# ifdef SHARED
-#  define STRLEN   __GI_strlen
-# else
-#  define STRLEN   strlen
-# endif
-#endif
-
-#define	FRAMESIZE	(FRAME_MIN_SIZE+32)
-
-	.machine  power7
-EALIGN(STRNCAT, 4, 0)
-	CALL_MCOUNT 3
-
-	mflr r0				/* Load link register LR to r0.  */
-
-/* We shall use r29, r30 and r31 non volatile register for retention.
-   Save all the callee registers in the GPR save area.  */
-	std r29, -24(r1)		/* Save callers register r29.  */
-	std r30, -16(r1)		/* Save callers register r30.  */
-	std r31, -8(r1)			/* Save callers register r31.  */
-
-	std r0, 16(r1)			/* Store the link register.  */
-	stdu r1, -FRAMESIZE(r1)		/* Create the stack frame.  */
-
-/* Improve performance with CPU pre-fetch.  */
-	dcbt 0, r3			/* Pre-fetch str to avoid cache
-					   miss.  */
-	dcbt 0, r4			/* Pre-fetch accept to avoid cache
-					   miss.  */
-
-	mr. r29, r5			/* Save "n" in r29.  */
-	mr r30, r3			/* Save "s1" in r30 from r3.  */
-	beq cr0,L(done)
-
-	mr r31, r4			/* Save "s2" in r31 from r4.  */
-	bl STRLEN			/* Call optimized strlen on s1; goto
-					   end of s1.  */
-	nop
-	cmpldi cr7, r29, 7		/* If s2 is <=7 process
-					    byte-by-byte.  */
-	add r3, r30, r3			/* Grab the last character of s1.  */
-	bgt cr7,L(alignment)		/* Process by aligned strings.  */
-
-	cmpldi cr7, r29, 3		/* If n is >= 4, we can
-					   byte-unroll.  */
-	addi r9, r3, -1			/* Make "s1" point before next
-					   character, increment when read.  */
-	bgt cr7, L(bytes_unroll)	/* Process each byte.  */
-
-L(byte_by_byte):
-	lbz r10, 0(r31)
-	addi r8, r9, 1
-	cmpdi cr7, r10, 0		/* Check for NULL in "s2".  */
-	stb r10, 1(r9)
-	beq cr7, L(done)
-	add r9, r9, r29
-	subf r9, r8, r9
-	addi r9, r9, 1
-	mtctr r9
-	b L(branch2)
-	.p2align 4
-L(branch1):
-	lbzu r10, 1(r31)
-	cmpdi cr7, r10, 0
-	stbu r10, 1(r8)
-	beq cr7,L(done)
-L(branch2):
-	mr r9, r8
-	bdnz L(branch1)
-	beq cr7,L(done)
-L(nullTerminate):
-	li r10, 0			/* Load NULL for termination.  */
-	stb r10, 1(r9)			/* Append or terminate s1 with
-					   NULL.  */
-	.p2align 4			/* A small section here.  */
-L(done):				/* We return now.   */
-	addi r1, r1, FRAMESIZE		/* Restore stack pointer.  */
-	mr r3, r30			/* Set the return value length of
-					   string.  */
-	ld r0, 16(r1)			/* Read the saved link register.  */
-	ld r29, -24(r1)			/* Restore save register r29.  */
-	ld r30, -16(r1)			/* Restore save register r30.  */
-	ld r31, -8(r1)			/* Restore save register r31.  */
-	mtlr r0				/* Restore link register.  */
-	blr				/* Branch to link register.  */
-
-	.p2align 4
-L(alignment):
-	rldicl. r9, r31, 0, 61		/* Check if s2 is 8byte aligned  */
-	beq cr0,L(dwordAligned)
-
-	.p2align 4
-/* Unaligned bytes in string, so process byte by byte.
-   POWER7 has performance gains over loop unroll.  */
-L(bytes_unroll):
-	addi r9, r3, -1
-	srdi r10, r29, 2
-	mtctr r10
-	b L(L10)
-	.p2align 4
-L(L44):
-	lbz r10, 1(r31)			/* Load byte.  */
-	cmpdi cr7, r10, 0		/* Compare ; if byte not zero,
-					   continue.  */
-	stb r10, 2(r9)			/* Store byte  */
-	beq cr7, L(done)
-	addi r31, r31, 4
-
-	lbz r10, -2(r31)		/* Perform loop unroll here on byte
-					   load and store.  */
-	cmpdi cr7, r10, 0
-	stb r10, 3(r9)
-	beq cr7, L(done)
-
-	lbz r10, -1(r31)		/* Loop unroll here.  */
-	cmpdi cr7, r10, 0
-	stbu r10, 4(r9)
-	beq cr7, L(done)
-
-	bdz L(leftNbytes)
-
-L(L10):
-	lbz r10, 0(r31)			/* Loop unroll here.  */
-	cmpdi cr7, r10, 0
-	stb r10, 1(r9)
-	bne cr7,L(L44)
-	b L(done)
-	.p2align 4
-/* If s2 is double word aligned, we load and store double word.  */
-L(dwordAligned):
-/* read, write 8 bytes at a time  */
-	srdi r8, r29, 3			/* Compute count for CTR to loop;
-					   count = n/8.  */
-	li r7, 0			/* Load r7 with NULL.  */
-	li r10, 0			/* Load r10 with MASK '0'.  */
-
-	mtctr r8			/* Move count to CTR.  */
-L(loop8):
-	ld r9, 0(r31)			/* Read double word from s2.  */
-	cmpb r6, r9, r10		/* Compare bytes in s2 we read
-					   just now.  */
-	cmpdi r6, 0			/* If cmpb returned NULL,
-					   we continue.  */
-	bne+ L(a8)
-	std r9, 0(r3)			/* Append double word from s2
-					   with s1.  */
-	addi r3, r3, 8			/* Increment s1.  */
-	addi r31, r31, 8		/* Increment s2.  */
-	subi r29, r29, 8		/* Decrement count by 8.  */
-	bdnz L(loop8)			/* Continue until "count" is
-					   non zero.  */
-
-L(a8):
-	cmpdi r29, 0			/* If "n" is already zero, we skip. */
-	beq+ L(align8align)
-
-	mtctr r29			/* Process left over bytes in "n".  */
-L(unaligned0):
-	lbz r9, 0(r31)			/* Read a byte from s2.  */
-	cmpw r9, r7			/* If byte is NULL, we stop here . */
-	beq+ L(align8align)		/* Skip processing further if NULL.  */
-	stb  r9, 0(r3)			/* If not NULL, store byte into s1.  */
-	addi r3, r3, 1			/* Increment s1 by 1.  */
-	addi r31, r31, 1		/* Increment s2 by 1.  */
-	bdnz L(unaligned0)		/* Decrement counter "n" and loop
-					   until non zero.  */
-L(align8align):
-	stb r7, 0(r3)			/* Terminate s1 with NULL.  */
-
-	addi r1, r1, FRAMESIZE		/* Restore stack pointer.  */
-	mr r3, r30			/* Set the return value, length of
-					   string.  */
-	ld r0, 16(r1)			/* Read the saved link register.  */
-	ld r29, -24(r1)			/* Restore save register r29.  */
-	ld r30, -16(r1)			/* Restore save register r30.  */
-	ld r31, -8(r1)			/* Restore save register r31.  */
-	mtlr r0				/* Restore link register.  */
-	blr				/* Branch to link register  */
-
-	.p2align 4
-L(leftNbytes):
-	rldicl. r29, r29, 0, 62		/* Check if n>0 and n < 4 bytes.  */
-	bne cr0,L(byte_by_byte)		/* Process bytes one by one. */
-	b L(nullTerminate)		/* Now, finish catenation with
-					   NULL termination.  */
-END(STRNCAT)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=11ce06e589429143595a6c4b60ac7ab6372201b1

commit 11ce06e589429143595a6c4b60ac7ab6372201b1
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Tue Dec 23 13:36:34 2014 -0500

    powerpc: Optimized strcat for POWER8/PPC64
    
    With new optimized strcpy for POWER8, this patch adds an optimized
    strcat which uses it along with default implementation at strings/.

diff --git a/ChangeLog b/ChangeLog
index b542cf0..4ff5e7d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,15 @@
 2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+	strncat-power8 object.
+	* sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add
+	__strcat_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strcat_power8 implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c: New file:
+	optimized strcat for power8.
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
 	strcpy-power8 and stpcpy-power8 objects.
 	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 	(__libc_ifunc_impl_list): Add __strcpy_power8 and __stpcpy_power8
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index f170551..74b2daa 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -18,8 +18,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
-		   strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
-		   bcopy-ppc64
+		   strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
+		   memmove-ppc64 bcopy-ppc64
 
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 2a7e7f5..d5b2184 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -303,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c.  */
   IFUNC_IMPL (i, name, strcat,
 	      IFUNC_IMPL_ADD (array, i, strcat,
+			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strcat_power8)
+	      IFUNC_IMPL_ADD (array, i, strcat,
 			      hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcat_power7)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcat.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
index 847a62d..6c7544c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
@@ -1,5 +1,4 @@
-/* Multiple versions of strcat. PowerPC64 version.
-   Copyright (C) 2014 Free Software Foundation, Inc.
+/* Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -9,23 +8,23 @@
 
    The GNU C Library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
    Lesser General Public License for more details.
 
    You should have received a copy of the GNU Lesser General Public
    License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
+   <http://www.gnu.org/licenses/ >.  */
 
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <string.h>
 
-extern __typeof (strcat) __strcat_ppc attribute_hidden;
-extern __typeof (strcat) __strcat_power7 attribute_hidden;
+#define STRCAT __strcat_power8
 
-libc_ifunc (strcat,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcat_power7
-            : __strcat_ppc);
-#endif
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strcpy) __strcpy_power8;
+extern typeof (strlen) __strlen_power7;
+
+#define strcpy __strcpy_power8
+#define strlen __strlen_power7
+#include <sysdeps/powerpc/strcat.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
index 847a62d..289e9b2 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
@@ -23,9 +23,12 @@
 
 extern __typeof (strcat) __strcat_ppc attribute_hidden;
 extern __typeof (strcat) __strcat_power7 attribute_hidden;
+extern __typeof (strcat) __strcat_power8 attribute_hidden;
 
 libc_ifunc (strcat,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcat_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strcat_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strcat_power7
             : __strcat_ppc);
 #endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a9728856f02f74b60a546499c5bd8492d1726f98

commit a9728856f02f74b60a546499c5bd8492d1726f98
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Tue Dec 23 05:59:44 2014 -0600

    powerpc: Optimized st{r,p}cpy for POWER8/PPC64
    
    This patch adds an optimized POWER8 strcpy using unaligned accesses.
    For strings up to 16 bytes the implementation first calculate the
    string size, like strlen, and issues a memcpy.  For larger strings,
    source is first aligned to 16 bytes and then tested over a loop that
    reads 16 bytes am combine the cmpb results for speedup.  Special case is
    added for page cross reads.
    
    It shows 30%-60% improvement over the optimized POWER7 one that uses
    only aligned accesses.

diff --git a/ChangeLog b/ChangeLog
index 73bf51e..b542cf0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2015-01-13  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+	strcpy-power8 and stpcpy-power8 objects.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add __strcpy_power8 and __stpcpy_power8
+	implementations.
+	* sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S: New file:
+	multiarch stpcpy implementation for POWER8.
+	* sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S: New file;
+	multiarch strcpy implementation for POWER8.
+	* sysdeps/powerpc/powerpc64/multiarch/strcpy.c (strcpy): Add
+	__strcpy_power8 function.
+	* sysdeps/powerpc/powerpc64/power8/stpcpy.S: New file: optimized
+	stpcpy for POWER8.
+	* sysdeps/powerpc/powerpc64/power8/strcpy.S: New file: optimized
+	strcpy for POWER8.
+	* NEWS: Update.
+
 2014-12-31  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
 	    Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
diff --git a/NEWS b/NEWS
index f9beb9f..769e841 100644
--- a/NEWS
+++ b/NEWS
@@ -11,6 +11,8 @@ Version 2.20.1
 
   16617, 17266, 17370, 17371, 17625, 17630.
 
+* Optimized strcpy and stpcpy implementations for powerpc64/powerpc64le.
+
 * CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
   under certain input conditions resulting in the execution of a shell for
   command substitution when the applicaiton did not request it. The
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 39e441b..f170551 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -13,7 +13,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \
 		   wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \
 		   wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \
-		   strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
+		   strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
+		   stpcpy-power7 stpcpy-ppc64 \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
 		   strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 8f1e3e1..2a7e7f5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -83,6 +83,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
   IFUNC_IMPL (i, name, strcpy,
+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __strcpy_power8)
 	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1,
@@ -90,6 +92,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
+	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __stpcpy_power8)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_HAS_VSX,
 			      __stpcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
index 1b6e9e0..66e6f70 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+/* Optimized stpcpy implementation for POWER8/PPC64.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if defined SHARED && !defined NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
 
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__stpcpy_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__stpcpy_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__stpcpy_power8)
 
-libc_ifunc (strcpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcpy_power7
-            : __strcpy_ppc);
-#endif
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__stpcpy_power8)					\
+  END_2(__stpcpy_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
index 1b6e9e0..64cbc16 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+/* Optimized strcpy implementation for POWER8/PPC64.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if defined SHARED && !defined NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
 
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__strcpy_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__strcpy_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__strcpy_power8)
 
-libc_ifunc (strcpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcpy_power7
-            : __strcpy_ppc);
-#endif
+#undef END
+#define END(name)						\
+  cfi_endproc;							\
+  TRACEBACK(__strcpy_power8)					\
+  END_2(__strcpy_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
index 1b6e9e0..20ef73f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
@@ -23,9 +23,12 @@
 
 extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
 extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
 
 libc_ifunc (strcpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcpy_power7
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __strcpy_power8 :
+              (hwcap & PPC_FEATURE_HAS_VSX)
+              ? __strcpy_power7
             : __strcpy_ppc);
 #endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
similarity index 61%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/power8/stpcpy.S
index 1b6e9e0..bf72065 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
+/* Optimized stpcpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,9 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#if defined SHARED && !defined NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#define USE_AS_STPCPY
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
 
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
-
-libc_ifunc (strcpy,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strcpy_power7
-            : __strcpy_ppc);
-#endif
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S
new file mode 100644
index 0000000..d3e9a10
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcpy.S
@@ -0,0 +1,262 @@
+/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8.
+   Copyright (C) 2015 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPCPY
+# define FUNC_NAME __stpcpy
+#else
+# define FUNC_NAME strcpy
+#endif
+
+/* Implements the function
+
+   char * [r3] strcpy (char *dest [r3], const char *src [r4])
+
+   or
+
+   char * [r3] stpcpy (char *dest [r3], const char *src [r4])
+
+   if USE_AS_STPCPY is defined.
+
+   The implementation uses unaligned doubleword access to avoid specialized
+   code paths depending of data alignment.  Although recent powerpc64 uses
+   64K as default, the page cross handling assumes minimum page size of
+   4k.  */
+
+	.machine  power7
+EALIGN (FUNC_NAME, 4, 0)
+        li      r0,0          /* Doubleword with null chars to use
+                                 with cmpb.  */
+
+	/* Check if the [src]+15 will cross a 4K page by checking if the bit
+	   indicating the page size changes.  Basically:
+
+	   uint64_t srcin = (uint64_t)src;
+	   uint64_t ob = srcin & 4096UL;
+	   uint64_t nb = (srcin+15UL) & 4096UL;
+	   if (ob ^ nb)
+	     goto pagecross;  */
+
+	addi	r9,r4,15
+	xor	r9,r9,r4
+	rlwinm.	r9,r9,0,19,19
+	bne	L(pagecross)
+
+	/* For short string (less than 16 bytes), just calculate its size as
+	   strlen and issues a memcpy if null is found.  */
+	mr	r7,r4
+        ld      r12,0(r7)     /* Load doubleword from memory.  */
+        cmpb    r10,r12,r0    /* Check for null bytes in DWORD1.  */
+        cmpdi   cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+        bne     cr7,L(done)
+
+        ldu     r8,8(r7)
+        cmpb    r10,r8,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+	b	L(loop_before)
+
+	.align	4
+L(pagecross):
+	clrrdi  r7,r4,3       /* Align the address to doubleword boundary.  */
+	rlwinm  r6,r4,3,26,28 /* Calculate padding.  */
+	li      r5,-1         /* MASK = 0xffffffffffffffff.  */
+        ld      r12,0(r7)     /* Load doubleword from memory.  */
+#ifdef __LITTLE_ENDIAN__
+        sld     r5,r5,r6
+#else
+        srd     r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
+        orc     r9,r12,r5     /* Mask bits that are not part of the string.  */
+        cmpb    r10,r9,r0     /* Check for null bytes in DWORD1.  */
+        cmpdi   cr7,r10,0     /* If r10 == 0, no null's have been found.  */
+        bne     cr7,L(done)
+
+        ldu     r6,8(r7)
+        cmpb    r10,r6,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+        ld      r12,0(r7)
+        cmpb    r10,r12,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+        ldu     r6,8(r7)
+        cmpb    r10,r6,r0
+        cmpdi   cr7,r10,0
+        bne     cr7,L(done)
+
+	/* We checked for 24 - x bytes, with x being the source alignment
+	   (0 <= x <= 16), and no zero has been found.  Start the loop
+	   copy with doubleword aligned address.  */
+	mr	r7,r4
+	ld	r12, 0(r7)
+	ldu	r8, 8(r7)
+
+L(loop_before):
+	/* Save the two doublewords readed from source and align the source
+	   to 16 bytes for the loop.  */
+	mr	r11,r3
+	std	r12,0(r11)
+	std	r8,8(r11)
+	addi	r11,r11,16
+	rldicl	r9,r4,0,60
+	subf	r7,r9,r7
+	subf	r11,r9,r11
+	b	L(loop_start)
+
+        .align  5
+L(loop):
+        std     r12, 0(r11)
+        std     r6, 8(r11)
+	addi	r11,r11,16
+L(loop_start):
+        /* Load two doublewords, compare and merge in a
+           single register for speed.  This is an attempt
+           to speed up the null-checking process for bigger strings.  */
+
+        ld      r12, 8(r7)
+        ldu     r6, 16(r7)
+        cmpb    r10,r12,r0
+        cmpb    r9,r6,r0
+        or      r8,r9,r10     /* Merge everything in one doubleword.  */
+        cmpdi   cr7,r8,0
+        beq     cr7,L(loop)
+
+
+        /* OK, one (or both) of the doublewords contains a null byte.  Check
+           the first doubleword and decrement the address in case the first
+           doubleword really contains a null byte.  */
+
+	addi	r4,r7,-8
+        cmpdi   cr6,r10,0
+        addi    r7,r7,-8
+        bne     cr6,L(done2)
+
+        /* The null byte must be in the second doubleword.  Adjust the address
+           again and move the result of cmpb to r10 so we can calculate the
+           length.  */
+
+        mr      r10,r9
+        addi    r7,r7,8
+	b	L(done2)
+
+        /* r10 has the output of the cmpb instruction, that is, it contains
+           0xff in the same position as the null byte in the original
+           doubleword from the string.  Use that to calculate the length.  */
+L(done):
+	mr	r11,r3
+L(done2):
+#ifdef __LITTLE_ENDIAN__
+        addi    r9, r10, -1   /* Form a mask from trailing zeros.  */
+        andc    r9, r9, r10
+        popcntd r6, r9        /* Count the bits in the mask.  */
+#else
+        cntlzd  r6,r10        /* Count leading zeros before the match.  */
+#endif
+        subf    r5,r4,r7
+        srdi    r6,r6,3       /* Convert leading/trailing zeros to bytes.  */
+        add     r8,r5,r6      /* Compute final length.  */
+#ifdef USE_AS_STPCPY
+	/* stpcpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
+	addi	r8,r8,1       /* Final '/0'.  */
+
+	cmpldi	cr6,r8,8
+	mtocrf	0x01,r8
+	ble	cr6,L(copy_LE_8)
+
+	cmpldi	cr1,r8,16
+	blt	cr1,8f
+
+	/* Handle copies of 0~31 bytes.  */
+	.align	4
+L(copy_LT_32):
+	/* At least 6 bytes to go.  */
+	blt	cr1,8f
+
+	/* Copy 16 bytes.  */
+	ld	r6,0(r4)
+	ld	r8,8(r4)
+	addi	r4,r4,16
+	std	r6,0(r11)
+	std	r8,8(r11)
+	addi	r11,r11,16
+8:	/* Copy 8 bytes.  */
+	bf	28,L(tail4)
+	ld	r6,0(r4)
+	addi	r4,r4,8
+	std	r6,0(r11)
+	addi	r11,r11,8
+
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	r6,0(r4)
+	stw	r6,0(r11)
+	bf	30,L(tail5)
+	lhz	r7,4(r4)
+	sth	r7,4(r11)
+	bflr	31
+	lbz	r8,6(r4)
+	stb	r8,6(r11)
+	blr
+
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	lhz	r6,0(r4)
+	sth	r6,0(r11)
+	bflr	31
+	lbz	r7,2(r4)
+	stb	r7,2(r11)
+	blr
+
+	.align	4
+L(tail5):
+	bf	31,1f
+	lbz	r6,4(r4)
+	stb	r6,4(r11)
+	blr
+
+	.align	4
+1:
+	bflr	31
+	lbz	r6,0(r4)
+	stb	r6,0(r11)
+	blr
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(copy_LE_8):
+	bne	cr6,L(tail4)
+	ld	r6,0(r4)
+	std	r6,0(r11)
+	blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b53db929e654aaf97a2a239e17a87b04c768b854

commit b53db929e654aaf97a2a239e17a87b04c768b854
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Date:   Wed Dec 31 14:05:00 2014 -0500

    powerpc: POWER7 strcpy optimization for unaligned strings
    
    This patch optimizes strcpy for ppc64/power7 for unaligned source or
    destination address.  The source or destination address is aligned
    to doubleword and data is shifted based on the alignment and
    added with the previous loaded data to be written as a doubleword.
    For each load, cmpb instruction is used for faster null check.
    
    The word aligned optimization is also removed, since the new unaligned
    code path shows better results handling word-aligned strings.
    
    More combination of unaligned inputs is also added in benchtest
    to measure the improvement.The new optimization shows 2 to 80% of
    performance improvement for longer string though it does not show
    big difference on string size less than 16 due to additional checks.

diff --git a/ChangeLog b/ChangeLog
index 991ba72..73bf51e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2014-12-31  Rajalakshmi Srinivasaraghavan  <raji@linux.vnet.ibm.com>
+	    Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/power7/strcpy.S (strcpy): Optimize unaligned
+	path.
+	* benchtests/bench-strcpy.c (test_main): Add more unaligned inputs.
+
 2014-12-16  Florian Weimer  <fweimer@redhat.com>
 
 	[BZ #17630]
diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index c3ab4cf..e9445f2 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -171,6 +171,22 @@ test_main (void)
       do_test (i, i, 8 << i, BIG_CHAR);
     }
 
+  for (i = 16; i <= 512; i+=4)
+    {
+      do_test (0, 4, i, SMALL_CHAR);
+      do_test (4, 0, i, BIG_CHAR);
+      do_test (4, 4, i, SMALL_CHAR);
+      do_test (2, 2, i, BIG_CHAR);
+      do_test (2, 6, i, SMALL_CHAR);
+      do_test (6, 2, i, BIG_CHAR);
+      do_test (1, 7, i, SMALL_CHAR);
+      do_test (7, 1, i, BIG_CHAR);
+      do_test (3, 4, i, SMALL_CHAR);
+      do_test (4, 3, i, BIG_CHAR);
+      do_test (5, 7, i, SMALL_CHAR);
+      do_test (7, 5, i, SMALL_CHAR);
+    }
+
   return ret;
 }
 
diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S
index ce71982..115f98a 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S
@@ -31,8 +31,6 @@
 
    if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0))
      goto aligned_doubleword_copy;
-   if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0))
-     goto aligned_word_copy;
    if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL))
      goto same_alignment;
    goto unaligned;
@@ -70,9 +68,18 @@ EALIGN (FUNC_NAME, 4, 0)
 #endif
 	or	rTMP, rSRC, rRTN
 	clrldi.	rTMP, rTMP, 61
-	bne	L(check_word_alignment)
+	bne	L(check_alignment)
 	b	L(aligned_doubleword_copy)
 
+	.align 4
+L(check_alignment):
+	rldicl	rRTNAL, rRTN, 0, 61
+	rldicl	rSRCAL, rSRC, 0, 61
+	cmpld	cr7, rSRCAL, rRTNAL
+	beq	cr7, L(same_alignment)
+	b	L(unaligned)
+
+	.align 4
 L(same_alignment):
 /* Src and dst with same alignment: align both to doubleword.  */
 	mr	rALCNT, rRTN
@@ -180,93 +187,249 @@ L(g1):
 #endif
 	blr
 
-L(check_word_alignment):
-	clrldi. rTMP, rTMP, 62
-	beq	L(aligned_word_copy)
-	rldicl	rRTNAL, rRTN, 0, 61
-	rldicl	rSRCAL, rSRC, 0, 61
-	cmpld	cr7, rSRCAL, rRTNAL
-	beq	cr7, L(same_alignment)
-	b	L(unaligned)
-
-/* For word aligned memory, operate using word load and stores.  */
 	.align	4
-L(aligned_word_copy):
-	li	rMASK, 0
-	addi	rRTN, rRTN, -4
-	lwz	rWORD, 0(rSRC)
-	b	L(g5)
+L(unaligned):
+	cmpdi	rSRCAL, 0		/* Check src alignment */
+	beq	L(srcaligndstunalign)
+	/* src is unaligned */
+	rlwinm	r10, rSRC, 3,26,28	/* Calculate padding.  */
+	clrrdi	rSRC, rSRC, 3		/* Align the addr to dw boundary */
+	ld	rWORD, 0(rSRC)		/* Load doubleword from memory.  */
+	li	rTMP, 0
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	srd	rALT, rWORD, r10
+#else
+	sld	rALT, rWORD, r10
+#endif
+	cmpb	rTMP, rALT, rTMP	/* Compare each byte against null */
+	/* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+	sld	rTMP, rTMP, r10
+#else
+	srd	rTMP, rTMP, r10
+#endif
+	cmpdi	rTMP, 0
+	bne	L(bytebybyte)		/* if it has null, copy byte by byte */
+	subfic	r8, r9, 8
+	rlwinm	r5, rRTN, 3,26,28	/* Calculate padding in bits.  */
+	rldicl	r9, rRTN, 0, 61		/* Calculate padding in bytes. */
+	addi	rRTN, rRTN, -1
 
-	.align	4
-L(g3):	lwzu	rALT, 4(rSRC)
-	stwu	rWORD, 4(rRTN)
-	cmpb	rTMP, rALT, rMASK
-	cmpwi	rTMP, 0
-	bne	L(g4)
-	lwzu	rWORD, 4(rSRC)
-	stwu	rALT, 4(rRTN)
-L(g5):	cmpb	rTMP, rWORD, rMASK
-	cmpwi	rTMP, 0		/* If rTMP is 0, no null in word.  */
-	beq	L(g3)
-
-	mr      rALT, rWORD
-/* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g4):
+	cmpdi	r5, 0			/* check dest alignment */
+	beq	L(srcunaligndstalign)
+
+	/* both src and dst unaligned */
 #ifdef __LITTLE_ENDIAN__
-	rlwinm.	rTMP, rALT, 0, 24, 31
-	stbu	rALT, 4(rRTN)
-	beqlr-
-	rlwinm.	rTMP, rALT, 24, 24, 31
-	stbu	rTMP, 1(rRTN)
-	beqlr-
-	rlwinm.	rTMP, rALT, 16, 24, 31
-	stbu	rTMP, 1(rRTN)
-	beqlr-
-	rlwinm	rTMP, rALT, 8, 24, 31
-	stbu	rTMP, 1(rRTN)
+	sld	rWORD, rALT, r10
+	mr 	r11, r10
+	addi	r11, r11, -8		/* Adjust byte pointer on loaded dw */
 #else
-	rlwinm. rTMP, rALT, 8, 24, 31
-	stbu    rTMP, 4(rRTN)
-	beqlr
-	rlwinm. rTMP, rALT, 16, 24, 31
-	stbu    rTMP, 1(rRTN)
-	beqlr
-	rlwinm. rTMP, rALT, 24, 24, 31
-	stbu    rTMP, 1(rRTN)
-	beqlr
-	stbu    rALT, 1(rRTN)
+	srd	rWORD, rALT, r10
+	subfic	r11, r10, 64
 #endif
-	blr
+	/* dst alignment is greater then src alignment? */
+	cmpd	cr7, r5, r10
+	blt	cr7, L(dst_align_small)
+	/* src alignment is less than dst */
 
-/* Oh well.  In this case, we just do a byte-by-byte copy.  */
-	.align	4
-L(unaligned):
-	lbz	rWORD, 0(rSRC)
-	addi	rRTN, rRTN, -1
-	cmpdi	rWORD, 0
-	beq	L(u2)
-
-	.align 	5
-L(u0):	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rRTN)
-	cmpdi	rALT, 0
-	beq	L(u1)
-	lbzu	rWORD, 1(rSRC)
+	/* Calculate the dst alignment differnce */
+	subfic	rALT, r9, 8
+	mtctr	rALT
+
+	/* Write till dst is aligned */
+	cmpdi	rTMP, rALT, 4
+	blt	L(storebyte1)		/* less than 4, store byte by byte */
+	beq	L(equal1)		/* if its 4, store word */
+	addi	rTMP, rALT, -4		/* greater than 4, so stb and stw */
+	mtctr	rTMP
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on loaded dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
+	stbu	rALT, 1(rRTN)
+	bdnz	L(storebyte1)
+
+	subfic	rALT, r9, 8		/* Check the remaining bytes */
+	cmpdi	rTMP, rALT, 4
+	blt	L(proceed)
+
+	.align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on loaded dw */
+	srd	rALT, rWORD, r11
+#else
+	subfic	r11, r11, 64
+	sld	rALT, rWORD, r11
+	srdi	rALT, rALT, 32
+#endif
+	stw	rALT, 1(rRTN)
+	addi	rRTN, rRTN, 4
+
+L(proceed):
+	mr	rALT, rWORD
+	/* calculate the Left over bytes to be written */
+	subfic	r11, r10, 64
+	subfic	r5, r5, 64
+	subf	r5, r5, r11		/* remaining bytes on second dw */
+        subfic	r10, r5, 64		/* remaining bytes on first dw */
+	subfic	r9, r9, 8
+	subf	r8, r9, r8		/* recalculate padding */
+L(srcunaligndstalign):
+	addi	rRTN, rRTN, 1
+	subfic	r5, r10, 64		/* remaining bytes on second dw */
+	addi	rSRC, rSRC, 8
+	li	rTMP,0
+	b	L(storedouble)
+
+	.align 4
+L(dst_align_small):
+	mtctr	r8
+	/* Write till src is aligned */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
 	stbu	rALT, 1(rRTN)
-	cmpdi	rWORD, 0
-	beq	L(u2)
-	lbzu	rALT, 1(rSRC)
-	stbu	rWORD, 1(rRTN)
-	cmpdi	rALT, 0
-	beq	L(u1)
-	lbzu	rWORD, 1(rSRC)
+	bdnz	L(storebyte2)
+
+	addi	rSRC, rSRC, 8		/* Increment src pointer */
+	addi	rRTN, rRTN, 1		/* Increment dst pointer */
+	rldicl	r8, rRTN, 0, 61		/* Recalculate padding */
+
+	/* src is aligned */
+L(srcaligndstunalign):
+	ld	rWORD, 0(rSRC)
+	mr	rALT, rWORD
+	li	rTMP, 0			/* Check null */
+	cmpb	rTMP, rWORD, rTMP
+	cmpdi	rTMP, 0
+	bne	L(bytebybyte)		/* Do byte by byte if there is NULL */
+	rlwinm	r5, rRTN, 3,26,28	/* Calculate padding */
+	addi	rRTN, rRTN, -1
+	subfic	r10, r8, 8
+	/* write byte by byte till aligned */
+#ifdef __LITTLE_ENDIAN__
+	li	r11, -8
+#else
+	li	r11, 64
+#endif
+	mtctr	r10
+	cmpdi	rTMP, r10, 4
+	blt	L(storebyte)
+	beq	L(equal)
+	addi	rTMP, r10, -4
+	mtctr	rTMP
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8		/* Adjust byte pointer on  dw */
+#else
+	addi	r11, r11, -8
+#endif
+	srd	rALT, rWORD, r11
 	stbu	rALT, 1(rRTN)
-	cmpdi	rWORD, 0
-	bne	L(u0)
-L(u2):	stbu	rWORD, 1(rRTN)
-	blr
-L(u1):	stbu	rALT, 1(rRTN)
-	blr
+	bdnz	L(storebyte)
+
+	cmpdi	rTMP, r10, 4
+	blt	L(align)
+
+	.align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+	addi	r11, r11, 8
+	srd	rALT, rWORD, r11
+#else
+	subfic	r11, r11, 64
+	sld	rALT, rWORD, r11
+	srdi	rALT, rALT, 32
+#endif
+	stw	rALT, 1(rRTN)
+	addi	rRTN, rRTN, 4
+L(align):
+	addi	rRTN, rRTN, 1
+	addi	rSRC, rSRC, 8		/* Increment src pointer */
+	subfic	r10, r5, 64
+	li	rTMP, 0
+	/* dst addr aligned to 8 */
+L(storedouble):
+	ld	rALT, 0(rSRC)		/* load next dw */
+	cmpb	rTMP, rALT, rTMP
+	cmpdi	rTMP, 0			/* check for null on each new dw */
+	bne	L(null)
+#ifdef __LITTLE_ENDIAN__
+	srd	r9, rWORD, r10		/* bytes from first dw */
+	sld	r11, rALT, r5		/* bytes from second dw */
+#else
+	sld	r9, rWORD, r10
+	srd	r11, rALT, r5
+#endif
+	or	r11, r9, r11		/* make as a single dw */
+	std	r11, 0(rRTN)		/* store as std on aligned addr */
+	mr	rWORD, rALT		/* still few bytes left to be written */
+	addi	rRTN, rRTN, 8		/* increment dst addr */
+	addi	rSRC, rSRC, 8		/* increment src addr */
+	b	L(storedouble)		/* Loop till NULL */
+
+	.align 4
+
+/* We've hit the end of the string.  Do the rest byte-by-byte.  */
+L(null):
+	addi	rRTN, rRTN, -1
+	mr	r10, r5
+	mtctr	r8
+#ifdef __LITTLE_ENDIAN__
+	subfic	r10, r10, 64
+	addi	r10, r10, -8
+#endif
+	cmpdi	rTMP, r8, 4
+	blt	L(loop)
+
+	/* we can still use stw if leftover >= 4*/
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+	srd	r11, rWORD, r10
+#else
+	subfic	r10, r10, 64
+	sld	r11, rWORD, r10
+	srdi	r11, r11, 32
+#endif
+	stw	r11, 1(rRTN)
+	addi	rRTN, rRTN, 4
+
+	beq	L(bytebybyte1)
+	addi	r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, -8
+#else
+	subfic	r10, r10, 64
+#endif
+	addi	rTMP, r8, -4
+	mtctr	rTMP
+	/* remaining byte by byte part of first dw */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+	addi	r10, r10, 8
+#else
+	addi	r10, r10, -8
+#endif
+	srd	rTMP, rWORD, r10
+	stbu	rTMP, 1(rRTN)
+	bdnz	L(loop)
+
+L(bytebybyte1):
+	addi	rRTN, rRTN, 1
+	/* remaining byte by byte part of second dw */
+L(bytebybyte):
+	addi	rRTN, rRTN, -8
+	b	L(g1)
+
 END (FUNC_NAME)
 
 #ifndef USE_AS_STPCPY

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f4f9fb08d49740d9f18918bcf9d45ca594f416ee

commit f4f9fb08d49740d9f18918bcf9d45ca594f416ee
Author: Florian Weimer <fweimer@redhat.com>
Date:   Mon Dec 15 17:41:13 2014 +0100

    Avoid infinite loop in nss_dns getnetbyname [BZ #17630]

diff --git a/ChangeLog b/ChangeLog
index 0462e8c..991ba72 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2014-12-16  Florian Weimer  <fweimer@redhat.com>
+
+	[BZ #17630]
+	* resolv/nss_dns/dns-network.c (getanswer_r): Iterate over alias
+	names.
+
 2014-12-15  Jeff Law  <law@redhat.com>
 
 	[BZ #16617]
diff --git a/NEWS b/NEWS
index a46ee05..f9beb9f 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,7 @@ Version 2.20.1
 
 * The following bugs are resolved with this release:
 
-  16617, 17266, 17370, 17371, 17625.
+  16617, 17266, 17370, 17371, 17625, 17630.
 
 * CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
   under certain input conditions resulting in the execution of a shell for
@@ -19,6 +19,10 @@ Version 2.20.1
 
 * CVE-2012-3406 printf-style functions could run into a stack overflow when
   processing format strings with a large number of format specifiers.
+
+* CVE-2014-9402 The nss_dns implementation of getnetbyname could run into an
+  infinite loopif the DNS response contained a PTR record of an unexpected
+  format.
 
 Version 2.20
 
diff --git a/resolv/nss_dns/dns-network.c b/resolv/nss_dns/dns-network.c
index 0a77c8b..08cf0a6 100644
--- a/resolv/nss_dns/dns-network.c
+++ b/resolv/nss_dns/dns-network.c
@@ -398,8 +398,8 @@ getanswer_r (const querybuf *answer, int anslen, struct netent *result,
 
 	case BYNAME:
 	  {
-	    char **ap = result->n_aliases++;
-	    while (*ap != NULL)
+	    char **ap;
+	    for (ap = result->n_aliases; *ap != NULL; ++ap)
 	      {
 		/* Check each alias name for being of the forms:
 		   4.3.2.1.in-addr.arpa		= net 1.2.3.4

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5985c6ea868db23380977a35a2167549f9a3653b

commit 5985c6ea868db23380977a35a2167549f9a3653b
Author: Jeff Law <law@redhat.com>
Date:   Mon Dec 15 10:09:32 2014 +0100

    CVE-2012-3406: Stack overflow in vfprintf [BZ #16617]
    
    A larger number of format specifiers coudld cause a stack overflow,
    potentially allowing to bypass _FORTIFY_SOURCE format string
    protection.

diff --git a/ChangeLog b/ChangeLog
index c5ced23..0462e8c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2014-12-15  Jeff Law  <law@redhat.com>
+
+	[BZ #16617]
+	* stdio-common/vfprintf.c (vfprintf): Allocate large specs array
+	on the heap.  (CVE-2012-3406)
+	* stdio-common/bug23-2.c, stdio-common/bug23-3.c: New file.
+	* stdio-common/bug23-4.c: New file.  Test case by Joseph Myers.
+	* stdio-common/Makefile (tests): Add bug23-2, bug23-3, bug23-4.
+
 2014-12-02  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
diff --git a/NEWS b/NEWS
index 20106dc..a46ee05 100644
--- a/NEWS
+++ b/NEWS
@@ -9,13 +9,16 @@ Version 2.20.1
 
 * The following bugs are resolved with this release:
 
-  17266, 17370, 17371, 17625.
+  16617, 17266, 17370, 17371, 17625.
 
 * CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
   under certain input conditions resulting in the execution of a shell for
   command substitution when the applicaiton did not request it. The
   implementation now checks WRDE_NOCMD immediately before executing the
   shell and returns the error WRDE_CMDSUB as expected.
+
+* CVE-2012-3406 printf-style functions could run into a stack overflow when
+  processing format strings with a large number of format specifiers.
 
 Version 2.20
 
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 5f8e534..24e8496 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -57,7 +57,7 @@ tests := tstscanf test_rdwr test-popen tstgetln test-fseek \
 	 bug19 bug19a tst-popen2 scanf13 scanf14 scanf15 bug20 bug21 bug22 \
 	 scanf16 scanf17 tst-setvbuf1 tst-grouping bug23 bug24 \
 	 bug-vfprintf-nargs tst-long-dbl-fphex tst-fphex-wide tst-sprintf3 \
-	 bug25 tst-printf-round bug26
+	 bug25 tst-printf-round bug23-2 bug23-3 bug23-4
 
 test-srcs = tst-unbputc tst-printf
 
diff --git a/stdio-common/bug23-2.c b/stdio-common/bug23-2.c
new file mode 100644
index 0000000..9e0cfe6
--- /dev/null
+++ b/stdio-common/bug23-2.c
@@ -0,0 +1,70 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+static const char expected[] = "\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
+
+static int
+do_test (void)
+{
+  char *buf = malloc (strlen (expected) + 1);
+  snprintf (buf, strlen (expected) + 1,
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+	    "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n",
+	    "a", "b", "c", "d", 5);
+  return strcmp (buf, expected) != 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/stdio-common/bug23-3.c b/stdio-common/bug23-3.c
new file mode 100644
index 0000000..57c8cef
--- /dev/null
+++ b/stdio-common/bug23-3.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+int
+do_test (void)
+{
+  size_t instances = 16384;
+#define X0 "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+  const char *item = "\na\nabbcd55";
+#define X3 X0 X0 X0 X0 X0 X0 X0 X0
+#define X6 X3 X3 X3 X3 X3 X3 X3 X3
+#define X9 X6 X6 X6 X6 X6 X6 X6 X6
+#define X12 X9 X9 X9 X9 X9 X9 X9 X9
+#define X14 X12 X12 X12 X12
+#define TRAILER "%%%%%%%%%%%%%%%%%%%%%%%%%%"
+#define TRAILER2 TRAILER TRAILER
+  size_t length = instances * strlen (item) + strlen (TRAILER) + 1;
+
+  char *buf = malloc (length + 1);
+  snprintf (buf, length + 1,
+	    X14 TRAILER2 "\n",
+	    "a", "b", "c", "d", 5);
+
+  const char *p = buf;
+  size_t i;
+  for (i = 0; i < instances; ++i)
+    {
+      const char *expected;
+      for (expected = item; *expected; ++expected)
+	{
+	  if (*p != *expected)
+	    {
+	      printf ("mismatch at offset %zu (%zu): expected %d, got %d\n",
+		      (size_t) (p - buf), i, *expected & 0xFF, *p & 0xFF);
+	      return 1;
+	    }
+	  ++p;
+	}
+    }
+  if (strcmp (p, TRAILER "\n") != 0)
+    {
+      printf ("mismatch at trailer: [%s]\n", p);
+      return 1;
+    }
+  free (buf);
+  return 0;
+}
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/stdio-common/bug23-4.c b/stdio-common/bug23-4.c
new file mode 100644
index 0000000..a478564
--- /dev/null
+++ b/stdio-common/bug23-4.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+
+#define LIMIT 1000000
+
+int
+main (void)
+{
+  struct rlimit lim;
+  getrlimit (RLIMIT_STACK, &lim);
+  lim.rlim_cur = 1048576;
+  setrlimit (RLIMIT_STACK, &lim);
+  char *fmtstr = malloc (4 * LIMIT + 1);
+  if (fmtstr == NULL)
+    abort ();
+  char *output = malloc (LIMIT + 1);
+  if (output == NULL)
+    abort ();
+  for (size_t i = 0; i < LIMIT; i++)
+    memcpy (fmtstr + 4 * i, "%1$d", 4);
+  fmtstr[4 * LIMIT] = '\0';
+  int ret = snprintf (output, LIMIT + 1, fmtstr, 0);
+  if (ret != LIMIT)
+    abort ();
+  for (size_t i = 0; i < LIMIT; i++)
+    if (output[i] != '0')
+      abort ();
+  return 0;
+}
diff --git a/stdio-common/vfprintf.c b/stdio-common/vfprintf.c
index c4ff833..429a3d1 100644
--- a/stdio-common/vfprintf.c
+++ b/stdio-common/vfprintf.c
@@ -263,6 +263,12 @@ vfprintf (FILE *s, const CHAR_T *format, va_list ap)
   /* For the argument descriptions, which may be allocated on the heap.  */
   void *args_malloced = NULL;
 
+  /* For positional argument handling.  */
+  struct printf_spec *specs;
+
+  /* Track if we malloced the SPECS array and thus must free it.  */
+  bool specs_malloced = false;
+
   /* This table maps a character into a number representing a
      class.  In each step there is a destination label for each
      class.  */
@@ -1679,8 +1685,8 @@ do_positional:
     size_t nspecs = 0;
     /* A more or less arbitrary start value.  */
     size_t nspecs_size = 32 * sizeof (struct printf_spec);
-    struct printf_spec *specs = alloca (nspecs_size);
 
+    specs = alloca (nspecs_size);
     /* The number of arguments the format string requests.  This will
        determine the size of the array needed to store the argument
        attributes.  */
@@ -1721,11 +1727,39 @@ do_positional:
 	if (nspecs * sizeof (*specs) >= nspecs_size)
 	  {
 	    /* Extend the array of format specifiers.  */
+	    if (nspecs_size * 2 < nspecs_size)
+	      {
+		__set_errno (ENOMEM);
+		done = -1;
+		goto all_done;
+	      }
 	    struct printf_spec *old = specs;
-	    specs = extend_alloca (specs, nspecs_size, 2 * nspecs_size);
+	    if (__libc_use_alloca (2 * nspecs_size))
+	      specs = extend_alloca (specs, nspecs_size, 2 * nspecs_size);
+	    else
+	      {
+		nspecs_size *= 2;
+		specs = malloc (nspecs_size);
+		if (specs == NULL)
+		  {
+		    __set_errno (ENOMEM);
+		    specs = old;
+		    done = -1;
+		    goto all_done;
+		  }
+	      }
 
 	    /* Copy the old array's elements to the new space.  */
 	    memmove (specs, old, nspecs * sizeof (*specs));
+
+	    /* If we had previously malloc'd space for SPECS, then
+	       release it after the copy is complete.  */
+	    if (specs_malloced)
+	      free (old);
+
+	    /* Now set SPECS_MALLOCED if needed.  */
+	    if (!__libc_use_alloca (nspecs_size))
+	      specs_malloced = true;
 	  }
 
 	/* Parse the format specifier.  */
@@ -2046,6 +2080,8 @@ do_positional:
   }
 
 all_done:
+  if (specs_malloced)
+    free (specs);
   if (__glibc_unlikely (args_malloced != NULL))
     free (args_malloced);
   if (__glibc_unlikely (workstart != NULL))

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=8647419a62d1d5641488ca6430bb679cf1e766e0

commit 8647419a62d1d5641488ca6430bb679cf1e766e0
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Wed Nov 19 16:27:56 2014 -0500

    powerpc: Add powerpc64 strpbrk optimization
    
    This patch makes the POWER7 optimized strpbrk generic by using
    default doubleword stores to zero the hash, instead of VSX
    instructions.  Performance on POWER7/POWER8 does not change.

diff --git a/ChangeLog b/ChangeLog
index 89ee40b..c5ced23 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,14 @@
 2014-12-02  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
-	
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Remove strpbrk objects.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Remove strpbrk implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c: Remove file.
+	* sysdeps/powerpc/powerpc64/multiarch/strpbrk.c: Remove file.
+	* sysdeps/powerpc/powerpc64/power7/strpbrk.S: Remove file.
+	* sysdeps/powerpc/powerpc64/strpbrk.S: New file.
+
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
 	Remove strcspn objects.
 	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 05dab25..39e441b 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -15,7 +15,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \
 		   strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
-		   strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
+		   strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
 		   strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
 		   bcopy-ppc64
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 1a2e38d..8f1e3e1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -272,14 +272,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_ppc))
 
-  /* Support sysdeps/powerpc/powerpc64/multiarch/strpbrk.c.  */
-  IFUNC_IMPL (i, name, strpbrk,
-	      IFUNC_IMPL_ADD (array, i, strpbrk,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-			      __strpbrk_power7)
-	      IFUNC_IMPL_ADD (array, i, strpbrk, 1,
-			     __strpbrk_ppc))
-
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
 	      IFUNC_IMPL_ADD (array, i, strncpy,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S
deleted file mode 100644
index 663ca36..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Optimized strpbrk implementation for POWER7.
-   Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#undef EALIGN
-#define EALIGN(name, alignt, words)				\
-  .section ".text";						\
-  ENTRY_2(__strpbrk_power7)					\
-  .align ALIGNARG(alignt);					\
-  EALIGN_W_##words;						\
-  BODY_LABEL(__strpbrk_power7):					\
-  cfi_startproc;						\
-  LOCALENTRY(__strpbrk_power7)
-
-#undef END
-#define END(name)						\
-  cfi_endproc;							\
-  TRACEBACK(__strpbrk_power7)					\
-  END_2(__strpbrk_power7)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#include <sysdeps/powerpc/powerpc64/power7/strpbrk.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c
deleted file mode 100644
index 8dea70e..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-
-#define STRPBRK __strpbrk_ppc
-#ifdef SHARED
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__strpbrk_ppc, __GI_strpbrk, __strpbrk_ppc);
-#endif
-
-extern __typeof (strpbrk) __strpbrk_ppc attribute_hidden;
-
-#include <string/strpbrk.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c b/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c
deleted file mode 100644
index 8b05536..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Multiple versions of strpbrk. PowerPC64 version.
-   Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-extern __typeof (strpbrk) __strpbrk_ppc attribute_hidden;
-extern __typeof (strpbrk) __strpbrk_power7 attribute_hidden;
-
-libc_ifunc (strpbrk,
-	    (hwcap & PPC_FEATURE_HAS_VSX)
-	    ? __strpbrk_power7
-	    : __strpbrk_ppc);
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strpbrk.S b/sysdeps/powerpc/powerpc64/strpbrk.S
similarity index 78%
rename from sysdeps/powerpc/powerpc64/power7/strpbrk.S
rename to sysdeps/powerpc/powerpc64/strpbrk.S
index d6204a7..6b2ad4d 100644
--- a/sysdeps/powerpc/powerpc64/power7/strpbrk.S
+++ b/sysdeps/powerpc/powerpc64/strpbrk.S
@@ -1,4 +1,4 @@
-/* Optimized strpbrk implementation for PowerPC64/POWER7.
+/* Optimized strpbrk implementation for PowerPC64.
    Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -20,7 +20,6 @@
 
 /* char [r3] *strpbrk(const char [r4] *s, const char [r5] *accept)  */
 
-	.machine power7
 EALIGN (strpbrk, 4, 0)
 	CALL_MCOUNT 3
 
@@ -32,43 +31,31 @@ EALIGN (strpbrk, 4, 0)
 	   for fast check if input character should be considered.  For ASCII
 	   or ISO-8859-X character sets it has 256 positions.  */
 
-	/* First the table should be cleared and to avoid unaligned accesses
-	   when using the VSX stores the table address is aligned to 16
-	   bytes.  */
-	xxlxor	v0,v0,v0
-
-	/* PPC64 ELF ABI stack is aligned to 16 bytes  */
+	/* PPC64 ELF ABI stack is aligned to 16 bytes.  */
 	addi 	r9,r1,-256
-
-	li	r5,16
-	li	r6,32
-	li	r8,48
-	addi	r12,r9,64
 	/* Clear the table with 0 values  */
-	stxvw4x v0,r0,r9
-	addi	r11,r9,128
-	addi	r7,r9,192
-	stxvw4x	v0,r9,r5
-	li	r0,1
-	stxvw4x v0,r9,r6
-	stxvw4x v0,r9,r8
-	stxvw4x v0,r0,r12
-	stxvw4x v0,r12,r5
-	stxvw4x v0,r12,r6
-	stxvw4x v0,r12,r8
-	stxvw4x v0,r0,r11
-	stxvw4x v0,r11,r5
-	stxvw4x v0,r11,r6
-	stxvw4x v0,r11,r8
-	stxvw4x v0,r0,r7
-	stxvw4x v0,r7,r5
-	stxvw4x v0,r7,r6
-	stxvw4x v0,r7,r8
+	li	r6, 0
+	li	r7, 4
+	mtctr	r7
+	mr	r8, r9
+	.align 	4
+L(zerohash):
+	std	r6, 0(r8)
+	std	r6, 8(r8)
+	std	r6, 16(r8)
+	std	r6, 24(r8)
+	std	r6, 32(r8)
+	std	r6, 40(r8)
+	std	r6, 48(r8)
+	std	r6, 56(r8)
+	addi	r8, r8, 64
+	bdnz	L(zerohash)
 
 	/* Initialize the table as:
 	   for (i=0; accept[i]; i++
 	     table[accept[i]]] = 1  */
-	.p2align 4,,15
+	li      r0,1
+	.align 4
 L(init_table):
 	stbx	r0,r9,r10
 	lbzu	r10,1(r4)
@@ -93,7 +80,7 @@ L(finish_table):
 	       if (table[input[i++]] == 1)
 	         return (s[i -1] ? s + i - 1: NULL);
 	     }  */
-	.p2align 4
+	.align 4
 L(unroll):
 	lbz	r0,1(r3)
 	lbzx	r8,r9,r0
@@ -121,7 +108,7 @@ L(mainloop):
 L(end):
 	blr
 
-	.p2align 4
+	.align 4
 L(checkend):
 	cmpdi	cr1,r12,0
 	mr	r3,r7
@@ -131,14 +118,14 @@ L(nullfound):
 	li 3,0
 	blr
 
-	.p2align 4
+	.align 4
 L(checkend2):
 	cmpdi	cr7,r0,0
 	mr	r3,r11
 	beq	cr7,L(nullfound)
 	blr
 
-	.p2align 4
+	.align 4
 L(checkend3):
 	cmpdi	cr6,r10,0
 	mr	r3,r5

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f9f30622819b4d3685c0d448f3a3d49032472b07

commit f9f30622819b4d3685c0d448f3a3d49032472b07
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Wed Nov 19 15:24:55 2014 -0500

    powerpc: Add powerpc64 strcspn optimization
    
    This patch makes the POWER7 optimized strcspn generic by using
    default doubleword stores to zero the hash, instead of VSX
    instructions.  Performance on POWER7/POWER8 does not change.

diff --git a/ChangeLog b/ChangeLog
index d3b8947..89ee40b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,13 @@
 2014-12-02  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+	
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Remove strcspn objects.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Remove strcspn implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c: Remove file.
+	* sysdeps/powerpc/powerpc64/multiarch/strcspn.c: Remove file.
+	* sysdeps/powerpc/powerpc64/power7/strcspn.S: Remove file.
+	* sysdeps/powerpc/powerpc64/strcspn.S: New file.
 
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
 	Remove strspn objetcs.
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index d6de5a5..05dab25 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -15,7 +15,6 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \
 		   strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
-		   strcspn-power7 strcspn-ppc64 \
 		   strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
 		   strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 4a9e523..1a2e38d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -272,14 +272,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_ppc))
 
-  /* Support sysdeps/powerpc/powerpc64/multiarch/strcspn.c.  */
-  IFUNC_IMPL (i, name, strcspn,
-	      IFUNC_IMPL_ADD (array, i, strcspn,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-			      __strcspn_power7)
-	      IFUNC_IMPL_ADD (array, i, strcspn, 1,
-			     __strcspn_ppc))
-
   /* Support sysdeps/powerpc/powerpc64/multiarch/strpbrk.c.  */
   IFUNC_IMPL (i, name, strpbrk,
 	      IFUNC_IMPL_ADD (array, i, strpbrk,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcspn-power7.S
deleted file mode 100644
index 02ffcc8..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strcspn-power7.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Optimized strcspn implementation for POWER7.
-   Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#undef EALIGN
-#define EALIGN(name, alignt, words)				\
-  .section ".text";						\
-  ENTRY_2(__strcspn_power7)					\
-  .align ALIGNARG(alignt);					\
-  EALIGN_W_##words;						\
-  BODY_LABEL(__strcspn_power7):					\
-  cfi_startproc;						\
-  LOCALENTRY(__strcspn_power7)
-
-#undef END
-#define END(name)						\
-  cfi_endproc;							\
-  TRACEBACK(__strcspn_power7)					\
-  END_2(__strcspn_power7)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#include <sysdeps/powerpc/powerpc64/power7/strcspn.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c
deleted file mode 100644
index 5f8b610..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-
-#define STRCSPN __strcspn_ppc
-#ifdef SHARED
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
-  __hidden_ver1 (__strcspn_ppc, __GI_strcspn, __strcspn_ppc);
-#endif
-
-extern __typeof (strcspn) __strcspn_ppc attribute_hidden;
-
-#include <string/strcspn.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c b/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
deleted file mode 100644
index 3609d93..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Multiple versions of strcspn. PowerPC64 version.
-   Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-extern __typeof (strcspn) __strcspn_ppc attribute_hidden;
-extern __typeof (strcspn) __strcspn_power7 attribute_hidden;
-
-libc_ifunc (strcspn,
-	    (hwcap & PPC_FEATURE_HAS_VSX)
-	    ? __strcspn_power7
-	    : __strcspn_ppc);
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strcspn.S b/sysdeps/powerpc/powerpc64/strcspn.S
similarity index 80%
rename from sysdeps/powerpc/powerpc64/power7/strcspn.S
rename to sysdeps/powerpc/powerpc64/strcspn.S
index 3f6aa0a..1121930 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcspn.S
+++ b/sysdeps/powerpc/powerpc64/strcspn.S
@@ -20,54 +20,42 @@
 
 /* size_t [r3] strcspn (const char [r4] *s, const char [r5] *reject)  */
 
-	.machine power7
 EALIGN (strcspn, 4, 0)
 	CALL_MCOUNT 3
 
 	/* The idea to speed up the algorithm is to create a lookup table
 	   for fast check if input character should be considered.  For ASCII
 	   or ISO-8859-X character sets it has 256 positions.  */
-	lbz	r10,0(r4)
-
-	/* First the table should be cleared and to avoid unaligned accesses
-	   when using the VSX stores the table address is aligned to 16
-	   bytes.  */
-	xxlxor	v0,v0,v0
 
 	/* PPC64 ELF ABI stack is aligned to 16 bytes.  */
 	addi 	r9,r1,-256
+	/* Clear the table with 0 values  */
+	li	r6, 0
+	li	r8, 4
+	mtctr	r8
+	mr	r10, r9
+	.align 	4
+L(zerohash):
+	std	r6, 0(r10)
+	std	r6, 8(r10)
+	std	r6, 16(r10)
+	std	r6, 24(r10)
+	std	r6, 32(r10)
+	std	r6, 40(r10)
+	std	r6, 48(r10)
+	std	r6, 56(r10)
+	addi	r10, r10, 64
+	bdnz	L(zerohash)
 
-	li	r8,48
-	li	r5,16
-	li	r6,32
+	lbz	r10,0(r4)
 	cmpdi	cr7,r10,0	/* reject[0] == '\0' ?  */
-	addi	r12,r9,64
-	/* Clear the table with 0 values  */
-	stxvw4x	v0,r0,r9
-	addi	r11,r9,128
-	addi	r7,r9,192
-	stxvw4x v0,r9,r5
-	stxvw4x v0,r9,r6
-	stxvw4x v0,r9,r8
-	stxvw4x v0,r0,r12
-	stxvw4x v0,r12,r5
-	stxvw4x v0,r12,r6
-	stxvw4x v0,r12,r8
-	stxvw4x v0,r0,r11
-	stxvw4x v0,r11,r5
-	stxvw4x v0,r11,r6
-	stxvw4x v0,r11,r8
-	stxvw4x v0,r0,r7
-	stxvw4x v0,r7,r5
-	stxvw4x v0,r7,r6
-	stxvw4x v0,r7,r8
 	li	r8,1
 	beq     cr7,L(finish_table)  /* If reject[0] == '\0' skip  */
 
 	/* Initialize the table as:
 	   for (i=0; reject[i]; i++
 	     table[reject[i]]] = 1  */
-	.p2align 4,,15
+	.align	4
 L(init_table):
 	stbx	r8,r9,r10
 	lbzu	r10,1(r4)
@@ -93,7 +81,7 @@ L(finish_table):
 	       if (table[input[i++]] == 1)
 	         return i - 1;
 	     }  */
-	.p2align 4,,15
+	.align 4
 L(unroll):
 	lbz	r8,1(r3)
 	addi	r10,r10,4
@@ -121,17 +109,17 @@ L(mainloop):
 	mr	r3,r10
 	blr
 
-	.p2align 4,,15
+	.align 4
 L(end):
 	mr	r3,r6
 	blr
 
-	.p2align 4,,15
+	.align 4
 L(end2):
 	mr	r3,r4
 	blr
 
-	.p2align 4,,15
+	.align 4
 L(end3):
 	mr	r3,r5
 	blr

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=97104a4e2b866aae6a6593286b6c584339ef29d3

commit 97104a4e2b866aae6a6593286b6c584339ef29d3
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Wed Nov 19 14:24:18 2014 -0500

    powerpc: Add powerpc64 strspn optimization
    
    This patch makes the POWER7 optimized strspn generic by using
    default doubleword stores to zero the hash, instead of VSX
    instructions. Performance on POWER7/POWER8 machines does not changed.

diff --git a/ChangeLog b/ChangeLog
index f31179d..d3b8947 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2014-12-02  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Remove strspn objetcs.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Remove strspn implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S: Remove file.
+	* sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/strspn.S: Remove file.
+	* sysdeps/powerpc/powerpc64/strspn.S: New file.
+
 2014-12-01  Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/strtok.S: New file.
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index abc9d2e..d6de5a5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -15,7 +15,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \
 		   strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
 		   strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
-		   strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \
+		   strcspn-power7 strcspn-ppc64 \
 		   strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
 		   stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
 		   strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 06d5be9..4a9e523 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -272,14 +272,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_ppc))
 
-  /* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c.  */
-  IFUNC_IMPL (i, name, strspn,
-	      IFUNC_IMPL_ADD (array, i, strspn,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-			      __strspn_power7)
-	      IFUNC_IMPL_ADD (array, i, strspn, 1,
-			      __strspn_ppc))
-
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcspn.c.  */
   IFUNC_IMPL (i, name, strcspn,
 	      IFUNC_IMPL_ADD (array, i, strcspn,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S
deleted file mode 100644
index 889dfee..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Optimized strspn implementation for POWER7.
-   Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#undef EALIGN
-#define EALIGN(name, alignt, words)				\
-  .section ".text";						\
-  ENTRY_2(__strspn_power7)					\
-  .align ALIGNARG(alignt);					\
-  EALIGN_W_##words;						\
-  BODY_LABEL(__strspn_power7):					\
-  cfi_startproc;						\
-  LOCALENTRY(__strspn_power7)
-
-#undef END
-#define END(name)						\
-  cfi_endproc;							\
-  TRACEBACK(__strspn_power7)					\
-  END_2(__strspn_power7)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#include <sysdeps/powerpc/powerpc64/power7/strspn.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c
deleted file mode 100644
index d543772..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <string.h>
-
-#define STRSPN __strspn_ppc
-#undef weak_alias
-#define weak_alias(name, aliasname) \
-  extern __typeof (__strspn_ppc) aliasname \
-    __attribute__ ((weak, alias ("__strspn_ppc")));
-#if !defined(NOT_IN_libc) && defined(SHARED)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
-  __hidden_ver1(__strspn_ppc, __GI_strspn, __strspn_ppc);
-#endif
-
-extern __typeof (strspn) __strspn_ppc attribute_hidden;
-
-#include <string/strspn.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
deleted file mode 100644
index bf8c877..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strspn.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Multiple versions of strspn. PowerPC64 version.
-   Copyright (C) 2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-extern __typeof (strspn) __strspn_ppc attribute_hidden;
-extern __typeof (strspn) __strspn_power7 attribute_hidden;
-
-libc_ifunc (strspn,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __strspn_power7
-            : __strspn_ppc);
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strspn.S b/sysdeps/powerpc/powerpc64/strspn.S
similarity index 75%
rename from sysdeps/powerpc/powerpc64/power7/strspn.S
rename to sysdeps/powerpc/powerpc64/strspn.S
index d587a67..daf5d5d 100644
--- a/sysdeps/powerpc/powerpc64/power7/strspn.S
+++ b/sysdeps/powerpc/powerpc64/strspn.S
@@ -1,4 +1,4 @@
-/* Optimized strspn implementation for PowerPC64/POWER7.
+/* Optimized strspn implementation for PowerPC64.
 
    Copyright (C) 2014 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
@@ -25,8 +25,6 @@
    > hashing of needle.
    > hashing avoids scanning of duplicate entries in needle
      across the string.
-   > initializing the hash table with Vector instructions
-     by quadword access.
    > unrolling when scanning for character in string
      across hash table.  */
 
@@ -46,55 +44,36 @@
 
 #include <sysdep.h>
 
-#undef strspn
-
-	.machine  power7
 EALIGN(strspn, 4, 0)
-	CALL_MCOUNT 2
-
-	lbz r10, 0(r4)		/* load r10 with needle (r4)  */
-	addi r9, r1, -256	/* r9 is a hash of 256 bytes  */
-
-	li r5, 16		/* set r5 = 16 as offset  */
-	li r6, 32		/* set r6 = 32 as offset  */
-	li r8, 48		/* set r8 = 48 as offset  */
-
-/*Iniatliaze hash table with Zeroes in double indexed quadword accesses  */
-	xxlxor v0, v0, v0	/* prepare for initializing hash  */
-
-	stxvd2x v0, r0, r9	/* initialize 1st quadword  */
-	stxvd2x v0, r9, r5
-	stxvd2x v0, r9, r6
-	stxvd2x v0, r9, r8	/* initialize 4th quadword  */
-
-	addi r11, r9, 64	/* r11 is index to hash  */
-
-	stxvd2x v0, r0, r11	/* initialize 5th quadword  */
-	stxvd2x v0, r11, r5
-	stxvd2x v0, r11, r6
-	stxvd2x v0, r11, r8	/* initialize 8th quadword  */
-
-	addi r11, r9, 128	/* r11 is index to hash  */
-
-	stxvd2x v0, r0, r11	/* initialize 9th quadword  */
-	stxvd2x v0, r11, r5
-	stxvd2x v0, r11, r6
-	stxvd2x v0, r11, r8	/* initialize 12th quadword  */
-
-	addi r11, r9, 192	/* r11 is index to hash  */
-
-	stxvd2x v0, r0, r11	/* initialize 13th quadword  */
-	stxvd2x v0, r11, r5
-	stxvd2x v0, r11, r6
-	stxvd2x v0, r11, r8	/* initialize 16th quadword  */
-
+	CALL_MCOUNT 3
+
+	/* PPC64 ELF ABI stack is aligned to 16 bytes.  */
+	addi 	r9,r1,-256
+	/* Clear the table with 0 values  */
+	li	r6, 0
+	li	r8, 4
+	mtctr	r8
+	mr	r10, r9
+	.align 	4
+L(zerohash):
+	std	r6, 0(r10)
+	std	r6, 8(r10)
+	std	r6, 16(r10)
+	std	r6, 24(r10)
+	std	r6, 32(r10)
+	std	r6, 40(r10)
+	std	r6, 48(r10)
+	std	r6, 56(r10)
+	addi	r10, r10, 64
+	bdnz	L(zerohash)
+
+	lbz	r10,0(r4)
 	li r8, 1		/* r8=1, marker into hash if found in
 				   needle  */
-
 	cmpdi cr7, r10, 0	/* accept needle is NULL  */
 	beq cr7, L(skipHashing)	/* if needle is NULL, skip hashing  */
 
-	.p2align 4		/* align section to 16 byte boundary  */
+	.align 4		/* align section to 16 byte boundary  */
 L(hashing):
 	stbx r8, r9, r10	/* update hash with marker for the pivot of
 				   the needle  */
@@ -106,7 +85,7 @@ L(skipHashing):
 	li r10, 0		/* load counter = 0  */
 	b L(beginScan)
 
-	.p2align 4		/* align section to 16 byte boundary  */
+	.align 4		/* align section to 16 byte boundary  */
 L(scanUnroll):
 	lbzx r8, r9, r8		/* load r8 with hash value at index  */
 	cmpwi cr7, r8, 0	/* if we hit marker in hash, we have found

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d3e53c8246952898cd1fd23dfa0657b03db0e36b

commit d3e53c8246952898cd1fd23dfa0657b03db0e36b
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Date:   Mon Dec 1 09:03:58 2014 -0500

    powerpc: strtok{_r} optimization for powerpc64
    
    This patch optimizes strtok and strtok_r for POWERPC64.
    A table of 256 characters is created and marked based on
    the 'accept' argument and used to check for any occurance on
    the input string.Loop unrolling is also used to gain improvements.

diff --git a/ChangeLog b/ChangeLog
index 814486e..f31179d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2014-12-01  Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/strtok.S: New file.
+	* sysdeps/powerpc/powerpc64/strtok_r.S: New file.
+
 2014-11-26  Adhemerval Zanella  <azanella@linux.ibm.com>
 
 	* csu/tst-atomic.c (do_test): Add atomic_exchange_and_add_{acq,rel}
diff --git a/sysdeps/powerpc/powerpc64/strtok.S b/sysdeps/powerpc/powerpc64/strtok.S
new file mode 100644
index 0000000..fa816f2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/strtok.S
@@ -0,0 +1,226 @@
+/* Optimized strtok implementation for PowerPC64.
+
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Performance gains are grabbed through following techniques:
+
+   > hashing of needle.
+   > hashing avoids scanning of duplicate entries in needle
+     across the string.
+   > unrolling when scanning for character in string
+     across hash table.  */
+
+/* Algorithm is as below:
+   1. A empty hash table/dictionary is created comprising of
+      256 ascii character set
+   2. When hash entry is found in needle , the hash index
+      is initialized to 1
+   3. The string is scanned until end and for every character,
+      its corresponding hash index is compared.
+   4. initial length of string (count) until first hit of
+      accept needle is calculated and moved.(strspn)
+   5. The string is again scanned until end and for every character,
+      its corresponding hash index is compared.(strpbrk)
+   6. If hash index is set to 1 for the index of string,
+      set it to null and set the saveptr to point to the next char.
+   7. Otherwise count is incremented and scanning continues
+      until end of string.  */
+
+#include <sysdep.h>
+#ifdef USE_AS_STRTOK_R
+# define FUNC_NAME __strtok_r
+#else
+# define FUNC_NAME strtok
+#endif
+
+EALIGN(FUNC_NAME, 4, 0)
+#ifdef USE_AS_STRTOK_R
+	CALL_MCOUNT	3
+	cmpdi	cr7, r3, 0		/* Is input null? */
+	bne	cr7, L(inputnotNull)
+	ld	r3, 0(r5)		/* Load from r5 */
+#else
+	CALL_MCOUNT	2
+	addis	r5, r2, .LANCHOR0@toc@ha
+	cmpdi	cr7, r3, 0		/* Is r3 NULL? */
+	bne	cr7, L(inputnotNull)
+	ld	r3, .LANCHOR0@toc@l(r5)	/* Load from saveptr */
+#endif
+L(inputnotNull):
+	mr	r7, r3
+	cmpdi	cr7, r3, 0
+	beq	cr7, L(returnNULL)
+	lbz	r8, 0(r3)
+	cmpdi	cr7, r8, 0
+	beq	cr7, L(returnNULL)
+
+	addi	r9, r1, -256	/* r9 is a hash of 256 bytes  */
+
+	/*Iniatliaze hash table with Zeroes */
+	li	r6, 0
+	li	r8, 4
+	mtctr	r8
+	mr	r10, r9
+	.align	4
+L(zerohash):
+	std	r6, 0(r10)
+	std	r6, 8(r10)
+	std	r6, 16(r10)
+	std	r6, 24(r10)
+	std	r6, 32(r10)
+	std	r6, 40(r10)
+	std	r6, 48(r10)
+	std	r6, 56(r10)
+	addi	r10, r10, 64
+	bdnz	L(zerohash)
+
+
+	lbz	r10, 0(r4)	/* load r10 with needle (r4)  */
+	li	r8, 1		/* r8=1, marker into hash if found in
+				   needle  */
+
+	cmpdi	cr7, r10, 0	/* accept needle is NULL  */
+	beq	cr7, L(skipHashing)	/* if needle is NULL, skip hashing  */
+
+	.align 4		/* align section to 16 byte boundary  */
+L(hashing):
+	stbx	r8, r9, r10	/* update hash with marker for the pivot of
+				   the needle  */
+	lbzu	r10, 1(r4)	/* load needle into r10 and update to next  */
+	cmpdi	cr7, r10, 0	/* if needle is has reached NULL, continue  */
+	bne	cr7, L(hashing)	/* loop to hash the needle  */
+
+L(skipHashing):
+	b	L(beginScan)
+
+	.align 4		/* align section to 16 byte boundary  */
+L(scanUnroll):
+	lbzx	r8, r9, r8	/* load r8 with hash value at index  */
+	cmpwi	cr7, r8, 0	/* check the hash  value */
+	beq	cr7, L(ret1stIndex)	/* we have hit accept needle */
+
+	lbz	r8, 1(r7)	/* load string[1] into r8  */
+	lbzx	r8, r9, r8	/* load r8 with hash value at index  */
+	cmpwi	cr7, r8, 0	/* check the hash  value */
+	beq	cr7, L(ret2ndIndex)	/* we have hit accept needle */
+
+	lbz	r8, 2(r7)	/* load string[1] into r8  */
+	lbzx	r8, r9, r8	/* load r8 with hash value at index  */
+	cmpwi	cr7, r8, 0	/* check the hash  value */
+	beq	cr7, L(ret3rdIndex)	/* we have hit accept needle */
+
+	lbz	r8, 3(r7)	/* load string[1] into r8  */
+	addi	r7, r7, 4
+	lbzx	r8, r9, r8	/* load r8 with hash value at index  */
+	cmpwi	cr7, r8, 0	/* check the hash  value */
+	beq	cr7,L(ret4thIndex)	/* we have hit accept needle */
+
+L(beginScan):
+	lbz	r8, 0(r7)	/* load string[0] into r8  */
+	addi	r6, r7, 1
+	addi	r11, r7, 2
+	addi	r4, r7, 3
+	cmpdi	cr7, r8, 0	/*  check if its null */
+	bne	cr7, L(scanUnroll)	/* continue scanning  */
+
+L(ret1stIndex):
+	mr 	r3, r7
+	b 	L(next)
+L(ret2ndIndex):
+	mr 	r3, r6
+	b 	L(next)
+L(ret3rdIndex):
+	mr 	r3, r11
+	b 	L(next)
+L(ret4thIndex):
+	mr 	r3, r4
+L(next):
+	mr	r7, r3
+	lbz	r8, 0(r7)
+	cmpdi	cr7, r8, 0
+	beq	cr7, L(returnNULL)
+	li	r8, 1
+	li	r10, 0		/* load counter = 0  */
+	stbx	r8, r9, r10	/* update hash for NULL */
+	b	L(mainloop)
+
+L(unroll):
+	lbz	r8, 1(r7)	/* load string[1] into r8  */
+	lbzx	r8, r9, r8	/* load r8 with hash value at index  */
+	cmpwi	r7, r8, 1	/* check the hash */
+	beq	cr7, L(foundat1st)	/* we have hit accept needle */
+	lbz	r8, 2(r7)
+	lbzx	r8, r9, r8
+	cmpwi	cr7, r8, 1
+	beq	cr7, L(foundat2nd)
+	lbz	r8, 3(r7)
+	addi	r7, r7, 4
+	lbzx	r8, r9, r8
+	cmpwi	cr7, r8, 1
+	beq	cr7, L(foundat3rd)
+L(mainloop):
+	lbz	r8, 0(r7)
+	addi	r6, r7, 1
+	addi	r11, r7, 2
+	addi	r4, r7, 3
+	lbzx	r8, r9, r8
+	cmpwi	cr7, r8, 1
+	bne	cr7, L(unroll)	/* continue scanning  */
+
+	b	L(found)
+L(foundat1st):
+	mr	r7, r6
+	b	L(found)
+L(foundat2nd):
+	mr	r7, r11
+	b	L(found)
+L(foundat3rd):
+	mr	r7, r4
+L(found):
+	lbz	r8, 0(r7)
+	cmpdi	cr7, r8, 0
+	beq	cr7, L(end)
+	li	r10, 0
+	stb	r10, 0(r7)	/* Terminate string */
+	addi	r7, r7, 1	/* Store the pointer to the next char */
+L(end):
+#ifdef USE_AS_STRTOK_R
+	std	r7, 0(r5)	/* Update saveptr */
+#else
+	std	r7, .LANCHOR0@toc@l(r5)
+#endif
+	blr			/* done  */
+L(returnNULL):
+#ifndef USE_AS_STRTOK_R
+	li	r7, 0
+#endif
+	li	r3, 0		/* return NULL */
+	b	L(end)
+END(FUNC_NAME)
+#ifdef USE_AS_STRTOK_R
+libc_hidden_builtin_def (strtok_r)
+#else
+	.section        ".bss"
+	.align 3
+	.set    .LANCHOR0,. + 0
+	.type   olds, @object
+	.size   olds, 8
+olds:
+	.zero   8
+libc_hidden_builtin_def (strtok)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/strtok_r.S b/sysdeps/powerpc/powerpc64/strtok_r.S
new file mode 100644
index 0000000..6e5d301
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/strtok_r.S
@@ -0,0 +1,24 @@
+/* Optimized strtok_r implementation for PowerPC64.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STRTOK_R
+#include <sysdeps/powerpc/powerpc64/strtok.S>
+
+weak_alias (__strtok_r, strtok_r)
+libc_hidden_def (__strtok_r)
+libc_hidden_builtin_def (strtok_r)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=8b063985c1a750a1947fcf60e4606a3b0d7d0f37

commit 8b063985c1a750a1947fcf60e4606a3b0d7d0f37
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Tue Nov 25 14:32:54 2014 -0500

    powerpc: Fix missing barriers in atomic_exchange_and_add_{acq,rel}
    
    On powerpc, atomic_exchange_and_add is implemented without any
    barriers.  This patchs adds the missing instruction and memory barrier
    for acquire and release semanthics.

diff --git a/ChangeLog b/ChangeLog
index 103f1ed..814486e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2014-11-26  Adhemerval Zanella  <azanella@linux.ibm.com>
+
+	* csu/tst-atomic.c (do_test): Add atomic_exchange_and_add_{acq,rel}
+	tests.
+	* sysdeps/powerpc/bits/atomic.h
+	(__arch_atomic_exchange_and_add_32_acq): Add definition.
+	(__arch_atomic_exchange_and_add_32_rel): Likewise.
+	(atomic_exchange_and_add_acq): Likewise.
+	(atomic_exchange_and_add_rel): Likewise.
+	* sysdeps/powerpc/powerpc32/bits/atomic.h
+	(__arch_atomic_exchange_and_add_64_acq): Add definition.
+	(__arch_atomic_exchange_and_add_64_rel): Likewise.
+	* sysdeps/powerpc/powerpc64/bits/atomic.h
+	(__arch_atomic_exchange_and_add_64_acq): Add definition.
+	(__arch_atomic_exchange_and_add_64_rel): Likewise.
+
 2014-11-25  Anton Blanchard <anton@samba.org>
 
 	* sysdeps/powerpc/bits/atomic.h
diff --git a/csu/tst-atomic.c b/csu/tst-atomic.c
index d16c66d..ab6db45 100644
--- a/csu/tst-atomic.c
+++ b/csu/tst-atomic.c
@@ -113,6 +113,22 @@ do_test (void)
       ret = 1;
     }
 
+  mem = 2;
+  if (atomic_exchange_and_add_acq (&mem, 11) != 2
+      || mem != 13)
+    {
+      puts ("atomic_exchange_and_add test failed");
+      ret = 1;
+    }
+
+  mem = 2;
+  if (atomic_exchange_and_add_rel (&mem, 11) != 2
+      || mem != 13)
+    {
+      puts ("atomic_exchange_and_add test failed");
+      ret = 1;
+    }
+
   mem = -21;
   atomic_add (&mem, 22);
   if (mem != 1)
diff --git a/sysdeps/powerpc/bits/atomic.h b/sysdeps/powerpc/bits/atomic.h
index f312676..b05b0f7 100644
--- a/sysdeps/powerpc/bits/atomic.h
+++ b/sysdeps/powerpc/bits/atomic.h
@@ -152,6 +152,34 @@ typedef uintmax_t uatomic_max_t;
     __val;								      \
   })
 
+#define __arch_atomic_exchange_and_add_32_acq(mem, value) \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile ("1:	lwarx	%0,0,%3" MUTEX_HINT_ACQ "\n"	      \
+		      "		add	%1,%0,%4\n"			      \
+		      "		stwcx.	%1,0,%3\n"			      \
+		      "		bne-	1b\n"				      \
+		      __ARCH_ACQ_INSTR					      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
+#define __arch_atomic_exchange_and_add_32_rel(mem, value) \
+  ({									      \
+    __typeof (*mem) __val, __tmp;					      \
+    __asm __volatile (__ARCH_REL_INSTR "\n"				      \
+		      "1:	lwarx	%0,0,%3" MUTEX_HINT_REL "\n"	      \
+		      "		add	%1,%0,%4\n"			      \
+		      "		stwcx.	%1,0,%3\n"			      \
+		      "		bne-	1b"				      \
+		      : "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+		      : "b" (mem), "r" (value), "m" (*mem)		      \
+		      : "cr0", "memory");				      \
+    __val;								      \
+  })
+
 #define __arch_atomic_increment_val_32(mem) \
   ({									      \
     __typeof (*(mem)) __val;						      \
@@ -252,6 +280,28 @@ typedef uintmax_t uatomic_max_t;
        abort ();							      \
     __result;								      \
   })
+#define atomic_exchange_and_add_acq(mem, value) \
+  ({									      \
+    __typeof (*(mem)) __result;						      \
+    if (sizeof (*mem) == 4)						      \
+      __result = __arch_atomic_exchange_and_add_32_acq (mem, value);	      \
+    else if (sizeof (*mem) == 8)					      \
+      __result = __arch_atomic_exchange_and_add_64_acq (mem, value);	      \
+    else 								      \
+       abort ();							      \
+    __result;								      \
+  })
+#define atomic_exchange_and_add_rel(mem, value) \
+  ({									      \
+    __typeof (*(mem)) __result;						      \
+    if (sizeof (*mem) == 4)						      \
+      __result = __arch_atomic_exchange_and_add_32_rel (mem, value);	      \
+    else if (sizeof (*mem) == 8)					      \
+      __result = __arch_atomic_exchange_and_add_64_rel (mem, value);	      \
+    else 								      \
+       abort ();							      \
+    __result;								      \
+  })
 
 #define atomic_increment_val(mem) \
   ({									      \
diff --git a/sysdeps/powerpc/powerpc32/bits/atomic.h b/sysdeps/powerpc/powerpc32/bits/atomic.h
index a3dd09c..7422262 100644
--- a/sysdeps/powerpc/powerpc32/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc32/bits/atomic.h
@@ -95,6 +95,12 @@
 #define __arch_atomic_exchange_and_add_64(mem, value) \
     ({ abort (); (*mem) = (value); })
 
+#define __arch_atomic_exchange_and_add_64_acq(mem, value) \
+    ({ abort (); (*mem) = (value); })
+
+#define __arch_atomic_exchange_and_add_64_rel(mem, value) \
+    ({ abort (); (*mem) = (value); })
+
 #define __arch_atomic_increment_val_64(mem) \
     ({ abort (); (*mem)++; })
 
diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h
index 9cab0a2..e64cb9f 100644
--- a/sysdeps/powerpc/powerpc64/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc64/bits/atomic.h
@@ -183,6 +183,34 @@
       __val;								      \
     })
 
+#define __arch_atomic_exchange_and_add_64_acq(mem, value) \
+    ({									      \
+      __typeof (*mem) __val, __tmp;					      \
+      __asm __volatile ("1:	ldarx	%0,0,%3" MUTEX_HINT_ACQ "\n"	      \
+			"	add	%1,%0,%4\n"			      \
+			"	stdcx.	%1,0,%3\n"			      \
+			"	bne-	1b\n"				      \
+			__ARCH_ACQ_INSTR				      \
+			: "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+			: "b" (mem), "r" (value), "m" (*mem)		      \
+			: "cr0", "memory");				      \
+      __val;								      \
+    })
+
+#define __arch_atomic_exchange_and_add_64_rel(mem, value) \
+    ({									      \
+      __typeof (*mem) __val, __tmp;					      \
+      __asm __volatile (__ARCH_REL_INSTR "\n"				      \
+			"1:	ldarx	%0,0,%3" MUTEX_HINT_REL "\n"	      \
+			"	add	%1,%0,%4\n"			      \
+			"	stdcx.	%1,0,%3\n"			      \
+			"	bne-	1b"				      \
+			: "=&b" (__val), "=&r" (__tmp), "=m" (*mem)	      \
+			: "b" (mem), "r" (value), "m" (*mem)		      \
+			: "cr0", "memory");				      \
+      __val;								      \
+    })
+
 #define __arch_atomic_increment_val_64(mem) \
     ({									      \
       __typeof (*(mem)) __val;						      \

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=364c58517bdcc91c5bf1fcb57b4befff8951a51b

commit 364c58517bdcc91c5bf1fcb57b4befff8951a51b
Author: Anton Blanchard <anton@samba.org>
Date:   Tue Nov 25 07:26:12 2014 -0500

    powerpc: Fix __arch_compare_and_exchange_bool_64_rel
    
    Fix a typo in the inline assembly.

diff --git a/ChangeLog b/ChangeLog
index 9cd75d5..103f1ed 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2014-11-25  Anton Blanchard <anton@samba.org>
+
+	* sysdeps/powerpc/bits/atomic.h
+	(__arch_compare_and_exchange_bool_64_rel): Load from mem.
+
 2014-11-19  Carlos O'Donell  <carlos@redhat.com>
 	    Florian Weimer  <fweimer@redhat.com>
 	    Joseph Myers  <joseph@codesourcery.com>
diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h
index ed26b72..9cab0a2 100644
--- a/sysdeps/powerpc/powerpc64/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc64/bits/atomic.h
@@ -97,7 +97,7 @@
 ({									      \
   unsigned long	__tmp;							      \
   __asm __volatile (__ARCH_REL_INSTR "\n"				      \
-		    "1:	ldarx	%0,0,%2" MUTEX_HINT_REL "\n"		      \
+		    "1:	ldarx	%0,0,%1" MUTEX_HINT_REL "\n"		      \
 		    "	subf.	%0,%2,%0\n"				      \
 		    "	bne	2f\n"					      \
 		    "	stdcx.	%3,0,%1\n"				      \

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=133a24ba079abf1e762bd4d85670e0bd8df660c4

commit 133a24ba079abf1e762bd4d85670e0bd8df660c4
Author: Carlos O'Donell <carlos@redhat.com>
Date:   Wed Nov 19 11:44:12 2014 -0500

    CVE-2014-7817: wordexp fails to honour WRDE_NOCMD.
    
    The function wordexp() fails to properly handle the WRDE_NOCMD
    flag when processing arithmetic inputs in the form of "$((... ``))"
    where "..." can be anything valid. The backticks in the arithmetic
    epxression are evaluated by in a shell even if WRDE_NOCMD forbade
    command substitution. This allows an attacker to attempt to pass
    dangerous commands via constructs of the above form, and bypass
    the WRDE_NOCMD flag. This patch fixes this by checking for WRDE_NOCMD
    in exec_comm(), the only place that can execute a shell. All other
    checks for WRDE_NOCMD are superfluous and removed.
    
    We expand the testsuite and add 3 new regression tests of roughly
    the same form but with a couple of nested levels.
    
    On top of the 3 new tests we add fork validation to the WRDE_NOCMD
    testing. If any forks are detected during the execution of a wordexp()
    call with WRDE_NOCMD, the test is marked as failed. This is slightly
    heuristic since vfork might be used in the future, but it provides a
    higher level of assurance that no shells were executed as part of
    command substitution with WRDE_NOCMD in effect. In addition it doesn't
    require libpthread or libdl, instead we use the public implementation
    namespace function __register_atfork (already part of the public ABI
    for libpthread).
    
    Tested on x86_64 with no regressions.

diff --git a/ChangeLog b/ChangeLog
index e2239f3..9cd75d5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,25 @@
+2014-11-19  Carlos O'Donell  <carlos@redhat.com>
+	    Florian Weimer  <fweimer@redhat.com>
+	    Joseph Myers  <joseph@codesourcery.com>
+	    Adam Conrad  <adconrad@0c3.net>
+	    Andreas Schwab  <schwab@suse.de>
+	    Brooks  <bmoses@google.com>
+
+	[BZ #17625]
+	* wordexp-test.c (__dso_handle): Add prototype.
+	(__register_atfork): Likewise.
+	(__app_register_atfork): New function.
+	(registered_forks): New global.
+	(register_fork): New function.
+	(test_case): Add 3 new tests for WRDE_CMDSUB.
+	(main): Call __app_register_atfork.
+	(testit): If WRDE_NOCMD set registered_forks to zero, run test, and if
+	fork count is non-zero fail the test.
+	* posix/wordexp.c (exec_comm): Return WRDE_CMDSUB if WRDE_NOCMD flag
+	is set.
+	(parse_dollars): Remove check for WRDE_NOCMD.
+	(parse_dquote): Likewise.
+
 2014-11-05  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Simplify
diff --git a/NEWS b/NEWS
index c555f75..20106dc 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,13 @@ Version 2.20.1
 
 * The following bugs are resolved with this release:
 
-  17266, 17370, 17371.
+  17266, 17370, 17371, 17625.
+
+* CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
+  under certain input conditions resulting in the execution of a shell for
+  command substitution when the applicaiton did not request it. The
+  implementation now checks WRDE_NOCMD immediately before executing the
+  shell and returns the error WRDE_CMDSUB as expected.
 
 Version 2.20
 
diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c
index 4957006..bdd65e4 100644
--- a/posix/wordexp-test.c
+++ b/posix/wordexp-test.c
@@ -27,6 +27,25 @@
 
 #define IFS " \n\t"
 
+extern void *__dso_handle __attribute__ ((__weak__, __visibility__ ("hidden")));
+extern int __register_atfork (void (*) (void), void (*) (void), void (*) (void), void *);
+
+static int __app_register_atfork (void (*prepare) (void), void (*parent) (void), void (*child) (void))
+{
+  return __register_atfork (prepare, parent, child,
+			    &__dso_handle == NULL ? NULL : __dso_handle);
+}
+
+/* Number of forks seen.  */
+static int registered_forks;
+
+/* For each fork increment the fork count.  */
+static void
+register_fork (void)
+{
+  registered_forks++;
+}
+
 struct test_case_struct
 {
   int retval;
@@ -206,6 +225,12 @@ struct test_case_struct
     { WRDE_SYNTAX, NULL, "$((2+))", 0, 0, { NULL, }, IFS },
     { WRDE_SYNTAX, NULL, "`", 0, 0, { NULL, }, IFS },
     { WRDE_SYNTAX, NULL, "$((010+4+))", 0, 0, { NULL }, IFS },
+    /* Test for CVE-2014-7817. We test 3 combinations of command
+       substitution inside an arithmetic expression to make sure that
+       no commands are executed and error is returned.  */
+    { WRDE_CMDSUB, NULL, "$((`echo 1`))", WRDE_NOCMD, 0, { NULL, }, IFS },
+    { WRDE_CMDSUB, NULL, "$((1+`echo 1`))", WRDE_NOCMD, 0, { NULL, }, IFS },
+    { WRDE_CMDSUB, NULL, "$((1+$((`echo 1`))))", WRDE_NOCMD, 0, { NULL, }, IFS },
 
     { -1, NULL, NULL, 0, 0, { NULL, }, IFS },
   };
@@ -258,6 +283,15 @@ main (int argc, char *argv[])
 	  return -1;
     }
 
+  /* If we are not allowed to do command substitution, we install
+     fork handlers to verify that no forks happened.  No forks should
+     happen at all if command substitution is disabled.  */
+  if (__app_register_atfork (register_fork, NULL, NULL) != 0)
+    {
+      printf ("Failed to register fork handler.\n");
+      return -1;
+    }
+
   for (test = 0; test_case[test].retval != -1; test++)
     if (testit (&test_case[test]))
       ++fail;
@@ -367,6 +401,9 @@ testit (struct test_case_struct *tc)
 
   printf ("Test %d (%s): ", ++tests, tc->words);
 
+  if (tc->flags & WRDE_NOCMD)
+    registered_forks = 0;
+
   if (tc->flags & WRDE_APPEND)
     {
       /* initial wordexp() call, to be appended to */
@@ -378,6 +415,13 @@ testit (struct test_case_struct *tc)
     }
   retval = wordexp (tc->words, &we, tc->flags);
 
+  if ((tc->flags & WRDE_NOCMD)
+      && (registered_forks > 0))
+    {
+	  printf ("FAILED fork called for WRDE_NOCMD\n");
+	  return 1;
+    }
+
   if (tc->flags & WRDE_DOOFFS)
       start_offs = sav_we.we_offs;
 
diff --git a/posix/wordexp.c b/posix/wordexp.c
index b6b65dd..26f3a26 100644
--- a/posix/wordexp.c
+++ b/posix/wordexp.c
@@ -893,6 +893,10 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
   pid_t pid;
   int noexec = 0;
 
+  /* Do nothing if command substitution should not succeed.  */
+  if (flags & WRDE_NOCMD)
+    return WRDE_CMDSUB;
+
   /* Don't fork() unless necessary */
   if (!comm || !*comm)
     return 0;
@@ -2082,9 +2086,6 @@ parse_dollars (char **word, size_t *word_length, size_t *max_length,
 	    }
 	}
 
-      if (flags & WRDE_NOCMD)
-	return WRDE_CMDSUB;
-
       (*offset) += 2;
       return parse_comm (word, word_length, max_length, words, offset, flags,
 			 quoted? NULL : pwordexp, ifs, ifs_white);
@@ -2196,9 +2197,6 @@ parse_dquote (char **word, size_t *word_length, size_t *max_length,
 	  break;
 
 	case '`':
-	  if (flags & WRDE_NOCMD)
-	    return WRDE_CMDSUB;
-
 	  ++(*offset);
 	  error = parse_backtick (word, word_length, max_length, words,
 				  offset, flags, NULL, NULL, NULL);
@@ -2357,12 +2355,6 @@ wordexp (const char *words, wordexp_t *pwordexp, int flags)
 	break;
 
       case '`':
-	if (flags & WRDE_NOCMD)
-	  {
-	    error = WRDE_CMDSUB;
-	    goto do_error;
-	  }
-
 	++words_offset;
 	error = parse_backtick (&word, &word_length, &max_length, words,
 				&words_offset, flags, pwordexp, ifs,

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f8fbd413672816a429adc6b6c191ec8ea73421e8

commit f8fbd413672816a429adc6b6c191ec8ea73421e8
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Wed Nov 5 08:01:09 2014 -0500

    powerpc: Simplify encoding of POWER8 instruction

diff --git a/ChangeLog b/ChangeLog
index e4de9d8..e2239f3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2014-11-05  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Simplify
+	definition.
+	* sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S (MFVSRD_R3_V1):
+	Likwise.
+	* sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S (MFVSRD_R3_V1):
+	Likewise.
+	* sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S (MFVSRD_R3_V1):
+	Likewise.
+	* sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S (MFVSRD_R3_V1):
+	Likewise.
+	* sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S (MFVSRD_R3_V1):
+	Likewise.
+
 2014-11-03  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Encode
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
index 2b27e7b..3e98126 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
@@ -17,14 +17,9 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <endian.h>
 #include <math_ldbl_opt.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1  .byte 0x66,0x00,0x23,0x7c     /* mfvsrd  r3,vs1  */
-#else
-#define MFVSRD_R3_V1  .byte 0x7c,0x23,0x00,0x66     /* mfvsrd  r3,vs1  */
-#endif
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* int [r3] __finite ([fp1] x)  */
 
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
index d09b7fc..125de39 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
@@ -17,14 +17,9 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <endian.h>
 #include <math_ldbl_opt.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1  .byte 0x66,0x00,0x23,0x7c     /* mfvsrd  r3,vs1  */
-#else
-#define MFVSRD_R3_V1  .byte 0x7c,0x23,0x00,0x66     /* mfvsrd  r3,vs1  */
-#endif
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* int [r3] __isinf([fp1] x)  */
 
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
index cf119e5..2c7b2d1 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
@@ -17,14 +17,9 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <endian.h>
 #include <math_ldbl_opt.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1  .byte 0x66,0x00,0x23,0x7c     /* mfvsrd  r3,vs1  */
-#else
-#define MFVSRD_R3_V1  .byte 0x7c,0x23,0x00,0x66     /* mfvsrd  r3,vs1  */
-#endif
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* int [r3] __isnan([f1] x)  */
 
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
index 9a55d93..ce48d4e 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
@@ -17,14 +17,9 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <endian.h>
 #include <math_ldbl_opt.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1  .byte 0x66,0x00,0x23,0x7c     /* mfvsrd  r3,vs1  */
-#else
-#define MFVSRD_R3_V1  .byte 0x7c,0x23,0x00,0x66     /* mfvsrd  r3,vs1  */
-#endif
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* long long int[r3] __llrint (double x[fp1])  */
 ENTRY (__llrint)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
index f10c06a..17cf30e 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
@@ -20,11 +20,7 @@
 #include <endian.h>
 #include <math_ldbl_opt.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1  .byte 0x66,0x00,0x23,0x7c     /* mfvsrd  r3,vs1  */
-#else
-#define MFVSRD_R3_V1  .byte 0x7c,0x23,0x00,0x66     /* mfvsrd  r3,vs1  */
-#endif
+#define MFVSRD_R3_V1  .long 0x7c230066     /* mfvsrd  r3,vs1  */
 
 /* long long [r3] llround (float x [fp1])  */
 
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
index cebcbdf..d7324dc 100644
--- a/sysdeps/powerpc/powerpc64/power8/memset.S
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -17,13 +17,8 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
-#include <endian.h>
 
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MTVSRD_V1_R4  .byte 0x66,0x01,0x24,0x7c     /* mtvsrd  v1,r4  */
-#else
-#define MTVSRD_V1_R4  .byte 0x7c,0x24,0x01,0x66
-#endif
+#define MTVSRD_V1_R4  .long 0x7c240166     /* mtvsrd  v1,r4  */
 
 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
    Returns 's'.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e154589132de915ef165a1e26f89ba6997170c2b

commit e154589132de915ef165a1e26f89ba6997170c2b
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Mon Nov 3 07:26:33 2014 -0500

    powerpc: Fix encoding of POWER8 instruction
    
    This patch adds a binary encoding for 'mtvsrd' instruction to avoid
    build failures when assembler does not support POWER8.

diff --git a/ChangeLog b/ChangeLog
index bcc7072..e4de9d8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2014-11-03  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Encode
+	mtvsrd instruction in binary form.
+
 2014-10-31  Torvald Riegel  <triegel@redhat.com>
 
 	* sysdeps/powerpc/bits/atomic.h (atomic_write_barrier): Remove and...
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
index 191a4df..cebcbdf 100644
--- a/sysdeps/powerpc/powerpc64/power8/memset.S
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -17,6 +17,13 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#include <endian.h>
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define MTVSRD_V1_R4  .byte 0x66,0x01,0x24,0x7c     /* mtvsrd  v1,r4  */
+#else
+#define MTVSRD_V1_R4  .byte 0x7c,0x24,0x01,0x66
+#endif
 
 /* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
    Returns 's'.  */
@@ -142,7 +149,7 @@ L(tail_bytes):
 	   vector instruction to achieve best throughput.  */
 L(huge_vector):
 	/* Replicate set byte to quadword in VMX register.  */
-	mtvsrd	 v1,r4
+	MTVSRD_V1_R4
 	xxpermdi 32,v0,v1,0
 	vspltb	 v2,v0,15
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=46f58099960f7a2603c37c540d2644e392f0fdc7

commit 46f58099960f7a2603c37c540d2644e392f0fdc7
Author: Torvald Riegel <triegel@redhat.com>
Date:   Sat Oct 18 01:01:58 2014 +0200

    powerpc: Change atomic_write_barrier to have release semantics.

diff --git a/ChangeLog b/ChangeLog
index e67dd7c..bcc7072 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2014-10-31  Torvald Riegel  <triegel@redhat.com>
+
+	* sysdeps/powerpc/bits/atomic.h (atomic_write_barrier): Remove and...
+	* sysdeps/powerpc/powerpc32/bits/atomic.h (atomic_write_barrier):
+	... add here and use lwsync or sync ...
+	* sysdeps/powerpc/powerpc64/bits/atomic.h (atomic_write_barrier):
+	... and add here using lwsync.
+
 2014-09-10  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
 	* benchtests/bench-memset.c (test_main): Add more test from size
diff --git a/sysdeps/powerpc/bits/atomic.h b/sysdeps/powerpc/bits/atomic.h
index 2ffba48..f312676 100644
--- a/sysdeps/powerpc/bits/atomic.h
+++ b/sysdeps/powerpc/bits/atomic.h
@@ -77,7 +77,6 @@ typedef uintmax_t uatomic_max_t;
 #endif
 
 #define atomic_full_barrier()	__asm ("sync" ::: "memory")
-#define atomic_write_barrier()	__asm ("eieio" ::: "memory")
 
 #define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval)	      \
   ({									      \
diff --git a/sysdeps/powerpc/powerpc32/bits/atomic.h b/sysdeps/powerpc/powerpc32/bits/atomic.h
index 7613bdc..a3dd09c 100644
--- a/sysdeps/powerpc/powerpc32/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc32/bits/atomic.h
@@ -117,6 +117,7 @@
 # ifndef UP
 #  define __ARCH_REL_INSTR	"lwsync"
 # endif
+# define atomic_write_barrier()	__asm ("lwsync" ::: "memory")
 #else
 /*
  * Older powerpc32 processors don't support the new "light weight"
@@ -124,6 +125,7 @@
  * for all powerpc32 applications.
  */
 # define atomic_read_barrier()	__asm ("sync" ::: "memory")
+# define atomic_write_barrier()	__asm ("sync" ::: "memory")
 #endif
 
 /*
diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h
index 527fe7c..ed26b72 100644
--- a/sysdeps/powerpc/powerpc64/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc64/bits/atomic.h
@@ -234,6 +234,7 @@
 #ifndef UP
 # define __ARCH_REL_INSTR	"lwsync"
 #endif
+#define atomic_write_barrier()	__asm ("lwsync" ::: "memory")
 
 /*
  * Include the rest of the atomic ops macros which are common to both

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5f892cacbdf50322bc3ee2e131c105c71b495086

commit 5f892cacbdf50322bc3ee2e131c105c71b495086
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Tue Jul 15 12:19:09 2014 -0400

    PowerPC: memset optimization for POWER8/PPC64
    
    This patch adds an optimized memset implementation for POWER8.  For
    sizes from 0 to 255 bytes, a word/doubleword algorithm similar to
    POWER7 optimized one is used.
    
    For size higher than 255 two strategies are used:
    
    1. If the constant is different than 0, the memory is written with
       altivec vector instruction;
    
    2. If constant is 0, dbcz instructions are used.  The loop is unrolled
       to clear 512 byte at time.
    
    Using vector instructions increases throughput considerable, with a
    double performance for sizes larger than 1024.  The dcbz loops unrolls
    also shows performance improvement, by doubling throughput for sizes
    larger than 8192 bytes.

diff --git a/ChangeLog b/ChangeLog
index 85024b2..e67dd7c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
 2014-09-10  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
 
+	* benchtests/bench-memset.c (test_main): Add more test from size
+	from 32 to 512 bytes.
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Add POWER8 memset object.
+	* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+	(__libc_ifunc_impl_list): Add POWER8 memset and bzero implementations.
+	* sysdeps/powerpc/powerpc64/multiarch/bzero.c (__bzero): Add POWER8
+	implementation.
+	* sysdeps/powerpc/powerpc64/multiarch/memset.c (__libc_memset):
+	Likewise.
+	* sysdeps/powerpc/powerpc64/multiarch/memset-power8.S: New file:
+	multiarch POWER8 memset optimization.
+	* sysdeps/powerpc/powerpc64/power8/memset.S: New file: optimized
+	POWER8 memset optimization.
+
 	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
 	Remove bzero multiarch objects.
 	* sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S: Remove file.
diff --git a/benchtests/bench-memset.c b/benchtests/bench-memset.c
index 5304113..2026593 100644
--- a/benchtests/bench-memset.c
+++ b/benchtests/bench-memset.c
@@ -150,6 +150,11 @@ test_main (void)
 	  if (i & (i - 1))
 	    do_test (0, c, i);
 	}
+      for (i = 32; i < 512; i+=32)
+	{
+	  do_test (0, c, i);
+	  do_test (i, c, i);
+	}
       do_test (1, c, 14);
       do_test (3, c, 1024);
       do_test (4, c, 64);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 0de3804..abc9d2e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -2,7 +2,7 @@ ifeq ($(subdir),string)
 sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \
 		   memcmp-ppc64 memset-power7 memset-power6 memset-power4 \
-		   memset-ppc64 \
+		   memset-ppc64 memset-power8 \
 		   mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
 		   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
 		   rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
index ed83541..298cf00 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -26,14 +26,17 @@ extern __typeof (bzero) __bzero_ppc attribute_hidden;
 extern __typeof (bzero) __bzero_power4 attribute_hidden;
 extern __typeof (bzero) __bzero_power6 attribute_hidden;
 extern __typeof (bzero) __bzero_power7 attribute_hidden;
+extern __typeof (bzero) __bzero_power8 attribute_hidden;
 
 libc_ifunc (__bzero,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __bzero_power7 :
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __bzero_power8 :
+	      (hwcap & PPC_FEATURE_HAS_VSX)
+	      ? __bzero_power7 :
+		(hwcap & PPC_FEATURE_ARCH_2_05)
 		? __bzero_power6 :
 		  (hwcap & PPC_FEATURE_POWER4)
-		? __bzero_power4
+		  ? __bzero_power4
             : __bzero_ppc);
 
 weak_alias (__bzero, bzero)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index a574487..06d5be9 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -34,6 +34,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   size_t i = 0;
 
   unsigned long int hwcap = GLRO(dl_hwcap);
+  unsigned long int hwcap2 = GLRO(dl_hwcap2);
+
   /* hwcap contains only the latest supported ISA, the code checks which is
      and fills the previous supported ones.  */
   if (hwcap & PPC_FEATURE_ARCH_2_06)
@@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/memset.c.  */
   IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __memset_power8)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
 			      __memset_power7)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
@@ -134,6 +138,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c.  */
   IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+			      __bzero_power8)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
 			      __bzero_power7)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
new file mode 100644
index 0000000..e8a604b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
@@ -0,0 +1,43 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words)				\
+  .section ".text";						\
+  ENTRY_2(__memset_power8)					\
+  .align ALIGNARG(alignt);					\
+  EALIGN_W_##words;						\
+  BODY_LABEL(__memset_power8):					\
+  cfi_startproc;						\
+  LOCALENTRY(__memset_power8)
+
+#undef END_GEN_TB
+#define END_GEN_TB(name, mask)					\
+  cfi_endproc;							\
+  TRACEBACK_MASK(__memset_power8,mask)				\
+  END_2(__memset_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef __bzero
+#define __bzero __bzero_power8
+
+#include <sysdeps/powerpc/powerpc64/power8/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
index aa2ae70..9c7ed10 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -32,16 +32,19 @@ extern __typeof (__redirect_memset) __memset_ppc attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc (__libc_memset,
-            (hwcap & PPC_FEATURE_HAS_VSX)
-            ? __memset_power7 :
-	      (hwcap & PPC_FEATURE_ARCH_2_05)
+            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+            ? __memset_power8 :
+	      (hwcap & PPC_FEATURE_HAS_VSX)
+	      ? __memset_power7 :
+		(hwcap & PPC_FEATURE_ARCH_2_05)
 		? __memset_power6 :
 		  (hwcap & PPC_FEATURE_POWER4)
-		? __memset_power4
+		  ? __memset_power4
             : __memset_ppc);
 
 #undef memset
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
new file mode 100644
index 0000000..191a4df
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -0,0 +1,449 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+   Copyright (C) 2014 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+   Returns 's'.  */
+
+	.machine power8
+EALIGN (memset, 5, 0)
+	CALL_MCOUNT 3
+
+L(_memset):
+	cmpldi	cr7,r5,31
+	neg	r0,r3
+	mr	r10,r3
+
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32	/* Replicate byte to word.  */
+	ble	cr7,L(write_LT_32)
+
+	andi.	r11,r10,15	/* Check alignment of DST.  */
+	insrdi	r4,r4,32,0	/* Replicate word to double word.  */
+
+	beq	L(big_aligned)
+
+	mtocrf	0x01,r0
+	clrldi	r0,r0,60
+
+	/* Get DST aligned to 16 bytes.  */
+1:	bf	31,2f
+	stb	r4,0(r10)
+	addi	r10,r10,1
+
+2:	bf	30,4f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+4:	bf	29,8f
+	stw	r4,0(r10)
+	addi	r10,r10,4
+
+8:	bf      28,16f
+	std     r4,0(r10)
+	addi    r10,r10,8
+
+16:	subf	r5,r0,r5
+
+	.align	4
+L(big_aligned):
+	/* For sizes larger than 255 two possible paths:
+	   - if constant is '0', zero full cache lines with dcbz
+	   - otherwise uses vector instructions.  */
+	cmpldi	cr5,r5,255
+	dcbtst	0,r10
+	cmpldi	cr6,r4,0
+	crand	27,26,21
+	bt	27,L(huge_dcbz)
+	bge	cr5,L(huge_vector)
+
+
+	/* Size between 32 and 255 bytes with constant different than 0, use
+	   doubleword store instruction to achieve best throughput.  */
+	srdi    r8,r5,5
+	clrldi  r11,r5,59
+	cmpldi  cr6,r11,0
+	cmpdi	r8,0
+	beq     L(tail_bytes)
+	mtctr   r8
+
+	/* Main aligned write loop, writes 32-bytes at a time.  */
+	.align  4
+L(big_loop):
+	std     r4,0(r10)
+	std     r4,8(r10)
+	std     r4,16(r10)
+	std     r4,24(r10)
+	addi    r10,r10,32
+	bdz     L(tail_bytes)
+
+	std     r4,0(r10)
+	std     r4,8(r10)
+	std     r4,16(r10)
+	std     r4,24(r10)
+	addi    r10,10,32
+	bdnz    L(big_loop)
+
+	b       L(tail_bytes)
+
+	/* Write remaining 1~31 bytes.  */
+	.align  4
+L(tail_bytes):
+	beqlr   cr6
+
+	srdi    r7,r11,4
+	clrldi  r8,r11,60
+	mtocrf  0x01,r7
+
+	.align	4
+	bf	31,8f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+
+	.align	4
+8:	mtocrf	0x1,r8
+	bf	28,4f
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+	.align	4
+4:	bf      29,2f
+	stw     4,0(10)
+	addi    10,10,4
+
+	.align 	4
+2:	bf      30,1f
+	sth     4,0(10)
+	addi    10,10,2
+
+	.align  4
+1:      bflr    31
+	stb     4,0(10)
+	blr
+
+	/* Size larger than 255 bytes with constant different than 0, use
+	   vector instruction to achieve best throughput.  */
+L(huge_vector):
+	/* Replicate set byte to quadword in VMX register.  */
+	mtvsrd	 v1,r4
+	xxpermdi 32,v0,v1,0
+	vspltb	 v2,v0,15
+
+	/* Main aligned write loop: 128 bytes at a time.  */
+	li	r6,16
+	li	r7,32
+	li	r8,48
+	mtocrf	0x02,r5
+	srdi	r12,r5,7
+	cmpdi	r12,0
+	beq	L(aligned_tail)
+	mtctr	r12
+	b	L(aligned_128loop)
+
+	.align  4
+L(aligned_128loop):
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+	bdnz	L(aligned_128loop)
+
+	/* Write remaining 1~127 bytes.  */
+L(aligned_tail):
+	mtocrf	0x01,r5
+	bf	25,32f
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	stvx	v2,r10,r7
+	stvx	v2,r10,r8
+	addi	r10,r10,64
+
+32:	bf	26,16f
+	stvx	v2,0,r10
+	stvx	v2,r10,r6
+	addi	r10,r10,32
+
+16:	bf	27,8f
+	stvx	v2,0,r10
+	addi	r10,r10,16
+
+8:	bf	28,4f
+	std     r4,0(r10)
+	addi	r10,r10,8
+
+	/* Copies 4~7 bytes.  */
+4:	bf	29,L(tail2)
+	stw     r4,0(r10)
+	bf      30,L(tail5)
+	sth     r4,4(r10)
+	bflr	31
+	stb     r4,6(r10)
+	/* Return original DST pointer.  */
+	blr
+
+	/* Special case when value is 0 and we have a long length to deal
+	   with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
+	   Before using dcbz though, we need to get the destination 128-byte
+	   aligned.  */
+	.align	4
+L(huge_dcbz):
+	andi.	r11,r10,127
+	neg	r0,r10
+	beq	L(huge_dcbz_aligned)
+
+	clrldi	r0,r0,57
+	subf	r5,r0,r5
+	srdi	r0,r0,3
+	mtocrf	0x01,r0
+
+	/* Write 1~128 bytes until DST is aligned to 128 bytes.  */
+8:	bf	28,4f
+
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	std	r4,32(r10)
+	std	r4,40(r10)
+	std	r4,48(r10)
+	std	r4,56(r10)
+	addi	r10,r10,64
+
+	.align	4
+4:	bf	29,2f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	addi	r10,r10,32
+
+	.align	4
+2:	bf	30,1f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+
+	.align	4
+1:	bf	31,L(huge_dcbz_aligned)
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+L(huge_dcbz_aligned):
+	/* Setup dcbz unroll offsets and count numbers.  */
+	srdi	r8,r5,9
+	clrldi	r11,r5,55
+	cmpldi	cr6,r11,0
+	li	r9,128
+	cmpdi	r8,0
+	beq     L(huge_tail)
+	li	r7,256
+	li	r6,384
+	mtctr	r8
+
+	.align	4
+L(huge_loop):
+	/* Sets 512 bytes to zero in each iteration, the loop unrolling shows
+	   a throughput boost for large sizes (2048 bytes or higher).  */
+	dcbz	0,r10
+	dcbz	r9,r10
+	dcbz	r7,r10
+	dcbz	r6,r10
+	addi	r10,r10,512
+	bdnz	L(huge_loop)
+
+	beqlr	cr6
+
+L(huge_tail):
+	srdi    r6,r11,8
+	srdi    r7,r11,4
+	clrldi  r8,r11,4
+	cmpldi  cr6,r8,0
+	mtocrf  0x01,r6
+
+	beq	cr6,L(tail)
+
+	/* We have 1~511 bytes remaining.  */
+	.align	4
+32:	bf	31,16f
+	dcbz	0,r10
+	dcbz	r9,r10
+	addi	r10,r10,256
+
+	.align	4
+16:	mtocrf  0x01,r7
+	bf	28,8f
+	dcbz	0,r10
+	addi	r10,r10,128
+
+	.align 	4
+8:	bf	29,4f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	std	r4,32(r10)
+	std	r4,40(r10)
+	std	r4,48(r10)
+	std	r4,56(r10)
+	addi	r10,r10,64
+
+	.align	4
+4:	bf	30,2f
+	std	r4,0(r10)
+	std	r4,8(r10)
+	std	r4,16(r10)
+	std	r4,24(r10)
+	addi	r10,r10,32
+
+	.align	4
+2:	bf	31,L(tail)
+	std	r4,0(r10)
+	std	r4,8(r10)
+	addi	r10,r10,16
+	.align	4
+
+	/* Remaining 1~15 bytes.  */
+L(tail):
+	mtocrf  0x01,r8
+
+	.align
+8:	bf	28,4f
+	std	r4,0(r10)
+	addi	r10,r10,8
+
+	.align	4
+4:	bf	29,2f
+	stw	r4,0(r10)
+	addi	r10,r10,4
+
+	.align	4
+2:	bf	30,1f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+	.align	4
+1:	bflr	31
+	stb	r4,0(r10)
+	blr
+
+	/* Handle short copies of 0~31 bytes.  Best throughput is achieved
+	   by just unrolling all operations.  */
+	.align	4
+L(write_LT_32):
+	cmpldi	cr6,5,8
+	mtocrf	0x01,r5
+	ble	cr6,L(write_LE_8)
+
+	/* At least 9 bytes to go.  */
+	neg	r8,r4
+	andi.	r0,r8,3
+	cmpldi	cr1,r5,16
+	beq	L(write_LT_32_aligned)
+
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,r0
+	subf	r5,r0,r5
+
+2:	bf	30,1f
+	sth	r4,0(r10)
+	addi	r10,r10,2
+
+1:	bf	31,L(end_4bytes_alignment)
+	stb	r4,0(r10)
+	addi	r10,r10,1
+
+	.align	4
+L(end_4bytes_alignment):
+	cmpldi	cr1,r5,16
+	mtocrf	0x01,r5
+
+L(write_LT_32_aligned):
+	blt	cr1,8f
+
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	stw	r4,8(r10)
+	stw	r4,12(r10)
+	addi	r10,r10,16
+
+8:	bf	28,L(tail4)
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	addi	r10,r10,8
+
+	.align	4
+	/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	stw	r4,0(r10)
+	bf	30,L(tail5)
+	sth	r4,4(r10)
+	bflr	31
+	stb	r4,6(r10)
+	blr
+
+	.align	4
+	/* Copies 2~3 bytes.  */
+L(tail2):
+	bf	30,1f
+	sth	r4,0(r10)
+	bflr	31
+	stb	r4,2(r10)
+	blr
+
+	.align	4
+L(tail5):
+	bflr	31
+	stb	r4,4(r10)
+	blr
+
+	.align	4
+1: 	bflr	31
+	stb	r4,0(r10)
+	blr
+
+	/* Handles copies of 0~8 bytes.  */
+	.align	4
+L(write_LE_8):
+	bne	cr6,L(tail4)
+
+	stw	r4,0(r10)
+	stw	r4,4(r10)
+	blr
+END_GEN_TB (memset,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+   between bzero and memset.  */
+ENTRY (__bzero)
+	CALL_MCOUNT 3
+	mr	r5,r4
+	li	r4,0
+	b	L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e6bb56b6914e6435e251814a3a0ccd7fb65a7e36

commit e6bb56b6914e6435e251814a3a0ccd7fb65a7e36
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date:   Tue Jul 15 16:54:46 2014 -0400

    PowerPC: multiarch bzero cleanup for PPC64
    
    This patch cleanups the multiarch bzero for powerpc64 by remove
    the multiarch objects and use instead the the memset embedded
    implementation presented in each multiarch optimization.  The
    code generate is essentially the same, but the TB_TOCLESS (which
    is not essential).
    
    Conflicts:
    	ChangeLog
    
    This is backport of 3b473fecdf4c52989cd915b649bb6d26c042d048.

diff --git a/ChangeLog b/ChangeLog
index 890c3c6..85024b2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2014-09-10  Adhemerval Zanella  <azanella@linux.vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+	Remove bzero multiarch objects.
+	* sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S: Remove file.
+	* sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S: Likewise.
+	* sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S: Likewise.
+	* sysdeps/powerpc/powerpc64/multiarch/memset-power4.S [NO_BZERO_IMPL]:
+	Remove define.
+	[__bzero]: Redefine to specific name.
+	* sysdeps/powerpc/powerpc64/multiarch/memset-power6.S: Likewise.
+	* sysdeps/powerpc/powerpc64/multiarch/memset-power7.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power4/memset.S [NO_BZERO_IMPL]: Remove
+	define.
+	* sysdeps/powerpc/powerpc64/power6/memset.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/memset.S: Likewise.
+
 2014-09-16  Siddhesh Poyarekar  <siddhesh@redhat.com>
 
 	[BZ #17370]
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 82722fb..0de3804 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -2,7 +2,7 @@ ifeq ($(subdir),string)
 sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
 		   memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \
 		   memcmp-ppc64 memset-power7 memset-power6 memset-power4 \
-		   memset-ppc64 bzero-power4 bzero-power6 bzero-power7 \
+		   memset-ppc64 \
 		   mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
 		   memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
 		   rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S b/sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S
deleted file mode 100644
index 72b75ac..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Optimized bzero implementation for PowerPC64/POWER4.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-ENTRY (__bzero_power4)
-	CALL_MCOUNT 3
-	mr	r5,r4
-	li	r4,0
-	b	__memset_power4
-END_GEN_TB (__bzero_power4,TB_TOCLESS)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S b/sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S
deleted file mode 100644
index d0917c5..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Optimized bzero implementation for PowerPC64/POWER6.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-ENTRY (__bzero_power6)
-	CALL_MCOUNT 3
-	mr	r5,r4
-	li	r4,0
-	b	__memset_power6
-END_GEN_TB (__bzero_power6,TB_TOCLESS)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S b/sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S
deleted file mode 100644
index 0ec285a..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Optimized bzero implementation for PowerPC64/POWER7.
-   Copyright (C) 2013-2014 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-ENTRY (__bzero_power7)
-	CALL_MCOUNT 3
-	mr	r5,r4
-	li	r4,0
-	b	__memset_power7
-END_GEN_TB (__bzero_power7,TB_TOCLESS)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S
index 968dc24..1291fb7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S
@@ -37,5 +37,7 @@
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#define NO_BZERO_IMPL
+#undef __bzero
+#define __bzero __bzero_power4
+
 #include <sysdeps/powerpc/powerpc64/power4/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S
index 65519b9..3dc199c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S
@@ -37,5 +37,7 @@
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#define NO_BZERO_IMPL
+#undef __bzero
+#define __bzero __bzero_power6
+
 #include <sysdeps/powerpc/powerpc64/power6/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S
index 86765e7..fb1a342 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S
@@ -37,5 +37,6 @@
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 
-#define NO_BZERO_IMPL
+#undef __bzero
+#define __bzero __bzero_power7
 #include <sysdeps/powerpc/powerpc64/power7/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/power4/memset.S b/sysdeps/powerpc/powerpc64/power4/memset.S
index 3a1e9dc..b433d49 100644
--- a/sysdeps/powerpc/powerpc64/power4/memset.S
+++ b/sysdeps/powerpc/powerpc64/power4/memset.S
@@ -235,7 +235,6 @@ L(medium_28t):
 END_GEN_TB (memset,TB_TOCLESS)
 libc_hidden_builtin_def (memset)
 
-#ifndef NO_BZERO_IMPL
 /* Copied from bzero.S to prevent the linker from inserting a stub
    between bzero and memset.  */
 ENTRY (__bzero)
@@ -243,7 +242,7 @@ ENTRY (__bzero)
 	mr	r5,r4
 	li	r4,0
 	b	L(_memset)
-END_GEN_TB (__bzero,TB_TOCLESS)
-
+END (__bzero)
+#ifndef __bzero
 weak_alias (__bzero, bzero)
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power6/memset.S b/sysdeps/powerpc/powerpc64/power6/memset.S
index b5115a7..6fffe0e 100644
--- a/sysdeps/powerpc/powerpc64/power6/memset.S
+++ b/sysdeps/powerpc/powerpc64/power6/memset.S
@@ -379,7 +379,6 @@ L(medium_28t):
 END_GEN_TB (memset,TB_TOCLESS)
 libc_hidden_builtin_def (memset)
 
-#ifndef NO_BZERO_IMPL
 /* Copied from bzero.S to prevent the linker from inserting a stub
    between bzero and memset.  */
 ENTRY (__bzero)
@@ -387,7 +386,7 @@ ENTRY (__bzero)
 	mr	r5,r4
 	li	r4,0
 	b	L(_memset)
-END_GEN_TB (__bzero,TB_TOCLESS)
-
+END (__bzero)
+#ifndef __bzero
 weak_alias (__bzero, bzero)
 #endif
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
index 6b8999d..14df042 100644
--- a/sysdeps/powerpc/powerpc64/power7/memset.S
+++ b/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -383,7 +383,6 @@ L(small):
 END_GEN_TB (memset,TB_TOCLESS)
 libc_hidden_builtin_def (memset)
 
-#ifndef NO_BZERO_IMPL
 /* Copied from bzero.S to prevent the linker from inserting a stub
    between bzero and memset.  */
 ENTRY (__bzero)
@@ -391,7 +390,7 @@ ENTRY (__bzero)
 	mr	r5,r4
 	li	r4,0
 	b	L(_memset)
-END_GEN_TB (__bzero,TB_TOCLESS)
-
+END (__bzero)
+#ifndef __bzero
 weak_alias (__bzero, bzero)
 #endif

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=10f5f4c8edc35b4c3912456ffee820975e20a50b

commit 10f5f4c8edc35b4c3912456ffee820975e20a50b
Author: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Date:   Fri Nov 15 07:44:20 2013 -0600

    Partially revert commit 2663b74f8103a2a8a46b4896439b7a452480fc7c
    
    This change is necessary in order to avoid the issue documented at
    http://sourceware.org/ml/libc-alpha/2013-05/msg00350.html.

diff --git a/localedata/locales/bo_CN b/localedata/locales/bo_CN
index d813c10..c573d3f 100644
--- a/localedata/locales/bo_CN
+++ b/localedata/locales/bo_CN
@@ -145,8 +145,7 @@ END LC_MEASUREMENT
 
 LC_NAME
 % FIXME
-
-name_fmt  ""
+name_fmt	"FIXME"
 % name_gen	"FIXME"
 % name_miss	"FIXME"
 % name_mr	"FIXME"
diff --git a/localedata/locales/bo_IN b/localedata/locales/bo_IN
index 8ab793c..a1a6280 100644
--- a/localedata/locales/bo_IN
+++ b/localedata/locales/bo_IN
@@ -71,7 +71,7 @@ END LC_MEASUREMENT
 
 LC_NAME
 % FIXME
-name_fmt	""
+name_fmt	"FIXME"
 % name_gen	"FIXME"
 % name_miss	"FIXME"
 % name_mr	"FIXME"

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e6f905009b29769bd27077389ce4379d5de80df2

commit e6f905009b29769bd27077389ce4379d5de80df2
Author: Ryan S. Arnold <rsa@linux.vnet.ibm.com>
Date:   Fri Nov 15 07:42:33 2013 -0600

    Remove assert() if DT_RUNPATH and DT_RPATH flags are found in ld.so.

diff --git a/elf/get-dynamic-info.h b/elf/get-dynamic-info.h
index 20ccf30..7f51d90 100644
--- a/elf/get-dynamic-info.h
+++ b/elf/get-dynamic-info.h
@@ -130,8 +130,8 @@ elf_get_dynamic_info (struct link_map *l, ElfW(Dyn) *temp)
   assert (info[DT_FLAGS] == NULL
 	  || (info[DT_FLAGS]->d_un.d_val & ~DF_BIND_NOW) == 0);
   /* Flags must not be set for ld.so.  */
-  assert (info[DT_RUNPATH] == NULL);
-  assert (info[DT_RPATH] == NULL);
+  info[DT_RUNPATH] == NULL;
+  info[DT_RPATH] == NULL;
 #else
   if (info[DT_FLAGS] != NULL)
     {

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]