This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch ibm/2.20/master created. glibc-2.20-31-g6831ddb
- From: azanella at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 16 Jan 2015 17:01:09 -0000
- Subject: GNU C Library master sources branch ibm/2.20/master created. glibc-2.20-31-g6831ddb
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, ibm/2.20/master has been created
at 6831ddb38379c1924bd19b3203d161a4c3ed1e2e (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6831ddb38379c1924bd19b3203d161a4c3ed1e2e
commit 6831ddb38379c1924bd19b3203d161a4c3ed1e2e
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Sun Jan 11 19:33:17 2015 -0600
powerpc: Fix POWER7/PPC64 performance regression on LE
This patch fixes a performance regression on the POWER7/PPC64 memcmp
porting for Little Endian. The LE code uses 'ldbrx' instruction to read
the memory on byte reversed form, however ISA 2.06 just provide the indexed
form which uses a register value as additional index, instead of a fixed value
enconded in the instruction.
And the port strategy for LE uses r0 index value and update the address
value on each compare loop interation. For large compare size values,
it adds 8 more instructions plus some more depending of trailing
size. This patch fixes it by adding pre-calculate indexes to remove the
address update on loops and tailing sizes.
For large sizes it shows a considerable gain, with double performance
pairing with BE.
diff --git a/ChangeLog b/ChangeLog
index a8b90fe..dbaa566 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+ * sysdeps/powerpc/powerpc64/power7/memcmp.S (memcmp): Fix performance
+ regression on LE.
+
* sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S: New file.
* sysdeps/powerpc/powerpc64/power8/strncmp.S: New file.
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
index 09bff69..98b9e54 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -26,18 +26,48 @@
EALIGN (memcmp, 4, 0)
CALL_MCOUNT 3
-#define rRTN r3
-#define rSTR1 r3 /* first string arg */
-#define rSTR2 r4 /* second string arg */
-#define rN r5 /* max string length */
-#define rWORD1 r6 /* current word in s1 */
-#define rWORD2 r7 /* current word in s2 */
-#define rWORD3 r8 /* next word in s1 */
-#define rWORD4 r9 /* next word in s2 */
-#define rWORD5 r10 /* next word in s1 */
-#define rWORD6 r11 /* next word in s2 */
-#define rWORD7 r30 /* next word in s1 */
-#define rWORD8 r31 /* next word in s2 */
+#define rRTN r3
+#define rSTR1 r3 /* first string arg */
+#define rSTR2 r4 /* second string arg */
+#define rN r5 /* max string length */
+#define rWORD1 r6 /* current word in s1 */
+#define rWORD2 r7 /* current word in s2 */
+#define rWORD3 r8 /* next word in s1 */
+#define rWORD4 r9 /* next word in s2 */
+#define rWORD5 r10 /* next word in s1 */
+#define rWORD6 r11 /* next word in s2 */
+
+#define rOFF8 r20 /* 8 bytes offset. */
+#define rOFF16 r21 /* 16 bytes offset. */
+#define rOFF24 r22 /* 24 bytes offset. */
+#define rOFF32 r23 /* 24 bytes offset. */
+#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
+#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
+#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
+#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
+#define rSHR r28 /* Unaligned shift right count. */
+#define rSHL r29 /* Unaligned shift left count. */
+#define rWORD7 r30 /* next word in s1 */
+#define rWORD8 r31 /* next word in s2 */
+
+#define rWORD8SAVE (-8)
+#define rWORD7SAVE (-16)
+#define rOFF8SAVE (-24)
+#define rOFF16SAVE (-32)
+#define rOFF24SAVE (-40)
+#define rOFF32SAVE (-48)
+#define rSHRSAVE (-56)
+#define rSHLSAVE (-64)
+#define rWORD8SHIFTSAVE (-72)
+#define rWORD2SHIFTSAVE (-80)
+#define rWORD4SHIFTSAVE (-88)
+#define rWORD6SHIFTSAVE (-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD ldbrx
+#else
+# define LD ldx
+#endif
xor r0, rSTR2, rSTR1
cmpldi cr6, rN, 0
@@ -51,10 +81,24 @@ EALIGN (memcmp, 4, 0)
/* If less than 8 bytes or not aligned, use the unaligned
byte loop. */
blt cr1, L(bytealigned)
- std rWORD8, -8(r1)
- cfi_offset(rWORD8, -8)
- std rWORD7, -16(r1)
- cfi_offset(rWORD7, -16)
+ std rWORD8, rWORD8SAVE(r1)
+ cfi_offset(rWORD8, rWORD8SAVE)
+ std rWORD7, rWORD7SAVE(r1)
+ cfi_offset(rWORD7, rWORD7SAVE)
+ std rOFF8, rOFF8SAVE(r1)
+ cfi_offset(rWORD7, rOFF8SAVE)
+ std rOFF16, rOFF16SAVE(r1)
+ cfi_offset(rWORD7, rOFF16SAVE)
+ std rOFF24, rOFF24SAVE(r1)
+ cfi_offset(rWORD7, rOFF24SAVE)
+ std rOFF32, rOFF32SAVE(r1)
+ cfi_offset(rWORD7, rOFF32SAVE)
+
+ li rOFF8,8
+ li rOFF16,16
+ li rOFF24,24
+ li rOFF32,32
+
bne L(unaligned)
/* At this point we know both strings have the same alignment and the
compare length is at least 8 bytes. r12 contains the low order
@@ -79,15 +123,8 @@ L(samealignment):
sldi rWORD6, r12, 3
srdi r0, rN, 5 /* Divide by 32 */
andi. r12, rN, 24 /* Get the DW remainder */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 0(rSTR1)
- ld rWORD2, 0(rSTR2)
-#endif
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
cmpldi cr1, r12, 16
cmpldi cr7, rN, 32
clrldi rN, rN, 61
@@ -104,15 +141,8 @@ L(dsP1):
cmpld cr5, rWORD5, rWORD6
blt cr7, L(dP1x)
/* Do something useful in this cycle since we have to branch anyway. */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr7, rWORD1, rWORD2
b L(dP1e)
/* Remainder is 16 */
@@ -123,15 +153,8 @@ L(dPs2):
cmpld cr6, rWORD5, rWORD6
blt cr7, L(dP2x)
/* Do something useful in this cycle since we have to branch anyway. */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 8(rSTR1)
- ld rWORD8, 8(rSTR2)
-#endif
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
cmpld cr5, rWORD7, rWORD8
b L(dP2e)
/* Remainder is 24 */
@@ -173,72 +196,43 @@ L(dP1):
change any on the early exit path. The key here is the non-early
exit path only cares about the condition code (cr5), not about which
register pair was used. */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 0(rSTR1)
- ld rWORD6, 0(rSTR2)
-#endif
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
cmpld cr5, rWORD5, rWORD6
blt cr7, L(dP1x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr7, rWORD1, rWORD2
L(dP1e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr5, L(dLcr5x)
bne cr7, L(dLcr7x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 32(rSTR1)
- ldu rWORD8, 32(rSTR2)
-#endif
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
bne cr1, L(dLcr1)
cmpld cr5, rWORD7, rWORD8
bdnz L(dLoop)
bne cr6, L(dLcr6)
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
.align 3
L(dP1x):
sldi. r12, rN, 3
bne cr5, L(dLcr5x)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
@@ -246,79 +240,41 @@ L(dP1x):
.align 4
L(dP2):
mtctr r0
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 0(rSTR1)
- ld rWORD6, 0(rSTR2)
-#endif
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
cmpld cr6, rWORD5, rWORD6
blt cr7, L(dP2x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 8(rSTR1)
- ld rWORD8, 8(rSTR2)
-#endif
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
cmpld cr5, rWORD7, rWORD8
L(dP2e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 16(rSTR1)
- ld rWORD2, 16(rSTR2)
-#endif
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
cmpld cr7, rWORD1, rWORD2
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 24(rSTR1)
- ld rWORD4, 24(rSTR2)
-#endif
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
cmpld cr1, rWORD3, rWORD4
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
bne cr6, L(dLcr6)
bne cr5, L(dLcr5)
b L(dLoop2)
-/* Again we are on a early exit path (16-23 byte compare), we want to
- only use volatile registers and avoid restoring non-volatile
- registers. */
.align 4
L(dP2x):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 8(rSTR1)
- ld rWORD4, 8(rSTR2)
-#endif
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
sldi. r12, rN, 3
bne cr6, L(dLcr6x)
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
bne cr1, L(dLcr1x)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
@@ -326,52 +282,22 @@ L(dP2x):
.align 4
L(dP3):
mtctr r0
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 0(rSTR1)
- ld rWORD4, 0(rSTR2)
-#endif
+ LD rWORD3, 0, rSTR1
+ LD rWORD4, 0, rSTR2
cmpld cr1, rWORD3, rWORD4
L(dP3e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 8(rSTR1)
- ld rWORD6, 8(rSTR2)
-#endif
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
cmpld cr6, rWORD5, rWORD6
blt cr7, L(dP3x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 16(rSTR1)
- ld rWORD8, 16(rSTR2)
-#endif
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
cmpld cr5, rWORD7, rWORD8
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 24(rSTR1)
- ld rWORD2, 24(rSTR2)
-#endif
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
cmpld cr7, rWORD1, rWORD2
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
bne cr1, L(dLcr1)
bne cr6, L(dLcr6)
b L(dLoop1)
@@ -380,26 +306,21 @@ L(dP3e):
registers. */
.align 4
L(dP3x):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 16(rSTR1)
- ld rWORD2, 16(rSTR2)
-#endif
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
cmpld cr7, rWORD1, rWORD2
sldi. r12, rN, 3
bne cr1, L(dLcr1x)
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
bne cr6, L(dLcr6x)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
bne cr7, L(dLcr7x)
bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
@@ -407,46 +328,20 @@ L(dP3x):
.align 4
L(dP4):
mtctr r0
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 0(rSTR1)
- ld rWORD2, 0(rSTR2)
-#endif
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
cmpld cr7, rWORD1, rWORD2
L(dP4e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 8(rSTR1)
- ld rWORD4, 8(rSTR2)
-#endif
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 16(rSTR1)
- ld rWORD6, 16(rSTR2)
-#endif
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 24(rSTR1)
- ldu rWORD8, 24(rSTR2)
-#endif
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
cmpld cr5, rWORD7, rWORD8
bne cr7, L(dLcr7)
bne cr1, L(dLcr1)
@@ -454,51 +349,25 @@ L(dP4e):
/* This is the primary loop */
.align 4
L(dLoop):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
bne cr6, L(dLcr6)
L(dLoop1):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr5, L(dLcr5)
L(dLoop2):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr7, L(dLcr7)
L(dLoop3):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 32(rSTR1)
- ldu rWORD8, 32(rSTR2)
-#endif
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
bne cr1, L(dLcr1)
cmpld cr7, rWORD1, rWORD2
bdnz L(dLoop)
@@ -519,62 +388,75 @@ L(d14):
sldi. r12, rN, 3
bne cr5, L(dLcr5)
L(d04):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
- beq L(zeroLength)
+ beq L(duzeroLength)
/* At this point we have a remainder of 1 to 7 bytes to compare. Since
we are aligned it is safe to load the whole double word, and use
shift right double to eliminate bits beyond the compare length. */
L(d00):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
srd rWORD1, rWORD1, rN
srd rWORD2, rWORD2, rN
cmpld cr7, rWORD1, rWORD2
bne cr7, L(dLcr7x)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
.align 4
L(dLcr7):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr7x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr7
li rRTN, -1
blr
.align 4
L(dLcr1):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr1x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr1
li rRTN, -1
blr
.align 4
L(dLcr6):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr6x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr6
li rRTN, -1
blr
.align 4
L(dLcr5):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr5x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr5
li rRTN, -1
@@ -583,10 +465,6 @@ L(dLcr5x):
.align 4
L(bytealigned):
mtctr rN
-#if 0
-/* Huh? We've already branched on cr6! */
- beq cr6, L(zeroLength)
-#endif
/* We need to prime this loop. This loop is swing modulo scheduled
to avoid pipe delays. The dependent instruction latencies (load to
@@ -685,6 +563,7 @@ L(b11):
L(bx12):
sub rRTN, rWORD1, rWORD2
blr
+
.align 4
L(zeroLength):
li rRTN, 0
@@ -705,42 +584,36 @@ L(zeroLength):
we need to adjust the length (rN) and special case the loop
versioning for the first DW. This ensures that the loop count is
correct and the first DW (shifted) is in the expected resister pair. */
-#define rSHL r29 /* Unaligned shift left count. */
-#define rSHR r28 /* Unaligned shift right count. */
-#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
-#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
-#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
-#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
L(unaligned):
- std rSHL, -24(r1)
- cfi_offset(rSHL, -24)
+ std rSHL, rSHLSAVE(r1)
+ cfi_offset(rSHL, rSHLSAVE)
clrldi rSHL, rSTR2, 61
beq cr6, L(duzeroLength)
- std rSHR, -32(r1)
- cfi_offset(rSHR, -32)
+ std rSHR, rSHRSAVE(r1)
+ cfi_offset(rSHR, rSHRSAVE)
beq cr5, L(DWunaligned)
- std rWORD8_SHIFT, -40(r1)
- cfi_offset(rWORD8_SHIFT, -40)
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
/* Adjust the logical start of rSTR2 to compensate for the extra bits
in the 1st rSTR1 DW. */
sub rWORD8_SHIFT, rSTR2, r12
/* But do not attempt to address the DW before that DW that contains
the actual start of rSTR2. */
clrrdi rSTR2, rSTR2, 3
- std rWORD2_SHIFT, -48(r1)
- cfi_offset(rWORD2_SHIFT, -48)
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
/* Compute the left/right shift counts for the unaligned rSTR2,
compensating for the logical (DW aligned) start of rSTR1. */
clrldi rSHL, rWORD8_SHIFT, 61
clrrdi rSTR1, rSTR1, 3
- std rWORD4_SHIFT, -56(r1)
- cfi_offset(rWORD4_SHIFT, -56)
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
sldi rSHL, rSHL, 3
cmpld cr5, rWORD8_SHIFT, rSTR2
add rN, rN, r12
sldi rWORD6, r12, 3
- std rWORD6_SHIFT, -64(r1)
- cfi_offset(rWORD6_SHIFT, -64)
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
subfic rSHR, rSHL, 64
srdi r0, rN, 5 /* Divide by 32 */
andi. r12, rN, 24 /* Get the DW remainder */
@@ -750,25 +623,13 @@ L(unaligned):
this may cross a page boundary and cause a page fault. */
li rWORD8, 0
blt cr5, L(dus0)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD8, 0, rSTR2
+ LD rWORD8, 0, rSTR2
addi rSTR2, rSTR2, 8
-#else
- ld rWORD8, 0(rSTR2)
- addi rSTR2, rSTR2, 8
-#endif
sld rWORD8, rWORD8, rSHL
L(dus0):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 0(rSTR1)
- ld rWORD2, 0(rSTR2)
-#endif
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
cmpldi cr1, r12, 16
cmpldi cr7, rN, 32
srd r12, rWORD2, rSHR
@@ -796,12 +657,7 @@ L(dusP1):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 16 */
@@ -832,27 +688,21 @@ L(duPs4):
compare length is at least 8 bytes. */
.align 4
L(DWunaligned):
- std rWORD8_SHIFT, -40(r1)
- cfi_offset(rWORD8_SHIFT, -40)
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
clrrdi rSTR2, rSTR2, 3
- std rWORD2_SHIFT, -48(r1)
- cfi_offset(rWORD2_SHIFT, -48)
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
srdi r0, rN, 5 /* Divide by 32 */
- std rWORD4_SHIFT, -56(r1)
- cfi_offset(rWORD4_SHIFT, -56)
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
andi. r12, rN, 24 /* Get the DW remainder */
- std rWORD6_SHIFT, -64(r1)
- cfi_offset(rWORD6_SHIFT, -64)
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
sldi rSHL, rSHL, 3
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD6, 0, rSTR2
+ LD rWORD6, 0, rSTR2
+ LD rWORD8, rOFF8, rSTR2
addi rSTR2, rSTR2, 8
- ldbrx rWORD8, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD6, 0(rSTR2)
- ldu rWORD8, 8(rSTR2)
-#endif
cmpldi cr1, r12, 16
cmpldi cr7, rN, 32
clrldi rN, rN, 61
@@ -867,52 +717,26 @@ L(DWunaligned):
.align 4
L(duP1):
srd r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD7, 0(rSTR1)
-#endif
+ LD rWORD7, 0, rSTR1
sld rWORD8_SHIFT, rWORD8, rSHL
or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP1x)
L(duP1e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr5, rWORD7, rWORD8
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr7, rWORD1, rWORD2
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
bne cr5, L(duLcr5)
or rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr1, rWORD3, rWORD4
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
@@ -932,82 +756,47 @@ L(duP1x):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 16 */
.align 4
L(duP2):
srd r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD5, 0(rSTR1)
-#endif
+ LD rWORD5, 0, rSTR1
or rWORD6, r0, rWORD6_SHIFT
sld rWORD6_SHIFT, rWORD8, rSHL
L(duP2e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 8(rSTR1)
- ld rWORD8, 8(rSTR2)
-#endif
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
cmpld cr6, rWORD5, rWORD6
srd r12, rWORD8, rSHR
sld rWORD8_SHIFT, rWORD8, rSHL
or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP2x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 16(rSTR1)
- ld rWORD2, 16(rSTR2)
-#endif
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 24(rSTR1)
- ld rWORD4, 24(rSTR2)
-#endif
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
cmpld cr7, rWORD1, rWORD2
bne cr5, L(duLcr5)
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
or rWORD4, r12, rWORD2_SHIFT
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
cmpld cr1, rWORD3, rWORD4
b L(duLoop2)
.align 4
L(duP2x):
cmpld cr5, rWORD7, rWORD8
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
bne cr6, L(duLcr6)
sldi. rN, rN, 3
bne cr5, L(duLcr5)
@@ -1015,12 +804,7 @@ L(duP2x):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
@@ -1028,73 +812,39 @@ L(duP2x):
.align 4
L(duP3):
srd r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD3, 0(rSTR1)
-#endif
+ LD rWORD3, 0, rSTR1
sld rWORD4_SHIFT, rWORD8, rSHL
or rWORD4, r12, rWORD6_SHIFT
L(duP3e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 8(rSTR1)
- ld rWORD6, 8(rSTR2)
-#endif
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
or rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 16(rSTR1)
- ld rWORD8, 16(rSTR2)
-#endif
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr1, L(duLcr1)
srd r12, rWORD8, rSHR
sld rWORD8_SHIFT, rWORD8, rSHL
or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP3x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 24(rSTR1)
- ld rWORD2, 24(rSTR2)
-#endif
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
cmpld cr7, rWORD1, rWORD2
b L(duLoop1)
.align 4
L(duP3x):
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
-#if 0
-/* Huh? We've already branched on cr1! */
- bne cr1, L(duLcr1)
-#endif
cmpld cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
sldi. rN, rN, 3
@@ -1103,12 +853,7 @@ L(duP3x):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
@@ -1117,51 +862,27 @@ L(duP3x):
L(duP4):
mtctr r0
srd r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD1, 0(rSTR1)
-#endif
+ LD rWORD1, 0, rSTR1
sld rWORD2_SHIFT, rWORD8, rSHL
or rWORD2, r0, rWORD6_SHIFT
L(duP4e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 8(rSTR1)
- ld rWORD4, 8(rSTR2)
-#endif
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
cmpld cr7, rWORD1, rWORD2
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
or rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 16(rSTR1)
- ld rWORD6, 16(rSTR2)
-#endif
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
cmpld cr1, rWORD3, rWORD4
bne cr7, L(duLcr7)
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
or rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 24(rSTR1)
- ldu rWORD8, 24(rSTR2)
-#endif
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
cmpld cr6, rWORD5, rWORD6
bne cr1, L(duLcr1)
srd r12, rWORD8, rSHR
@@ -1172,60 +893,34 @@ L(duP4e):
/* This is the primary loop */
.align 4
L(duLoop):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
bne cr6, L(duLcr6)
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
L(duLoop1):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr5, L(duLcr5)
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
or rWORD4, r12, rWORD2_SHIFT
L(duLoop2):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr7, L(duLcr7)
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
or rWORD6, r0, rWORD4_SHIFT
L(duLoop3):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 32(rSTR1)
- ldu rWORD8, 32(rSTR2)
-#endif
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
cmpld cr7, rWORD1, rWORD2
bne cr1, L(duLcr1)
srd r12, rWORD8, rSHR
@@ -1234,10 +929,6 @@ L(duLoop3):
bdnz L(duLoop)
L(duL4):
-#if 0
-/* Huh? We've already branched on cr1! */
- bne cr1, L(duLcr1)
-#endif
cmpld cr1, rWORD3, rWORD4
bne cr6, L(duLcr6)
cmpld cr6, rWORD5, rWORD6
@@ -1264,99 +955,102 @@ L(du14):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
.align 4
L(dutrim):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
-#else
- ld rWORD1, 8(rSTR1)
-#endif
+ LD rWORD1, rOFF8, rSTR1
ld rWORD8, -8(r1)
subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
or rWORD2, r0, rWORD8_SHIFT
- ld rWORD7, -16(r1)
- ld rSHL, -24(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ ld rSHL, rSHLSAVE(r1)
srd rWORD1, rWORD1, rN
srd rWORD2, rWORD2, rN
- ld rSHR, -32(r1)
- ld rWORD8_SHIFT, -40(r1)
+ ld rSHR, rSHRSAVE(r1)
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
li rRTN, 0
cmpld cr7, rWORD1, rWORD2
- ld rWORD2_SHIFT, -48(r1)
- ld rWORD4_SHIFT, -56(r1)
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
beq cr7, L(dureturn24)
li rRTN, 1
- ld rWORD6_SHIFT, -64(r1)
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
bgtlr cr7
li rRTN, -1
blr
.align 4
L(duLcr7):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr7, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr1):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr1, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr6):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr6, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr5):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr5, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
+
.align 3
L(duZeroReturn):
li rRTN, 0
.align 4
L(dureturn):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dureturn29):
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
L(dureturn27):
- ld rWORD8_SHIFT, -40(r1)
-L(dureturn26):
- ld rWORD2_SHIFT, -48(r1)
-L(dureturn25):
- ld rWORD4_SHIFT, -56(r1)
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
L(dureturn24):
- ld rWORD6_SHIFT, -64(r1)
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
blr
+
L(duzeroLength):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=473b6083820fd156985bf7b2cb60db9d4031b536
commit 473b6083820fd156985bf7b2cb60db9d4031b536
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Fri Jan 9 16:04:26 2015 -0500
powerpc: Optimized strncmp for POWER8/PPC64
This patch adds an optimized POWER8 strncmp. The implementation focus
on speeding up unaligned cases follwing the ideas of power8 strcmp.
The algorithm first check the initial 16 bytes, then align the first
function source and uses unaligned loads on second argument only.
Aditional checks for page boundaries are done for unaligned cases
(where sources alignment are different).
diff --git a/ChangeLog b/ChangeLog
index 621daa4..a8b90fe 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S: New file.
+ * sysdeps/powerpc/powerpc64/power8/strncmp.S: New file.
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+ strncmp-power8 object.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add __strncmp_power8 implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/strncmp.c (strncmp): Likewise.
+ * NEWS: Update.
+
2015-01-13 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Adhemerval Zanella <azanella@linux.vnet.ibm.com>
diff --git a/NEWS b/NEWS
index 0a4fa77..4b52bd0 100644
--- a/NEWS
+++ b/NEWS
@@ -11,8 +11,8 @@ Version 2.20.1
16617, 17266, 17370, 17371, 17625, 17630.
-* Optimized strcpy, stpcpy, strncpy, stpncpy, and strcmp implementations for
- powerpc64/powerpc64le.
+* Optimized strcpy, stpcpy, strncpy, stpncpy, strcmp, and strncmp
+ implementations for powerpc64/powerpc64le.
Implemented by Adhemerval Zanella (IBM).
* CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index ec4fca7..b7ea284 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -7,8 +7,9 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \
- strncase-power7 strncase_l-power7 strncmp-power7 \
- strncmp-power4 strncmp-ppc64 strchr-power7 strchr-ppc64 \
+ strncase-power7 strncase_l-power7 \
+ strncmp-power8 strncmp-power7 strncmp-power4 strncmp-ppc64 \
+ strchr-power7 strchr-ppc64 \
strchrnul-power7 strchrnul-ppc64 wcschr-power7 \
wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \
wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 36c5149..bd92cf6 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -108,6 +108,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c. */
IFUNC_IMPL (i, name, strncmp,
+ IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strncmp_power8)
IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_HAS_VSX,
__strncmp_power7)
IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_POWER4,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S
new file mode 100644
index 0000000..8d7223d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S
@@ -0,0 +1,40 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name,alignt,words) \
+ .section ".text"; \
+ ENTRY_2(__strncmp_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__strncmp_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__strncmp_power8)
+
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strncmp_power8) \
+ END_2(__strncmp_power8)
+
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncmp.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
index 9829d69..5e76783 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
@@ -25,13 +25,16 @@
extern __typeof (strncmp) __strncmp_ppc attribute_hidden;
extern __typeof (strncmp) __strncmp_power4 attribute_hidden;
extern __typeof (strncmp) __strncmp_power7 attribute_hidden;
+extern __typeof (strncmp) __strncmp_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (strncmp,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strncmp_power7 :
- (hwcap & PPC_FEATURE_POWER4)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strncmp_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strncmp_power7 :
+ (hwcap & PPC_FEATURE_POWER4)
? __strncmp_power4
: __strncmp_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strncmp.S b/sysdeps/powerpc/powerpc64/power8/strncmp.S
new file mode 100644
index 0000000..56c814b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncmp.S
@@ -0,0 +1,323 @@
+/* Optimized strncmp implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Implements the function
+
+ int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+ .machine power7
+EALIGN (strncmp, 4, 0)
+ /* Check if size is 0. */
+ mr. r10,r5
+ beq cr0,L(ret0)
+
+ /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
+ the code:
+
+ (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+ with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
+ rldicl r8,r3,0,52
+ cmpldi cr7,r8,4096-16
+ bgt cr7,L(pagecross)
+ rldicl r9,r4,0,52
+ cmpldi cr7,r9,4096-16
+ bgt cr7,L(pagecross)
+
+ /* For short string up to 16 bytes, load both s1 and s2 using
+ unaligned dwords and compare. */
+ ld r7,0(r3)
+ ld r9,0(r4)
+ li r8,0
+ cmpb r8,r7,r8
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different1)
+
+ /* If the string compared are equal, but size is less or equal
+ to 8, return 0. */
+ cmpldi cr7,r10,8
+ li r9,0
+ ble cr7,L(ret1)
+ addi r5,r10,-8
+
+ ld r7,8(r3)
+ ld r9,8(r4)
+ cmpb r8,r7,r8
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different0)
+
+ cmpldi cr7,r5,8
+ mr r9,r8
+ ble cr7,L(ret1)
+
+ /* Update pointers and size. */
+ addi r10,r10,-16
+ addi r3,r3,16
+ addi r4,r4,16
+
+ /* Now it has checked for first 16 bytes, align source1 to doubleword
+ and adjust source2 address. */
+L(align_8b):
+ rldicl r5,r3,0,61
+ rldicr r3,r3,0,60
+ subf r4,r5,r4
+ add r10,r10,r5
+
+ /* At this point, source1 alignment is 0 and source2 alignment is
+ between 0 and 7. Check is source2 alignment is 0, meaning both
+ sources have the same alignment. */
+ andi. r8,r4,0x7
+ beq cr0,L(loop_eq_align_0)
+
+ li r5,0
+ b L(loop_ne_align_1)
+
+ /* If source2 is unaligned to doubleword, the code needs to check
+ on each interation if the unaligned doubleword access will cross
+ a 4k page boundary. */
+ .align 4
+L(loop_ne_align_0):
+ ld r7,0(r3)
+ ld r9,0(r4)
+ cmpb r8,r7,r5
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different1)
+
+ cmpldi cr7,r10,8
+ ble cr7,L(ret0)
+ addi r10,r10,-8
+ addi r3,r3,8
+ addi r4,r4,8
+L(loop_ne_align_1):
+ rldicl r9,r4,0,52
+ cmpldi r7,r9,4088
+ ble cr7,L(loop_ne_align_0)
+ cmpdi cr7,r10,0
+ beq cr7,L(ret0)
+
+ lbz r9,0(r3)
+ lbz r8,0(r4)
+ cmplw cr7,r9,r8
+ bne cr7,L(byte_ne_4)
+ cmpdi cr7,r9,0
+ beq cr7,L(size_reached_0)
+
+ li r9,r7
+ addi r8,r3,1
+ mtctr r9
+ addi r4,r4,1
+ addi r10,r10,-1
+ addi r3,r3,8
+
+ /* The unaligned read of source2 will cross a 4K page boundary,
+ and the different byte or NULL maybe be in the remaining page
+ bytes. Since it can not use the unaligned load the algorithm
+ reads and compares 8 bytes to keep source1 doubleword aligned. */
+ .align 4
+L(loop_ne_align_byte):
+ cmpdi cr7,r10,0
+ addi r10,r10,-1
+ beq cr7,L(ret0)
+ lbz r9,0(r8)
+ lbz r7,0(r4)
+ addi r8,r8,1
+ addi r4,r4,1
+ cmplw cr7,r9,r7
+ cmpdi cr5,r9,0
+ bne cr7,L(size_reached_2)
+ beq cr5,L(size_reached_0)
+ bdnz L(loop_ne_align_byte)
+
+ cmpdi cr7,r10,0
+ bne+ cr7,L(loop_ne_align_0)
+
+ .align 4
+L(ret0):
+ li r9,0
+L(ret1):
+ mr r3,r9
+ blr
+
+ /* The code now check if r8 and r10 are different by issuing a
+ cmpb and shift the result based on its output:
+
+ #ifdef __LITTLE_ENDIAN__
+ leadzero = (__builtin_ffsl (z1) - 1);
+ leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+ r1 = (r1 >> leadzero) & 0xFFUL;
+ r2 = (r2 >> leadzero) & 0xFFUL;
+ #else
+ leadzero = __builtin_clzl (z1);
+ leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+ r1 = (r1 >> (56 - leadzero)) & 0xFFUL;
+ r2 = (r2 >> (56 - leadzero)) & 0xFFUL;
+ #endif
+ return r1 - r2; */
+
+ .align 4
+L(different0):
+ mr r10,r5
+#ifdef __LITTLE_ENDIAN__
+L(different1):
+ neg r11,r8
+ sldi r10,r10,3
+ and r8,r11,r8
+ addi r10,r10,-8
+ cntlzd r8,r8
+ subfic r8,r8,63
+ extsw r8,r8
+ cmpld cr7,r8,r10
+ ble cr7,L(different2)
+ mr r8,r10
+L(different2):
+ extsw r8,r8
+#else
+L(different1):
+ addi r10,r10,-1
+ cntlzd r8,r8
+ sldi r10,r10,3
+ cmpld cr7,r8,r10
+ blt cr7,L(different2)
+ mr r8,r10
+L(different2):
+ subfic r8,r8,56
+#endif
+ srd r7,r7,r8
+ srd r9,r9,r8
+ rldicl r3,r7,0,56
+ rldicl r9,r9,0,56
+ subf r9,r9,3
+ extsw r9,r9
+ mr r3,r9
+ blr
+
+ /* If unaligned 16 bytes reads across a 4K page boundary, it uses
+ a simple byte a byte comparison until the page alignment for s1
+ is reached. */
+ .align 4
+L(pagecross):
+ lbz r7,0(r3)
+ lbz r9,0(r4)
+ subfic r8,r8,4095
+ cmplw cr7,r9,r7
+ bne cr7,L(byte_ne_3)
+ cmpdi cr7,r9,0
+ beq cr7,L(byte_ne_0)
+ addi r10,r10,-1
+ subf r7,r8,r10
+ subf r9,r7,r10
+ addi r9,r9,1
+ mtctr r9
+ b L(pagecross_loop1)
+
+ .align 4
+L(pagecross_loop0):
+ beq cr7,L(ret0)
+ lbz r9,0(r3)
+ lbz r8,0(r4)
+ addi r10,r10,-1
+ cmplw cr7,r9,r8
+ cmpdi cr5,r9,0
+ bne r7,L(byte_ne_2)
+ beq r5,L(byte_ne_0)
+L(pagecross_loop1):
+ cmpdi cr7,r10,0
+ addi r3,r3,1
+ addi r4,r4,1
+ bdnz L(pagecross_loop0)
+ cmpdi cr7,r7,0
+ li r9,0
+ bne+ cr7,L(align_8b)
+ b L(ret1)
+
+ /* If both source1 and source2 are doubleword aligned, there is no
+ need for page boundary cross checks. */
+ .align 4
+L(loop_eq_align_0):
+ ld r7,0(r3)
+ ld r9,0(r4)
+ cmpb r8,r7,r8
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different1)
+
+ cmpldi cr7,r10,8
+ ble cr7,L(ret0)
+ addi r9,r10,-9
+
+ li r5,0
+ srdi r9,r9,3
+ addi r9,r9,1
+ mtctr r9
+ b L(loop_eq_align_2)
+
+ .align 4
+L(loop_eq_align_1):
+ bdz L(ret0)
+L(loop_eq_align_2):
+ ldu r7,8(r3)
+ addi r10,r10,-8
+ ldu r9,8(r4)
+ cmpb r8,r7,r5
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ beq cr0,L(loop_eq_align_1)
+ b L(different1)
+
+ .align 4
+L(byte_ne_0):
+ li r7,0
+L(byte_ne_1):
+ subf r9,r9,r7
+ extsw r9,r9
+ b L(ret1)
+
+ .align 4
+L(byte_ne_2):
+ extsw r7,r9
+ mr r9,r8
+ b L(byte_ne_1)
+L(size_reached_0):
+ li r10,0
+L(size_reached_1):
+ subf r9,r9,r10
+ extsw r9,r9
+ b L(ret1)
+L(size_reached_2):
+ extsw r10,r9
+ mr r9,r7
+ b L(size_reached_1)
+L(byte_ne_3):
+ extsw r7,r7
+ b L(byte_ne_1)
+L(byte_ne_4):
+ extsw r10,r9
+ mr r9,r8
+ b L(size_reached_1)
+END(strncmp)
+libc_hidden_builtin_def(strncmp)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=299b9464d9a1a48bbcfbc1c7a99604091ec5248f
commit 299b9464d9a1a48bbcfbc1c7a99604091ec5248f
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Date: Fri Jan 9 11:56:35 2015 -0500
powerpc: Optimize POWER7 strcmp trailing checks
This patch optimized the POWER7 trailing check by avoiding using byte
read operations and instead use the doubleword already readed with
bitwise operations.
diff --git a/ChangeLog b/ChangeLog
index b947d5b..621daa4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2015-01-13 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
+ Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/power7/strcmp.S (strcmp): Optimize
+ trailing byte check.
+
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S
index f16a9d8..ade2811 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S
@@ -25,122 +25,96 @@
/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */
+ .machine power7
EALIGN (strcmp, 4, 0)
CALL_MCOUNT 2
or r9, r3, r4
rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */
bne cr0, L(process_unaligned_bytes)
+ li r5, 0
+ .align 4
/* process input parameters on double word aligned boundary */
- ld r9, 0(r4) /* load s2 at offset=0 */
- li r10, 0 /* load mask=0 */
- cmpb r10, r9, r10 /* compare bytes at s2 with mask */
- cmpdi cr7, r10, 0 /* is NULL found ..? is end of string HIT */
- bne cr7, L(process_unaligned_bytes) /* process byte by byte */
-
- ld r10, 0(r3) /* load s1 at offset=0 */
- li r8, 0 /* load mask=0 */
- cmpb r8, r10, r8 /* compare bytes at s1 with mask */
- cmpdi cr7, r8, 0 /* is NULL found ..? is end of string HIT */
- bne cr7, L(process_unaligned_bytes) /* process byte by byte */
-
-/*s1 and s2 does not contain NULL now , so compare all 8 bytes in a GO */
- cmpb r9, r10, r9 /* compare s1 and s2 */
- cmpdi cr7, r9, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */
- bne cr7, L(process_unaligned_bytes) /* s1,s2 mismatch found */
-
- addi r5, r3, 8 /* save next offset of s2 */
- addi r11, r4, 8 /* save next offset of s1 */
- ld r8, 8(r4) /* load s2 at offset=8 */
- li r9, 0 /* load mask=0 */
- cmpb r9, r8, r9 /* compare bytes at s2 with mask */
- cmpdi cr7, r9, 0 /* NULL found ..? */
- bne cr7, L(processBytes)/* update input and process bytes one by one */
-
- mr r9, r4 /* save s2 */
- li r10, 0 /* load mask=0 */
-
- ld r7, 8(r3) /* load s1 at offset=8 */
- cmpb r6, r7, r10 /* compare bytes at s1 with mask */
- cmpdi cr7, r6, 0 /* is NULL found */
- bne cr7, L(processBytes)/* mismatch, so process one by one */
-
L(unrollDword):
- cmpb r8, r7, r8 /* compare s1 and s2 */
- cmpdi cr7, r8, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */
- bne cr7, L(processBytes)/* mismatch with s1 and s2 */
-
- addi r5, r3, 16 /* save offset=16 of s1 */
- addi r4, r9, 16 /* save offset=16 of s2 */
- ld r8, 16(r9) /* load s2 at offset=16 */
- cmpb r7, r8, r10 /* compare bytes at s2 with mask */
- cmpdi cr7, r7, 0 /* NULL found ..? */
- bne cr7, L(update2processBytes)
-
- ld r7, 16(r3) /* load s1 at offset=16 */
- cmpb r6, r7, r10 /* check s1 for end of string */
- cmpdi cr7, r6, 0 /* end of s1 ?,then handle byte by byte */
- bne 7,L(update2processBytes)
-
- cmpb r8, r7, r8 /* compare s1 and s2 double words */
- cmpdi cr7, r8, -1 /* compare results with 0xFFFFFFFFFFFFFFFF */
- bne cr7,L(update2processBytes)
-
- addi r5, r3, 24 /* update s1 to offset=24 */
- addi r4, r9, 24 /* update s2 to offset=24 */
-
- ld r8, 24(r9) /* load s2 */
- cmpb r7, r8, r10 /* compare s2 for NULL */
- cmpdi cr7, r7, 0 /* verify if s2 is ending now */
- bne cr7,L(update2processBytes)
-
- ld r7, 24(r3) /* load s1 at offset=24 */
- cmpb r6, r7, r10 /* verify for NULL */
- cmpdi cr7, r6, 0 /* is NULL found */
- bne cr7, L(update2processBytes)
-
- cmpb r8, r7, r8 /* compare s1 and s2 */
- cmpdi cr7, r8, -1 /* are s1 and s2 same ..? */
- bne cr7, L(update2processBytes)
-
- addi r7, r9, 32 /* update s2 to next double word */
- addi r3, r3, 32 /* update s1 to next double word */
-
- ld r8, 32(r9) /* load s2 */
- mr r4, r7 /* save s2 */
- cmpb r6, r8, r10 /* compare s2 with NULL */
- cmpdi cr7, r6, 0 /* end of s2 ..? */
- bne cr7, L(process_unaligned_bytes)
-
- ld r6, 0(r3) /* load and compare s1 for NULL */
- cmpb r5, r6, r10
- cmpdi cr7, r5, 0
- bne cr7, L(process_unaligned_bytes)
-
- cmpb r8, r6, r8 /* compare s1 and s2 */
- cmpdi cr7, r8, -1
- bne cr7, L(process_unaligned_bytes)
-
- addi r5, r3, 8 /* increment s1 and d2 here */
- addi r11, r9, 40
-
- ld r8, 40(r9) /* process s2 now */
- cmpb r9, r8, r10
- cmpdi cr7, r9, 0
- bne cr7, L(processBytes)
-
- mr r9, r7
- ld r7, 8(r3) /* process s1 now */
- cmpb r6, r7, r10
- cmpdi cr7, r6, 0
- beq cr7, L(unrollDword) /* unroll to compare s1 and s2 */
-
-L(processBytes):
- mr r4, r11 /* update input params */
- mr r3, r5
-
- .p2align 4
+ ld r8,0(r3)
+ ld r10,0(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,8(r3)
+ ld r10,8(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,16(r3)
+ ld r10,16(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,24(r3)
+ ld r10,24(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ addi r3, r3, 32
+ addi r4, r4, 32
+ beq cr7, L(unrollDword)
+
+ .align 4
+L(null_found):
+#ifdef __LITTLE_ENDIAN__
+ neg r7,r9
+ and r9,r9,r7
+ li r7,-1
+ cntlzd r9,r9
+ subfic r9,r9,71
+ sld r9,r7,r9
+#else
+ cntlzd r9,r9
+ li r7,-1
+ addi r9,r9,8
+ srd r9,r7,r9
+#endif
+ or r8,r8,r9
+ or r10,r10,r9
+
+L(different):
+ cmpb r9,r8,r10
+#ifdef __LITTLE_ENDIAN__
+ addi r7,r9,1
+ andc r9,r7,r9
+ cntlzd r9,r9
+ subfic r9,r9,63
+#else
+ not r9,r9
+ cntlzd r9,r9
+ subfic r9,r9,56
+#endif
+ srd r3,r8,r9
+ srd r10,r10,r9
+ rldicl r10,r10,0,56
+ rldicl r3,r3,0,56
+ subf r3,r10,r3
+ blr
+
+ .align 4
L(process_unaligned_bytes):
lbz r9, 0(r3) /* load byte from s1 */
lbz r10, 0(r4) /* load byte from s2 */
@@ -172,24 +146,19 @@ L(process_unaligned_bytes):
addi r4, r4, 4 /* increment s2 by unroll factor */
beq cr6, L(process_unaligned_bytes) /* unroll byte processing */
- .p2align 4
+ .align 4
L(ComputeDiff):
extsw r9, r9
subf r10, r10, r9 /* compute s1 - s2 */
extsw r3, r10
blr /* return */
- .p2align 4
+ .align 4
L(diffOfNULL):
li r9, 0
subf r10, r10, r9 /* compute s1 - s2 */
extsw r3, r10 /* sign extend result */
blr /* return */
- .p2align 4
-L(update2processBytes):
- mr r3, r5 /* update and proceed */
- b L(process_unaligned_bytes)
-
END (strcmp)
libc_hidden_builtin_def (strcmp)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=6f0685edc6676c7266fdc30fd0769fb88d058f04
commit 6f0685edc6676c7266fdc30fd0769fb88d058f04
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Wed Jan 7 07:18:30 2015 -0500
powerpc: Optimized strcmp for POWER8/PPC64
This patch adds an optimized POWER8 strcmp using unaligned accesses.
The algorithm first check the initial 16 bytes, then align the first
function source and uses unaligned loads on second argument only.
Aditional checks for page boundaries are done for unaligned cases
diff --git a/ChangeLog b/ChangeLog
index 383f1f5..b947d5b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,16 @@
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+ Add strcmp-power8 object.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add __strcmp_power8 implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S: New file.
+ * sysdeps/powerpc/powerpc64/multiarch/strcmp.c (strcmp): Add
+ __strcmp_power8 implementation.
+ * sysdeps/powerpc/powerpc64/power8/strcmp.S: New file.
+ * NEWS: Update.
+
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
Add strncpy-power8 and stpncpy-power8 objects.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
diff --git a/NEWS b/NEWS
index afb48df..0a4fa77 100644
--- a/NEWS
+++ b/NEWS
@@ -11,8 +11,9 @@ Version 2.20.1
16617, 17266, 17370, 17371, 17625, 17630.
-* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for
+* Optimized strcpy, stpcpy, strncpy, stpncpy, and strcmp implementations for
powerpc64/powerpc64le.
+ Implemented by Adhemerval Zanella (IBM).
* CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
under certain input conditions resulting in the execution of a shell for
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 18d3378..ec4fca7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -18,7 +18,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
strncpy-power7 strncpy-ppc64 \
stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
- strcmp-power7 strcmp-ppc64 \
+ strcmp-power8 strcmp-power7 strcmp-ppc64 \
strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
memmove-ppc64 bcopy-ppc64 strncpy-power8
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index b698b90..36c5149 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */
IFUNC_IMPL (i, name, strcmp,
IFUNC_IMPL_ADD (array, i, strcmp,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strcmp_power8)
+ IFUNC_IMPL_ADD (array, i, strcmp,
hwcap & PPC_FEATURE_HAS_VSX,
__strcmp_power7)
IFUNC_IMPL_ADD (array, i, strcmp, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcmp.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
index 2013301..dc4bfac 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcmp. PowerPC64 version.
- Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized strcmp implementation for POWER8/PPC64.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if defined SHARED && !defined NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
-extern __typeof (strcmp) __strcmp_ppc attribute_hidden;
-extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__strcmp_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__strcmp_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__strcmp_power8)
-libc_ifunc (strcmp,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcmp_power7
- : __strcmp_ppc);
-#endif
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strcmp_power8) \
+ END_2(__strcmp_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strcmp.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
index 2013301..c711969 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -23,9 +23,12 @@
extern __typeof (strcmp) __strcmp_ppc attribute_hidden;
extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
+extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
libc_ifunc (strcmp,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcmp_power7
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strcmp_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strcmp_power7
: __strcmp_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S
new file mode 100644
index 0000000..223d891
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -0,0 +1,257 @@
+/* Optimized strcmp implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Implements the function
+
+ size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+EALIGN (strcmp, 4, 0)
+ li r0,0
+
+ /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+ the code:
+
+ (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+ with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */
+
+ rldicl r7,r3,0,52
+ rldicl r9,r4,0,52
+ cmpldi cr7,r7,4096-32
+ bgt cr7,L(pagecross_check)
+ cmpldi cr5,r9,4096-32
+ bgt cr5,L(pagecross_check)
+
+ /* For short string up to 32 bytes, load both s1 and s2 using
+ unaligned dwords and compare. */
+ ld r8,0(r3)
+ ld r10,0(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ld r8,8(r3)
+ ld r10,8(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ld r8,16(r3)
+ ld r10,16(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ld r8,24(r3)
+ ld r10,24(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ addi r7,r3,32
+ addi r4,r4,32
+
+L(align_8b):
+ /* Now it has checked for first 32 bytes, align source1 to doubleword
+ and adjust source2 address. */
+ rldicl r9,r7,0,61 /* source1 alignment to doubleword */
+ subf r4,r9,r4 /* Adjust source2 address based on source1
+ alignment. */
+ rldicr r7,r7,0,60 /* Align source1 to doubleword. */
+
+ /* At this point, source1 alignment is 0 and source2 alignment is
+ between 0 and 7. Check is source2 alignment is 0, meaning both
+ sources have the same alignment. */
+ andi. r9,r4,0x7
+ bne cr0,L(loop_diff_align)
+
+ /* If both source1 and source2 are doubleword aligned, there is no
+ need for page boundary cross checks. */
+
+ ld r8,0(r7)
+ ld r10,0(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ .align 4
+L(loop_equal_align):
+ ld r8,8(r7)
+ ld r10,8(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ld r8,16(r7)
+ ld r10,16(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ldu r8,24(r7)
+ ldu r10,24(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ b L(loop_equal_align)
+
+ /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb
+ result and r10 the dword from s2. To code isolate the byte
+ up to end (including the '\0'), masking with 0xFF the remaining
+ ones:
+
+ #if __LITTLE_ENDIAN__
+ (__builtin_ffsl (x) - 1) = counting trailing zero bits
+ r9 = (__builtin_ffsl (r9) - 1) + 8;
+ r9 = -1UL << r9
+ #else
+ r9 = __builtin_clzl (r9) + 8;
+ r9 = -1UL >> r9
+ #endif
+ r8 = r8 | r9
+ r10 = r10 | r9 */
+
+#ifdef __LITTLE_ENDIAN__
+ nor r9,r9,r9
+L(different_nocmpb):
+ neg r3,r9
+ and r9,r9,r3
+ cntlzd r9,r9
+ subfic r9,r9,63
+#else
+ not r9,r9
+L(different_nocmpb):
+ cntlzd r9,r9
+ subfic r9,r9,56
+#endif
+ srd r3,r8,r9
+ srd r10,r10,r9
+ rldicl r10,r10,0,56
+ rldicl r3,r3,0,56
+ subf r3,r10,r3
+ extsw r3,r3
+ blr
+
+ .align 4
+L(pagecross_check):
+ subfic r9,r9,4096
+ subfic r7,r7,4096
+ cmpld cr7,r7,r9
+ bge cr7,L(pagecross)
+ mr r7,r9
+
+ /* If unaligned 16 bytes reads across a 4K page boundary, it uses
+ a simple byte a byte comparison until the page alignment for s1
+ is reached. */
+L(pagecross):
+ add r7,r3,r7
+ subf r9,r3,r7
+ mtctr r9
+
+ .align 4
+L(pagecross_loop):
+ /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
+ and if *s1 is '\0'. */
+ lbz r9,0(r3)
+ lbz r10,0(r4)
+ addi r3,r3,1
+ addi r4,r4,1
+ cmplw cr7,r9,r10
+ cmpdi cr5,r9,r0
+ bne cr7,L(pagecross_ne)
+ beq cr5,L(pagecross_nullfound)
+ bdnz L(pagecross_loop)
+ b L(align_8b)
+
+ .align 4
+ /* The unaligned read of source2 will cross a 4K page boundary,
+ and the different byte or NULL maybe be in the remaining page
+ bytes. Since it can not use the unaligned load, the algorithm
+ reads and compares 8 bytes to keep source1 doubleword aligned. */
+L(check_source2_byte):
+ li r9,8
+ mtctr r9
+
+ .align 4
+L(check_source2_byte_loop):
+ lbz r9,0(r7)
+ lbz r10,0(r4)
+ addi r7,r7,1
+ addi r4,r4,1
+ cmplw cr7,r9,10
+ cmpdi r5,r9,0
+ bne cr7,L(pagecross_ne)
+ beq cr5,L(pagecross_nullfound)
+ bdnz L(check_source2_byte_loop)
+
+ /* If source2 is unaligned to doubleword, the code needs to check
+ on each interation if the unaligned doubleword access will cross
+ a 4k page boundary. */
+ .align 5
+L(loop_unaligned):
+ ld r8,0(r7)
+ ld r10,0(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+ addi r7,r7,8
+ addi r4,r4,8
+
+L(loop_diff_align):
+ /* Check if [src2]+8 cross a 4k page boundary:
+
+ srcin2 % PAGE_SIZE > (PAGE_SIZE - 8)
+
+ with PAGE_SIZE being 4096. */
+ rldicl r9,r4,0,52
+ cmpldi cr7,r9,4088
+ ble cr7,L(loop_unaligned)
+ b L(check_source2_byte)
+
+ .align 4
+L(pagecross_ne):
+ extsw r3,r9
+ mr r9,r10
+L(pagecross_retdiff):
+ subf r9,r9,r3
+ extsw r3,r9
+ blr
+
+ .align 4
+L(pagecross_nullfound):
+ li r3,0
+ b L(pagecross_retdiff)
+END (strcmp)
+libc_hidden_builtin_def (strcmp)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a38f68f12fd03374d599eeb0b6943e50b0ff7348
commit a38f68f12fd03374d599eeb0b6943e50b0ff7348
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Wed Dec 31 11:47:41 2014 -0500
powerpc: Optimized st{r,p}ncpy for POWER8/PPC64
This patch adds an optimized POWER8 st{r,p}ncpy using unaligned accesses.
It shows 10%-80% improvement over the optimized POWER7 one that uses
only aligned accesses, specially on unaligned inputs.
The algorithm first read and check 16 bytes (if inputs do not cross a 4K
page size). The it realign source to 16-bytes and issue a 16 bytes read
and compare loop to speedup null byte checks for large strings. Also,
different from POWER7 optimization, the null pad is done inline in the
implementation using possible unaligned accesses, instead of realying on
a memset call. Special case is added for page cross reads.
diff --git a/ChangeLog b/ChangeLog
index 0c3f78d..383f1f5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+ Add strncpy-power8 and stpncpy-power8 objects.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
+ implementations.
+ * sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S: New file.
+ * sysdeps/powerpc/powerpc64/multiarch/stpncpy.c (__stpncpy): Add
+ __stpncpy_power8 implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S: New file.
+ * sysdeps/powerpc/powerpc64/multiarch/strncpy.c (strncpy): Add
+ __strncpy_power8 implementation.
+ * sysdeps/powerpc/powerpc64/power8/stpncpy.S: New file.
+ * sysdeps/powerpc/powerpc64/power8/strncpy.S: New file.
+ * NEWS: Update.
+
* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
* sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
diff --git a/NEWS b/NEWS
index 769e841..afb48df 100644
--- a/NEWS
+++ b/NEWS
@@ -11,7 +11,8 @@ Version 2.20.1
16617, 17266, 17370, 17371, 17625, 17630.
-* Optimized strcpy and stpcpy implementations for powerpc64/powerpc64le.
+* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for
+ powerpc64/powerpc64le.
* CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
under certain input conditions resulting in the execution of a shell for
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 74b2daa..18d3378 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -17,9 +17,10 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
stpcpy-power7 stpcpy-ppc64 \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
strncpy-power7 strncpy-ppc64 \
- stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
+ stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
+ strcmp-power7 strcmp-ppc64 \
strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
- memmove-ppc64 bcopy-ppc64
+ memmove-ppc64 bcopy-ppc64 strncpy-power8
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index d5b2184..b698b90 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -279,6 +279,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
IFUNC_IMPL_ADD (array, i, strncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strncpy_power8)
+ IFUNC_IMPL_ADD (array, i, strncpy,
hwcap & PPC_FEATURE_HAS_VSX,
__strncpy_power7)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
@@ -287,6 +290,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __stpncpy_power8)
+ IFUNC_IMPL_ADD (array, i, stpncpy,
hwcap & PPC_FEATURE_HAS_VSX,
__stpncpy_power7)
IFUNC_IMPL_ADD (array, i, stpncpy, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
similarity index 55%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
index dbf8521..d5d835d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
- Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized stpncpy implementation for POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,24 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+#define USE_AS_STPNCPY
-libc_ifunc (__stpncpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __stpncpy_power7
- : __stpncpy_ppc);
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__stpncpy_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__stpncpy_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__stpncpy_power8)
-weak_alias (__stpncpy, stpncpy)
-#endif
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__stpncpy_power8) \
+ END_2(__stpncpy_power8)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index dbf8521..3ee50e5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -23,10 +23,13 @@
extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
libc_ifunc (__stpncpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __stpncpy_power7
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __stpncpy_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __stpncpy_power7
: __stpncpy_ppc);
weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
index dbf8521..ed906a4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
- Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized strncpy implementation for POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,25 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__strncpy_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__strncpy_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__strncpy_power8)
-libc_ifunc (__stpncpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __stpncpy_power7
- : __stpncpy_ppc);
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strncpy_power8) \
+ END_2(__strncpy_power8)
-weak_alias (__stpncpy, stpncpy)
-#endif
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 8fd5e4b..19927bc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -24,12 +24,15 @@
extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (strncpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strncpy_power7
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strncpy_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strncpy_power7
: __strncpy_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
similarity index 59%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/power8/stpncpy.S
index dbf8521..76a1466 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
- Copyright (C) 2014 Free Software Foundation, Inc.
+/* Optimized stpncpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,5 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
-
-libc_ifunc (__stpncpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __stpncpy_power7
- : __stpncpy_ppc);
-
-weak_alias (__stpncpy, stpncpy)
-#endif
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
new file mode 100644
index 0000000..5fda953
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -0,0 +1,424 @@
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+/* Implements the function
+
+ char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ or
+
+ char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ if USE_AS_STPCPY is defined.
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+ .machine power7
+EALIGN (FUNC_NAME, 4, 0)
+
+ /* Check if the [src]+15 will cross a 4K page by checking if the bit
+ indicating the page size changes. Basically:
+
+ uint64_t srcin = (uint64_t)src;
+ uint64_t ob = srcin & 4096UL;
+ uint64_t nb = (srcin+15UL) & 4096UL;
+ if (ob ^ nb)
+ goto pagecross; */
+
+ addi r10,r4,16
+ rlwinm r9,r4,0,19,19
+
+ /* Since it is a leaf function, save some non-volatile registers on the
+ protected/red zone. */
+ std r26,-48(r1)
+ std r27,-40(r1)
+
+ rlwinm r8,r10,0,19,19
+
+ std r28,-32(r1)
+ std r29,-24(r1)
+
+ cmpld r7,r9,r8
+
+ std r30,-16(r1)
+ std r31,-8(r1)
+
+ beq cr7,L(unaligned_lt_16)
+ rldicl r9,r4,0,61
+ subfic r8,r9,8
+ cmpld cr7,r5,r8
+ bgt cr7,L(pagecross)
+
+ /* At this points there is 1 to 15 bytes to check and write. Since it could
+ be either from first unaligned 16 bytes access or from bulk copy, the code
+ uses an unrolled byte read/write instead of trying to analyze the cmpb
+ results. */
+L(short_path):
+ mr r9,r3
+L(short_path_1):
+ cmpdi cr7,r5,0
+ beq cr7,L(short_path_loop_end_1)
+L(short_path_2):
+ lbz r10,0(r4)
+ cmpdi cr7,r10,0
+ stb r10,0(r9)
+ beq cr7,L(zero_pad_start_1)
+ cmpdi cr0,r5,1
+ addi r8,r9,1
+ addi r6,r5,-1
+ beq cr0,L(short_path_loop_end_0)
+ lbz r10,1(r4)
+ cmpdi cr7,r10,0
+ stb r10,1(r9)
+ beq cr7,L(zero_pad_start_prepare_1)
+ addi r10,r5,-3
+ b L(short_path_loop_1)
+
+ .align 4
+L(short_path_loop):
+ lbz r8,0(r4)
+ addi r7,r10,-2
+ cmpdi cr5,r8,0
+ stb r8,0(r9)
+ beq cr5,L(zero_pad_start_1)
+ beq r7,L(short_path_loop_end_0)
+ lbz r8,1(r4)
+ cmpdi cr7,r8,0
+ stb r8,1(r9)
+ beq cr7,L(zero_pad_start)
+ mr r10,r7
+L(short_path_loop_1):
+ addic. r5,r5,-2
+ addi r9,r9,2
+ cmpdi cr7,r10,0
+ addi r4,r4,2
+ addi r6,r9,1
+ bne cr0,L(short_path_loop)
+#ifdef USE_AS_STPNCPY
+ mr r3,r9
+ b L(short_path_loop_end)
+#endif
+
+L(short_path_loop_end_0):
+#ifdef USE_AS_STPNCPY
+ addi r3,r9,1
+ b L(short_path_loop_end)
+#endif
+L(short_path_loop_end_1):
+#ifdef USE_AS_STPNCPY
+ mr r3,r9
+#endif
+L(short_path_loop_end):
+ /* Restore non-volatile registers. */
+ ld r26,-48(r1)
+ ld r27,-40(r1)
+ ld r28,-32(r1)
+ ld r29,-24(r1)
+ ld r30,-16(r1)
+ ld r31,-8(r1)
+ blr
+
+ /* This code pads the remainder dest with NULL bytes. The algorithm
+ calculate the remanining size and issues a doubleword unrolled
+ loops followed by a byte a byte set. */
+ .align 4
+L(zero_pad_start):
+ mr r5,r10
+ mr r9,r6
+L(zero_pad_start_1):
+ srdi. r8,r5,r3
+ mr r10,r9
+#ifdef USE_AS_STPNCPY
+ mr r3,r9
+#endif
+ beq- cr0,L(zero_pad_loop_b_start)
+ cmpldi cr7,r8,1
+ li cr7,0
+ std r7,0(r9)
+ beq cr7,L(zero_pad_loop_b_prepare)
+ addic. r8,r8,-2
+ addi r10,r9,r16
+ std r7,8(r9)
+ beq cr0,L(zero_pad_loop_dw_2)
+ std r7,16(r9)
+ li r9,0
+ b L(zero_pad_loop_dw_1)
+
+ .align 4
+L(zero_pad_loop_dw):
+ addi r10,r10,16
+ std r9,-8(r10)
+ beq cr0,L(zero_pad_loop_dw_2)
+ std r9,0(r10)
+L(zero_pad_loop_dw_1):
+ cmpldi cr7,r8,1
+ std r9,0(r10)
+ addic. r8,r8,-2
+ bne cr7,L(zero_pad_loop_dw)
+ addi r10,r10,8
+L(zero_pad_loop_dw_2):
+ rldicl r5,r5,0,61
+L(zero_pad_loop_b_start):
+ cmpdi cr7,r5,0
+ addi r5,r5,-1
+ addi r9,r10,-1
+ add r10,r10,5
+ subf r10,r9,r10
+ li r8,0
+ beq- cr7,L(short_path_loop_end)
+
+ /* Write remaining 1-8 bytes. */
+ .align 4
+ addi r9,r9,1
+ mtocrf 0x1,r10
+ bf 29,4f
+ stw r8,0(r9)
+ addi r9,r9,4
+
+ .align 4
+4: bf 30,2f
+ sth r8,0(r9)
+ addi r9,r9,2
+
+ .align 4
+2: bf 31,1f
+ stb r8,0(r9)
+
+ /* Restore non-volatile registers. */
+1: ld r26,-48(r1)
+ ld r27,-40(r1)
+ ld r28,-32(r1)
+ ld r29,-24(r1)
+ ld r30,-16(r1)
+ ld r31,-8(r1)
+ blr
+
+ /* The common case where [src]+16 will not cross a 4K page boundary.
+ In this case the code fast check the first 16 bytes by using doubleword
+ read/compares and update destiny if neither total size or null byte
+ is found in destiny. */
+ .align 4
+L(unaligned_lt_16):
+ cmpldi cr7,r5,7
+ ble cr7,L(short_path)
+ ld r7,0(r4)
+ li r8,0
+ cmpb r8,r7,r8
+ cmpdi cr7,r8,0
+ bne cr7,L(short_path_prepare_2)
+ addi r6,r5,-8
+ std r7,0(r3)
+ addi r9,r3,r8
+ cmpldi cr7,r6,7
+ addi r7,r4,8
+ ble cr7,L(short_path_prepare_1_1)
+ ld r4,8(r4)
+ cmpb r8,r4,r8
+ cmpdi cr7,r8,0
+ bne cr7,L(short_path_prepare_2_1)
+ std r4,8(r3)
+ addi r29,r3,16
+ addi r5,r5,-16
+ /* Neither the null byte was found or total length was reached,
+ align to 16 bytes and issue a bulk copy/compare. */
+ b L(align_to_16b)
+
+ /* In the case of 4k page boundary cross, the algorithm first align
+ the address to a doubleword, calculate a mask based on alignment
+ to ignore the bytes and continue using doubleword. */
+ .align 4
+L(pagecross):
+ rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */
+ li r6,-1 /* MASK = 0xffffffffffffffffUL. */
+ sldi r9,r9,3 /* Calculate padding. */
+ ld r7,0(r11) /* Load doubleword from memory. */
+#ifdef __LITTLE_ENDIAN__
+ sld r9,r6,r9 /* MASK = MASK << padding. */
+#else
+ srd r9,r6,r9 /* MASK = MASK >> padding. */
+#endif
+ orc r9,r7,r9 /* Mask bits that are not part of the
+ string. */
+ li cr7,0
+ cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ subf r8,r8,r5 /* Adjust total length. */
+ cmpldi cr7,r8,8 /* Check if length was reached. */
+ ble cr7,L(short_path_prepare_2)
+
+ /* For next checks we have aligned address, so we check for more
+ three doublewords to make sure we can read 16 unaligned bytes
+ to start the bulk copy with 16 aligned addresses. */
+ ld cr7,8(r11)
+ cmpb r9,r7,r9
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ addi cr7,r8,-8
+ cmpldi cr7,r7,8
+ ble cr7,L(short_path_prepare_2)
+ ld cr7,16(r11)
+ cmpb r9,r7,r9
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ addi r8,r8,-16
+ cmpldi r7,r8,8
+ ble cr7,L(short_path_prepare_2)
+ ld r8,24(r11)
+ cmpb r9,r8,r9
+ cmpdi r7,r9,0
+ bne cr7,L(short_path_prepare_2)
+
+ /* No null byte found in the 32 bytes readed and length not reached,
+ read source again using unaligned loads and store them. */
+ ld r9,0(r4)
+ addi r29,r3,16
+ addi r5,r5,-16
+ std r9,0(r3)
+ ld r9,8(r4)
+ std r9,8(r3)
+
+ /* Align source to 16 bytes and adjust destiny and size. */
+L(align_to_16b):
+ rldicl r9,r10,0,60
+ rldicr r28,r10,0,59
+ add r12,r5,r9
+ subf r29,r9,r29
+
+ /* The bulk read/compare/copy loads two doublewords, compare and merge
+ in a single register for speed. This is an attempt to speed up the
+ null-checking process for bigger strings. */
+
+ cmpldi cr7,r12,15
+ ble cr7,L(short_path_prepare_1_2)
+
+ /* Main loop for large sizes, unrolled 2 times to get better use of
+ pipeline. */
+ ld r8,0(28)
+ ld r10,8(28)
+ li r9,0
+ cmpb r7,r8,r9
+ cmpb r9,r10,r9
+ or. r6,r9,r7
+ bne cr0,L(short_path_prepare_2_3)
+ addi r5,r12,-16
+ addi r4,r28,16
+ std r8,0(r29)
+ std r10,8(r29)
+ cmpldi cr7,r5,15
+ addi r9,r29,16
+ ble cr7,L(short_path_1)
+ mr r11,r28
+ mr r6,r29
+ li r30,0
+ subfic r26,r4,48
+ subfic r27,r9,48
+
+ b L(loop_16b)
+
+ .align 4
+L(loop_start):
+ ld r31,0(r11)
+ ld r10,8(r11)
+ cmpb r0,r31,r7
+ cmpb r8,r10,r7
+ or. r7,r0,r8
+ addi r5,r5,-32
+ cmpldi cr7,r5,15
+ add r4,r4,r26
+ add r9,r9,r27
+ bne cr0,L(short_path_prepare_2_2)
+ add r4,r28,r4
+ std r31,0(r6)
+ add r9,r29,r9
+ std r10,8(r6)
+ ble cr7,L(short_path_1)
+
+L(loop_16b):
+ ld r10,16(r11)
+ ld r0,24(r11)
+ cmpb r8,r10,r30
+ cmpb r7,r0,r30
+ or. r7,r8,r7
+ addi r12,r12,-32
+ cmpldi r7,r12,15
+ addi r11,r11,32
+ bne cr0,L(short_path_2)
+ std r10,16(r6)
+ addi r6,r6,32
+ std r0,-8(r6)
+ bgt cr7,L(loop_start)
+
+ mr r5,r12
+ mr r4,r11
+ mr r9,r6
+ b L(short_path_1)
+
+ .align 4
+L(short_path_prepare_1_1):
+ mr r5,r6
+ mr r4,r7
+ b L(short_path_1)
+L(short_path_prepare_1_2):
+ mr r5,r12
+ mr r4,r28
+ mr r9,r29
+ b L(short_path_1)
+L(short_path_prepare_2):
+ mr r9,r3
+ b L(short_path_2)
+L(short_path_prepare_2_1):
+ mr r5,r6
+ mr r4,r7
+ b L(short_path_2)
+L(short_path_prepare_2_2):
+ mr r5,r12
+ mr r4,r11
+ mr r9,r6
+ b L(short_path_2)
+L(short_path_prepare_2_3):
+ mr r5,r12
+ mr r4,r28
+ mr r9,r29
+ b L(short_path_2)
+L(zero_pad_loop_b_prepare):
+ addi r10,r9,8
+ rldicl r5,r5,0,61
+ b L(zero_pad_loop_b_start)
+L(zero_pad_start_prepare_1):
+ mr r5,r6
+ mr r9,r8
+ b L(zero_pad_start_1)
+END (FUNC_NAME)
+
+#ifdef USE_AS_STPNCPY
+libc_hidden_def (__stpncpy)
+#else
+libc_hidden_builtin_def (strncpy)
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4242356131256e54ca3e96b0c6f2af773b7a69c8
commit 4242356131256e54ca3e96b0c6f2af773b7a69c8
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Tue Dec 23 13:39:23 2014 -0500
powerpc: Optimized strncat for POWER7/PPC64
With 3eb38795dbbbd816 (Simplify strncat) the generic algorithms uses
strlen, strnlen, and memcpy. This is faster than POWER7 current
implementation, especially for unaligned strings (where POWER7 code
uses byte-byte operations).
This patch removes the assembly implementation and uses a multiarch
specialization based on default algorithm calling optimized POWER7
symbols.
diff --git a/ChangeLog b/ChangeLog
index 4ff5e7d..0c3f78d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+ * sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
+ * sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
+ * sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
+
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
strncat-power8 object.
* sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S
deleted file mode 100644
index ead4a9a..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Optimized strncat implementation for POWER7.
- Copyright (C) 2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#undef EALIGN
-#define EALIGN(name, alignt, words) \
- .section ".text"; \
- ENTRY_2(__strncat_power7) \
- .align ALIGNARG(alignt); \
- EALIGN_W_##words; \
- BODY_LABEL(__strncat_power7): \
- cfi_startproc; \
- LOCALENTRY(__strncat_power7)
-
-#undef END
-#define END(name) \
- cfi_endproc; \
- TRACEBACK(__strncat_power7) \
- END_2(__strncat_power7)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#define STRLEN __strlen_power7
-
-#include <sysdeps/powerpc/powerpc64/power7/strncat.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
new file mode 100644
index 0000000..39b1aeb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
@@ -0,0 +1,31 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/ >. */
+
+#include <string.h>
+
+#define STRNCAT __strncat_power7
+
+extern __typeof (strncat) __strncat_power7 attribute_hidden;
+extern __typeof (strlen) __strlen_power7 attribute_hidden;
+extern __typeof (strnlen) __strnlen_power7 attribute_hidden;
+extern __typeof (memcpy) __memcpy_power7 attribute_hidden;
+
+#define strlen __strlen_power7
+#define __strnlen __strnlen_power7
+#define memcpy __memcpy_power7
+
+#include <string/strncat.c>
diff --git a/sysdeps/powerpc/powerpc64/power7/strncat.S b/sysdeps/powerpc/powerpc64/power7/strncat.S
deleted file mode 100644
index f5ea52d..0000000
--- a/sysdeps/powerpc/powerpc64/power7/strncat.S
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Optimized strncat implementation for PowerPC64/POWER7.
-
- Copyright (C) 2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* The algorithm is as follows for aligned memory access :
-
- if address of s2 is divisible by 0x7UL,
- perform aligned doubleword catenation
- else
- perform unaligned catenation
-
- The aligned comparison are made using cmpb instructions. */
-
-/* char* [r3] strncat (const char *s1 [r3],
- const char *s2 [r4],
- size_t size [r5]) */
-
-#include <sysdep.h>
-
-#ifndef STRNCAT
-# undef strncat
-# define STRNCAT strncat
-#endif
-
-#ifndef STRLEN
-/* For builds with no IFUNC support, local calls should be made to internal
- GLIBC symbol (created by libc_hidden_builtin_def). */
-# ifdef SHARED
-# define STRLEN __GI_strlen
-# else
-# define STRLEN strlen
-# endif
-#endif
-
-#define FRAMESIZE (FRAME_MIN_SIZE+32)
-
- .machine power7
-EALIGN(STRNCAT, 4, 0)
- CALL_MCOUNT 3
-
- mflr r0 /* Load link register LR to r0. */
-
-/* We shall use r29, r30 and r31 non volatile register for retention.
- Save all the callee registers in the GPR save area. */
- std r29, -24(r1) /* Save callers register r29. */
- std r30, -16(r1) /* Save callers register r30. */
- std r31, -8(r1) /* Save callers register r31. */
-
- std r0, 16(r1) /* Store the link register. */
- stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
-
-/* Improve performance with CPU pre-fetch. */
- dcbt 0, r3 /* Pre-fetch str to avoid cache
- miss. */
- dcbt 0, r4 /* Pre-fetch accept to avoid cache
- miss. */
-
- mr. r29, r5 /* Save "n" in r29. */
- mr r30, r3 /* Save "s1" in r30 from r3. */
- beq cr0,L(done)
-
- mr r31, r4 /* Save "s2" in r31 from r4. */
- bl STRLEN /* Call optimized strlen on s1; goto
- end of s1. */
- nop
- cmpldi cr7, r29, 7 /* If s2 is <=7 process
- byte-by-byte. */
- add r3, r30, r3 /* Grab the last character of s1. */
- bgt cr7,L(alignment) /* Process by aligned strings. */
-
- cmpldi cr7, r29, 3 /* If n is >= 4, we can
- byte-unroll. */
- addi r9, r3, -1 /* Make "s1" point before next
- character, increment when read. */
- bgt cr7, L(bytes_unroll) /* Process each byte. */
-
-L(byte_by_byte):
- lbz r10, 0(r31)
- addi r8, r9, 1
- cmpdi cr7, r10, 0 /* Check for NULL in "s2". */
- stb r10, 1(r9)
- beq cr7, L(done)
- add r9, r9, r29
- subf r9, r8, r9
- addi r9, r9, 1
- mtctr r9
- b L(branch2)
- .p2align 4
-L(branch1):
- lbzu r10, 1(r31)
- cmpdi cr7, r10, 0
- stbu r10, 1(r8)
- beq cr7,L(done)
-L(branch2):
- mr r9, r8
- bdnz L(branch1)
- beq cr7,L(done)
-L(nullTerminate):
- li r10, 0 /* Load NULL for termination. */
- stb r10, 1(r9) /* Append or terminate s1 with
- NULL. */
- .p2align 4 /* A small section here. */
-L(done): /* We return now. */
- addi r1, r1, FRAMESIZE /* Restore stack pointer. */
- mr r3, r30 /* Set the return value length of
- string. */
- ld r0, 16(r1) /* Read the saved link register. */
- ld r29, -24(r1) /* Restore save register r29. */
- ld r30, -16(r1) /* Restore save register r30. */
- ld r31, -8(r1) /* Restore save register r31. */
- mtlr r0 /* Restore link register. */
- blr /* Branch to link register. */
-
- .p2align 4
-L(alignment):
- rldicl. r9, r31, 0, 61 /* Check if s2 is 8byte aligned */
- beq cr0,L(dwordAligned)
-
- .p2align 4
-/* Unaligned bytes in string, so process byte by byte.
- POWER7 has performance gains over loop unroll. */
-L(bytes_unroll):
- addi r9, r3, -1
- srdi r10, r29, 2
- mtctr r10
- b L(L10)
- .p2align 4
-L(L44):
- lbz r10, 1(r31) /* Load byte. */
- cmpdi cr7, r10, 0 /* Compare ; if byte not zero,
- continue. */
- stb r10, 2(r9) /* Store byte */
- beq cr7, L(done)
- addi r31, r31, 4
-
- lbz r10, -2(r31) /* Perform loop unroll here on byte
- load and store. */
- cmpdi cr7, r10, 0
- stb r10, 3(r9)
- beq cr7, L(done)
-
- lbz r10, -1(r31) /* Loop unroll here. */
- cmpdi cr7, r10, 0
- stbu r10, 4(r9)
- beq cr7, L(done)
-
- bdz L(leftNbytes)
-
-L(L10):
- lbz r10, 0(r31) /* Loop unroll here. */
- cmpdi cr7, r10, 0
- stb r10, 1(r9)
- bne cr7,L(L44)
- b L(done)
- .p2align 4
-/* If s2 is double word aligned, we load and store double word. */
-L(dwordAligned):
-/* read, write 8 bytes at a time */
- srdi r8, r29, 3 /* Compute count for CTR to loop;
- count = n/8. */
- li r7, 0 /* Load r7 with NULL. */
- li r10, 0 /* Load r10 with MASK '0'. */
-
- mtctr r8 /* Move count to CTR. */
-L(loop8):
- ld r9, 0(r31) /* Read double word from s2. */
- cmpb r6, r9, r10 /* Compare bytes in s2 we read
- just now. */
- cmpdi r6, 0 /* If cmpb returned NULL,
- we continue. */
- bne+ L(a8)
- std r9, 0(r3) /* Append double word from s2
- with s1. */
- addi r3, r3, 8 /* Increment s1. */
- addi r31, r31, 8 /* Increment s2. */
- subi r29, r29, 8 /* Decrement count by 8. */
- bdnz L(loop8) /* Continue until "count" is
- non zero. */
-
-L(a8):
- cmpdi r29, 0 /* If "n" is already zero, we skip. */
- beq+ L(align8align)
-
- mtctr r29 /* Process left over bytes in "n". */
-L(unaligned0):
- lbz r9, 0(r31) /* Read a byte from s2. */
- cmpw r9, r7 /* If byte is NULL, we stop here . */
- beq+ L(align8align) /* Skip processing further if NULL. */
- stb r9, 0(r3) /* If not NULL, store byte into s1. */
- addi r3, r3, 1 /* Increment s1 by 1. */
- addi r31, r31, 1 /* Increment s2 by 1. */
- bdnz L(unaligned0) /* Decrement counter "n" and loop
- until non zero. */
-L(align8align):
- stb r7, 0(r3) /* Terminate s1 with NULL. */
-
- addi r1, r1, FRAMESIZE /* Restore stack pointer. */
- mr r3, r30 /* Set the return value, length of
- string. */
- ld r0, 16(r1) /* Read the saved link register. */
- ld r29, -24(r1) /* Restore save register r29. */
- ld r30, -16(r1) /* Restore save register r30. */
- ld r31, -8(r1) /* Restore save register r31. */
- mtlr r0 /* Restore link register. */
- blr /* Branch to link register */
-
- .p2align 4
-L(leftNbytes):
- rldicl. r29, r29, 0, 62 /* Check if n>0 and n < 4 bytes. */
- bne cr0,L(byte_by_byte) /* Process bytes one by one. */
- b L(nullTerminate) /* Now, finish catenation with
- NULL termination. */
-END(STRNCAT)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=11ce06e589429143595a6c4b60ac7ab6372201b1
commit 11ce06e589429143595a6c4b60ac7ab6372201b1
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Tue Dec 23 13:36:34 2014 -0500
powerpc: Optimized strcat for POWER8/PPC64
With new optimized strcpy for POWER8, this patch adds an optimized
strcat which uses it along with default implementation at strings/.
diff --git a/ChangeLog b/ChangeLog
index b542cf0..4ff5e7d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,15 @@
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+ strncat-power8 object.
+ * sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add
+ __strcat_power8 implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add __strcat_power8 implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c: New file:
+ optimized strcat for power8.
+
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
strcpy-power8 and stpcpy-power8 objects.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __strcpy_power8 and __stpcpy_power8
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index f170551..74b2daa 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -18,8 +18,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
strncpy-power7 strncpy-ppc64 \
stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
- strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
- bcopy-ppc64
+ strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
+ memmove-ppc64 bcopy-ppc64
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 2a7e7f5..d5b2184 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -303,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c. */
IFUNC_IMPL (i, name, strcat,
IFUNC_IMPL_ADD (array, i, strcat,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strcat_power8)
+ IFUNC_IMPL_ADD (array, i, strcat,
hwcap & PPC_FEATURE_HAS_VSX,
__strcat_power7)
IFUNC_IMPL_ADD (array, i, strcat, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcat.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
index 847a62d..6c7544c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
@@ -1,5 +1,4 @@
-/* Multiple versions of strcat. PowerPC64 version.
- Copyright (C) 2014 Free Software Foundation, Inc.
+/* Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -9,23 +8,23 @@
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
+ <http://www.gnu.org/licenses/ >. */
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <string.h>
-extern __typeof (strcat) __strcat_ppc attribute_hidden;
-extern __typeof (strcat) __strcat_power7 attribute_hidden;
+#define STRCAT __strcat_power8
-libc_ifunc (strcat,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcat_power7
- : __strcat_ppc);
-#endif
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strcpy) __strcpy_power8;
+extern typeof (strlen) __strlen_power7;
+
+#define strcpy __strcpy_power8
+#define strlen __strlen_power7
+#include <sysdeps/powerpc/strcat.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
index 847a62d..289e9b2 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
@@ -23,9 +23,12 @@
extern __typeof (strcat) __strcat_ppc attribute_hidden;
extern __typeof (strcat) __strcat_power7 attribute_hidden;
+extern __typeof (strcat) __strcat_power8 attribute_hidden;
libc_ifunc (strcat,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcat_power7
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strcat_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strcat_power7
: __strcat_ppc);
#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=a9728856f02f74b60a546499c5bd8492d1726f98
commit a9728856f02f74b60a546499c5bd8492d1726f98
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Tue Dec 23 05:59:44 2014 -0600
powerpc: Optimized st{r,p}cpy for POWER8/PPC64
This patch adds an optimized POWER8 strcpy using unaligned accesses.
For strings up to 16 bytes the implementation first calculate the
string size, like strlen, and issues a memcpy. For larger strings,
source is first aligned to 16 bytes and then tested over a loop that
reads 16 bytes am combine the cmpb results for speedup. Special case is
added for page cross reads.
It shows 30%-60% improvement over the optimized POWER7 one that uses
only aligned accesses.
diff --git a/ChangeLog b/ChangeLog
index 73bf51e..b542cf0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+ strcpy-power8 and stpcpy-power8 objects.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add __strcpy_power8 and __stpcpy_power8
+ implementations.
+ * sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S: New file:
+ multiarch stpcpy implementation for POWER8.
+ * sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S: New file;
+ multiarch strcpy implementation for POWER8.
+ * sysdeps/powerpc/powerpc64/multiarch/strcpy.c (strcpy): Add
+ __strcpy_power8 function.
+ * sysdeps/powerpc/powerpc64/power8/stpcpy.S: New file: optimized
+ stpcpy for POWER8.
+ * sysdeps/powerpc/powerpc64/power8/strcpy.S: New file: optimized
+ strcpy for POWER8.
+ * NEWS: Update.
+
2014-12-31 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Adhemerval Zanella <azanella@linux.vnet.ibm.com>
diff --git a/NEWS b/NEWS
index f9beb9f..769e841 100644
--- a/NEWS
+++ b/NEWS
@@ -11,6 +11,8 @@ Version 2.20.1
16617, 17266, 17370, 17371, 17625, 17630.
+* Optimized strcpy and stpcpy implementations for powerpc64/powerpc64le.
+
* CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
under certain input conditions resulting in the execution of a shell for
command substitution when the applicaiton did not request it. The
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 39e441b..f170551 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -13,7 +13,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \
wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \
wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \
- strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
+ strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
+ stpcpy-power7 stpcpy-ppc64 \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
strncpy-power7 strncpy-ppc64 \
stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 8f1e3e1..2a7e7f5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -83,6 +83,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c. */
IFUNC_IMPL (i, name, strcpy,
+ IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strcpy_power8)
IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
__strcpy_power7)
IFUNC_IMPL_ADD (array, i, strcpy, 1,
@@ -90,6 +92,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpcpy.c. */
IFUNC_IMPL (i, name, stpcpy,
+ IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __stpcpy_power8)
IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_HAS_VSX,
__stpcpy_power7)
IFUNC_IMPL_ADD (array, i, stpcpy, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
index 1b6e9e0..66e6f70 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
- Copyright (C) 2013-2014 Free Software Foundation, Inc.
+/* Optimized stpcpy implementation for POWER8/PPC64.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if defined SHARED && !defined NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__stpcpy_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__stpcpy_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__stpcpy_power8)
-libc_ifunc (strcpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcpy_power7
- : __strcpy_ppc);
-#endif
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__stpcpy_power8) \
+ END_2(__stpcpy_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
index 1b6e9e0..64cbc16 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
- Copyright (C) 2013-2014 Free Software Foundation, Inc.
+/* Optimized strcpy implementation for POWER8/PPC64.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if defined SHARED && !defined NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__strcpy_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__strcpy_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__strcpy_power8)
-libc_ifunc (strcpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcpy_power7
- : __strcpy_ppc);
-#endif
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strcpy_power8) \
+ END_2(__strcpy_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
index 1b6e9e0..20ef73f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
@@ -23,9 +23,12 @@
extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
libc_ifunc (strcpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcpy_power7
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strcpy_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strcpy_power7
: __strcpy_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
similarity index 61%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/power8/stpcpy.S
index 1b6e9e0..bf72065 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
- Copyright (C) 2013-2014 Free Software Foundation, Inc.
+/* Optimized stpcpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,9 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if defined SHARED && !defined NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#define USE_AS_STPCPY
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
-
-libc_ifunc (strcpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcpy_power7
- : __strcpy_ppc);
-#endif
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S
new file mode 100644
index 0000000..d3e9a10
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcpy.S
@@ -0,0 +1,262 @@
+/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPCPY
+# define FUNC_NAME __stpcpy
+#else
+# define FUNC_NAME strcpy
+#endif
+
+/* Implements the function
+
+ char * [r3] strcpy (char *dest [r3], const char *src [r4])
+
+ or
+
+ char * [r3] stpcpy (char *dest [r3], const char *src [r4])
+
+ if USE_AS_STPCPY is defined.
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+ .machine power7
+EALIGN (FUNC_NAME, 4, 0)
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ /* Check if the [src]+15 will cross a 4K page by checking if the bit
+ indicating the page size changes. Basically:
+
+ uint64_t srcin = (uint64_t)src;
+ uint64_t ob = srcin & 4096UL;
+ uint64_t nb = (srcin+15UL) & 4096UL;
+ if (ob ^ nb)
+ goto pagecross; */
+
+ addi r9,r4,15
+ xor r9,r9,r4
+ rlwinm. r9,r9,0,19,19
+ bne L(pagecross)
+
+ /* For short string (less than 16 bytes), just calculate its size as
+ strlen and issues a memcpy if null is found. */
+ mr r7,r4
+ ld r12,0(r7) /* Load doubleword from memory. */
+ cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+ bne cr7,L(done)
+
+ ldu r8,8(r7)
+ cmpb r10,r8,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ b L(loop_before)
+
+ .align 4
+L(pagecross):
+ clrrdi r7,r4,3 /* Align the address to doubleword boundary. */
+ rlwinm r6,r4,3,26,28 /* Calculate padding. */
+ li r5,-1 /* MASK = 0xffffffffffffffff. */
+ ld r12,0(r7) /* Load doubleword from memory. */
+#ifdef __LITTLE_ENDIAN__
+ sld r5,r5,r6
+#else
+ srd r5,r5,r6 /* MASK = MASK >> padding. */
+#endif
+ orc r9,r12,r5 /* Mask bits that are not part of the string. */
+ cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+ bne cr7,L(done)
+
+ ldu r6,8(r7)
+ cmpb r10,r6,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ ld r12,0(r7)
+ cmpb r10,r12,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ ldu r6,8(r7)
+ cmpb r10,r6,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ /* We checked for 24 - x bytes, with x being the source alignment
+ (0 <= x <= 16), and no zero has been found. Start the loop
+ copy with doubleword aligned address. */
+ mr r7,r4
+ ld r12, 0(r7)
+ ldu r8, 8(r7)
+
+L(loop_before):
+ /* Save the two doublewords readed from source and align the source
+ to 16 bytes for the loop. */
+ mr r11,r3
+ std r12,0(r11)
+ std r8,8(r11)
+ addi r11,r11,16
+ rldicl r9,r4,0,60
+ subf r7,r9,r7
+ subf r11,r9,r11
+ b L(loop_start)
+
+ .align 5
+L(loop):
+ std r12, 0(r11)
+ std r6, 8(r11)
+ addi r11,r11,16
+L(loop_start):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+
+ ld r12, 8(r7)
+ ldu r6, 16(r7)
+ cmpb r10,r12,r0
+ cmpb r9,r6,r0
+ or r8,r9,r10 /* Merge everything in one doubleword. */
+ cmpdi cr7,r8,0
+ beq cr7,L(loop)
+
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ addi r4,r7,-8
+ cmpdi cr6,r10,0
+ addi r7,r7,-8
+ bne cr6,L(done2)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ length. */
+
+ mr r10,r9
+ addi r7,r7,8
+ b L(done2)
+
+ /* r10 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the length. */
+L(done):
+ mr r11,r3
+L(done2):
+#ifdef __LITTLE_ENDIAN__
+ addi r9, r10, -1 /* Form a mask from trailing zeros. */
+ andc r9, r9, r10
+ popcntd r6, r9 /* Count the bits in the mask. */
+#else
+ cntlzd r6,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r4,r7
+ srdi r6,r6,3 /* Convert leading/trailing zeros to bytes. */
+ add r8,r5,r6 /* Compute final length. */
+#ifdef USE_AS_STPCPY
+ /* stpcpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r8
+#endif
+ addi r8,r8,1 /* Final '/0'. */
+
+ cmpldi cr6,r8,8
+ mtocrf 0x01,r8
+ ble cr6,L(copy_LE_8)
+
+ cmpldi cr1,r8,16
+ blt cr1,8f
+
+ /* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ /* At least 6 bytes to go. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ ld r6,0(r4)
+ ld r8,8(r4)
+ addi r4,r4,16
+ std r6,0(r11)
+ std r8,8(r11)
+ addi r11,r11,16
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4)
+ ld r6,0(r4)
+ addi r4,r4,8
+ std r6,0(r11)
+ addi r11,r11,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ lwz r6,0(r4)
+ stw r6,0(r11)
+ bf 30,L(tail5)
+ lhz r7,4(r4)
+ sth r7,4(r11)
+ bflr 31
+ lbz r8,6(r4)
+ stb r8,6(r11)
+ blr
+
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ lhz r6,0(r4)
+ sth r6,0(r11)
+ bflr 31
+ lbz r7,2(r4)
+ stb r7,2(r11)
+ blr
+
+ .align 4
+L(tail5):
+ bf 31,1f
+ lbz r6,4(r4)
+ stb r6,4(r11)
+ blr
+
+ .align 4
+1:
+ bflr 31
+ lbz r6,0(r4)
+ stb r6,0(r11)
+ blr
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,L(tail4)
+ ld r6,0(r4)
+ std r6,0(r11)
+ blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=b53db929e654aaf97a2a239e17a87b04c768b854
commit b53db929e654aaf97a2a239e17a87b04c768b854
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Date: Wed Dec 31 14:05:00 2014 -0500
powerpc: POWER7 strcpy optimization for unaligned strings
This patch optimizes strcpy for ppc64/power7 for unaligned source or
destination address. The source or destination address is aligned
to doubleword and data is shifted based on the alignment and
added with the previous loaded data to be written as a doubleword.
For each load, cmpb instruction is used for faster null check.
The word aligned optimization is also removed, since the new unaligned
code path shows better results handling word-aligned strings.
More combination of unaligned inputs is also added in benchtest
to measure the improvement.The new optimization shows 2 to 80% of
performance improvement for longer string though it does not show
big difference on string size less than 16 due to additional checks.
diff --git a/ChangeLog b/ChangeLog
index 991ba72..73bf51e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2014-12-31 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
+ Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/power7/strcpy.S (strcpy): Optimize unaligned
+ path.
+ * benchtests/bench-strcpy.c (test_main): Add more unaligned inputs.
+
2014-12-16 Florian Weimer <fweimer@redhat.com>
[BZ #17630]
diff --git a/benchtests/bench-strcpy.c b/benchtests/bench-strcpy.c
index c3ab4cf..e9445f2 100644
--- a/benchtests/bench-strcpy.c
+++ b/benchtests/bench-strcpy.c
@@ -171,6 +171,22 @@ test_main (void)
do_test (i, i, 8 << i, BIG_CHAR);
}
+ for (i = 16; i <= 512; i+=4)
+ {
+ do_test (0, 4, i, SMALL_CHAR);
+ do_test (4, 0, i, BIG_CHAR);
+ do_test (4, 4, i, SMALL_CHAR);
+ do_test (2, 2, i, BIG_CHAR);
+ do_test (2, 6, i, SMALL_CHAR);
+ do_test (6, 2, i, BIG_CHAR);
+ do_test (1, 7, i, SMALL_CHAR);
+ do_test (7, 1, i, BIG_CHAR);
+ do_test (3, 4, i, SMALL_CHAR);
+ do_test (4, 3, i, BIG_CHAR);
+ do_test (5, 7, i, SMALL_CHAR);
+ do_test (7, 5, i, SMALL_CHAR);
+ }
+
return ret;
}
diff --git a/sysdeps/powerpc/powerpc64/power7/strcpy.S b/sysdeps/powerpc/powerpc64/power7/strcpy.S
index ce71982..115f98a 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcpy.S
@@ -31,8 +31,6 @@
if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0))
goto aligned_doubleword_copy;
- if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0))
- goto aligned_word_copy;
if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL))
goto same_alignment;
goto unaligned;
@@ -70,9 +68,18 @@ EALIGN (FUNC_NAME, 4, 0)
#endif
or rTMP, rSRC, rRTN
clrldi. rTMP, rTMP, 61
- bne L(check_word_alignment)
+ bne L(check_alignment)
b L(aligned_doubleword_copy)
+ .align 4
+L(check_alignment):
+ rldicl rRTNAL, rRTN, 0, 61
+ rldicl rSRCAL, rSRC, 0, 61
+ cmpld cr7, rSRCAL, rRTNAL
+ beq cr7, L(same_alignment)
+ b L(unaligned)
+
+ .align 4
L(same_alignment):
/* Src and dst with same alignment: align both to doubleword. */
mr rALCNT, rRTN
@@ -180,93 +187,249 @@ L(g1):
#endif
blr
-L(check_word_alignment):
- clrldi. rTMP, rTMP, 62
- beq L(aligned_word_copy)
- rldicl rRTNAL, rRTN, 0, 61
- rldicl rSRCAL, rSRC, 0, 61
- cmpld cr7, rSRCAL, rRTNAL
- beq cr7, L(same_alignment)
- b L(unaligned)
-
-/* For word aligned memory, operate using word load and stores. */
.align 4
-L(aligned_word_copy):
- li rMASK, 0
- addi rRTN, rRTN, -4
- lwz rWORD, 0(rSRC)
- b L(g5)
+L(unaligned):
+ cmpdi rSRCAL, 0 /* Check src alignment */
+ beq L(srcaligndstunalign)
+ /* src is unaligned */
+ rlwinm r10, rSRC, 3,26,28 /* Calculate padding. */
+ clrrdi rSRC, rSRC, 3 /* Align the addr to dw boundary */
+ ld rWORD, 0(rSRC) /* Load doubleword from memory. */
+ li rTMP, 0
+ /* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+ srd rALT, rWORD, r10
+#else
+ sld rALT, rWORD, r10
+#endif
+ cmpb rTMP, rALT, rTMP /* Compare each byte against null */
+ /* Discard bits not part of the string */
+#ifdef __LITTLE_ENDIAN__
+ sld rTMP, rTMP, r10
+#else
+ srd rTMP, rTMP, r10
+#endif
+ cmpdi rTMP, 0
+ bne L(bytebybyte) /* if it has null, copy byte by byte */
+ subfic r8, r9, 8
+ rlwinm r5, rRTN, 3,26,28 /* Calculate padding in bits. */
+ rldicl r9, rRTN, 0, 61 /* Calculate padding in bytes. */
+ addi rRTN, rRTN, -1
- .align 4
-L(g3): lwzu rALT, 4(rSRC)
- stwu rWORD, 4(rRTN)
- cmpb rTMP, rALT, rMASK
- cmpwi rTMP, 0
- bne L(g4)
- lwzu rWORD, 4(rSRC)
- stwu rALT, 4(rRTN)
-L(g5): cmpb rTMP, rWORD, rMASK
- cmpwi rTMP, 0 /* If rTMP is 0, no null in word. */
- beq L(g3)
-
- mr rALT, rWORD
-/* We've hit the end of the string. Do the rest byte-by-byte. */
-L(g4):
+ cmpdi r5, 0 /* check dest alignment */
+ beq L(srcunaligndstalign)
+
+ /* both src and dst unaligned */
#ifdef __LITTLE_ENDIAN__
- rlwinm. rTMP, rALT, 0, 24, 31
- stbu rALT, 4(rRTN)
- beqlr-
- rlwinm. rTMP, rALT, 24, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr-
- rlwinm. rTMP, rALT, 16, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr-
- rlwinm rTMP, rALT, 8, 24, 31
- stbu rTMP, 1(rRTN)
+ sld rWORD, rALT, r10
+ mr r11, r10
+ addi r11, r11, -8 /* Adjust byte pointer on loaded dw */
#else
- rlwinm. rTMP, rALT, 8, 24, 31
- stbu rTMP, 4(rRTN)
- beqlr
- rlwinm. rTMP, rALT, 16, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr
- rlwinm. rTMP, rALT, 24, 24, 31
- stbu rTMP, 1(rRTN)
- beqlr
- stbu rALT, 1(rRTN)
+ srd rWORD, rALT, r10
+ subfic r11, r10, 64
#endif
- blr
+ /* dst alignment is greater then src alignment? */
+ cmpd cr7, r5, r10
+ blt cr7, L(dst_align_small)
+ /* src alignment is less than dst */
-/* Oh well. In this case, we just do a byte-by-byte copy. */
- .align 4
-L(unaligned):
- lbz rWORD, 0(rSRC)
- addi rRTN, rRTN, -1
- cmpdi rWORD, 0
- beq L(u2)
-
- .align 5
-L(u0): lbzu rALT, 1(rSRC)
- stbu rWORD, 1(rRTN)
- cmpdi rALT, 0
- beq L(u1)
- lbzu rWORD, 1(rSRC)
+ /* Calculate the dst alignment differnce */
+ subfic rALT, r9, 8
+ mtctr rALT
+
+ /* Write till dst is aligned */
+ cmpdi rTMP, rALT, 4
+ blt L(storebyte1) /* less than 4, store byte by byte */
+ beq L(equal1) /* if its 4, store word */
+ addi rTMP, rALT, -4 /* greater than 4, so stb and stw */
+ mtctr rTMP
+L(storebyte1):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd rALT, rWORD, r11
+ stbu rALT, 1(rRTN)
+ bdnz L(storebyte1)
+
+ subfic rALT, r9, 8 /* Check the remaining bytes */
+ cmpdi rTMP, rALT, 4
+ blt L(proceed)
+
+ .align 4
+L(equal1):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on loaded dw */
+ srd rALT, rWORD, r11
+#else
+ subfic r11, r11, 64
+ sld rALT, rWORD, r11
+ srdi rALT, rALT, 32
+#endif
+ stw rALT, 1(rRTN)
+ addi rRTN, rRTN, 4
+
+L(proceed):
+ mr rALT, rWORD
+ /* calculate the Left over bytes to be written */
+ subfic r11, r10, 64
+ subfic r5, r5, 64
+ subf r5, r5, r11 /* remaining bytes on second dw */
+ subfic r10, r5, 64 /* remaining bytes on first dw */
+ subfic r9, r9, 8
+ subf r8, r9, r8 /* recalculate padding */
+L(srcunaligndstalign):
+ addi rRTN, rRTN, 1
+ subfic r5, r10, 64 /* remaining bytes on second dw */
+ addi rSRC, rSRC, 8
+ li rTMP,0
+ b L(storedouble)
+
+ .align 4
+L(dst_align_small):
+ mtctr r8
+ /* Write till src is aligned */
+L(storebyte2):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd rALT, rWORD, r11
stbu rALT, 1(rRTN)
- cmpdi rWORD, 0
- beq L(u2)
- lbzu rALT, 1(rSRC)
- stbu rWORD, 1(rRTN)
- cmpdi rALT, 0
- beq L(u1)
- lbzu rWORD, 1(rSRC)
+ bdnz L(storebyte2)
+
+ addi rSRC, rSRC, 8 /* Increment src pointer */
+ addi rRTN, rRTN, 1 /* Increment dst pointer */
+ rldicl r8, rRTN, 0, 61 /* Recalculate padding */
+
+ /* src is aligned */
+L(srcaligndstunalign):
+ ld rWORD, 0(rSRC)
+ mr rALT, rWORD
+ li rTMP, 0 /* Check null */
+ cmpb rTMP, rWORD, rTMP
+ cmpdi rTMP, 0
+ bne L(bytebybyte) /* Do byte by byte if there is NULL */
+ rlwinm r5, rRTN, 3,26,28 /* Calculate padding */
+ addi rRTN, rRTN, -1
+ subfic r10, r8, 8
+ /* write byte by byte till aligned */
+#ifdef __LITTLE_ENDIAN__
+ li r11, -8
+#else
+ li r11, 64
+#endif
+ mtctr r10
+ cmpdi rTMP, r10, 4
+ blt L(storebyte)
+ beq L(equal)
+ addi rTMP, r10, -4
+ mtctr rTMP
+L(storebyte):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8 /* Adjust byte pointer on dw */
+#else
+ addi r11, r11, -8
+#endif
+ srd rALT, rWORD, r11
stbu rALT, 1(rRTN)
- cmpdi rWORD, 0
- bne L(u0)
-L(u2): stbu rWORD, 1(rRTN)
- blr
-L(u1): stbu rALT, 1(rRTN)
- blr
+ bdnz L(storebyte)
+
+ cmpdi rTMP, r10, 4
+ blt L(align)
+
+ .align 4
+L(equal):
+#ifdef __LITTLE_ENDIAN__
+ addi r11, r11, 8
+ srd rALT, rWORD, r11
+#else
+ subfic r11, r11, 64
+ sld rALT, rWORD, r11
+ srdi rALT, rALT, 32
+#endif
+ stw rALT, 1(rRTN)
+ addi rRTN, rRTN, 4
+L(align):
+ addi rRTN, rRTN, 1
+ addi rSRC, rSRC, 8 /* Increment src pointer */
+ subfic r10, r5, 64
+ li rTMP, 0
+ /* dst addr aligned to 8 */
+L(storedouble):
+ ld rALT, 0(rSRC) /* load next dw */
+ cmpb rTMP, rALT, rTMP
+ cmpdi rTMP, 0 /* check for null on each new dw */
+ bne L(null)
+#ifdef __LITTLE_ENDIAN__
+ srd r9, rWORD, r10 /* bytes from first dw */
+ sld r11, rALT, r5 /* bytes from second dw */
+#else
+ sld r9, rWORD, r10
+ srd r11, rALT, r5
+#endif
+ or r11, r9, r11 /* make as a single dw */
+ std r11, 0(rRTN) /* store as std on aligned addr */
+ mr rWORD, rALT /* still few bytes left to be written */
+ addi rRTN, rRTN, 8 /* increment dst addr */
+ addi rSRC, rSRC, 8 /* increment src addr */
+ b L(storedouble) /* Loop till NULL */
+
+ .align 4
+
+/* We've hit the end of the string. Do the rest byte-by-byte. */
+L(null):
+ addi rRTN, rRTN, -1
+ mr r10, r5
+ mtctr r8
+#ifdef __LITTLE_ENDIAN__
+ subfic r10, r10, 64
+ addi r10, r10, -8
+#endif
+ cmpdi rTMP, r8, 4
+ blt L(loop)
+
+ /* we can still use stw if leftover >= 4*/
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, 8
+ srd r11, rWORD, r10
+#else
+ subfic r10, r10, 64
+ sld r11, rWORD, r10
+ srdi r11, r11, 32
+#endif
+ stw r11, 1(rRTN)
+ addi rRTN, rRTN, 4
+
+ beq L(bytebybyte1)
+ addi r10, r10, 32
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, -8
+#else
+ subfic r10, r10, 64
+#endif
+ addi rTMP, r8, -4
+ mtctr rTMP
+ /* remaining byte by byte part of first dw */
+L(loop):
+#ifdef __LITTLE_ENDIAN__
+ addi r10, r10, 8
+#else
+ addi r10, r10, -8
+#endif
+ srd rTMP, rWORD, r10
+ stbu rTMP, 1(rRTN)
+ bdnz L(loop)
+
+L(bytebybyte1):
+ addi rRTN, rRTN, 1
+ /* remaining byte by byte part of second dw */
+L(bytebybyte):
+ addi rRTN, rRTN, -8
+ b L(g1)
+
END (FUNC_NAME)
#ifndef USE_AS_STPCPY
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f4f9fb08d49740d9f18918bcf9d45ca594f416ee
commit f4f9fb08d49740d9f18918bcf9d45ca594f416ee
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Dec 15 17:41:13 2014 +0100
Avoid infinite loop in nss_dns getnetbyname [BZ #17630]
diff --git a/ChangeLog b/ChangeLog
index 0462e8c..991ba72 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2014-12-16 Florian Weimer <fweimer@redhat.com>
+
+ [BZ #17630]
+ * resolv/nss_dns/dns-network.c (getanswer_r): Iterate over alias
+ names.
+
2014-12-15 Jeff Law <law@redhat.com>
[BZ #16617]
diff --git a/NEWS b/NEWS
index a46ee05..f9beb9f 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,7 @@ Version 2.20.1
* The following bugs are resolved with this release:
- 16617, 17266, 17370, 17371, 17625.
+ 16617, 17266, 17370, 17371, 17625, 17630.
* CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
under certain input conditions resulting in the execution of a shell for
@@ -19,6 +19,10 @@ Version 2.20.1
* CVE-2012-3406 printf-style functions could run into a stack overflow when
processing format strings with a large number of format specifiers.
+
+* CVE-2014-9402 The nss_dns implementation of getnetbyname could run into an
+ infinite loopif the DNS response contained a PTR record of an unexpected
+ format.
Version 2.20
diff --git a/resolv/nss_dns/dns-network.c b/resolv/nss_dns/dns-network.c
index 0a77c8b..08cf0a6 100644
--- a/resolv/nss_dns/dns-network.c
+++ b/resolv/nss_dns/dns-network.c
@@ -398,8 +398,8 @@ getanswer_r (const querybuf *answer, int anslen, struct netent *result,
case BYNAME:
{
- char **ap = result->n_aliases++;
- while (*ap != NULL)
+ char **ap;
+ for (ap = result->n_aliases; *ap != NULL; ++ap)
{
/* Check each alias name for being of the forms:
4.3.2.1.in-addr.arpa = net 1.2.3.4
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5985c6ea868db23380977a35a2167549f9a3653b
commit 5985c6ea868db23380977a35a2167549f9a3653b
Author: Jeff Law <law@redhat.com>
Date: Mon Dec 15 10:09:32 2014 +0100
CVE-2012-3406: Stack overflow in vfprintf [BZ #16617]
A larger number of format specifiers coudld cause a stack overflow,
potentially allowing to bypass _FORTIFY_SOURCE format string
protection.
diff --git a/ChangeLog b/ChangeLog
index c5ced23..0462e8c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2014-12-15 Jeff Law <law@redhat.com>
+
+ [BZ #16617]
+ * stdio-common/vfprintf.c (vfprintf): Allocate large specs array
+ on the heap. (CVE-2012-3406)
+ * stdio-common/bug23-2.c, stdio-common/bug23-3.c: New file.
+ * stdio-common/bug23-4.c: New file. Test case by Joseph Myers.
+ * stdio-common/Makefile (tests): Add bug23-2, bug23-3, bug23-4.
+
2014-12-02 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
diff --git a/NEWS b/NEWS
index 20106dc..a46ee05 100644
--- a/NEWS
+++ b/NEWS
@@ -9,13 +9,16 @@ Version 2.20.1
* The following bugs are resolved with this release:
- 17266, 17370, 17371, 17625.
+ 16617, 17266, 17370, 17371, 17625.
* CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
under certain input conditions resulting in the execution of a shell for
command substitution when the applicaiton did not request it. The
implementation now checks WRDE_NOCMD immediately before executing the
shell and returns the error WRDE_CMDSUB as expected.
+
+* CVE-2012-3406 printf-style functions could run into a stack overflow when
+ processing format strings with a large number of format specifiers.
Version 2.20
diff --git a/stdio-common/Makefile b/stdio-common/Makefile
index 5f8e534..24e8496 100644
--- a/stdio-common/Makefile
+++ b/stdio-common/Makefile
@@ -57,7 +57,7 @@ tests := tstscanf test_rdwr test-popen tstgetln test-fseek \
bug19 bug19a tst-popen2 scanf13 scanf14 scanf15 bug20 bug21 bug22 \
scanf16 scanf17 tst-setvbuf1 tst-grouping bug23 bug24 \
bug-vfprintf-nargs tst-long-dbl-fphex tst-fphex-wide tst-sprintf3 \
- bug25 tst-printf-round bug26
+ bug25 tst-printf-round bug23-2 bug23-3 bug23-4
test-srcs = tst-unbputc tst-printf
diff --git a/stdio-common/bug23-2.c b/stdio-common/bug23-2.c
new file mode 100644
index 0000000..9e0cfe6
--- /dev/null
+++ b/stdio-common/bug23-2.c
@@ -0,0 +1,70 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+static const char expected[] = "\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55\
+\n\
+a\n\
+abbcd55%%%%%%%%%%%%%%%%%%%%%%%%%%\n";
+
+static int
+do_test (void)
+{
+ char *buf = malloc (strlen (expected) + 1);
+ snprintf (buf, strlen (expected) + 1,
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n",
+ "a", "b", "c", "d", 5);
+ return strcmp (buf, expected) != 0;
+}
+
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/stdio-common/bug23-3.c b/stdio-common/bug23-3.c
new file mode 100644
index 0000000..57c8cef
--- /dev/null
+++ b/stdio-common/bug23-3.c
@@ -0,0 +1,50 @@
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+int
+do_test (void)
+{
+ size_t instances = 16384;
+#define X0 "\n%1$s\n" "%1$s" "%2$s" "%2$s" "%3$s" "%4$s" "%5$d" "%5$d"
+ const char *item = "\na\nabbcd55";
+#define X3 X0 X0 X0 X0 X0 X0 X0 X0
+#define X6 X3 X3 X3 X3 X3 X3 X3 X3
+#define X9 X6 X6 X6 X6 X6 X6 X6 X6
+#define X12 X9 X9 X9 X9 X9 X9 X9 X9
+#define X14 X12 X12 X12 X12
+#define TRAILER "%%%%%%%%%%%%%%%%%%%%%%%%%%"
+#define TRAILER2 TRAILER TRAILER
+ size_t length = instances * strlen (item) + strlen (TRAILER) + 1;
+
+ char *buf = malloc (length + 1);
+ snprintf (buf, length + 1,
+ X14 TRAILER2 "\n",
+ "a", "b", "c", "d", 5);
+
+ const char *p = buf;
+ size_t i;
+ for (i = 0; i < instances; ++i)
+ {
+ const char *expected;
+ for (expected = item; *expected; ++expected)
+ {
+ if (*p != *expected)
+ {
+ printf ("mismatch at offset %zu (%zu): expected %d, got %d\n",
+ (size_t) (p - buf), i, *expected & 0xFF, *p & 0xFF);
+ return 1;
+ }
+ ++p;
+ }
+ }
+ if (strcmp (p, TRAILER "\n") != 0)
+ {
+ printf ("mismatch at trailer: [%s]\n", p);
+ return 1;
+ }
+ free (buf);
+ return 0;
+}
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
diff --git a/stdio-common/bug23-4.c b/stdio-common/bug23-4.c
new file mode 100644
index 0000000..a478564
--- /dev/null
+++ b/stdio-common/bug23-4.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+
+#define LIMIT 1000000
+
+int
+main (void)
+{
+ struct rlimit lim;
+ getrlimit (RLIMIT_STACK, &lim);
+ lim.rlim_cur = 1048576;
+ setrlimit (RLIMIT_STACK, &lim);
+ char *fmtstr = malloc (4 * LIMIT + 1);
+ if (fmtstr == NULL)
+ abort ();
+ char *output = malloc (LIMIT + 1);
+ if (output == NULL)
+ abort ();
+ for (size_t i = 0; i < LIMIT; i++)
+ memcpy (fmtstr + 4 * i, "%1$d", 4);
+ fmtstr[4 * LIMIT] = '\0';
+ int ret = snprintf (output, LIMIT + 1, fmtstr, 0);
+ if (ret != LIMIT)
+ abort ();
+ for (size_t i = 0; i < LIMIT; i++)
+ if (output[i] != '0')
+ abort ();
+ return 0;
+}
diff --git a/stdio-common/vfprintf.c b/stdio-common/vfprintf.c
index c4ff833..429a3d1 100644
--- a/stdio-common/vfprintf.c
+++ b/stdio-common/vfprintf.c
@@ -263,6 +263,12 @@ vfprintf (FILE *s, const CHAR_T *format, va_list ap)
/* For the argument descriptions, which may be allocated on the heap. */
void *args_malloced = NULL;
+ /* For positional argument handling. */
+ struct printf_spec *specs;
+
+ /* Track if we malloced the SPECS array and thus must free it. */
+ bool specs_malloced = false;
+
/* This table maps a character into a number representing a
class. In each step there is a destination label for each
class. */
@@ -1679,8 +1685,8 @@ do_positional:
size_t nspecs = 0;
/* A more or less arbitrary start value. */
size_t nspecs_size = 32 * sizeof (struct printf_spec);
- struct printf_spec *specs = alloca (nspecs_size);
+ specs = alloca (nspecs_size);
/* The number of arguments the format string requests. This will
determine the size of the array needed to store the argument
attributes. */
@@ -1721,11 +1727,39 @@ do_positional:
if (nspecs * sizeof (*specs) >= nspecs_size)
{
/* Extend the array of format specifiers. */
+ if (nspecs_size * 2 < nspecs_size)
+ {
+ __set_errno (ENOMEM);
+ done = -1;
+ goto all_done;
+ }
struct printf_spec *old = specs;
- specs = extend_alloca (specs, nspecs_size, 2 * nspecs_size);
+ if (__libc_use_alloca (2 * nspecs_size))
+ specs = extend_alloca (specs, nspecs_size, 2 * nspecs_size);
+ else
+ {
+ nspecs_size *= 2;
+ specs = malloc (nspecs_size);
+ if (specs == NULL)
+ {
+ __set_errno (ENOMEM);
+ specs = old;
+ done = -1;
+ goto all_done;
+ }
+ }
/* Copy the old array's elements to the new space. */
memmove (specs, old, nspecs * sizeof (*specs));
+
+ /* If we had previously malloc'd space for SPECS, then
+ release it after the copy is complete. */
+ if (specs_malloced)
+ free (old);
+
+ /* Now set SPECS_MALLOCED if needed. */
+ if (!__libc_use_alloca (nspecs_size))
+ specs_malloced = true;
}
/* Parse the format specifier. */
@@ -2046,6 +2080,8 @@ do_positional:
}
all_done:
+ if (specs_malloced)
+ free (specs);
if (__glibc_unlikely (args_malloced != NULL))
free (args_malloced);
if (__glibc_unlikely (workstart != NULL))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=8647419a62d1d5641488ca6430bb679cf1e766e0
commit 8647419a62d1d5641488ca6430bb679cf1e766e0
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Wed Nov 19 16:27:56 2014 -0500
powerpc: Add powerpc64 strpbrk optimization
This patch makes the POWER7 optimized strpbrk generic by using
default doubleword stores to zero the hash, instead of VSX
instructions. Performance on POWER7/POWER8 does not change.
diff --git a/ChangeLog b/ChangeLog
index 89ee40b..c5ced23 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,14 @@
2014-12-02 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
-
+
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+ Remove strpbrk objects.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Remove strpbrk implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c: Remove file.
+ * sysdeps/powerpc/powerpc64/multiarch/strpbrk.c: Remove file.
+ * sysdeps/powerpc/powerpc64/power7/strpbrk.S: Remove file.
+ * sysdeps/powerpc/powerpc64/strpbrk.S: New file.
+
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
Remove strcspn objects.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 05dab25..39e441b 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -15,7 +15,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \
strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
- strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
+ strncpy-power7 strncpy-ppc64 \
stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
bcopy-ppc64
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 1a2e38d..8f1e3e1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -272,14 +272,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncat, 1,
__strncat_ppc))
- /* Support sysdeps/powerpc/powerpc64/multiarch/strpbrk.c. */
- IFUNC_IMPL (i, name, strpbrk,
- IFUNC_IMPL_ADD (array, i, strpbrk,
- hwcap & PPC_FEATURE_HAS_VSX,
- __strpbrk_power7)
- IFUNC_IMPL_ADD (array, i, strpbrk, 1,
- __strpbrk_ppc))
-
/* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
IFUNC_IMPL_ADD (array, i, strncpy,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S
deleted file mode 100644
index 663ca36..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Optimized strpbrk implementation for POWER7.
- Copyright (C) 2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#undef EALIGN
-#define EALIGN(name, alignt, words) \
- .section ".text"; \
- ENTRY_2(__strpbrk_power7) \
- .align ALIGNARG(alignt); \
- EALIGN_W_##words; \
- BODY_LABEL(__strpbrk_power7): \
- cfi_startproc; \
- LOCALENTRY(__strpbrk_power7)
-
-#undef END
-#define END(name) \
- cfi_endproc; \
- TRACEBACK(__strpbrk_power7) \
- END_2(__strpbrk_power7)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#include <sysdeps/powerpc/powerpc64/power7/strpbrk.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c
deleted file mode 100644
index 8dea70e..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (C) 2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <string.h>
-
-#define STRPBRK __strpbrk_ppc
-#ifdef SHARED
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
- __hidden_ver1 (__strpbrk_ppc, __GI_strpbrk, __strpbrk_ppc);
-#endif
-
-extern __typeof (strpbrk) __strpbrk_ppc attribute_hidden;
-
-#include <string/strpbrk.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c b/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c
deleted file mode 100644
index 8b05536..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Multiple versions of strpbrk. PowerPC64 version.
- Copyright (C) 2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-extern __typeof (strpbrk) __strpbrk_ppc attribute_hidden;
-extern __typeof (strpbrk) __strpbrk_power7 attribute_hidden;
-
-libc_ifunc (strpbrk,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strpbrk_power7
- : __strpbrk_ppc);
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strpbrk.S b/sysdeps/powerpc/powerpc64/strpbrk.S
similarity index 78%
rename from sysdeps/powerpc/powerpc64/power7/strpbrk.S
rename to sysdeps/powerpc/powerpc64/strpbrk.S
index d6204a7..6b2ad4d 100644
--- a/sysdeps/powerpc/powerpc64/power7/strpbrk.S
+++ b/sysdeps/powerpc/powerpc64/strpbrk.S
@@ -1,4 +1,4 @@
-/* Optimized strpbrk implementation for PowerPC64/POWER7.
+/* Optimized strpbrk implementation for PowerPC64.
Copyright (C) 2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -20,7 +20,6 @@
/* char [r3] *strpbrk(const char [r4] *s, const char [r5] *accept) */
- .machine power7
EALIGN (strpbrk, 4, 0)
CALL_MCOUNT 3
@@ -32,43 +31,31 @@ EALIGN (strpbrk, 4, 0)
for fast check if input character should be considered. For ASCII
or ISO-8859-X character sets it has 256 positions. */
- /* First the table should be cleared and to avoid unaligned accesses
- when using the VSX stores the table address is aligned to 16
- bytes. */
- xxlxor v0,v0,v0
-
- /* PPC64 ELF ABI stack is aligned to 16 bytes */
+ /* PPC64 ELF ABI stack is aligned to 16 bytes. */
addi r9,r1,-256
-
- li r5,16
- li r6,32
- li r8,48
- addi r12,r9,64
/* Clear the table with 0 values */
- stxvw4x v0,r0,r9
- addi r11,r9,128
- addi r7,r9,192
- stxvw4x v0,r9,r5
- li r0,1
- stxvw4x v0,r9,r6
- stxvw4x v0,r9,r8
- stxvw4x v0,r0,r12
- stxvw4x v0,r12,r5
- stxvw4x v0,r12,r6
- stxvw4x v0,r12,r8
- stxvw4x v0,r0,r11
- stxvw4x v0,r11,r5
- stxvw4x v0,r11,r6
- stxvw4x v0,r11,r8
- stxvw4x v0,r0,r7
- stxvw4x v0,r7,r5
- stxvw4x v0,r7,r6
- stxvw4x v0,r7,r8
+ li r6, 0
+ li r7, 4
+ mtctr r7
+ mr r8, r9
+ .align 4
+L(zerohash):
+ std r6, 0(r8)
+ std r6, 8(r8)
+ std r6, 16(r8)
+ std r6, 24(r8)
+ std r6, 32(r8)
+ std r6, 40(r8)
+ std r6, 48(r8)
+ std r6, 56(r8)
+ addi r8, r8, 64
+ bdnz L(zerohash)
/* Initialize the table as:
for (i=0; accept[i]; i++
table[accept[i]]] = 1 */
- .p2align 4,,15
+ li r0,1
+ .align 4
L(init_table):
stbx r0,r9,r10
lbzu r10,1(r4)
@@ -93,7 +80,7 @@ L(finish_table):
if (table[input[i++]] == 1)
return (s[i -1] ? s + i - 1: NULL);
} */
- .p2align 4
+ .align 4
L(unroll):
lbz r0,1(r3)
lbzx r8,r9,r0
@@ -121,7 +108,7 @@ L(mainloop):
L(end):
blr
- .p2align 4
+ .align 4
L(checkend):
cmpdi cr1,r12,0
mr r3,r7
@@ -131,14 +118,14 @@ L(nullfound):
li 3,0
blr
- .p2align 4
+ .align 4
L(checkend2):
cmpdi cr7,r0,0
mr r3,r11
beq cr7,L(nullfound)
blr
- .p2align 4
+ .align 4
L(checkend3):
cmpdi cr6,r10,0
mr r3,r5
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f9f30622819b4d3685c0d448f3a3d49032472b07
commit f9f30622819b4d3685c0d448f3a3d49032472b07
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Wed Nov 19 15:24:55 2014 -0500
powerpc: Add powerpc64 strcspn optimization
This patch makes the POWER7 optimized strcspn generic by using
default doubleword stores to zero the hash, instead of VSX
instructions. Performance on POWER7/POWER8 does not change.
diff --git a/ChangeLog b/ChangeLog
index d3b8947..89ee40b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,4 +1,13 @@
2014-12-02 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+ Remove strcspn objects.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Remove strcspn implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c: Remove file.
+ * sysdeps/powerpc/powerpc64/multiarch/strcspn.c: Remove file.
+ * sysdeps/powerpc/powerpc64/power7/strcspn.S: Remove file.
+ * sysdeps/powerpc/powerpc64/strcspn.S: New file.
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
Remove strspn objetcs.
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index d6de5a5..05dab25 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -15,7 +15,6 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \
strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
- strcspn-power7 strcspn-ppc64 \
strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 4a9e523..1a2e38d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -272,14 +272,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncat, 1,
__strncat_ppc))
- /* Support sysdeps/powerpc/powerpc64/multiarch/strcspn.c. */
- IFUNC_IMPL (i, name, strcspn,
- IFUNC_IMPL_ADD (array, i, strcspn,
- hwcap & PPC_FEATURE_HAS_VSX,
- __strcspn_power7)
- IFUNC_IMPL_ADD (array, i, strcspn, 1,
- __strcspn_ppc))
-
/* Support sysdeps/powerpc/powerpc64/multiarch/strpbrk.c. */
IFUNC_IMPL (i, name, strpbrk,
IFUNC_IMPL_ADD (array, i, strpbrk,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strcspn-power7.S
deleted file mode 100644
index 02ffcc8..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strcspn-power7.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Optimized strcspn implementation for POWER7.
- Copyright (C) 2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#undef EALIGN
-#define EALIGN(name, alignt, words) \
- .section ".text"; \
- ENTRY_2(__strcspn_power7) \
- .align ALIGNARG(alignt); \
- EALIGN_W_##words; \
- BODY_LABEL(__strcspn_power7): \
- cfi_startproc; \
- LOCALENTRY(__strcspn_power7)
-
-#undef END
-#define END(name) \
- cfi_endproc; \
- TRACEBACK(__strcspn_power7) \
- END_2(__strcspn_power7)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#include <sysdeps/powerpc/powerpc64/power7/strcspn.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c
deleted file mode 100644
index 5f8b610..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strcspn-ppc64.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (C) 2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <string.h>
-
-#define STRCSPN __strcspn_ppc
-#ifdef SHARED
-
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
- __hidden_ver1 (__strcspn_ppc, __GI_strcspn, __strcspn_ppc);
-#endif
-
-extern __typeof (strcspn) __strcspn_ppc attribute_hidden;
-
-#include <string/strcspn.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c b/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
deleted file mode 100644
index 3609d93..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Multiple versions of strcspn. PowerPC64 version.
- Copyright (C) 2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-extern __typeof (strcspn) __strcspn_ppc attribute_hidden;
-extern __typeof (strcspn) __strcspn_power7 attribute_hidden;
-
-libc_ifunc (strcspn,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcspn_power7
- : __strcspn_ppc);
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strcspn.S b/sysdeps/powerpc/powerpc64/strcspn.S
similarity index 80%
rename from sysdeps/powerpc/powerpc64/power7/strcspn.S
rename to sysdeps/powerpc/powerpc64/strcspn.S
index 3f6aa0a..1121930 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcspn.S
+++ b/sysdeps/powerpc/powerpc64/strcspn.S
@@ -20,54 +20,42 @@
/* size_t [r3] strcspn (const char [r4] *s, const char [r5] *reject) */
- .machine power7
EALIGN (strcspn, 4, 0)
CALL_MCOUNT 3
/* The idea to speed up the algorithm is to create a lookup table
for fast check if input character should be considered. For ASCII
or ISO-8859-X character sets it has 256 positions. */
- lbz r10,0(r4)
-
- /* First the table should be cleared and to avoid unaligned accesses
- when using the VSX stores the table address is aligned to 16
- bytes. */
- xxlxor v0,v0,v0
/* PPC64 ELF ABI stack is aligned to 16 bytes. */
addi r9,r1,-256
+ /* Clear the table with 0 values */
+ li r6, 0
+ li r8, 4
+ mtctr r8
+ mr r10, r9
+ .align 4
+L(zerohash):
+ std r6, 0(r10)
+ std r6, 8(r10)
+ std r6, 16(r10)
+ std r6, 24(r10)
+ std r6, 32(r10)
+ std r6, 40(r10)
+ std r6, 48(r10)
+ std r6, 56(r10)
+ addi r10, r10, 64
+ bdnz L(zerohash)
- li r8,48
- li r5,16
- li r6,32
+ lbz r10,0(r4)
cmpdi cr7,r10,0 /* reject[0] == '\0' ? */
- addi r12,r9,64
- /* Clear the table with 0 values */
- stxvw4x v0,r0,r9
- addi r11,r9,128
- addi r7,r9,192
- stxvw4x v0,r9,r5
- stxvw4x v0,r9,r6
- stxvw4x v0,r9,r8
- stxvw4x v0,r0,r12
- stxvw4x v0,r12,r5
- stxvw4x v0,r12,r6
- stxvw4x v0,r12,r8
- stxvw4x v0,r0,r11
- stxvw4x v0,r11,r5
- stxvw4x v0,r11,r6
- stxvw4x v0,r11,r8
- stxvw4x v0,r0,r7
- stxvw4x v0,r7,r5
- stxvw4x v0,r7,r6
- stxvw4x v0,r7,r8
li r8,1
beq cr7,L(finish_table) /* If reject[0] == '\0' skip */
/* Initialize the table as:
for (i=0; reject[i]; i++
table[reject[i]]] = 1 */
- .p2align 4,,15
+ .align 4
L(init_table):
stbx r8,r9,r10
lbzu r10,1(r4)
@@ -93,7 +81,7 @@ L(finish_table):
if (table[input[i++]] == 1)
return i - 1;
} */
- .p2align 4,,15
+ .align 4
L(unroll):
lbz r8,1(r3)
addi r10,r10,4
@@ -121,17 +109,17 @@ L(mainloop):
mr r3,r10
blr
- .p2align 4,,15
+ .align 4
L(end):
mr r3,r6
blr
- .p2align 4,,15
+ .align 4
L(end2):
mr r3,r4
blr
- .p2align 4,,15
+ .align 4
L(end3):
mr r3,r5
blr
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=97104a4e2b866aae6a6593286b6c584339ef29d3
commit 97104a4e2b866aae6a6593286b6c584339ef29d3
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Wed Nov 19 14:24:18 2014 -0500
powerpc: Add powerpc64 strspn optimization
This patch makes the POWER7 optimized strspn generic by using
default doubleword stores to zero the hash, instead of VSX
instructions. Performance on POWER7/POWER8 machines does not changed.
diff --git a/ChangeLog b/ChangeLog
index f31179d..d3b8947 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2014-12-02 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+ Remove strspn objetcs.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Remove strspn implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S: Remove file.
+ * sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c: Likewise.
+ * sysdeps/powerpc/powerpc64/power7/strspn.S: Remove file.
+ * sysdeps/powerpc/powerpc64/strspn.S: New file.
+
2014-12-01 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/strtok.S: New file.
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index abc9d2e..d6de5a5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -15,7 +15,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \
strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
- strspn-power7 strspn-ppc64 strcspn-power7 strcspn-ppc64 \
+ strcspn-power7 strcspn-ppc64 \
strpbrk-power7 strpbrk-ppc64 strncpy-power7 strncpy-ppc64 \
stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 06d5be9..4a9e523 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -272,14 +272,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncat, 1,
__strncat_ppc))
- /* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c. */
- IFUNC_IMPL (i, name, strspn,
- IFUNC_IMPL_ADD (array, i, strspn,
- hwcap & PPC_FEATURE_HAS_VSX,
- __strspn_power7)
- IFUNC_IMPL_ADD (array, i, strspn, 1,
- __strspn_ppc))
-
/* Support sysdeps/powerpc/powerpc64/multiarch/strcspn.c. */
IFUNC_IMPL (i, name, strcspn,
IFUNC_IMPL_ADD (array, i, strcspn,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S
deleted file mode 100644
index 889dfee..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strspn-power7.S
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Optimized strspn implementation for POWER7.
- Copyright (C) 2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#undef EALIGN
-#define EALIGN(name, alignt, words) \
- .section ".text"; \
- ENTRY_2(__strspn_power7) \
- .align ALIGNARG(alignt); \
- EALIGN_W_##words; \
- BODY_LABEL(__strspn_power7): \
- cfi_startproc; \
- LOCALENTRY(__strspn_power7)
-
-#undef END
-#define END(name) \
- cfi_endproc; \
- TRACEBACK(__strspn_power7) \
- END_2(__strspn_power7)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#include <sysdeps/powerpc/powerpc64/power7/strspn.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c
deleted file mode 100644
index d543772..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (C) 2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <string.h>
-
-#define STRSPN __strspn_ppc
-#undef weak_alias
-#define weak_alias(name, aliasname) \
- extern __typeof (__strspn_ppc) aliasname \
- __attribute__ ((weak, alias ("__strspn_ppc")));
-#if !defined(NOT_IN_libc) && defined(SHARED)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
- __hidden_ver1(__strspn_ppc, __GI_strspn, __strspn_ppc);
-#endif
-
-extern __typeof (strspn) __strspn_ppc attribute_hidden;
-
-#include <string/strspn.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
deleted file mode 100644
index bf8c877..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strspn.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Multiple versions of strspn. PowerPC64 version.
- Copyright (C) 2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#ifndef NOT_IN_libc
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-extern __typeof (strspn) __strspn_ppc attribute_hidden;
-extern __typeof (strspn) __strspn_power7 attribute_hidden;
-
-libc_ifunc (strspn,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strspn_power7
- : __strspn_ppc);
-#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/strspn.S b/sysdeps/powerpc/powerpc64/strspn.S
similarity index 75%
rename from sysdeps/powerpc/powerpc64/power7/strspn.S
rename to sysdeps/powerpc/powerpc64/strspn.S
index d587a67..daf5d5d 100644
--- a/sysdeps/powerpc/powerpc64/power7/strspn.S
+++ b/sysdeps/powerpc/powerpc64/strspn.S
@@ -1,4 +1,4 @@
-/* Optimized strspn implementation for PowerPC64/POWER7.
+/* Optimized strspn implementation for PowerPC64.
Copyright (C) 2014 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -25,8 +25,6 @@
> hashing of needle.
> hashing avoids scanning of duplicate entries in needle
across the string.
- > initializing the hash table with Vector instructions
- by quadword access.
> unrolling when scanning for character in string
across hash table. */
@@ -46,55 +44,36 @@
#include <sysdep.h>
-#undef strspn
-
- .machine power7
EALIGN(strspn, 4, 0)
- CALL_MCOUNT 2
-
- lbz r10, 0(r4) /* load r10 with needle (r4) */
- addi r9, r1, -256 /* r9 is a hash of 256 bytes */
-
- li r5, 16 /* set r5 = 16 as offset */
- li r6, 32 /* set r6 = 32 as offset */
- li r8, 48 /* set r8 = 48 as offset */
-
-/*Iniatliaze hash table with Zeroes in double indexed quadword accesses */
- xxlxor v0, v0, v0 /* prepare for initializing hash */
-
- stxvd2x v0, r0, r9 /* initialize 1st quadword */
- stxvd2x v0, r9, r5
- stxvd2x v0, r9, r6
- stxvd2x v0, r9, r8 /* initialize 4th quadword */
-
- addi r11, r9, 64 /* r11 is index to hash */
-
- stxvd2x v0, r0, r11 /* initialize 5th quadword */
- stxvd2x v0, r11, r5
- stxvd2x v0, r11, r6
- stxvd2x v0, r11, r8 /* initialize 8th quadword */
-
- addi r11, r9, 128 /* r11 is index to hash */
-
- stxvd2x v0, r0, r11 /* initialize 9th quadword */
- stxvd2x v0, r11, r5
- stxvd2x v0, r11, r6
- stxvd2x v0, r11, r8 /* initialize 12th quadword */
-
- addi r11, r9, 192 /* r11 is index to hash */
-
- stxvd2x v0, r0, r11 /* initialize 13th quadword */
- stxvd2x v0, r11, r5
- stxvd2x v0, r11, r6
- stxvd2x v0, r11, r8 /* initialize 16th quadword */
-
+ CALL_MCOUNT 3
+
+ /* PPC64 ELF ABI stack is aligned to 16 bytes. */
+ addi r9,r1,-256
+ /* Clear the table with 0 values */
+ li r6, 0
+ li r8, 4
+ mtctr r8
+ mr r10, r9
+ .align 4
+L(zerohash):
+ std r6, 0(r10)
+ std r6, 8(r10)
+ std r6, 16(r10)
+ std r6, 24(r10)
+ std r6, 32(r10)
+ std r6, 40(r10)
+ std r6, 48(r10)
+ std r6, 56(r10)
+ addi r10, r10, 64
+ bdnz L(zerohash)
+
+ lbz r10,0(r4)
li r8, 1 /* r8=1, marker into hash if found in
needle */
-
cmpdi cr7, r10, 0 /* accept needle is NULL */
beq cr7, L(skipHashing) /* if needle is NULL, skip hashing */
- .p2align 4 /* align section to 16 byte boundary */
+ .align 4 /* align section to 16 byte boundary */
L(hashing):
stbx r8, r9, r10 /* update hash with marker for the pivot of
the needle */
@@ -106,7 +85,7 @@ L(skipHashing):
li r10, 0 /* load counter = 0 */
b L(beginScan)
- .p2align 4 /* align section to 16 byte boundary */
+ .align 4 /* align section to 16 byte boundary */
L(scanUnroll):
lbzx r8, r9, r8 /* load r8 with hash value at index */
cmpwi cr7, r8, 0 /* if we hit marker in hash, we have found
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d3e53c8246952898cd1fd23dfa0657b03db0e36b
commit d3e53c8246952898cd1fd23dfa0657b03db0e36b
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Date: Mon Dec 1 09:03:58 2014 -0500
powerpc: strtok{_r} optimization for powerpc64
This patch optimizes strtok and strtok_r for POWERPC64.
A table of 256 characters is created and marked based on
the 'accept' argument and used to check for any occurance on
the input string.Loop unrolling is also used to gain improvements.
diff --git a/ChangeLog b/ChangeLog
index 814486e..f31179d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2014-12-01 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/strtok.S: New file.
+ * sysdeps/powerpc/powerpc64/strtok_r.S: New file.
+
2014-11-26 Adhemerval Zanella <azanella@linux.ibm.com>
* csu/tst-atomic.c (do_test): Add atomic_exchange_and_add_{acq,rel}
diff --git a/sysdeps/powerpc/powerpc64/strtok.S b/sysdeps/powerpc/powerpc64/strtok.S
new file mode 100644
index 0000000..fa816f2
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/strtok.S
@@ -0,0 +1,226 @@
+/* Optimized strtok implementation for PowerPC64.
+
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Performance gains are grabbed through following techniques:
+
+ > hashing of needle.
+ > hashing avoids scanning of duplicate entries in needle
+ across the string.
+ > unrolling when scanning for character in string
+ across hash table. */
+
+/* Algorithm is as below:
+ 1. A empty hash table/dictionary is created comprising of
+ 256 ascii character set
+ 2. When hash entry is found in needle , the hash index
+ is initialized to 1
+ 3. The string is scanned until end and for every character,
+ its corresponding hash index is compared.
+ 4. initial length of string (count) until first hit of
+ accept needle is calculated and moved.(strspn)
+ 5. The string is again scanned until end and for every character,
+ its corresponding hash index is compared.(strpbrk)
+ 6. If hash index is set to 1 for the index of string,
+ set it to null and set the saveptr to point to the next char.
+ 7. Otherwise count is incremented and scanning continues
+ until end of string. */
+
+#include <sysdep.h>
+#ifdef USE_AS_STRTOK_R
+# define FUNC_NAME __strtok_r
+#else
+# define FUNC_NAME strtok
+#endif
+
+EALIGN(FUNC_NAME, 4, 0)
+#ifdef USE_AS_STRTOK_R
+ CALL_MCOUNT 3
+ cmpdi cr7, r3, 0 /* Is input null? */
+ bne cr7, L(inputnotNull)
+ ld r3, 0(r5) /* Load from r5 */
+#else
+ CALL_MCOUNT 2
+ addis r5, r2, .LANCHOR0@toc@ha
+ cmpdi cr7, r3, 0 /* Is r3 NULL? */
+ bne cr7, L(inputnotNull)
+ ld r3, .LANCHOR0@toc@l(r5) /* Load from saveptr */
+#endif
+L(inputnotNull):
+ mr r7, r3
+ cmpdi cr7, r3, 0
+ beq cr7, L(returnNULL)
+ lbz r8, 0(r3)
+ cmpdi cr7, r8, 0
+ beq cr7, L(returnNULL)
+
+ addi r9, r1, -256 /* r9 is a hash of 256 bytes */
+
+ /*Iniatliaze hash table with Zeroes */
+ li r6, 0
+ li r8, 4
+ mtctr r8
+ mr r10, r9
+ .align 4
+L(zerohash):
+ std r6, 0(r10)
+ std r6, 8(r10)
+ std r6, 16(r10)
+ std r6, 24(r10)
+ std r6, 32(r10)
+ std r6, 40(r10)
+ std r6, 48(r10)
+ std r6, 56(r10)
+ addi r10, r10, 64
+ bdnz L(zerohash)
+
+
+ lbz r10, 0(r4) /* load r10 with needle (r4) */
+ li r8, 1 /* r8=1, marker into hash if found in
+ needle */
+
+ cmpdi cr7, r10, 0 /* accept needle is NULL */
+ beq cr7, L(skipHashing) /* if needle is NULL, skip hashing */
+
+ .align 4 /* align section to 16 byte boundary */
+L(hashing):
+ stbx r8, r9, r10 /* update hash with marker for the pivot of
+ the needle */
+ lbzu r10, 1(r4) /* load needle into r10 and update to next */
+ cmpdi cr7, r10, 0 /* if needle is has reached NULL, continue */
+ bne cr7, L(hashing) /* loop to hash the needle */
+
+L(skipHashing):
+ b L(beginScan)
+
+ .align 4 /* align section to 16 byte boundary */
+L(scanUnroll):
+ lbzx r8, r9, r8 /* load r8 with hash value at index */
+ cmpwi cr7, r8, 0 /* check the hash value */
+ beq cr7, L(ret1stIndex) /* we have hit accept needle */
+
+ lbz r8, 1(r7) /* load string[1] into r8 */
+ lbzx r8, r9, r8 /* load r8 with hash value at index */
+ cmpwi cr7, r8, 0 /* check the hash value */
+ beq cr7, L(ret2ndIndex) /* we have hit accept needle */
+
+ lbz r8, 2(r7) /* load string[1] into r8 */
+ lbzx r8, r9, r8 /* load r8 with hash value at index */
+ cmpwi cr7, r8, 0 /* check the hash value */
+ beq cr7, L(ret3rdIndex) /* we have hit accept needle */
+
+ lbz r8, 3(r7) /* load string[1] into r8 */
+ addi r7, r7, 4
+ lbzx r8, r9, r8 /* load r8 with hash value at index */
+ cmpwi cr7, r8, 0 /* check the hash value */
+ beq cr7,L(ret4thIndex) /* we have hit accept needle */
+
+L(beginScan):
+ lbz r8, 0(r7) /* load string[0] into r8 */
+ addi r6, r7, 1
+ addi r11, r7, 2
+ addi r4, r7, 3
+ cmpdi cr7, r8, 0 /* check if its null */
+ bne cr7, L(scanUnroll) /* continue scanning */
+
+L(ret1stIndex):
+ mr r3, r7
+ b L(next)
+L(ret2ndIndex):
+ mr r3, r6
+ b L(next)
+L(ret3rdIndex):
+ mr r3, r11
+ b L(next)
+L(ret4thIndex):
+ mr r3, r4
+L(next):
+ mr r7, r3
+ lbz r8, 0(r7)
+ cmpdi cr7, r8, 0
+ beq cr7, L(returnNULL)
+ li r8, 1
+ li r10, 0 /* load counter = 0 */
+ stbx r8, r9, r10 /* update hash for NULL */
+ b L(mainloop)
+
+L(unroll):
+ lbz r8, 1(r7) /* load string[1] into r8 */
+ lbzx r8, r9, r8 /* load r8 with hash value at index */
+ cmpwi r7, r8, 1 /* check the hash */
+ beq cr7, L(foundat1st) /* we have hit accept needle */
+ lbz r8, 2(r7)
+ lbzx r8, r9, r8
+ cmpwi cr7, r8, 1
+ beq cr7, L(foundat2nd)
+ lbz r8, 3(r7)
+ addi r7, r7, 4
+ lbzx r8, r9, r8
+ cmpwi cr7, r8, 1
+ beq cr7, L(foundat3rd)
+L(mainloop):
+ lbz r8, 0(r7)
+ addi r6, r7, 1
+ addi r11, r7, 2
+ addi r4, r7, 3
+ lbzx r8, r9, r8
+ cmpwi cr7, r8, 1
+ bne cr7, L(unroll) /* continue scanning */
+
+ b L(found)
+L(foundat1st):
+ mr r7, r6
+ b L(found)
+L(foundat2nd):
+ mr r7, r11
+ b L(found)
+L(foundat3rd):
+ mr r7, r4
+L(found):
+ lbz r8, 0(r7)
+ cmpdi cr7, r8, 0
+ beq cr7, L(end)
+ li r10, 0
+ stb r10, 0(r7) /* Terminate string */
+ addi r7, r7, 1 /* Store the pointer to the next char */
+L(end):
+#ifdef USE_AS_STRTOK_R
+ std r7, 0(r5) /* Update saveptr */
+#else
+ std r7, .LANCHOR0@toc@l(r5)
+#endif
+ blr /* done */
+L(returnNULL):
+#ifndef USE_AS_STRTOK_R
+ li r7, 0
+#endif
+ li r3, 0 /* return NULL */
+ b L(end)
+END(FUNC_NAME)
+#ifdef USE_AS_STRTOK_R
+libc_hidden_builtin_def (strtok_r)
+#else
+ .section ".bss"
+ .align 3
+ .set .LANCHOR0,. + 0
+ .type olds, @object
+ .size olds, 8
+olds:
+ .zero 8
+libc_hidden_builtin_def (strtok)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/strtok_r.S b/sysdeps/powerpc/powerpc64/strtok_r.S
new file mode 100644
index 0000000..6e5d301
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/strtok_r.S
@@ -0,0 +1,24 @@
+/* Optimized strtok_r implementation for PowerPC64.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define USE_AS_STRTOK_R
+#include <sysdeps/powerpc/powerpc64/strtok.S>
+
+weak_alias (__strtok_r, strtok_r)
+libc_hidden_def (__strtok_r)
+libc_hidden_builtin_def (strtok_r)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=8b063985c1a750a1947fcf60e4606a3b0d7d0f37
commit 8b063985c1a750a1947fcf60e4606a3b0d7d0f37
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Tue Nov 25 14:32:54 2014 -0500
powerpc: Fix missing barriers in atomic_exchange_and_add_{acq,rel}
On powerpc, atomic_exchange_and_add is implemented without any
barriers. This patchs adds the missing instruction and memory barrier
for acquire and release semanthics.
diff --git a/ChangeLog b/ChangeLog
index 103f1ed..814486e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+2014-11-26 Adhemerval Zanella <azanella@linux.ibm.com>
+
+ * csu/tst-atomic.c (do_test): Add atomic_exchange_and_add_{acq,rel}
+ tests.
+ * sysdeps/powerpc/bits/atomic.h
+ (__arch_atomic_exchange_and_add_32_acq): Add definition.
+ (__arch_atomic_exchange_and_add_32_rel): Likewise.
+ (atomic_exchange_and_add_acq): Likewise.
+ (atomic_exchange_and_add_rel): Likewise.
+ * sysdeps/powerpc/powerpc32/bits/atomic.h
+ (__arch_atomic_exchange_and_add_64_acq): Add definition.
+ (__arch_atomic_exchange_and_add_64_rel): Likewise.
+ * sysdeps/powerpc/powerpc64/bits/atomic.h
+ (__arch_atomic_exchange_and_add_64_acq): Add definition.
+ (__arch_atomic_exchange_and_add_64_rel): Likewise.
+
2014-11-25 Anton Blanchard <anton@samba.org>
* sysdeps/powerpc/bits/atomic.h
diff --git a/csu/tst-atomic.c b/csu/tst-atomic.c
index d16c66d..ab6db45 100644
--- a/csu/tst-atomic.c
+++ b/csu/tst-atomic.c
@@ -113,6 +113,22 @@ do_test (void)
ret = 1;
}
+ mem = 2;
+ if (atomic_exchange_and_add_acq (&mem, 11) != 2
+ || mem != 13)
+ {
+ puts ("atomic_exchange_and_add test failed");
+ ret = 1;
+ }
+
+ mem = 2;
+ if (atomic_exchange_and_add_rel (&mem, 11) != 2
+ || mem != 13)
+ {
+ puts ("atomic_exchange_and_add test failed");
+ ret = 1;
+ }
+
mem = -21;
atomic_add (&mem, 22);
if (mem != 1)
diff --git a/sysdeps/powerpc/bits/atomic.h b/sysdeps/powerpc/bits/atomic.h
index f312676..b05b0f7 100644
--- a/sysdeps/powerpc/bits/atomic.h
+++ b/sysdeps/powerpc/bits/atomic.h
@@ -152,6 +152,34 @@ typedef uintmax_t uatomic_max_t;
__val; \
})
+#define __arch_atomic_exchange_and_add_32_acq(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile ("1: lwarx %0,0,%3" MUTEX_HINT_ACQ "\n" \
+ " add %1,%0,%4\n" \
+ " stwcx. %1,0,%3\n" \
+ " bne- 1b\n" \
+ __ARCH_ACQ_INSTR \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
+#define __arch_atomic_exchange_and_add_32_rel(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile (__ARCH_REL_INSTR "\n" \
+ "1: lwarx %0,0,%3" MUTEX_HINT_REL "\n" \
+ " add %1,%0,%4\n" \
+ " stwcx. %1,0,%3\n" \
+ " bne- 1b" \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
#define __arch_atomic_increment_val_32(mem) \
({ \
__typeof (*(mem)) __val; \
@@ -252,6 +280,28 @@ typedef uintmax_t uatomic_max_t;
abort (); \
__result; \
})
+#define atomic_exchange_and_add_acq(mem, value) \
+ ({ \
+ __typeof (*(mem)) __result; \
+ if (sizeof (*mem) == 4) \
+ __result = __arch_atomic_exchange_and_add_32_acq (mem, value); \
+ else if (sizeof (*mem) == 8) \
+ __result = __arch_atomic_exchange_and_add_64_acq (mem, value); \
+ else \
+ abort (); \
+ __result; \
+ })
+#define atomic_exchange_and_add_rel(mem, value) \
+ ({ \
+ __typeof (*(mem)) __result; \
+ if (sizeof (*mem) == 4) \
+ __result = __arch_atomic_exchange_and_add_32_rel (mem, value); \
+ else if (sizeof (*mem) == 8) \
+ __result = __arch_atomic_exchange_and_add_64_rel (mem, value); \
+ else \
+ abort (); \
+ __result; \
+ })
#define atomic_increment_val(mem) \
({ \
diff --git a/sysdeps/powerpc/powerpc32/bits/atomic.h b/sysdeps/powerpc/powerpc32/bits/atomic.h
index a3dd09c..7422262 100644
--- a/sysdeps/powerpc/powerpc32/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc32/bits/atomic.h
@@ -95,6 +95,12 @@
#define __arch_atomic_exchange_and_add_64(mem, value) \
({ abort (); (*mem) = (value); })
+#define __arch_atomic_exchange_and_add_64_acq(mem, value) \
+ ({ abort (); (*mem) = (value); })
+
+#define __arch_atomic_exchange_and_add_64_rel(mem, value) \
+ ({ abort (); (*mem) = (value); })
+
#define __arch_atomic_increment_val_64(mem) \
({ abort (); (*mem)++; })
diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h
index 9cab0a2..e64cb9f 100644
--- a/sysdeps/powerpc/powerpc64/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc64/bits/atomic.h
@@ -183,6 +183,34 @@
__val; \
})
+#define __arch_atomic_exchange_and_add_64_acq(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile ("1: ldarx %0,0,%3" MUTEX_HINT_ACQ "\n" \
+ " add %1,%0,%4\n" \
+ " stdcx. %1,0,%3\n" \
+ " bne- 1b\n" \
+ __ARCH_ACQ_INSTR \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
+#define __arch_atomic_exchange_and_add_64_rel(mem, value) \
+ ({ \
+ __typeof (*mem) __val, __tmp; \
+ __asm __volatile (__ARCH_REL_INSTR "\n" \
+ "1: ldarx %0,0,%3" MUTEX_HINT_REL "\n" \
+ " add %1,%0,%4\n" \
+ " stdcx. %1,0,%3\n" \
+ " bne- 1b" \
+ : "=&b" (__val), "=&r" (__tmp), "=m" (*mem) \
+ : "b" (mem), "r" (value), "m" (*mem) \
+ : "cr0", "memory"); \
+ __val; \
+ })
+
#define __arch_atomic_increment_val_64(mem) \
({ \
__typeof (*(mem)) __val; \
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=364c58517bdcc91c5bf1fcb57b4befff8951a51b
commit 364c58517bdcc91c5bf1fcb57b4befff8951a51b
Author: Anton Blanchard <anton@samba.org>
Date: Tue Nov 25 07:26:12 2014 -0500
powerpc: Fix __arch_compare_and_exchange_bool_64_rel
Fix a typo in the inline assembly.
diff --git a/ChangeLog b/ChangeLog
index 9cd75d5..103f1ed 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2014-11-25 Anton Blanchard <anton@samba.org>
+
+ * sysdeps/powerpc/bits/atomic.h
+ (__arch_compare_and_exchange_bool_64_rel): Load from mem.
+
2014-11-19 Carlos O'Donell <carlos@redhat.com>
Florian Weimer <fweimer@redhat.com>
Joseph Myers <joseph@codesourcery.com>
diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h
index ed26b72..9cab0a2 100644
--- a/sysdeps/powerpc/powerpc64/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc64/bits/atomic.h
@@ -97,7 +97,7 @@
({ \
unsigned long __tmp; \
__asm __volatile (__ARCH_REL_INSTR "\n" \
- "1: ldarx %0,0,%2" MUTEX_HINT_REL "\n" \
+ "1: ldarx %0,0,%1" MUTEX_HINT_REL "\n" \
" subf. %0,%2,%0\n" \
" bne 2f\n" \
" stdcx. %3,0,%1\n" \
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=133a24ba079abf1e762bd4d85670e0bd8df660c4
commit 133a24ba079abf1e762bd4d85670e0bd8df660c4
Author: Carlos O'Donell <carlos@redhat.com>
Date: Wed Nov 19 11:44:12 2014 -0500
CVE-2014-7817: wordexp fails to honour WRDE_NOCMD.
The function wordexp() fails to properly handle the WRDE_NOCMD
flag when processing arithmetic inputs in the form of "$((... ``))"
where "..." can be anything valid. The backticks in the arithmetic
epxression are evaluated by in a shell even if WRDE_NOCMD forbade
command substitution. This allows an attacker to attempt to pass
dangerous commands via constructs of the above form, and bypass
the WRDE_NOCMD flag. This patch fixes this by checking for WRDE_NOCMD
in exec_comm(), the only place that can execute a shell. All other
checks for WRDE_NOCMD are superfluous and removed.
We expand the testsuite and add 3 new regression tests of roughly
the same form but with a couple of nested levels.
On top of the 3 new tests we add fork validation to the WRDE_NOCMD
testing. If any forks are detected during the execution of a wordexp()
call with WRDE_NOCMD, the test is marked as failed. This is slightly
heuristic since vfork might be used in the future, but it provides a
higher level of assurance that no shells were executed as part of
command substitution with WRDE_NOCMD in effect. In addition it doesn't
require libpthread or libdl, instead we use the public implementation
namespace function __register_atfork (already part of the public ABI
for libpthread).
Tested on x86_64 with no regressions.
diff --git a/ChangeLog b/ChangeLog
index e2239f3..9cd75d5 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,25 @@
+2014-11-19 Carlos O'Donell <carlos@redhat.com>
+ Florian Weimer <fweimer@redhat.com>
+ Joseph Myers <joseph@codesourcery.com>
+ Adam Conrad <adconrad@0c3.net>
+ Andreas Schwab <schwab@suse.de>
+ Brooks <bmoses@google.com>
+
+ [BZ #17625]
+ * wordexp-test.c (__dso_handle): Add prototype.
+ (__register_atfork): Likewise.
+ (__app_register_atfork): New function.
+ (registered_forks): New global.
+ (register_fork): New function.
+ (test_case): Add 3 new tests for WRDE_CMDSUB.
+ (main): Call __app_register_atfork.
+ (testit): If WRDE_NOCMD set registered_forks to zero, run test, and if
+ fork count is non-zero fail the test.
+ * posix/wordexp.c (exec_comm): Return WRDE_CMDSUB if WRDE_NOCMD flag
+ is set.
+ (parse_dollars): Remove check for WRDE_NOCMD.
+ (parse_dquote): Likewise.
+
2014-11-05 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Simplify
diff --git a/NEWS b/NEWS
index c555f75..20106dc 100644
--- a/NEWS
+++ b/NEWS
@@ -9,7 +9,13 @@ Version 2.20.1
* The following bugs are resolved with this release:
- 17266, 17370, 17371.
+ 17266, 17370, 17371, 17625.
+
+* CVE-2104-7817 The wordexp function could ignore the WRDE_NOCMD flag
+ under certain input conditions resulting in the execution of a shell for
+ command substitution when the applicaiton did not request it. The
+ implementation now checks WRDE_NOCMD immediately before executing the
+ shell and returns the error WRDE_CMDSUB as expected.
Version 2.20
diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c
index 4957006..bdd65e4 100644
--- a/posix/wordexp-test.c
+++ b/posix/wordexp-test.c
@@ -27,6 +27,25 @@
#define IFS " \n\t"
+extern void *__dso_handle __attribute__ ((__weak__, __visibility__ ("hidden")));
+extern int __register_atfork (void (*) (void), void (*) (void), void (*) (void), void *);
+
+static int __app_register_atfork (void (*prepare) (void), void (*parent) (void), void (*child) (void))
+{
+ return __register_atfork (prepare, parent, child,
+ &__dso_handle == NULL ? NULL : __dso_handle);
+}
+
+/* Number of forks seen. */
+static int registered_forks;
+
+/* For each fork increment the fork count. */
+static void
+register_fork (void)
+{
+ registered_forks++;
+}
+
struct test_case_struct
{
int retval;
@@ -206,6 +225,12 @@ struct test_case_struct
{ WRDE_SYNTAX, NULL, "$((2+))", 0, 0, { NULL, }, IFS },
{ WRDE_SYNTAX, NULL, "`", 0, 0, { NULL, }, IFS },
{ WRDE_SYNTAX, NULL, "$((010+4+))", 0, 0, { NULL }, IFS },
+ /* Test for CVE-2014-7817. We test 3 combinations of command
+ substitution inside an arithmetic expression to make sure that
+ no commands are executed and error is returned. */
+ { WRDE_CMDSUB, NULL, "$((`echo 1`))", WRDE_NOCMD, 0, { NULL, }, IFS },
+ { WRDE_CMDSUB, NULL, "$((1+`echo 1`))", WRDE_NOCMD, 0, { NULL, }, IFS },
+ { WRDE_CMDSUB, NULL, "$((1+$((`echo 1`))))", WRDE_NOCMD, 0, { NULL, }, IFS },
{ -1, NULL, NULL, 0, 0, { NULL, }, IFS },
};
@@ -258,6 +283,15 @@ main (int argc, char *argv[])
return -1;
}
+ /* If we are not allowed to do command substitution, we install
+ fork handlers to verify that no forks happened. No forks should
+ happen at all if command substitution is disabled. */
+ if (__app_register_atfork (register_fork, NULL, NULL) != 0)
+ {
+ printf ("Failed to register fork handler.\n");
+ return -1;
+ }
+
for (test = 0; test_case[test].retval != -1; test++)
if (testit (&test_case[test]))
++fail;
@@ -367,6 +401,9 @@ testit (struct test_case_struct *tc)
printf ("Test %d (%s): ", ++tests, tc->words);
+ if (tc->flags & WRDE_NOCMD)
+ registered_forks = 0;
+
if (tc->flags & WRDE_APPEND)
{
/* initial wordexp() call, to be appended to */
@@ -378,6 +415,13 @@ testit (struct test_case_struct *tc)
}
retval = wordexp (tc->words, &we, tc->flags);
+ if ((tc->flags & WRDE_NOCMD)
+ && (registered_forks > 0))
+ {
+ printf ("FAILED fork called for WRDE_NOCMD\n");
+ return 1;
+ }
+
if (tc->flags & WRDE_DOOFFS)
start_offs = sav_we.we_offs;
diff --git a/posix/wordexp.c b/posix/wordexp.c
index b6b65dd..26f3a26 100644
--- a/posix/wordexp.c
+++ b/posix/wordexp.c
@@ -893,6 +893,10 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
pid_t pid;
int noexec = 0;
+ /* Do nothing if command substitution should not succeed. */
+ if (flags & WRDE_NOCMD)
+ return WRDE_CMDSUB;
+
/* Don't fork() unless necessary */
if (!comm || !*comm)
return 0;
@@ -2082,9 +2086,6 @@ parse_dollars (char **word, size_t *word_length, size_t *max_length,
}
}
- if (flags & WRDE_NOCMD)
- return WRDE_CMDSUB;
-
(*offset) += 2;
return parse_comm (word, word_length, max_length, words, offset, flags,
quoted? NULL : pwordexp, ifs, ifs_white);
@@ -2196,9 +2197,6 @@ parse_dquote (char **word, size_t *word_length, size_t *max_length,
break;
case '`':
- if (flags & WRDE_NOCMD)
- return WRDE_CMDSUB;
-
++(*offset);
error = parse_backtick (word, word_length, max_length, words,
offset, flags, NULL, NULL, NULL);
@@ -2357,12 +2355,6 @@ wordexp (const char *words, wordexp_t *pwordexp, int flags)
break;
case '`':
- if (flags & WRDE_NOCMD)
- {
- error = WRDE_CMDSUB;
- goto do_error;
- }
-
++words_offset;
error = parse_backtick (&word, &word_length, &max_length, words,
&words_offset, flags, pwordexp, ifs,
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f8fbd413672816a429adc6b6c191ec8ea73421e8
commit f8fbd413672816a429adc6b6c191ec8ea73421e8
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Wed Nov 5 08:01:09 2014 -0500
powerpc: Simplify encoding of POWER8 instruction
diff --git a/ChangeLog b/ChangeLog
index e4de9d8..e2239f3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2014-11-05 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Simplify
+ definition.
+ * sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S (MFVSRD_R3_V1):
+ Likwise.
+ * sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S (MFVSRD_R3_V1):
+ Likewise.
+ * sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S (MFVSRD_R3_V1):
+ Likewise.
+ * sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S (MFVSRD_R3_V1):
+ Likewise.
+ * sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S (MFVSRD_R3_V1):
+ Likewise.
+
2014-11-03 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Encode
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
index 2b27e7b..3e98126 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_finite.S
@@ -17,14 +17,9 @@
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include <endian.h>
#include <math_ldbl_opt.h>
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1 .byte 0x66,0x00,0x23,0x7c /* mfvsrd r3,vs1 */
-#else
-#define MFVSRD_R3_V1 .byte 0x7c,0x23,0x00,0x66 /* mfvsrd r3,vs1 */
-#endif
+#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
/* int [r3] __finite ([fp1] x) */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
index d09b7fc..125de39 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isinf.S
@@ -17,14 +17,9 @@
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include <endian.h>
#include <math_ldbl_opt.h>
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1 .byte 0x66,0x00,0x23,0x7c /* mfvsrd r3,vs1 */
-#else
-#define MFVSRD_R3_V1 .byte 0x7c,0x23,0x00,0x66 /* mfvsrd r3,vs1 */
-#endif
+#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
/* int [r3] __isinf([fp1] x) */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
index cf119e5..2c7b2d1 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_isnan.S
@@ -17,14 +17,9 @@
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include <endian.h>
#include <math_ldbl_opt.h>
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1 .byte 0x66,0x00,0x23,0x7c /* mfvsrd r3,vs1 */
-#else
-#define MFVSRD_R3_V1 .byte 0x7c,0x23,0x00,0x66 /* mfvsrd r3,vs1 */
-#endif
+#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
/* int [r3] __isnan([f1] x) */
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
index 9a55d93..ce48d4e 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llrint.S
@@ -17,14 +17,9 @@
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include <endian.h>
#include <math_ldbl_opt.h>
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1 .byte 0x66,0x00,0x23,0x7c /* mfvsrd r3,vs1 */
-#else
-#define MFVSRD_R3_V1 .byte 0x7c,0x23,0x00,0x66 /* mfvsrd r3,vs1 */
-#endif
+#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
/* long long int[r3] __llrint (double x[fp1]) */
ENTRY (__llrint)
diff --git a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
index f10c06a..17cf30e 100644
--- a/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
+++ b/sysdeps/powerpc/powerpc64/power8/fpu/s_llround.S
@@ -20,11 +20,7 @@
#include <endian.h>
#include <math_ldbl_opt.h>
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MFVSRD_R3_V1 .byte 0x66,0x00,0x23,0x7c /* mfvsrd r3,vs1 */
-#else
-#define MFVSRD_R3_V1 .byte 0x7c,0x23,0x00,0x66 /* mfvsrd r3,vs1 */
-#endif
+#define MFVSRD_R3_V1 .long 0x7c230066 /* mfvsrd r3,vs1 */
/* long long [r3] llround (float x [fp1]) */
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
index cebcbdf..d7324dc 100644
--- a/sysdeps/powerpc/powerpc64/power8/memset.S
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -17,13 +17,8 @@
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
-#include <endian.h>
-#if __BYTE_ORDER == __LITTLE_ENDIAN
-#define MTVSRD_V1_R4 .byte 0x66,0x01,0x24,0x7c /* mtvsrd v1,r4 */
-#else
-#define MTVSRD_V1_R4 .byte 0x7c,0x24,0x01,0x66
-#endif
+#define MTVSRD_V1_R4 .long 0x7c240166 /* mtvsrd v1,r4 */
/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
Returns 's'. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e154589132de915ef165a1e26f89ba6997170c2b
commit e154589132de915ef165a1e26f89ba6997170c2b
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Mon Nov 3 07:26:33 2014 -0500
powerpc: Fix encoding of POWER8 instruction
This patch adds a binary encoding for 'mtvsrd' instruction to avoid
build failures when assembler does not support POWER8.
diff --git a/ChangeLog b/ChangeLog
index bcc7072..e4de9d8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2014-11-03 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/power8/memset.S (MTVSRD_V1_R4): Encode
+ mtvsrd instruction in binary form.
+
2014-10-31 Torvald Riegel <triegel@redhat.com>
* sysdeps/powerpc/bits/atomic.h (atomic_write_barrier): Remove and...
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
index 191a4df..cebcbdf 100644
--- a/sysdeps/powerpc/powerpc64/power8/memset.S
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -17,6 +17,13 @@
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
+#include <endian.h>
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define MTVSRD_V1_R4 .byte 0x66,0x01,0x24,0x7c /* mtvsrd v1,r4 */
+#else
+#define MTVSRD_V1_R4 .byte 0x7c,0x24,0x01,0x66
+#endif
/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
Returns 's'. */
@@ -142,7 +149,7 @@ L(tail_bytes):
vector instruction to achieve best throughput. */
L(huge_vector):
/* Replicate set byte to quadword in VMX register. */
- mtvsrd v1,r4
+ MTVSRD_V1_R4
xxpermdi 32,v0,v1,0
vspltb v2,v0,15
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=46f58099960f7a2603c37c540d2644e392f0fdc7
commit 46f58099960f7a2603c37c540d2644e392f0fdc7
Author: Torvald Riegel <triegel@redhat.com>
Date: Sat Oct 18 01:01:58 2014 +0200
powerpc: Change atomic_write_barrier to have release semantics.
diff --git a/ChangeLog b/ChangeLog
index e67dd7c..bcc7072 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+2014-10-31 Torvald Riegel <triegel@redhat.com>
+
+ * sysdeps/powerpc/bits/atomic.h (atomic_write_barrier): Remove and...
+ * sysdeps/powerpc/powerpc32/bits/atomic.h (atomic_write_barrier):
+ ... add here and use lwsync or sync ...
+ * sysdeps/powerpc/powerpc64/bits/atomic.h (atomic_write_barrier):
+ ... and add here using lwsync.
+
2014-09-10 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* benchtests/bench-memset.c (test_main): Add more test from size
diff --git a/sysdeps/powerpc/bits/atomic.h b/sysdeps/powerpc/bits/atomic.h
index 2ffba48..f312676 100644
--- a/sysdeps/powerpc/bits/atomic.h
+++ b/sysdeps/powerpc/bits/atomic.h
@@ -77,7 +77,6 @@ typedef uintmax_t uatomic_max_t;
#endif
#define atomic_full_barrier() __asm ("sync" ::: "memory")
-#define atomic_write_barrier() __asm ("eieio" ::: "memory")
#define __arch_compare_and_exchange_val_32_acq(mem, newval, oldval) \
({ \
diff --git a/sysdeps/powerpc/powerpc32/bits/atomic.h b/sysdeps/powerpc/powerpc32/bits/atomic.h
index 7613bdc..a3dd09c 100644
--- a/sysdeps/powerpc/powerpc32/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc32/bits/atomic.h
@@ -117,6 +117,7 @@
# ifndef UP
# define __ARCH_REL_INSTR "lwsync"
# endif
+# define atomic_write_barrier() __asm ("lwsync" ::: "memory")
#else
/*
* Older powerpc32 processors don't support the new "light weight"
@@ -124,6 +125,7 @@
* for all powerpc32 applications.
*/
# define atomic_read_barrier() __asm ("sync" ::: "memory")
+# define atomic_write_barrier() __asm ("sync" ::: "memory")
#endif
/*
diff --git a/sysdeps/powerpc/powerpc64/bits/atomic.h b/sysdeps/powerpc/powerpc64/bits/atomic.h
index 527fe7c..ed26b72 100644
--- a/sysdeps/powerpc/powerpc64/bits/atomic.h
+++ b/sysdeps/powerpc/powerpc64/bits/atomic.h
@@ -234,6 +234,7 @@
#ifndef UP
# define __ARCH_REL_INSTR "lwsync"
#endif
+#define atomic_write_barrier() __asm ("lwsync" ::: "memory")
/*
* Include the rest of the atomic ops macros which are common to both
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5f892cacbdf50322bc3ee2e131c105c71b495086
commit 5f892cacbdf50322bc3ee2e131c105c71b495086
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Tue Jul 15 12:19:09 2014 -0400
PowerPC: memset optimization for POWER8/PPC64
This patch adds an optimized memset implementation for POWER8. For
sizes from 0 to 255 bytes, a word/doubleword algorithm similar to
POWER7 optimized one is used.
For size higher than 255 two strategies are used:
1. If the constant is different than 0, the memory is written with
altivec vector instruction;
2. If constant is 0, dbcz instructions are used. The loop is unrolled
to clear 512 byte at time.
Using vector instructions increases throughput considerable, with a
double performance for sizes larger than 1024. The dcbz loops unrolls
also shows performance improvement, by doubling throughput for sizes
larger than 8192 bytes.
diff --git a/ChangeLog b/ChangeLog
index 85024b2..e67dd7c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
2014-09-10 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+ * benchtests/bench-memset.c (test_main): Add more test from size
+ from 32 to 512 bytes.
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+ Add POWER8 memset object.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add POWER8 memset and bzero implementations.
+ * sysdeps/powerpc/powerpc64/multiarch/bzero.c (__bzero): Add POWER8
+ implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/memset.c (__libc_memset):
+ Likewise.
+ * sysdeps/powerpc/powerpc64/multiarch/memset-power8.S: New file:
+ multiarch POWER8 memset optimization.
+ * sysdeps/powerpc/powerpc64/power8/memset.S: New file: optimized
+ POWER8 memset optimization.
+
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
Remove bzero multiarch objects.
* sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S: Remove file.
diff --git a/benchtests/bench-memset.c b/benchtests/bench-memset.c
index 5304113..2026593 100644
--- a/benchtests/bench-memset.c
+++ b/benchtests/bench-memset.c
@@ -150,6 +150,11 @@ test_main (void)
if (i & (i - 1))
do_test (0, c, i);
}
+ for (i = 32; i < 512; i+=32)
+ {
+ do_test (0, c, i);
+ do_test (i, c, i);
+ }
do_test (1, c, 14);
do_test (3, c, 1024);
do_test (4, c, 64);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 0de3804..abc9d2e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -2,7 +2,7 @@ ifeq ($(subdir),string)
sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \
memcmp-ppc64 memset-power7 memset-power6 memset-power4 \
- memset-ppc64 \
+ memset-ppc64 memset-power8 \
mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
index ed83541..298cf00 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -26,14 +26,17 @@ extern __typeof (bzero) __bzero_ppc attribute_hidden;
extern __typeof (bzero) __bzero_power4 attribute_hidden;
extern __typeof (bzero) __bzero_power6 attribute_hidden;
extern __typeof (bzero) __bzero_power7 attribute_hidden;
+extern __typeof (bzero) __bzero_power8 attribute_hidden;
libc_ifunc (__bzero,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __bzero_power7 :
- (hwcap & PPC_FEATURE_ARCH_2_05)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __bzero_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __bzero_power7 :
+ (hwcap & PPC_FEATURE_ARCH_2_05)
? __bzero_power6 :
(hwcap & PPC_FEATURE_POWER4)
- ? __bzero_power4
+ ? __bzero_power4
: __bzero_ppc);
weak_alias (__bzero, bzero)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index a574487..06d5be9 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -34,6 +34,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
size_t i = 0;
unsigned long int hwcap = GLRO(dl_hwcap);
+ unsigned long int hwcap2 = GLRO(dl_hwcap2);
+
/* hwcap contains only the latest supported ISA, the code checks which is
and fills the previous supported ones. */
if (hwcap & PPC_FEATURE_ARCH_2_06)
@@ -69,6 +71,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/memset.c. */
IFUNC_IMPL (i, name, memset,
+ IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __memset_power8)
IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
__memset_power7)
IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
@@ -134,6 +138,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c. */
IFUNC_IMPL (i, name, bzero,
+ IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __bzero_power8)
IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
__bzero_power7)
IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
new file mode 100644
index 0000000..e8a604b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power8.S
@@ -0,0 +1,43 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__memset_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__memset_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__memset_power8)
+
+#undef END_GEN_TB
+#define END_GEN_TB(name, mask) \
+ cfi_endproc; \
+ TRACEBACK_MASK(__memset_power8,mask) \
+ END_2(__memset_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef __bzero
+#define __bzero __bzero_power8
+
+#include <sysdeps/powerpc/powerpc64/power8/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
index aa2ae70..9c7ed10 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -32,16 +32,19 @@ extern __typeof (__redirect_memset) __memset_ppc attribute_hidden;
extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (__libc_memset,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __memset_power7 :
- (hwcap & PPC_FEATURE_ARCH_2_05)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __memset_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __memset_power7 :
+ (hwcap & PPC_FEATURE_ARCH_2_05)
? __memset_power6 :
(hwcap & PPC_FEATURE_POWER4)
- ? __memset_power4
+ ? __memset_power4
: __memset_ppc);
#undef memset
diff --git a/sysdeps/powerpc/powerpc64/power8/memset.S b/sysdeps/powerpc/powerpc64/power8/memset.S
new file mode 100644
index 0000000..191a4df
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/memset.S
@@ -0,0 +1,449 @@
+/* Optimized memset implementation for PowerPC64/POWER8.
+ Copyright (C) 2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5]));
+ Returns 's'. */
+
+ .machine power8
+EALIGN (memset, 5, 0)
+ CALL_MCOUNT 3
+
+L(_memset):
+ cmpldi cr7,r5,31
+ neg r0,r3
+ mr r10,r3
+
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32 /* Replicate byte to word. */
+ ble cr7,L(write_LT_32)
+
+ andi. r11,r10,15 /* Check alignment of DST. */
+ insrdi r4,r4,32,0 /* Replicate word to double word. */
+
+ beq L(big_aligned)
+
+ mtocrf 0x01,r0
+ clrldi r0,r0,60
+
+ /* Get DST aligned to 16 bytes. */
+1: bf 31,2f
+ stb r4,0(r10)
+ addi r10,r10,1
+
+2: bf 30,4f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+4: bf 29,8f
+ stw r4,0(r10)
+ addi r10,r10,4
+
+8: bf 28,16f
+ std r4,0(r10)
+ addi r10,r10,8
+
+16: subf r5,r0,r5
+
+ .align 4
+L(big_aligned):
+ /* For sizes larger than 255 two possible paths:
+ - if constant is '0', zero full cache lines with dcbz
+ - otherwise uses vector instructions. */
+ cmpldi cr5,r5,255
+ dcbtst 0,r10
+ cmpldi cr6,r4,0
+ crand 27,26,21
+ bt 27,L(huge_dcbz)
+ bge cr5,L(huge_vector)
+
+
+ /* Size between 32 and 255 bytes with constant different than 0, use
+ doubleword store instruction to achieve best throughput. */
+ srdi r8,r5,5
+ clrldi r11,r5,59
+ cmpldi cr6,r11,0
+ cmpdi r8,0
+ beq L(tail_bytes)
+ mtctr r8
+
+ /* Main aligned write loop, writes 32-bytes at a time. */
+ .align 4
+L(big_loop):
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+ bdz L(tail_bytes)
+
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,10,32
+ bdnz L(big_loop)
+
+ b L(tail_bytes)
+
+ /* Write remaining 1~31 bytes. */
+ .align 4
+L(tail_bytes):
+ beqlr cr6
+
+ srdi r7,r11,4
+ clrldi r8,r11,60
+ mtocrf 0x01,r7
+
+ .align 4
+ bf 31,8f
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+
+ .align 4
+8: mtocrf 0x1,r8
+ bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ .align 4
+4: bf 29,2f
+ stw 4,0(10)
+ addi 10,10,4
+
+ .align 4
+2: bf 30,1f
+ sth 4,0(10)
+ addi 10,10,2
+
+ .align 4
+1: bflr 31
+ stb 4,0(10)
+ blr
+
+ /* Size larger than 255 bytes with constant different than 0, use
+ vector instruction to achieve best throughput. */
+L(huge_vector):
+ /* Replicate set byte to quadword in VMX register. */
+ mtvsrd v1,r4
+ xxpermdi 32,v0,v1,0
+ vspltb v2,v0,15
+
+ /* Main aligned write loop: 128 bytes at a time. */
+ li r6,16
+ li r7,32
+ li r8,48
+ mtocrf 0x02,r5
+ srdi r12,r5,7
+ cmpdi r12,0
+ beq L(aligned_tail)
+ mtctr r12
+ b L(aligned_128loop)
+
+ .align 4
+L(aligned_128loop):
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+ bdnz L(aligned_128loop)
+
+ /* Write remaining 1~127 bytes. */
+L(aligned_tail):
+ mtocrf 0x01,r5
+ bf 25,32f
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ stvx v2,r10,r7
+ stvx v2,r10,r8
+ addi r10,r10,64
+
+32: bf 26,16f
+ stvx v2,0,r10
+ stvx v2,r10,r6
+ addi r10,r10,32
+
+16: bf 27,8f
+ stvx v2,0,r10
+ addi r10,r10,16
+
+8: bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ /* Copies 4~7 bytes. */
+4: bf 29,L(tail2)
+ stw r4,0(r10)
+ bf 30,L(tail5)
+ sth r4,4(r10)
+ bflr 31
+ stb r4,6(r10)
+ /* Return original DST pointer. */
+ blr
+
+ /* Special case when value is 0 and we have a long length to deal
+ with. Use dcbz to zero out a full cacheline of 128 bytes at a time.
+ Before using dcbz though, we need to get the destination 128-byte
+ aligned. */
+ .align 4
+L(huge_dcbz):
+ andi. r11,r10,127
+ neg r0,r10
+ beq L(huge_dcbz_aligned)
+
+ clrldi r0,r0,57
+ subf r5,r0,r5
+ srdi r0,r0,3
+ mtocrf 0x01,r0
+
+ /* Write 1~128 bytes until DST is aligned to 128 bytes. */
+8: bf 28,4f
+
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ std r4,32(r10)
+ std r4,40(r10)
+ std r4,48(r10)
+ std r4,56(r10)
+ addi r10,r10,64
+
+ .align 4
+4: bf 29,2f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+
+ .align 4
+2: bf 30,1f
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+
+ .align 4
+1: bf 31,L(huge_dcbz_aligned)
+ std r4,0(r10)
+ addi r10,r10,8
+
+L(huge_dcbz_aligned):
+ /* Setup dcbz unroll offsets and count numbers. */
+ srdi r8,r5,9
+ clrldi r11,r5,55
+ cmpldi cr6,r11,0
+ li r9,128
+ cmpdi r8,0
+ beq L(huge_tail)
+ li r7,256
+ li r6,384
+ mtctr r8
+
+ .align 4
+L(huge_loop):
+ /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
+ a throughput boost for large sizes (2048 bytes or higher). */
+ dcbz 0,r10
+ dcbz r9,r10
+ dcbz r7,r10
+ dcbz r6,r10
+ addi r10,r10,512
+ bdnz L(huge_loop)
+
+ beqlr cr6
+
+L(huge_tail):
+ srdi r6,r11,8
+ srdi r7,r11,4
+ clrldi r8,r11,4
+ cmpldi cr6,r8,0
+ mtocrf 0x01,r6
+
+ beq cr6,L(tail)
+
+ /* We have 1~511 bytes remaining. */
+ .align 4
+32: bf 31,16f
+ dcbz 0,r10
+ dcbz r9,r10
+ addi r10,r10,256
+
+ .align 4
+16: mtocrf 0x01,r7
+ bf 28,8f
+ dcbz 0,r10
+ addi r10,r10,128
+
+ .align 4
+8: bf 29,4f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ std r4,32(r10)
+ std r4,40(r10)
+ std r4,48(r10)
+ std r4,56(r10)
+ addi r10,r10,64
+
+ .align 4
+4: bf 30,2f
+ std r4,0(r10)
+ std r4,8(r10)
+ std r4,16(r10)
+ std r4,24(r10)
+ addi r10,r10,32
+
+ .align 4
+2: bf 31,L(tail)
+ std r4,0(r10)
+ std r4,8(r10)
+ addi r10,r10,16
+ .align 4
+
+ /* Remaining 1~15 bytes. */
+L(tail):
+ mtocrf 0x01,r8
+
+ .align
+8: bf 28,4f
+ std r4,0(r10)
+ addi r10,r10,8
+
+ .align 4
+4: bf 29,2f
+ stw r4,0(r10)
+ addi r10,r10,4
+
+ .align 4
+2: bf 30,1f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+ .align 4
+1: bflr 31
+ stb r4,0(r10)
+ blr
+
+ /* Handle short copies of 0~31 bytes. Best throughput is achieved
+ by just unrolling all operations. */
+ .align 4
+L(write_LT_32):
+ cmpldi cr6,5,8
+ mtocrf 0x01,r5
+ ble cr6,L(write_LE_8)
+
+ /* At least 9 bytes to go. */
+ neg r8,r4
+ andi. r0,r8,3
+ cmpldi cr1,r5,16
+ beq L(write_LT_32_aligned)
+
+ /* Force 4-byte alignment for SRC. */
+ mtocrf 0x01,r0
+ subf r5,r0,r5
+
+2: bf 30,1f
+ sth r4,0(r10)
+ addi r10,r10,2
+
+1: bf 31,L(end_4bytes_alignment)
+ stb r4,0(r10)
+ addi r10,r10,1
+
+ .align 4
+L(end_4bytes_alignment):
+ cmpldi cr1,r5,16
+ mtocrf 0x01,r5
+
+L(write_LT_32_aligned):
+ blt cr1,8f
+
+ stw r4,0(r10)
+ stw r4,4(r10)
+ stw r4,8(r10)
+ stw r4,12(r10)
+ addi r10,r10,16
+
+8: bf 28,L(tail4)
+ stw r4,0(r10)
+ stw r4,4(r10)
+ addi r10,r10,8
+
+ .align 4
+ /* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ stw r4,0(r10)
+ bf 30,L(tail5)
+ sth r4,4(r10)
+ bflr 31
+ stb r4,6(r10)
+ blr
+
+ .align 4
+ /* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ sth r4,0(r10)
+ bflr 31
+ stb r4,2(r10)
+ blr
+
+ .align 4
+L(tail5):
+ bflr 31
+ stb r4,4(r10)
+ blr
+
+ .align 4
+1: bflr 31
+ stb r4,0(r10)
+ blr
+
+ /* Handles copies of 0~8 bytes. */
+ .align 4
+L(write_LE_8):
+ bne cr6,L(tail4)
+
+ stw r4,0(r10)
+ stw r4,4(r10)
+ blr
+END_GEN_TB (memset,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+ between bzero and memset. */
+ENTRY (__bzero)
+ CALL_MCOUNT 3
+ mr r5,r4
+ li r4,0
+ b L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e6bb56b6914e6435e251814a3a0ccd7fb65a7e36
commit e6bb56b6914e6435e251814a3a0ccd7fb65a7e36
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Tue Jul 15 16:54:46 2014 -0400
PowerPC: multiarch bzero cleanup for PPC64
This patch cleanups the multiarch bzero for powerpc64 by remove
the multiarch objects and use instead the the memset embedded
implementation presented in each multiarch optimization. The
code generate is essentially the same, but the TB_TOCLESS (which
is not essential).
Conflicts:
ChangeLog
This is backport of 3b473fecdf4c52989cd915b649bb6d26c042d048.
diff --git a/ChangeLog b/ChangeLog
index 890c3c6..85024b2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2014-09-10 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+ Remove bzero multiarch objects.
+ * sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S: Remove file.
+ * sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S: Likewise.
+ * sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S: Likewise.
+ * sysdeps/powerpc/powerpc64/multiarch/memset-power4.S [NO_BZERO_IMPL]:
+ Remove define.
+ [__bzero]: Redefine to specific name.
+ * sysdeps/powerpc/powerpc64/multiarch/memset-power6.S: Likewise.
+ * sysdeps/powerpc/powerpc64/multiarch/memset-power7.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power4/memset.S [NO_BZERO_IMPL]: Remove
+ define.
+ * sysdeps/powerpc/powerpc64/power6/memset.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power7/memset.S: Likewise.
+
2014-09-16 Siddhesh Poyarekar <siddhesh@redhat.com>
[BZ #17370]
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 82722fb..0de3804 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -2,7 +2,7 @@ ifeq ($(subdir),string)
sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
memcpy-power4 memcpy-ppc64 memcmp-power7 memcmp-power4 \
memcmp-ppc64 memset-power7 memset-power6 memset-power4 \
- memset-ppc64 bzero-power4 bzero-power6 bzero-power7 \
+ memset-ppc64 \
mempcpy-power7 mempcpy-ppc64 memchr-power7 memchr-ppc64 \
memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S b/sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S
deleted file mode 100644
index 72b75ac..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power4.S
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Optimized bzero implementation for PowerPC64/POWER4.
- Copyright (C) 2013-2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-ENTRY (__bzero_power4)
- CALL_MCOUNT 3
- mr r5,r4
- li r4,0
- b __memset_power4
-END_GEN_TB (__bzero_power4,TB_TOCLESS)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S b/sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S
deleted file mode 100644
index d0917c5..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power6.S
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Optimized bzero implementation for PowerPC64/POWER6.
- Copyright (C) 2013-2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-ENTRY (__bzero_power6)
- CALL_MCOUNT 3
- mr r5,r4
- li r4,0
- b __memset_power6
-END_GEN_TB (__bzero_power6,TB_TOCLESS)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S b/sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S
deleted file mode 100644
index 0ec285a..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero-power7.S
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Optimized bzero implementation for PowerPC64/POWER7.
- Copyright (C) 2013-2014 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-ENTRY (__bzero_power7)
- CALL_MCOUNT 3
- mr r5,r4
- li r4,0
- b __memset_power7
-END_GEN_TB (__bzero_power7,TB_TOCLESS)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S
index 968dc24..1291fb7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power4.S
@@ -37,5 +37,7 @@
#undef libc_hidden_builtin_def
#define libc_hidden_builtin_def(name)
-#define NO_BZERO_IMPL
+#undef __bzero
+#define __bzero __bzero_power4
+
#include <sysdeps/powerpc/powerpc64/power4/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S
index 65519b9..3dc199c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power6.S
@@ -37,5 +37,7 @@
#undef libc_hidden_builtin_def
#define libc_hidden_builtin_def(name)
-#define NO_BZERO_IMPL
+#undef __bzero
+#define __bzero __bzero_power6
+
#include <sysdeps/powerpc/powerpc64/power6/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S
index 86765e7..fb1a342 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power7.S
@@ -37,5 +37,6 @@
#undef libc_hidden_builtin_def
#define libc_hidden_builtin_def(name)
-#define NO_BZERO_IMPL
+#undef __bzero
+#define __bzero __bzero_power7
#include <sysdeps/powerpc/powerpc64/power7/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/power4/memset.S b/sysdeps/powerpc/powerpc64/power4/memset.S
index 3a1e9dc..b433d49 100644
--- a/sysdeps/powerpc/powerpc64/power4/memset.S
+++ b/sysdeps/powerpc/powerpc64/power4/memset.S
@@ -235,7 +235,6 @@ L(medium_28t):
END_GEN_TB (memset,TB_TOCLESS)
libc_hidden_builtin_def (memset)
-#ifndef NO_BZERO_IMPL
/* Copied from bzero.S to prevent the linker from inserting a stub
between bzero and memset. */
ENTRY (__bzero)
@@ -243,7 +242,7 @@ ENTRY (__bzero)
mr r5,r4
li r4,0
b L(_memset)
-END_GEN_TB (__bzero,TB_TOCLESS)
-
+END (__bzero)
+#ifndef __bzero
weak_alias (__bzero, bzero)
#endif
diff --git a/sysdeps/powerpc/powerpc64/power6/memset.S b/sysdeps/powerpc/powerpc64/power6/memset.S
index b5115a7..6fffe0e 100644
--- a/sysdeps/powerpc/powerpc64/power6/memset.S
+++ b/sysdeps/powerpc/powerpc64/power6/memset.S
@@ -379,7 +379,6 @@ L(medium_28t):
END_GEN_TB (memset,TB_TOCLESS)
libc_hidden_builtin_def (memset)
-#ifndef NO_BZERO_IMPL
/* Copied from bzero.S to prevent the linker from inserting a stub
between bzero and memset. */
ENTRY (__bzero)
@@ -387,7 +386,7 @@ ENTRY (__bzero)
mr r5,r4
li r4,0
b L(_memset)
-END_GEN_TB (__bzero,TB_TOCLESS)
-
+END (__bzero)
+#ifndef __bzero
weak_alias (__bzero, bzero)
#endif
diff --git a/sysdeps/powerpc/powerpc64/power7/memset.S b/sysdeps/powerpc/powerpc64/power7/memset.S
index 6b8999d..14df042 100644
--- a/sysdeps/powerpc/powerpc64/power7/memset.S
+++ b/sysdeps/powerpc/powerpc64/power7/memset.S
@@ -383,7 +383,6 @@ L(small):
END_GEN_TB (memset,TB_TOCLESS)
libc_hidden_builtin_def (memset)
-#ifndef NO_BZERO_IMPL
/* Copied from bzero.S to prevent the linker from inserting a stub
between bzero and memset. */
ENTRY (__bzero)
@@ -391,7 +390,7 @@ ENTRY (__bzero)
mr r5,r4
li r4,0
b L(_memset)
-END_GEN_TB (__bzero,TB_TOCLESS)
-
+END (__bzero)
+#ifndef __bzero
weak_alias (__bzero, bzero)
#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=10f5f4c8edc35b4c3912456ffee820975e20a50b
commit 10f5f4c8edc35b4c3912456ffee820975e20a50b
Author: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Date: Fri Nov 15 07:44:20 2013 -0600
Partially revert commit 2663b74f8103a2a8a46b4896439b7a452480fc7c
This change is necessary in order to avoid the issue documented at
http://sourceware.org/ml/libc-alpha/2013-05/msg00350.html.
diff --git a/localedata/locales/bo_CN b/localedata/locales/bo_CN
index d813c10..c573d3f 100644
--- a/localedata/locales/bo_CN
+++ b/localedata/locales/bo_CN
@@ -145,8 +145,7 @@ END LC_MEASUREMENT
LC_NAME
% FIXME
-
-name_fmt ""
+name_fmt "FIXME"
% name_gen "FIXME"
% name_miss "FIXME"
% name_mr "FIXME"
diff --git a/localedata/locales/bo_IN b/localedata/locales/bo_IN
index 8ab793c..a1a6280 100644
--- a/localedata/locales/bo_IN
+++ b/localedata/locales/bo_IN
@@ -71,7 +71,7 @@ END LC_MEASUREMENT
LC_NAME
% FIXME
-name_fmt ""
+name_fmt "FIXME"
% name_gen "FIXME"
% name_miss "FIXME"
% name_mr "FIXME"
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e6f905009b29769bd27077389ce4379d5de80df2
commit e6f905009b29769bd27077389ce4379d5de80df2
Author: Ryan S. Arnold <rsa@linux.vnet.ibm.com>
Date: Fri Nov 15 07:42:33 2013 -0600
Remove assert() if DT_RUNPATH and DT_RPATH flags are found in ld.so.
diff --git a/elf/get-dynamic-info.h b/elf/get-dynamic-info.h
index 20ccf30..7f51d90 100644
--- a/elf/get-dynamic-info.h
+++ b/elf/get-dynamic-info.h
@@ -130,8 +130,8 @@ elf_get_dynamic_info (struct link_map *l, ElfW(Dyn) *temp)
assert (info[DT_FLAGS] == NULL
|| (info[DT_FLAGS]->d_un.d_val & ~DF_BIND_NOW) == 0);
/* Flags must not be set for ld.so. */
- assert (info[DT_RUNPATH] == NULL);
- assert (info[DT_RPATH] == NULL);
+ info[DT_RUNPATH] == NULL;
+ info[DT_RPATH] == NULL;
#else
if (info[DT_FLAGS] != NULL)
{
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources