This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch master updated. glibc-2.20-539-gce6615c
- From: azanella at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 13 Jan 2015 19:38:01 -0000
- Subject: GNU C Library master sources branch master updated. glibc-2.20-539-gce6615c
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via ce6615c9c686acd34672a9f4eba9bcf5553496f6 (commit)
via d3b00f468bec441596877a685a19f43dee88657f (commit)
via 72607db038df1a1a7987af814aad8d2ed466c45c (commit)
from 54dc546139a42f77bff190be1e44bfd4479a8367 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=ce6615c9c686acd34672a9f4eba9bcf5553496f6
commit ce6615c9c686acd34672a9f4eba9bcf5553496f6
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Sun Jan 11 19:33:17 2015 -0600
powerpc: Fix POWER7/PPC64 performance regression on LE
This patch fixes a performance regression on the POWER7/PPC64 memcmp
porting for Little Endian. The LE code uses 'ldbrx' instruction to read
the memory on byte reversed form, however ISA 2.06 just provide the indexed
form which uses a register value as additional index, instead of a fixed value
enconded in the instruction.
And the port strategy for LE uses r0 index value and update the address
value on each compare loop interation. For large compare size values,
it adds 8 more instructions plus some more depending of trailing
size. This patch fixes it by adding pre-calculate indexes to remove the
address update on loops and tailing sizes.
For large sizes it shows a considerable gain, with double performance
pairing with BE.
diff --git a/ChangeLog b/ChangeLog
index c1e8055..2b65e02 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+ * sysdeps/powerpc/powerpc64/power7/memcmp.S (memcmp): Fix performance
+ regression on LE.
+
* sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S: New file.
* sysdeps/powerpc/powerpc64/power8/strncmp.S: New file.
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
diff --git a/sysdeps/powerpc/powerpc64/power7/memcmp.S b/sysdeps/powerpc/powerpc64/power7/memcmp.S
index b2c1c95..d60dfda 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcmp.S
@@ -26,18 +26,48 @@
EALIGN (memcmp, 4, 0)
CALL_MCOUNT 3
-#define rRTN r3
-#define rSTR1 r3 /* first string arg */
-#define rSTR2 r4 /* second string arg */
-#define rN r5 /* max string length */
-#define rWORD1 r6 /* current word in s1 */
-#define rWORD2 r7 /* current word in s2 */
-#define rWORD3 r8 /* next word in s1 */
-#define rWORD4 r9 /* next word in s2 */
-#define rWORD5 r10 /* next word in s1 */
-#define rWORD6 r11 /* next word in s2 */
-#define rWORD7 r30 /* next word in s1 */
-#define rWORD8 r31 /* next word in s2 */
+#define rRTN r3
+#define rSTR1 r3 /* first string arg */
+#define rSTR2 r4 /* second string arg */
+#define rN r5 /* max string length */
+#define rWORD1 r6 /* current word in s1 */
+#define rWORD2 r7 /* current word in s2 */
+#define rWORD3 r8 /* next word in s1 */
+#define rWORD4 r9 /* next word in s2 */
+#define rWORD5 r10 /* next word in s1 */
+#define rWORD6 r11 /* next word in s2 */
+
+#define rOFF8 r20 /* 8 bytes offset. */
+#define rOFF16 r21 /* 16 bytes offset. */
+#define rOFF24 r22 /* 24 bytes offset. */
+#define rOFF32 r23 /* 24 bytes offset. */
+#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
+#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
+#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
+#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
+#define rSHR r28 /* Unaligned shift right count. */
+#define rSHL r29 /* Unaligned shift left count. */
+#define rWORD7 r30 /* next word in s1 */
+#define rWORD8 r31 /* next word in s2 */
+
+#define rWORD8SAVE (-8)
+#define rWORD7SAVE (-16)
+#define rOFF8SAVE (-24)
+#define rOFF16SAVE (-32)
+#define rOFF24SAVE (-40)
+#define rOFF32SAVE (-48)
+#define rSHRSAVE (-56)
+#define rSHLSAVE (-64)
+#define rWORD8SHIFTSAVE (-72)
+#define rWORD2SHIFTSAVE (-80)
+#define rWORD4SHIFTSAVE (-88)
+#define rWORD6SHIFTSAVE (-96)
+
+#ifdef __LITTLE_ENDIAN__
+# define LD ldbrx
+#else
+# define LD ldx
+#endif
xor r0, rSTR2, rSTR1
cmpldi cr6, rN, 0
@@ -51,10 +81,24 @@ EALIGN (memcmp, 4, 0)
/* If less than 8 bytes or not aligned, use the unaligned
byte loop. */
blt cr1, L(bytealigned)
- std rWORD8, -8(r1)
- cfi_offset(rWORD8, -8)
- std rWORD7, -16(r1)
- cfi_offset(rWORD7, -16)
+ std rWORD8, rWORD8SAVE(r1)
+ cfi_offset(rWORD8, rWORD8SAVE)
+ std rWORD7, rWORD7SAVE(r1)
+ cfi_offset(rWORD7, rWORD7SAVE)
+ std rOFF8, rOFF8SAVE(r1)
+ cfi_offset(rWORD7, rOFF8SAVE)
+ std rOFF16, rOFF16SAVE(r1)
+ cfi_offset(rWORD7, rOFF16SAVE)
+ std rOFF24, rOFF24SAVE(r1)
+ cfi_offset(rWORD7, rOFF24SAVE)
+ std rOFF32, rOFF32SAVE(r1)
+ cfi_offset(rWORD7, rOFF32SAVE)
+
+ li rOFF8,8
+ li rOFF16,16
+ li rOFF24,24
+ li rOFF32,32
+
bne L(unaligned)
/* At this point we know both strings have the same alignment and the
compare length is at least 8 bytes. r12 contains the low order
@@ -79,15 +123,8 @@ L(samealignment):
sldi rWORD6, r12, 3
srdi r0, rN, 5 /* Divide by 32 */
andi. r12, rN, 24 /* Get the DW remainder */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 0(rSTR1)
- ld rWORD2, 0(rSTR2)
-#endif
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
cmpldi cr1, r12, 16
cmpldi cr7, rN, 32
clrldi rN, rN, 61
@@ -104,15 +141,8 @@ L(dsP1):
cmpld cr5, rWORD5, rWORD6
blt cr7, L(dP1x)
/* Do something useful in this cycle since we have to branch anyway. */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr7, rWORD1, rWORD2
b L(dP1e)
/* Remainder is 16 */
@@ -123,15 +153,8 @@ L(dPs2):
cmpld cr6, rWORD5, rWORD6
blt cr7, L(dP2x)
/* Do something useful in this cycle since we have to branch anyway. */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 8(rSTR1)
- ld rWORD8, 8(rSTR2)
-#endif
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
cmpld cr5, rWORD7, rWORD8
b L(dP2e)
/* Remainder is 24 */
@@ -173,72 +196,43 @@ L(dP1):
change any on the early exit path. The key here is the non-early
exit path only cares about the condition code (cr5), not about which
register pair was used. */
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 0(rSTR1)
- ld rWORD6, 0(rSTR2)
-#endif
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
cmpld cr5, rWORD5, rWORD6
blt cr7, L(dP1x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr7, rWORD1, rWORD2
L(dP1e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr5, L(dLcr5x)
bne cr7, L(dLcr7x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 32(rSTR1)
- ldu rWORD8, 32(rSTR2)
-#endif
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
bne cr1, L(dLcr1)
cmpld cr5, rWORD7, rWORD8
bdnz L(dLoop)
bne cr6, L(dLcr6)
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
.align 3
L(dP1x):
sldi. r12, rN, 3
bne cr5, L(dLcr5x)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
@@ -246,79 +240,41 @@ L(dP1x):
.align 4
L(dP2):
mtctr r0
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 0(rSTR1)
- ld rWORD6, 0(rSTR2)
-#endif
+ LD rWORD5, 0, rSTR1
+ LD rWORD6, 0, rSTR2
cmpld cr6, rWORD5, rWORD6
blt cr7, L(dP2x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 8(rSTR1)
- ld rWORD8, 8(rSTR2)
-#endif
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
cmpld cr5, rWORD7, rWORD8
L(dP2e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 16(rSTR1)
- ld rWORD2, 16(rSTR2)
-#endif
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
cmpld cr7, rWORD1, rWORD2
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 24(rSTR1)
- ld rWORD4, 24(rSTR2)
-#endif
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
cmpld cr1, rWORD3, rWORD4
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
bne cr6, L(dLcr6)
bne cr5, L(dLcr5)
b L(dLoop2)
-/* Again we are on a early exit path (16-23 byte compare), we want to
- only use volatile registers and avoid restoring non-volatile
- registers. */
.align 4
L(dP2x):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 8(rSTR1)
- ld rWORD4, 8(rSTR2)
-#endif
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
sldi. r12, rN, 3
bne cr6, L(dLcr6x)
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
bne cr1, L(dLcr1x)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
@@ -326,52 +282,22 @@ L(dP2x):
.align 4
L(dP3):
mtctr r0
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 0(rSTR1)
- ld rWORD4, 0(rSTR2)
-#endif
+ LD rWORD3, 0, rSTR1
+ LD rWORD4, 0, rSTR2
cmpld cr1, rWORD3, rWORD4
L(dP3e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 8(rSTR1)
- ld rWORD6, 8(rSTR2)
-#endif
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
cmpld cr6, rWORD5, rWORD6
blt cr7, L(dP3x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 16(rSTR1)
- ld rWORD8, 16(rSTR2)
-#endif
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
cmpld cr5, rWORD7, rWORD8
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 24(rSTR1)
- ld rWORD2, 24(rSTR2)
-#endif
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
cmpld cr7, rWORD1, rWORD2
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
bne cr1, L(dLcr1)
bne cr6, L(dLcr6)
b L(dLoop1)
@@ -380,26 +306,21 @@ L(dP3e):
registers. */
.align 4
L(dP3x):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 16(rSTR1)
- ld rWORD2, 16(rSTR2)
-#endif
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
cmpld cr7, rWORD1, rWORD2
sldi. r12, rN, 3
bne cr1, L(dLcr1x)
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
bne cr6, L(dLcr6x)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
bne cr7, L(dLcr7x)
bne L(d00)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
@@ -407,46 +328,20 @@ L(dP3x):
.align 4
L(dP4):
mtctr r0
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 0(rSTR1)
- ld rWORD2, 0(rSTR2)
-#endif
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
cmpld cr7, rWORD1, rWORD2
L(dP4e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 8(rSTR1)
- ld rWORD4, 8(rSTR2)
-#endif
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 16(rSTR1)
- ld rWORD6, 16(rSTR2)
-#endif
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 24(rSTR1)
- ldu rWORD8, 24(rSTR2)
-#endif
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
cmpld cr5, rWORD7, rWORD8
bne cr7, L(dLcr7)
bne cr1, L(dLcr1)
@@ -454,51 +349,25 @@ L(dP4e):
/* This is the primary loop */
.align 4
L(dLoop):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
bne cr6, L(dLcr6)
L(dLoop1):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr5, L(dLcr5)
L(dLoop2):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr7, L(dLcr7)
L(dLoop3):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 32(rSTR1)
- ldu rWORD8, 32(rSTR2)
-#endif
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
bne cr1, L(dLcr1)
cmpld cr7, rWORD1, rWORD2
bdnz L(dLoop)
@@ -519,62 +388,75 @@ L(d14):
sldi. r12, rN, 3
bne cr5, L(dLcr5)
L(d04):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
subfic rN, r12, 64 /* Shift count is 64 - (rN * 8). */
- beq L(zeroLength)
+ beq L(duzeroLength)
/* At this point we have a remainder of 1 to 7 bytes to compare. Since
we are aligned it is safe to load the whole double word, and use
shift right double to eliminate bits beyond the compare length. */
L(d00):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
srd rWORD1, rWORD1, rN
srd rWORD2, rWORD2, rN
cmpld cr7, rWORD1, rWORD2
bne cr7, L(dLcr7x)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
.align 4
L(dLcr7):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr7x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr7
li rRTN, -1
blr
.align 4
L(dLcr1):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr1x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr1
li rRTN, -1
blr
.align 4
L(dLcr6):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr6x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr6
li rRTN, -1
blr
.align 4
L(dLcr5):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dLcr5x):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 1
bgtlr cr5
li rRTN, -1
@@ -583,10 +465,6 @@ L(dLcr5x):
.align 4
L(bytealigned):
mtctr rN
-#if 0
-/* Huh? We've already branched on cr6! */
- beq cr6, L(zeroLength)
-#endif
/* We need to prime this loop. This loop is swing modulo scheduled
to avoid pipe delays. The dependent instruction latencies (load to
@@ -685,6 +563,7 @@ L(b11):
L(bx12):
sub rRTN, rWORD1, rWORD2
blr
+
.align 4
L(zeroLength):
li rRTN, 0
@@ -705,42 +584,36 @@ L(zeroLength):
we need to adjust the length (rN) and special case the loop
versioning for the first DW. This ensures that the loop count is
correct and the first DW (shifted) is in the expected resister pair. */
-#define rSHL r29 /* Unaligned shift left count. */
-#define rSHR r28 /* Unaligned shift right count. */
-#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
-#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
-#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
-#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
L(unaligned):
- std rSHL, -24(r1)
- cfi_offset(rSHL, -24)
+ std rSHL, rSHLSAVE(r1)
+ cfi_offset(rSHL, rSHLSAVE)
clrldi rSHL, rSTR2, 61
beq cr6, L(duzeroLength)
- std rSHR, -32(r1)
- cfi_offset(rSHR, -32)
+ std rSHR, rSHRSAVE(r1)
+ cfi_offset(rSHR, rSHRSAVE)
beq cr5, L(DWunaligned)
- std rWORD8_SHIFT, -40(r1)
- cfi_offset(rWORD8_SHIFT, -40)
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
/* Adjust the logical start of rSTR2 to compensate for the extra bits
in the 1st rSTR1 DW. */
sub rWORD8_SHIFT, rSTR2, r12
/* But do not attempt to address the DW before that DW that contains
the actual start of rSTR2. */
clrrdi rSTR2, rSTR2, 3
- std rWORD2_SHIFT, -48(r1)
- cfi_offset(rWORD2_SHIFT, -48)
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
/* Compute the left/right shift counts for the unaligned rSTR2,
compensating for the logical (DW aligned) start of rSTR1. */
clrldi rSHL, rWORD8_SHIFT, 61
clrrdi rSTR1, rSTR1, 3
- std rWORD4_SHIFT, -56(r1)
- cfi_offset(rWORD4_SHIFT, -56)
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
sldi rSHL, rSHL, 3
cmpld cr5, rWORD8_SHIFT, rSTR2
add rN, rN, r12
sldi rWORD6, r12, 3
- std rWORD6_SHIFT, -64(r1)
- cfi_offset(rWORD6_SHIFT, -64)
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
subfic rSHR, rSHL, 64
srdi r0, rN, 5 /* Divide by 32 */
andi. r12, rN, 24 /* Get the DW remainder */
@@ -750,25 +623,13 @@ L(unaligned):
this may cross a page boundary and cause a page fault. */
li rWORD8, 0
blt cr5, L(dus0)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD8, 0, rSTR2
+ LD rWORD8, 0, rSTR2
addi rSTR2, rSTR2, 8
-#else
- ld rWORD8, 0(rSTR2)
- addi rSTR2, rSTR2, 8
-#endif
sld rWORD8, rWORD8, rSHL
L(dus0):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 0(rSTR1)
- ld rWORD2, 0(rSTR2)
-#endif
+ LD rWORD1, 0, rSTR1
+ LD rWORD2, 0, rSTR2
cmpldi cr1, r12, 16
cmpldi cr7, rN, 32
srd r12, rWORD2, rSHR
@@ -796,12 +657,7 @@ L(dusP1):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 16 */
@@ -832,27 +688,21 @@ L(duPs4):
compare length is at least 8 bytes. */
.align 4
L(DWunaligned):
- std rWORD8_SHIFT, -40(r1)
- cfi_offset(rWORD8_SHIFT, -40)
+ std rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ cfi_offset(rWORD8_SHIFT, rWORD8SHIFTSAVE)
clrrdi rSTR2, rSTR2, 3
- std rWORD2_SHIFT, -48(r1)
- cfi_offset(rWORD2_SHIFT, -48)
+ std rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ cfi_offset(rWORD2_SHIFT, rWORD2SHIFTSAVE)
srdi r0, rN, 5 /* Divide by 32 */
- std rWORD4_SHIFT, -56(r1)
- cfi_offset(rWORD4_SHIFT, -56)
+ std rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
+ cfi_offset(rWORD4_SHIFT, rWORD4SHIFTSAVE)
andi. r12, rN, 24 /* Get the DW remainder */
- std rWORD6_SHIFT, -64(r1)
- cfi_offset(rWORD6_SHIFT, -64)
+ std rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ cfi_offset(rWORD6_SHIFT, rWORD6SHIFTSAVE)
sldi rSHL, rSHL, 3
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD6, 0, rSTR2
+ LD rWORD6, 0, rSTR2
+ LD rWORD8, rOFF8, rSTR2
addi rSTR2, rSTR2, 8
- ldbrx rWORD8, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD6, 0(rSTR2)
- ldu rWORD8, 8(rSTR2)
-#endif
cmpldi cr1, r12, 16
cmpldi cr7, rN, 32
clrldi rN, rN, 61
@@ -867,52 +717,26 @@ L(DWunaligned):
.align 4
L(duP1):
srd r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD7, 0(rSTR1)
-#endif
+ LD rWORD7, 0, rSTR1
sld rWORD8_SHIFT, rWORD8, rSHL
or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP1x)
L(duP1e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr5, rWORD7, rWORD8
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr7, rWORD1, rWORD2
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
bne cr5, L(duLcr5)
or rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr1, rWORD3, rWORD4
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
@@ -932,82 +756,47 @@ L(duP1x):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 16 */
.align 4
L(duP2):
srd r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD5, 0(rSTR1)
-#endif
+ LD rWORD5, 0, rSTR1
or rWORD6, r0, rWORD6_SHIFT
sld rWORD6_SHIFT, rWORD8, rSHL
L(duP2e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 8(rSTR1)
- ld rWORD8, 8(rSTR2)
-#endif
+ LD rWORD7, rOFF8, rSTR1
+ LD rWORD8, rOFF8, rSTR2
cmpld cr6, rWORD5, rWORD6
srd r12, rWORD8, rSHR
sld rWORD8_SHIFT, rWORD8, rSHL
or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP2x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 16(rSTR1)
- ld rWORD2, 16(rSTR2)
-#endif
+ LD rWORD1, rOFF16, rSTR1
+ LD rWORD2, rOFF16, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 24(rSTR1)
- ld rWORD4, 24(rSTR2)
-#endif
+ LD rWORD3, rOFF24, rSTR1
+ LD rWORD4, rOFF24, rSTR2
cmpld cr7, rWORD1, rWORD2
bne cr5, L(duLcr5)
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
or rWORD4, r12, rWORD2_SHIFT
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
cmpld cr1, rWORD3, rWORD4
b L(duLoop2)
.align 4
L(duP2x):
cmpld cr5, rWORD7, rWORD8
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
-#endif
bne cr6, L(duLcr6)
sldi. rN, rN, 3
bne cr5, L(duLcr5)
@@ -1015,12 +804,7 @@ L(duP2x):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
@@ -1028,73 +812,39 @@ L(duP2x):
.align 4
L(duP3):
srd r12, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD3, 0(rSTR1)
-#endif
+ LD rWORD3, 0, rSTR1
sld rWORD4_SHIFT, rWORD8, rSHL
or rWORD4, r12, rWORD6_SHIFT
L(duP3e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 8(rSTR1)
- ld rWORD6, 8(rSTR2)
-#endif
+ LD rWORD5, rOFF8, rSTR1
+ LD rWORD6, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
or rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD7, 16(rSTR1)
- ld rWORD8, 16(rSTR2)
-#endif
+ LD rWORD7, rOFF16, rSTR1
+ LD rWORD8, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr1, L(duLcr1)
srd r12, rWORD8, rSHR
sld rWORD8_SHIFT, rWORD8, rSHL
or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP3x)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 24(rSTR1)
- ld rWORD2, 24(rSTR2)
-#endif
+ LD rWORD1, rOFF24, rSTR1
+ LD rWORD2, rOFF24, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
cmpld cr7, rWORD1, rWORD2
b L(duLoop1)
.align 4
L(duP3x):
-#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 16
addi rSTR2, rSTR2, 16
-#endif
-#if 0
-/* Huh? We've already branched on cr1! */
- bne cr1, L(duLcr1)
-#endif
cmpld cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
sldi. rN, rN, 3
@@ -1103,12 +853,7 @@ L(duP3x):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
b L(dutrim)
@@ -1117,51 +862,27 @@ L(duP3x):
L(duP4):
mtctr r0
srd r0, rWORD8, rSHR
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- addi rSTR1, rSTR1, 8
-#else
- ld rWORD1, 0(rSTR1)
-#endif
+ LD rWORD1, 0, rSTR1
sld rWORD2_SHIFT, rWORD8, rSHL
or rWORD2, r0, rWORD6_SHIFT
L(duP4e):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 8(rSTR1)
- ld rWORD4, 8(rSTR2)
-#endif
+ LD rWORD3, rOFF8, rSTR1
+ LD rWORD4, rOFF8, rSTR2
cmpld cr7, rWORD1, rWORD2
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
or rWORD4, r12, rWORD2_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 16(rSTR1)
- ld rWORD6, 16(rSTR2)
-#endif
+ LD rWORD5, rOFF16, rSTR1
+ LD rWORD6, rOFF16, rSTR2
cmpld cr1, rWORD3, rWORD4
bne cr7, L(duLcr7)
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
or rWORD6, r0, rWORD4_SHIFT
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 24(rSTR1)
- ldu rWORD8, 24(rSTR2)
-#endif
+ LD rWORD7, rOFF24, rSTR1
+ LD rWORD8, rOFF24, rSTR2
+ addi rSTR1, rSTR1, 24
+ addi rSTR2, rSTR2, 24
cmpld cr6, rWORD5, rWORD6
bne cr1, L(duLcr1)
srd r12, rWORD8, rSHR
@@ -1172,60 +893,34 @@ L(duP4e):
/* This is the primary loop */
.align 4
L(duLoop):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
- ldbrx rWORD2, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD1, 8(rSTR1)
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD1, rOFF8, rSTR1
+ LD rWORD2, rOFF8, rSTR2
cmpld cr1, rWORD3, rWORD4
bne cr6, L(duLcr6)
srd r0, rWORD2, rSHR
sld rWORD2_SHIFT, rWORD2, rSHL
or rWORD2, r0, rWORD8_SHIFT
L(duLoop1):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD3, 0, rSTR1
- ldbrx rWORD4, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD3, 16(rSTR1)
- ld rWORD4, 16(rSTR2)
-#endif
+ LD rWORD3, rOFF16, rSTR1
+ LD rWORD4, rOFF16, rSTR2
cmpld cr6, rWORD5, rWORD6
bne cr5, L(duLcr5)
srd r12, rWORD4, rSHR
sld rWORD4_SHIFT, rWORD4, rSHL
or rWORD4, r12, rWORD2_SHIFT
L(duLoop2):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD5, 0, rSTR1
- ldbrx rWORD6, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD5, 24(rSTR1)
- ld rWORD6, 24(rSTR2)
-#endif
+ LD rWORD5, rOFF24, rSTR1
+ LD rWORD6, rOFF24, rSTR2
cmpld cr5, rWORD7, rWORD8
bne cr7, L(duLcr7)
srd r0, rWORD6, rSHR
sld rWORD6_SHIFT, rWORD6, rSHL
or rWORD6, r0, rWORD4_SHIFT
L(duLoop3):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD7, 0, rSTR1
- ldbrx rWORD8, 0, rSTR2
- addi rSTR1, rSTR1, 8
- addi rSTR2, rSTR2, 8
-#else
- ldu rWORD7, 32(rSTR1)
- ldu rWORD8, 32(rSTR2)
-#endif
+ LD rWORD7, rOFF32, rSTR1
+ LD rWORD8, rOFF32, rSTR2
+ addi rSTR1, rSTR1, 32
+ addi rSTR2, rSTR2, 32
cmpld cr7, rWORD1, rWORD2
bne cr1, L(duLcr1)
srd r12, rWORD8, rSHR
@@ -1234,10 +929,6 @@ L(duLoop3):
bdnz L(duLoop)
L(duL4):
-#if 0
-/* Huh? We've already branched on cr1! */
- bne cr1, L(duLcr1)
-#endif
cmpld cr1, rWORD3, rWORD4
bne cr6, L(duLcr6)
cmpld cr6, rWORD5, rWORD6
@@ -1264,99 +955,102 @@ L(du14):
beq L(duZeroReturn)
li r0, 0
ble cr7, L(dutrim)
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD2, 0, rSTR2
- addi rSTR2, rSTR2, 8
-#else
- ld rWORD2, 8(rSTR2)
-#endif
+ LD rWORD2, rOFF8, rSTR2
srd r0, rWORD2, rSHR
.align 4
L(dutrim):
-#ifdef __LITTLE_ENDIAN__
- ldbrx rWORD1, 0, rSTR1
-#else
- ld rWORD1, 8(rSTR1)
-#endif
+ LD rWORD1, rOFF8, rSTR1
ld rWORD8, -8(r1)
subfic rN, rN, 64 /* Shift count is 64 - (rN * 8). */
or rWORD2, r0, rWORD8_SHIFT
- ld rWORD7, -16(r1)
- ld rSHL, -24(r1)
+ ld rWORD7, rWORD7SAVE(r1)
+ ld rSHL, rSHLSAVE(r1)
srd rWORD1, rWORD1, rN
srd rWORD2, rWORD2, rN
- ld rSHR, -32(r1)
- ld rWORD8_SHIFT, -40(r1)
+ ld rSHR, rSHRSAVE(r1)
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
li rRTN, 0
cmpld cr7, rWORD1, rWORD2
- ld rWORD2_SHIFT, -48(r1)
- ld rWORD4_SHIFT, -56(r1)
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
beq cr7, L(dureturn24)
li rRTN, 1
- ld rWORD6_SHIFT, -64(r1)
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
bgtlr cr7
li rRTN, -1
blr
.align 4
L(duLcr7):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr7, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr1):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr1, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr6):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr6, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr5):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
li rRTN, 1
bgt cr5, L(dureturn29)
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
li rRTN, -1
b L(dureturn27)
+
.align 3
L(duZeroReturn):
li rRTN, 0
.align 4
L(dureturn):
- ld rWORD8, -8(r1)
- ld rWORD7, -16(r1)
+ ld rWORD8, rWORD8SAVE(r1)
+ ld rWORD7, rWORD7SAVE(r1)
L(dureturn29):
- ld rSHL, -24(r1)
- ld rSHR, -32(r1)
+ ld rSHL, rSHLSAVE(r1)
+ ld rSHR, rSHRSAVE(r1)
L(dureturn27):
- ld rWORD8_SHIFT, -40(r1)
-L(dureturn26):
- ld rWORD2_SHIFT, -48(r1)
-L(dureturn25):
- ld rWORD4_SHIFT, -56(r1)
+ ld rWORD8_SHIFT, rWORD8SHIFTSAVE(r1)
+ ld rWORD2_SHIFT, rWORD2SHIFTSAVE(r1)
+ ld rWORD4_SHIFT, rWORD4SHIFTSAVE(r1)
L(dureturn24):
- ld rWORD6_SHIFT, -64(r1)
+ ld rWORD6_SHIFT, rWORD6SHIFTSAVE(r1)
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
blr
+
L(duzeroLength):
+ ld rOFF8, rOFF8SAVE(r1)
+ ld rOFF16, rOFF16SAVE(r1)
+ ld rOFF24, rOFF24SAVE(r1)
+ ld rOFF32, rOFF32SAVE(r1)
li rRTN, 0
blr
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d3b00f468bec441596877a685a19f43dee88657f
commit d3b00f468bec441596877a685a19f43dee88657f
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Fri Jan 9 16:04:26 2015 -0500
powerpc: Optimized strncmp for POWER8/PPC64
This patch adds an optimized POWER8 strncmp. The implementation focus
on speeding up unaligned cases follwing the ideas of power8 strcmp.
The algorithm first check the initial 16 bytes, then align the first
function source and uses unaligned loads on second argument only.
Aditional checks for page boundaries are done for unaligned cases
(where sources alignment are different).
diff --git a/ChangeLog b/ChangeLog
index 79e971e..c1e8055 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S: New file.
+ * sysdeps/powerpc/powerpc64/power8/strncmp.S: New file.
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+ strncmp-power8 object.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add __strncmp_power8 implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/strncmp.c (strncmp): Likewise.
+ * NEWS: Update.
+
2015-01-13 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Adhemerval Zanella <azanella@linux.vnet.ibm.com>
diff --git a/NEWS b/NEWS
index e9f5034..3bdc96a 100644
--- a/NEWS
+++ b/NEWS
@@ -19,8 +19,8 @@ Version 2.21
17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782,
17791, 17793, 17796, 17797, 17803, 17806, 17834
-* Optimized strcpy, stpcpy, strncpy, stpncpy, and strcmp implementations for
- powerpc64/powerpc64le.
+* Optimized strcpy, stpcpy, strncpy, stpncpy, strcmp, and strncmp
+ implementations for powerpc64/powerpc64le.
Implemented by Adhemerval Zanella (IBM).
* Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index ec4fca7..b7ea284 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -7,8 +7,9 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
memrchr-power7 memrchr-ppc64 rawmemchr-power7 \
rawmemchr-ppc64 strlen-power7 strlen-ppc64 strnlen-power7 \
strnlen-ppc64 strcasecmp-power7 strcasecmp_l-power7 \
- strncase-power7 strncase_l-power7 strncmp-power7 \
- strncmp-power4 strncmp-ppc64 strchr-power7 strchr-ppc64 \
+ strncase-power7 strncase_l-power7 \
+ strncmp-power8 strncmp-power7 strncmp-power4 strncmp-ppc64 \
+ strchr-power7 strchr-ppc64 \
strchrnul-power7 strchrnul-ppc64 wcschr-power7 \
wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \
wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 2c03060..f5fdea5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -108,6 +108,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c. */
IFUNC_IMPL (i, name, strncmp,
+ IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strncmp_power8)
IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_HAS_VSX,
__strncmp_power7)
IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_POWER4,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S
similarity index 50%
copy from sysdeps/powerpc/powerpc64/multiarch/strncmp.c
copy to sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S
index eb02aac..8d7223d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp-power8.S
@@ -1,5 +1,4 @@
-/* Multiple versions of strncmp.
- Copyright (C) 2013-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,22 +15,26 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-/* Define multiple versions only for definition in libc. */
-#if defined SHARED && IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-extern __typeof (strncmp) __strncmp_ppc attribute_hidden;
-extern __typeof (strncmp) __strncmp_power4 attribute_hidden;
-extern __typeof (strncmp) __strncmp_power7 attribute_hidden;
-
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
- ifunc symbol properly. */
-libc_ifunc (strncmp,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strncmp_power7 :
- (hwcap & PPC_FEATURE_POWER4)
- ? __strncmp_power4
- : __strncmp_ppc);
-#endif
+#include <sysdep.h>
+
+#undef EALIGN
+#define EALIGN(name,alignt,words) \
+ .section ".text"; \
+ ENTRY_2(__strncmp_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__strncmp_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__strncmp_power8)
+
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strncmp_power8) \
+ END_2(__strncmp_power8)
+
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncmp.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
index eb02aac..9b6a659 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
@@ -25,13 +25,16 @@
extern __typeof (strncmp) __strncmp_ppc attribute_hidden;
extern __typeof (strncmp) __strncmp_power4 attribute_hidden;
extern __typeof (strncmp) __strncmp_power7 attribute_hidden;
+extern __typeof (strncmp) __strncmp_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (strncmp,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strncmp_power7 :
- (hwcap & PPC_FEATURE_POWER4)
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strncmp_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strncmp_power7 :
+ (hwcap & PPC_FEATURE_POWER4)
? __strncmp_power4
: __strncmp_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strncmp.S b/sysdeps/powerpc/powerpc64/power8/strncmp.S
new file mode 100644
index 0000000..56c814b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncmp.S
@@ -0,0 +1,323 @@
+/* Optimized strncmp implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Implements the function
+
+ int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+ .machine power7
+EALIGN (strncmp, 4, 0)
+ /* Check if size is 0. */
+ mr. r10,r5
+ beq cr0,L(ret0)
+
+ /* Check if [s1]+16 or [s2]+16 will cross a 4K page boundary using
+ the code:
+
+ (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+ with PAGE_SIZE being 4096 and ITER_SIZE begin 16. */
+ rldicl r8,r3,0,52
+ cmpldi cr7,r8,4096-16
+ bgt cr7,L(pagecross)
+ rldicl r9,r4,0,52
+ cmpldi cr7,r9,4096-16
+ bgt cr7,L(pagecross)
+
+ /* For short string up to 16 bytes, load both s1 and s2 using
+ unaligned dwords and compare. */
+ ld r7,0(r3)
+ ld r9,0(r4)
+ li r8,0
+ cmpb r8,r7,r8
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different1)
+
+ /* If the string compared are equal, but size is less or equal
+ to 8, return 0. */
+ cmpldi cr7,r10,8
+ li r9,0
+ ble cr7,L(ret1)
+ addi r5,r10,-8
+
+ ld r7,8(r3)
+ ld r9,8(r4)
+ cmpb r8,r7,r8
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different0)
+
+ cmpldi cr7,r5,8
+ mr r9,r8
+ ble cr7,L(ret1)
+
+ /* Update pointers and size. */
+ addi r10,r10,-16
+ addi r3,r3,16
+ addi r4,r4,16
+
+ /* Now it has checked for first 16 bytes, align source1 to doubleword
+ and adjust source2 address. */
+L(align_8b):
+ rldicl r5,r3,0,61
+ rldicr r3,r3,0,60
+ subf r4,r5,r4
+ add r10,r10,r5
+
+ /* At this point, source1 alignment is 0 and source2 alignment is
+ between 0 and 7. Check is source2 alignment is 0, meaning both
+ sources have the same alignment. */
+ andi. r8,r4,0x7
+ beq cr0,L(loop_eq_align_0)
+
+ li r5,0
+ b L(loop_ne_align_1)
+
+ /* If source2 is unaligned to doubleword, the code needs to check
+ on each interation if the unaligned doubleword access will cross
+ a 4k page boundary. */
+ .align 4
+L(loop_ne_align_0):
+ ld r7,0(r3)
+ ld r9,0(r4)
+ cmpb r8,r7,r5
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different1)
+
+ cmpldi cr7,r10,8
+ ble cr7,L(ret0)
+ addi r10,r10,-8
+ addi r3,r3,8
+ addi r4,r4,8
+L(loop_ne_align_1):
+ rldicl r9,r4,0,52
+ cmpldi r7,r9,4088
+ ble cr7,L(loop_ne_align_0)
+ cmpdi cr7,r10,0
+ beq cr7,L(ret0)
+
+ lbz r9,0(r3)
+ lbz r8,0(r4)
+ cmplw cr7,r9,r8
+ bne cr7,L(byte_ne_4)
+ cmpdi cr7,r9,0
+ beq cr7,L(size_reached_0)
+
+ li r9,r7
+ addi r8,r3,1
+ mtctr r9
+ addi r4,r4,1
+ addi r10,r10,-1
+ addi r3,r3,8
+
+ /* The unaligned read of source2 will cross a 4K page boundary,
+ and the different byte or NULL maybe be in the remaining page
+ bytes. Since it can not use the unaligned load the algorithm
+ reads and compares 8 bytes to keep source1 doubleword aligned. */
+ .align 4
+L(loop_ne_align_byte):
+ cmpdi cr7,r10,0
+ addi r10,r10,-1
+ beq cr7,L(ret0)
+ lbz r9,0(r8)
+ lbz r7,0(r4)
+ addi r8,r8,1
+ addi r4,r4,1
+ cmplw cr7,r9,r7
+ cmpdi cr5,r9,0
+ bne cr7,L(size_reached_2)
+ beq cr5,L(size_reached_0)
+ bdnz L(loop_ne_align_byte)
+
+ cmpdi cr7,r10,0
+ bne+ cr7,L(loop_ne_align_0)
+
+ .align 4
+L(ret0):
+ li r9,0
+L(ret1):
+ mr r3,r9
+ blr
+
+ /* The code now check if r8 and r10 are different by issuing a
+ cmpb and shift the result based on its output:
+
+ #ifdef __LITTLE_ENDIAN__
+ leadzero = (__builtin_ffsl (z1) - 1);
+ leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+ r1 = (r1 >> leadzero) & 0xFFUL;
+ r2 = (r2 >> leadzero) & 0xFFUL;
+ #else
+ leadzero = __builtin_clzl (z1);
+ leadzero = leadzero > (n-1)*8 ? (n-1)*8 : leadzero;
+ r1 = (r1 >> (56 - leadzero)) & 0xFFUL;
+ r2 = (r2 >> (56 - leadzero)) & 0xFFUL;
+ #endif
+ return r1 - r2; */
+
+ .align 4
+L(different0):
+ mr r10,r5
+#ifdef __LITTLE_ENDIAN__
+L(different1):
+ neg r11,r8
+ sldi r10,r10,3
+ and r8,r11,r8
+ addi r10,r10,-8
+ cntlzd r8,r8
+ subfic r8,r8,63
+ extsw r8,r8
+ cmpld cr7,r8,r10
+ ble cr7,L(different2)
+ mr r8,r10
+L(different2):
+ extsw r8,r8
+#else
+L(different1):
+ addi r10,r10,-1
+ cntlzd r8,r8
+ sldi r10,r10,3
+ cmpld cr7,r8,r10
+ blt cr7,L(different2)
+ mr r8,r10
+L(different2):
+ subfic r8,r8,56
+#endif
+ srd r7,r7,r8
+ srd r9,r9,r8
+ rldicl r3,r7,0,56
+ rldicl r9,r9,0,56
+ subf r9,r9,3
+ extsw r9,r9
+ mr r3,r9
+ blr
+
+ /* If unaligned 16 bytes reads across a 4K page boundary, it uses
+ a simple byte a byte comparison until the page alignment for s1
+ is reached. */
+ .align 4
+L(pagecross):
+ lbz r7,0(r3)
+ lbz r9,0(r4)
+ subfic r8,r8,4095
+ cmplw cr7,r9,r7
+ bne cr7,L(byte_ne_3)
+ cmpdi cr7,r9,0
+ beq cr7,L(byte_ne_0)
+ addi r10,r10,-1
+ subf r7,r8,r10
+ subf r9,r7,r10
+ addi r9,r9,1
+ mtctr r9
+ b L(pagecross_loop1)
+
+ .align 4
+L(pagecross_loop0):
+ beq cr7,L(ret0)
+ lbz r9,0(r3)
+ lbz r8,0(r4)
+ addi r10,r10,-1
+ cmplw cr7,r9,r8
+ cmpdi cr5,r9,0
+ bne r7,L(byte_ne_2)
+ beq r5,L(byte_ne_0)
+L(pagecross_loop1):
+ cmpdi cr7,r10,0
+ addi r3,r3,1
+ addi r4,r4,1
+ bdnz L(pagecross_loop0)
+ cmpdi cr7,r7,0
+ li r9,0
+ bne+ cr7,L(align_8b)
+ b L(ret1)
+
+ /* If both source1 and source2 are doubleword aligned, there is no
+ need for page boundary cross checks. */
+ .align 4
+L(loop_eq_align_0):
+ ld r7,0(r3)
+ ld r9,0(r4)
+ cmpb r8,r7,r8
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ bne cr0,L(different1)
+
+ cmpldi cr7,r10,8
+ ble cr7,L(ret0)
+ addi r9,r10,-9
+
+ li r5,0
+ srdi r9,r9,3
+ addi r9,r9,1
+ mtctr r9
+ b L(loop_eq_align_2)
+
+ .align 4
+L(loop_eq_align_1):
+ bdz L(ret0)
+L(loop_eq_align_2):
+ ldu r7,8(r3)
+ addi r10,r10,-8
+ ldu r9,8(r4)
+ cmpb r8,r7,r5
+ cmpb r6,r7,r9
+ orc. r8,r8,r6
+ beq cr0,L(loop_eq_align_1)
+ b L(different1)
+
+ .align 4
+L(byte_ne_0):
+ li r7,0
+L(byte_ne_1):
+ subf r9,r9,r7
+ extsw r9,r9
+ b L(ret1)
+
+ .align 4
+L(byte_ne_2):
+ extsw r7,r9
+ mr r9,r8
+ b L(byte_ne_1)
+L(size_reached_0):
+ li r10,0
+L(size_reached_1):
+ subf r9,r9,r10
+ extsw r9,r9
+ b L(ret1)
+L(size_reached_2):
+ extsw r10,r9
+ mr r9,r7
+ b L(size_reached_1)
+L(byte_ne_3):
+ extsw r7,r7
+ b L(byte_ne_1)
+L(byte_ne_4):
+ extsw r10,r9
+ mr r9,r8
+ b L(size_reached_1)
+END(strncmp)
+libc_hidden_builtin_def(strncmp)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=72607db038df1a1a7987af814aad8d2ed466c45c
commit 72607db038df1a1a7987af814aad8d2ed466c45c
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Date: Fri Jan 9 11:56:35 2015 -0500
powerpc: Optimize POWER7 strcmp trailing checks
This patch optimized the POWER7 trailing check by avoiding using byte
read operations and instead use the doubleword already readed with
bitwise operations.
diff --git a/ChangeLog b/ChangeLog
index b2b1163..79e971e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2015-01-13 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
+ Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/power7/strcmp.S (strcmp): Optimize
+ trailing byte check.
+
2015-01-13 David S. Miller <davem@davemloft.net>
* include/signal.h (__sigreturn): Guard with __USE_MISC.
diff --git a/sysdeps/powerpc/powerpc64/power7/strcmp.S b/sysdeps/powerpc/powerpc64/power7/strcmp.S
index b81080f..6af0e7d 100644
--- a/sysdeps/powerpc/powerpc64/power7/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/power7/strcmp.S
@@ -25,122 +25,96 @@
/* int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]) */
+ .machine power7
EALIGN (strcmp, 4, 0)
CALL_MCOUNT 2
or r9, r3, r4
rldicl. r10, r9, 0, 61 /* are s1 and s2 8 byte aligned..? */
bne cr0, L(process_unaligned_bytes)
+ li r5, 0
+ .align 4
/* process input parameters on double word aligned boundary */
- ld r9, 0(r4) /* load s2 at offset=0 */
- li r10, 0 /* load mask=0 */
- cmpb r10, r9, r10 /* compare bytes at s2 with mask */
- cmpdi cr7, r10, 0 /* is NULL found ..? is end of string HIT */
- bne cr7, L(process_unaligned_bytes) /* process byte by byte */
-
- ld r10, 0(r3) /* load s1 at offset=0 */
- li r8, 0 /* load mask=0 */
- cmpb r8, r10, r8 /* compare bytes at s1 with mask */
- cmpdi cr7, r8, 0 /* is NULL found ..? is end of string HIT */
- bne cr7, L(process_unaligned_bytes) /* process byte by byte */
-
-/*s1 and s2 does not contain NULL now , so compare all 8 bytes in a GO */
- cmpb r9, r10, r9 /* compare s1 and s2 */
- cmpdi cr7, r9, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */
- bne cr7, L(process_unaligned_bytes) /* s1,s2 mismatch found */
-
- addi r5, r3, 8 /* save next offset of s2 */
- addi r11, r4, 8 /* save next offset of s1 */
- ld r8, 8(r4) /* load s2 at offset=8 */
- li r9, 0 /* load mask=0 */
- cmpb r9, r8, r9 /* compare bytes at s2 with mask */
- cmpdi cr7, r9, 0 /* NULL found ..? */
- bne cr7, L(processBytes)/* update input and process bytes one by one */
-
- mr r9, r4 /* save s2 */
- li r10, 0 /* load mask=0 */
-
- ld r7, 8(r3) /* load s1 at offset=8 */
- cmpb r6, r7, r10 /* compare bytes at s1 with mask */
- cmpdi cr7, r6, 0 /* is NULL found */
- bne cr7, L(processBytes)/* mismatch, so process one by one */
-
L(unrollDword):
- cmpb r8, r7, r8 /* compare s1 and s2 */
- cmpdi cr7, r8, -1 /* compare result with 0xFFFFFFFFFFFFFFFF */
- bne cr7, L(processBytes)/* mismatch with s1 and s2 */
-
- addi r5, r3, 16 /* save offset=16 of s1 */
- addi r4, r9, 16 /* save offset=16 of s2 */
- ld r8, 16(r9) /* load s2 at offset=16 */
- cmpb r7, r8, r10 /* compare bytes at s2 with mask */
- cmpdi cr7, r7, 0 /* NULL found ..? */
- bne cr7, L(update2processBytes)
-
- ld r7, 16(r3) /* load s1 at offset=16 */
- cmpb r6, r7, r10 /* check s1 for end of string */
- cmpdi cr7, r6, 0 /* end of s1 ?,then handle byte by byte */
- bne 7,L(update2processBytes)
-
- cmpb r8, r7, r8 /* compare s1 and s2 double words */
- cmpdi cr7, r8, -1 /* compare results with 0xFFFFFFFFFFFFFFFF */
- bne cr7,L(update2processBytes)
-
- addi r5, r3, 24 /* update s1 to offset=24 */
- addi r4, r9, 24 /* update s2 to offset=24 */
-
- ld r8, 24(r9) /* load s2 */
- cmpb r7, r8, r10 /* compare s2 for NULL */
- cmpdi cr7, r7, 0 /* verify if s2 is ending now */
- bne cr7,L(update2processBytes)
-
- ld r7, 24(r3) /* load s1 at offset=24 */
- cmpb r6, r7, r10 /* verify for NULL */
- cmpdi cr7, r6, 0 /* is NULL found */
- bne cr7, L(update2processBytes)
-
- cmpb r8, r7, r8 /* compare s1 and s2 */
- cmpdi cr7, r8, -1 /* are s1 and s2 same ..? */
- bne cr7, L(update2processBytes)
-
- addi r7, r9, 32 /* update s2 to next double word */
- addi r3, r3, 32 /* update s1 to next double word */
-
- ld r8, 32(r9) /* load s2 */
- mr r4, r7 /* save s2 */
- cmpb r6, r8, r10 /* compare s2 with NULL */
- cmpdi cr7, r6, 0 /* end of s2 ..? */
- bne cr7, L(process_unaligned_bytes)
-
- ld r6, 0(r3) /* load and compare s1 for NULL */
- cmpb r5, r6, r10
- cmpdi cr7, r5, 0
- bne cr7, L(process_unaligned_bytes)
-
- cmpb r8, r6, r8 /* compare s1 and s2 */
- cmpdi cr7, r8, -1
- bne cr7, L(process_unaligned_bytes)
-
- addi r5, r3, 8 /* increment s1 and d2 here */
- addi r11, r9, 40
-
- ld r8, 40(r9) /* process s2 now */
- cmpb r9, r8, r10
- cmpdi cr7, r9, 0
- bne cr7, L(processBytes)
-
- mr r9, r7
- ld r7, 8(r3) /* process s1 now */
- cmpb r6, r7, r10
- cmpdi cr7, r6, 0
- beq cr7, L(unrollDword) /* unroll to compare s1 and s2 */
-
-L(processBytes):
- mr r4, r11 /* update input params */
- mr r3, r5
-
- .p2align 4
+ ld r8,0(r3)
+ ld r10,0(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,8(r3)
+ ld r10,8(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,16(r3)
+ ld r10,16(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ ld r8,24(r3)
+ ld r10,24(r4)
+ cmpb r7,r8,r5
+ cmpdi cr7,r7,0
+ mr r9,r7
+ bne cr7,L(null_found)
+ cmpld cr7,r8,r10
+ bne cr7,L(different)
+
+ addi r3, r3, 32
+ addi r4, r4, 32
+ beq cr7, L(unrollDword)
+
+ .align 4
+L(null_found):
+#ifdef __LITTLE_ENDIAN__
+ neg r7,r9
+ and r9,r9,r7
+ li r7,-1
+ cntlzd r9,r9
+ subfic r9,r9,71
+ sld r9,r7,r9
+#else
+ cntlzd r9,r9
+ li r7,-1
+ addi r9,r9,8
+ srd r9,r7,r9
+#endif
+ or r8,r8,r9
+ or r10,r10,r9
+
+L(different):
+ cmpb r9,r8,r10
+#ifdef __LITTLE_ENDIAN__
+ addi r7,r9,1
+ andc r9,r7,r9
+ cntlzd r9,r9
+ subfic r9,r9,63
+#else
+ not r9,r9
+ cntlzd r9,r9
+ subfic r9,r9,56
+#endif
+ srd r3,r8,r9
+ srd r10,r10,r9
+ rldicl r10,r10,0,56
+ rldicl r3,r3,0,56
+ subf r3,r10,r3
+ blr
+
+ .align 4
L(process_unaligned_bytes):
lbz r9, 0(r3) /* load byte from s1 */
lbz r10, 0(r4) /* load byte from s2 */
@@ -172,24 +146,19 @@ L(process_unaligned_bytes):
addi r4, r4, 4 /* increment s2 by unroll factor */
beq cr6, L(process_unaligned_bytes) /* unroll byte processing */
- .p2align 4
+ .align 4
L(ComputeDiff):
extsw r9, r9
subf r10, r10, r9 /* compute s1 - s2 */
extsw r3, r10
blr /* return */
- .p2align 4
+ .align 4
L(diffOfNULL):
li r9, 0
subf r10, r10, r9 /* compute s1 - s2 */
extsw r3, r10 /* sign extend result */
blr /* return */
- .p2align 4
-L(update2processBytes):
- mr r3, r5 /* update and proceed */
- b L(process_unaligned_bytes)
-
END (strcmp)
libc_hidden_builtin_def (strcmp)
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 20 +
NEWS | 4 +-
sysdeps/powerpc/powerpc64/multiarch/Makefile | 5 +-
.../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 2 +
.../{strncmp-power7.S => strncmp-power8.S} | 14 +-
sysdeps/powerpc/powerpc64/multiarch/strncmp.c | 9 +-
sysdeps/powerpc/powerpc64/power7/memcmp.S | 870 +++++++-------------
sysdeps/powerpc/powerpc64/power7/strcmp.S | 197 ++---
sysdeps/powerpc/powerpc64/power8/strncmp.S | 323 ++++++++
9 files changed, 728 insertions(+), 716 deletions(-)
copy sysdeps/powerpc/powerpc64/multiarch/{strncmp-power7.S => strncmp-power8.S} (79%)
create mode 100644 sysdeps/powerpc/powerpc64/power8/strncmp.S
hooks/post-receive
--
GNU C Library master sources