This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
Hi all, Following Alan Modra suggestion, it is a stpcpy optimization patch for PPC64. This patch optimizes the default PPC64 by adding doubleword stores/loads increasing aligned throughput for large sizes. For POWER7 version it also removed unneeded branch prediction and use cmpb instructions instead of the bitwise operation to find string's end. This saved some cycles for both aligned and unaligned cases. Tested on PPC64 power4/power7 and I'm attaching the benchtests output for each case (default master, default optimized, power7 master, and power7 optimized). --- 2013-09-10 Adhemerval Zanella <azanella@linux.vnet.ibm.com> * sysdeps/powerpc/powerpc64/stpcpy.S (__stpcpy): Add doubleword read and write to provide a boost for large inputs. * sysdeps/powerpc/powerpc64/power7/stpcpy.S: New file. -- diff --git a/sysdeps/powerpc/powerpc64/power7/stpcpy.S b/sysdeps/powerpc/powerpc64/power7/stpcpy.S new file mode 100644 index 0000000..3b44df2 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/stpcpy.S @@ -0,0 +1,156 @@ +/* Optimized stpcpy implementation for PowerPC64/POWER7. + Copyright (C) 2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +/* See strlen.s for comments on how the end-of-string testing works. */ + +/* char * [r3] stpcpy (char *dest [r3], const char *src [r4]) */ + + .machine power7 +EALIGN (__stpcpy, 4, 0) + CALL_MCOUNT 2 + +#define rTMP r0 +#define rRTN r3 /* pointer to previous word/doubleword in dest */ +#define rSRC r4 /* pointer to previous word/doubleword in src */ +#define rMASK r5 /* mark 0xffffffff | 0xffffffffffffffff */ +#define rWORD r6 /* current word from src */ +#define rALT r7 /* alternate word from src */ + + or rTMP, rSRC, rRTN + clrldi. rTMP, rTMP, 61 + bne L(check_word_alignment) + +/* For doubleword aligned memory, operate using doubleword load and stores. */ + li rMASK, 0 + addi rRTN, rRTN, -8 + ld rWORD, 0(rSRC) + b L(g2) + + .align 4 +L(g0): ldu rALT, 8(rSRC) + stdu rWORD, 8(rRTN) + cmpb rTMP, rALT, rMASK + cmpdi rTMP, 0 + bne L(g1) + ldu rWORD, 8(rSRC) + stdu rALT, 8(rRTN) +L(g2): cmpb rTMP, rWORD, rMASK + cmpdi rTMP, 0 /* If rTMP is 0, no null's have been found. */ + beq L(g0) + + mr rALT, rWORD +/* We've hit the end of the string. Do the rest byte-by-byte. */ +L(g1): + extrdi. rTMP, rALT, 8, 0 + stbu rTMP, 8(rRTN) + beqlr + extrdi. rTMP, rALT, 8, 8 + stbu rTMP, 1(rRTN) + beqlr + extrdi. rTMP, rALT, 8, 16 + stbu rTMP, 1(rRTN) + beqlr + extrdi. rTMP, rALT, 8, 24 + stbu rTMP, 1(rRTN) + beqlr + extrdi. rTMP, rALT, 8, 32 + stbu rTMP, 1(rRTN) + beqlr + extrdi. rTMP, rALT, 8, 40 + stbu rTMP, 1(rRTN) + beqlr + extrdi. rTMP, rALT, 8, 48 + stbu rTMP, 1(rRTN) + beqlr + stbu rALT, 1(rRTN) + blr + +L(check_word_alignment): + clrldi. rTMP, rTMP, 62 + bne L(unaligned) + +/* For word aligned memory, operate using word load and + stores. */ + li rMASK, 0 + addi rRTN, rRTN, -4 + lwz rWORD, 0(rSRC) + b L(g5) + + .align 4 +L(g3): lwzu rALT, 4(rSRC) + stwu rWORD, 4(rRTN) + cmpb rTMP, rALT, rMASK + cmpwi rTMP, 0 + bne L(g4) + lwzu rWORD, 4(rSRC) + stwu rALT, 4(rRTN) +L(g5): cmpb rTMP, rWORD, rMASK + cmpwi rTMP, 0 /* If rTMP is 0, no null in word. */ + beq L(g3) + + mr rALT, rWORD +/* We've hit the end of the string. Do the rest byte-by-byte. */ +L(g4): + rlwinm. rTMP, rALT, 8, 24, 31 + stbu rTMP, 4(rRTN) + beqlr + rlwinm. rTMP, rALT, 16, 24, 31 + stbu rTMP, 1(rRTN) + beqlr + rlwinm. rTMP, rALT, 24, 24, 31 + stbu rTMP, 1(rRTN) + beqlr + stbu rALT, 1(rRTN) + blr + +/* Oh well. In this case, we just do a byte-by-byte copy. */ + .align 4 +L(unaligned): + lbz rWORD, 0(rSRC) + addi rRTN, rRTN, -1 + cmpdi rWORD, 0 + beq L(u2) + + .align 5 +L(u0): lbzu rALT, 1(rSRC) + stbu rWORD, 1(rRTN) + cmpdi rALT, 0 + beq L(u1) + lbzu rWORD, 1(rSRC) + stbu rALT, 1(rRTN) + cmpdi rWORD, 0 + beq L(u2) + lbzu rALT, 1(rSRC) + stbu rWORD, 1(rRTN) + cmpdi rALT, 0 + beq L(u1) + lbzu rWORD, 1(rSRC) + stbu rALT, 1(rRTN) + cmpdi rWORD, 0 + bne L(u0) +L(u2): stbu rWORD, 1(rRTN) + blr +L(u1): stbu rALT, 1(rRTN) + blr +END (__stpcpy) + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/sysdeps/powerpc/powerpc64/stpcpy.S b/sysdeps/powerpc/powerpc64/stpcpy.S index 070cd46..d3db070 100644 --- a/sysdeps/powerpc/powerpc64/stpcpy.S +++ b/sysdeps/powerpc/powerpc64/stpcpy.S @@ -26,35 +26,40 @@ EALIGN (__stpcpy, 4, 0) CALL_MCOUNT 2 #define rTMP r0 -#define rRTN r3 -#define rDEST r3 /* pointer to previous word in dest */ -#define rSRC r4 /* pointer to previous word in src */ -#define rWORD r6 /* current word from src */ -#define rFEFE r7 /* 0xfefefeff */ -#define r7F7F r8 /* 0x7f7f7f7f */ -#define rNEG r9 /* ~(word in src | 0x7f7f7f7f) */ -#define rALT r10 /* alternate word from src */ - - or rTMP, rSRC, rDEST - clrldi. rTMP, rTMP, 62 - addi rDEST, rDEST, -4 - bne L(unaligned) +#define rRTN r3 /* pointer to previous word/doubleword in dest */ +#define rSRC r4 /* pointer to previous word/doubleword in src */ +#define rWORD r6 /* current word from src */ +#define rFEFE r7 /* constant 0xfefefeff | 0xfefefefefefefeff */ +#define r7F7F r8 /* constant 0x7f7f7f7f | 0x7f7f7f7f7f7f7f7f */ +#define rNEG r9 /* ~(word in s1 | r7F7F) */ +#define rALT r10 /* alternate word from src */ + + or rTMP, rSRC, rRTN + clrldi. rTMP, rTMP, 61 + bne L(check_word_alignment) + +/* For doubleword aligned memory, operate using doubleword load and + stores. */ + addi rRTN, rRTN, -8 lis rFEFE, -0x101 lis r7F7F, 0x7f7f - lwz rWORD, 0(rSRC) + ld rWORD, 0(rSRC) addi rFEFE, rFEFE, -0x101 addi r7F7F, r7F7F, 0x7f7f + sldi rTMP, rFEFE, 32 + insrdi r7F7F, r7F7F, 32, 0 + add rFEFE, rFEFE, rTMP b L(g2) -L(g0): lwzu rALT, 4(rSRC) - stwu rWORD, 4(rDEST) +L(g0): ldu rALT, 8(rSRC) + stdu rWORD, 8(rRTN) add rTMP, rFEFE, rALT nor rNEG, r7F7F, rALT and. rTMP, rTMP, rNEG bne- L(g1) - lwzu rWORD, 4(rSRC) - stwu rALT, 4(rDEST) + ldu rWORD, 8(rSRC) + stdu rALT, 8(rRTN) L(g2): add rTMP, rFEFE, rWORD nor rNEG, r7F7F, rWORD and. rTMP, rTMP, rNEG @@ -62,39 +67,95 @@ L(g2): add rTMP, rFEFE, rWORD mr rALT, rWORD /* We've hit the end of the string. Do the rest byte-by-byte. */ -L(g1): rlwinm. rTMP, rALT, 8, 24, 31 - stbu rTMP, 4(rDEST) +L(g1): + extrdi. rTMP, rALT, 8, 0 + stbu rTMP, 8(rRTN) + beqlr- + extrdi. rTMP, rALT, 8, 8 + stbu rTMP, 1(rRTN) + beqlr- + extrdi. rTMP, rALT, 8, 16 + stbu rTMP, 1(rRTN) beqlr- - rlwinm. rTMP, rALT, 16, 24, 31 - stbu rTMP, 1(rDEST) + extrdi. rTMP, rALT, 8, 24 + stbu rTMP, 1(rRTN) beqlr- - rlwinm. rTMP, rALT, 24, 24, 31 - stbu rTMP, 1(rDEST) + extrdi. rTMP, rALT, 8, 32 + stbu rTMP, 1(rRTN) + beqlr + extrdi. rTMP, rALT, 8, 40 + stbu rTMP, 1(rRTN) beqlr- - stbu rALT, 1(rDEST) + extrdi. rTMP, rALT, 8, 48 + stbu rTMP, 1(rRTN) + beqlr- + stbu rALT, 1(rRTN) blr +L(check_word_alignment): + clrldi. rTMP, rTMP, 62 + bne L(unaligned) + +/* For word aligned memory, operate using word load and + stores. */ + addi rRTN, rRTN, -4 + + lis rFEFE, -0x101 + lis r7F7F, 0x7f7f + lwz rWORD, 0(rSRC) + addi rFEFE, rFEFE, -0x101 + addi r7F7F, r7F7F, 0x7f7f + b L(g5) + +L(g3): lwzu rALT, 4(rSRC) + stwu rWORD, 4(rRTN) + add rTMP, rFEFE, rALT + nor rNEG, r7F7F, rALT + and. rTMP, rTMP, rNEG + bne- L(g4) + lwzu rWORD, 4(rSRC) + stwu rALT, 4(rRTN) +L(g5): add rTMP, rFEFE, rWORD + nor rNEG, r7F7F, rWORD + and. rTMP, rTMP, rNEG + beq+ L(g3) + + mr rALT, rWORD +/* We've hit the end of the string. Do the rest byte-by-byte. */ +L(g4): + rlwinm. rTMP, rALT, 8, 24, 31 + stbu rTMP, 4(rRTN) + beqlr- + rlwinm. rTMP, rALT, 16, 24, 31 + stbu rTMP, 1(rRTN) + beqlr- + rlwinm. rTMP, rALT, 24, 24, 31 + stbu rTMP, 1(rRTN) + beqlr- + stbu rALT, 1(rRTN) + blr + /* Oh well. In this case, we just do a byte-by-byte copy. */ .align 4 nop L(unaligned): lbz rWORD, 0(rSRC) - addi rDEST, rDEST, 3 + addi rRTN, rRTN, -1 cmpwi rWORD, 0 beq- L(u2) L(u0): lbzu rALT, 1(rSRC) - stbu rWORD, 1(rDEST) + stbu rWORD, 1(rRTN) cmpwi rALT, 0 beq- L(u1) nop /* Let 601 load start of loop. */ lbzu rWORD, 1(rSRC) - stbu rALT, 1(rDEST) + stbu rALT, 1(rRTN) cmpwi rWORD, 0 bne+ L(u0) -L(u2): stbu rWORD, 1(rDEST) +L(u2): stbu rWORD, 1(rRTN) blr -L(u1): stbu rALT, 1(rDEST) +L(u1): stbu rALT, 1(rRTN) blr END (__stpcpy)
Attachment:
bench-stpcpy-master-power4.out
Description: Text document
Attachment:
bench-stpcpy-master-power7.out
Description: Text document
Attachment:
bench-stpcpy-patch-power4.out
Description: Text document
Attachment:
bench-stpcpy-patch-power7.out
Description: Text document
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |