This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
>From 2784371b29cf426603438fb5cac2b2a1f41940bd Mon Sep 17 00:00:00 2001 From: Vidya Ranganathan <vidya@linux.vnet.ibm.com> Date: Thu, 27 Feb 2014 09:34:10 -0500 Subject: [PATCH] Optimization for strpbrk() on ppc64 and ppc64le. I have attached the benchtest output to show the performance improvement. The optimization is achieved by following techniques: 1.aligned memory access 2.loop unrolling P7 gain 3.CPU pre-fetch to avoid cache miss ChangeLog: 2014-02-27 Vidya Ranganathan <vidya@linux.vnet.ibm.com> * sysdeps/powerpc/powerpc64/power7/strpbrk.S: New file: Optimization. * sysdeps/powerpc/powerpc64/multiarch/strpbrk.c: New file: multiarch strpbrk for PPC64. * sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c: New file * sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S: New file * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c: (__libc_ifunc_impl_list): Likewise. * sysdeps/powerpc/powerpc64/multiarch/Makefile: Add strpbrk multiarch optimizations * string/strpbrk.c (strpbrk): Using macro to redefine symbol name. Signed-off-by: Vidya Ranganathan <vidya@linux.vnet.ibm.com> --- string/strpbrk.c | 6 +- sysdeps/powerpc/powerpc64/multiarch/Makefile | 3 +- .../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 8 ++ .../powerpc/powerpc64/multiarch/strpbrk-power7.S | 40 ++++++ .../powerpc/powerpc64/multiarch/strpbrk-ppc64.c | 30 +++++ sysdeps/powerpc/powerpc64/multiarch/strpbrk.c | 31 +++++ sysdeps/powerpc/powerpc64/power7/strpbrk.S | 144 +++++++++++++++++++++ 7 files changed, 260 insertions(+), 2 deletions(-) create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strpbrk.c create mode 100644 sysdeps/powerpc/powerpc64/power7/strpbrk.S diff --git a/string/strpbrk.c b/string/strpbrk.c index ce33b68..2955644 100644 --- a/string/strpbrk.c +++ b/string/strpbrk.c @@ -25,9 +25,13 @@ #undef strpbrk +#ifndef STRPBRK +#define STRPBRK strpbrk +#endif + /* Find the first occurrence in S of any character in ACCEPT. */ char * -strpbrk (s, accept) +STRPBRK (s, accept) const char *s; const char *accept; { diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile index 3c47316..5fe8a0c 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile @@ -13,7 +13,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \ wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \ wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \ wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \ - strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 + strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \ + strpbrk-power7 strpbrk-ppc64 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c index 6bbdd4e..96ec23b 100644 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c @@ -238,5 +238,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, wcscpy, 1, __wcscpy_ppc)) + /* Support sysdeps/powerpc/powerpc64/multiarch/strpbrk.c. */ + IFUNC_IMPL (i, name, strpbrk, + IFUNC_IMPL_ADD (array, i, strpbrk, + hwcap & PPC_FEATURE_HAS_VSX, + __strpbrk_power7) + IFUNC_IMPL_ADD (array, i, strpbrk, 1, + __strpbrk_ppc)) + return i; } diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S new file mode 100644 index 0000000..1f5eb99 --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strpbrk-power7.S @@ -0,0 +1,40 @@ +/* Optimized strpbrk implementation for POWER7. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +#undef EALIGN +#define EALIGN(name, alignt, words) \ + .section ".text"; \ + ENTRY_2(__strpbrk_power7) \ + .align ALIGNARG(alignt); \ + EALIGN_W_##words; \ + BODY_LABEL(__strpbrk_power7): \ + cfi_startproc; \ + LOCALENTRY(__strpbrk_power7) + +#undef END +#define END(name) \ + cfi_endproc; \ + TRACEBACK(__strpbrk_power7) \ + END_2(__strpbrk_power7) + +#undef libc_hidden_builtin_def +#define libc_hidden_builtin_def(name) + +#include <sysdeps/powerpc/powerpc64/power7/strpbrk.S> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c b/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c new file mode 100644 index 0000000..b15cdca --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strpbrk-ppc64.c @@ -0,0 +1,30 @@ +/* Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/ >. */ + +#include <string.h> + +#define STRPBRK __strpbrk_ppc +#ifdef SHARED + +# undef libc_hidden_builtin_def +# define libc_hidden_builtin_def(name) \ + __hidden_ver1 (__strpbrk_ppc, __GI_strpbrk, __strpbrk_ppc); +#endif + +extern __typeof (strpbrk) __strpbrk_ppc attribute_hidden; + +#include <string/strpbrk.c> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c b/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c new file mode 100644 index 0000000..778a5fb --- /dev/null +++ b/sysdeps/powerpc/powerpc64/multiarch/strpbrk.c @@ -0,0 +1,31 @@ +/* Multiple versions of strpbrk. PowerPC64 version. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#ifndef NOT_IN_libc +# include <string.h> +# include <shlib-compat.h> +# include "init-arch.h" + +extern __typeof (strpbrk) __strpbrk_ppc attribute_hidden; +extern __typeof (strpbrk) __strpbrk_power7 attribute_hidden; + +libc_ifunc (strpbrk, + (hwcap & PPC_FEATURE_HAS_VSX) + ? __strpbrk_power7 + : __strpbrk_ppc); +#endif diff --git a/sysdeps/powerpc/powerpc64/power7/strpbrk.S b/sysdeps/powerpc/powerpc64/power7/strpbrk.S new file mode 100644 index 0000000..ec3b09d --- /dev/null +++ b/sysdeps/powerpc/powerpc64/power7/strpbrk.S @@ -0,0 +1,144 @@ +/* Optimized strpbrk implementation for PowerPC64/POWER7. + + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* char* [r3] strpbrk (const char *str [r3], + const char *accept [r4] */ + +/* Performance gains are grabbed through following techniques: + CPU pre-fetch to avoid cache miss + loop unrolling P7 gain + aligned memory access */ + +#include <sysdep.h> + +#undef strpbrk + + .machine power7 +EALIGN(strpbrk, 4, 0) + CALL_MCOUNT 2 + + mflr r0 /* load link register LR to r0 */ + +/* We shall use r14, r15, r16 ; save all the calle save registers + in the GPR save area. */ + std r14,-24(r1) /* save callers register , r14 */ + std r15,-16(r1) /* save callers register , r15 */ + std r16,-8(r1) /* save callers register , r16 */ + + std r0,16(r1) /* store the link register */ + stdu r1,-144(r1) /* create stack frame */ + +/* improve performance with CPU pre-fetch */ + dcbt 0, r3 /* pre-fetch str to avoid cache miss */ + dcbt 0, r4 /* pre-fetch accept to avoid cache miss */ + + mr r14,r3 /* move str[r3] to r14 */ + mr r16,r4 /* move accept[r4] to r16 */ + bl strlen /* compute length of str */ + nop /* no operation ; trigger CPU activity */ + + cmpdi cr7,r3,0 /* compare length of str with Zero */ + beq cr7,L(retNULL) /* if length of str=0, branch to return NULL */ + + mr r15,r3 /* r15 = length of str */ + mr r3,r16 /* now, r3 is accept */ + bl strlen /* compute length of accept */ + nop /* no operation ; trigger CPU activity */ + + add r15,r14,r15 /* compute the last acceptable address of str */ + cmpdi cr6,r3,0 /* if len(accept) is zero */ + beq cr6,L(retNULL) /* if length of accept=0, branch to return NULL */ + +L(innerLoop): + lbz r10,0(r14) /* load r10 with str[0] now */ + li r9,0 /* initialize index */ + b L(checkUnroll) + +/* unrolling is a small loop with very few instructions + align it to 16-bytes. */ + .p2align 4 + +/* POWER7 gains performance with loop unrolling; + so, unroll here. */ +L(unrolling): + cmpw cr7,r7,r10 /* compare str[curr] with accept[index] */ + addi r9,r9,3 /* increment index by 3 for better performance */ + beq cr7,L(update) + + lbz r7,1(r6) /* load r7 with next indexed value */ + cmpw cr7,r7,r10 /* compare str[curr] with accept[index] */ + beq cr7,L(update) + + lbz r7,2(r6) /* load r7 with next to next indexed value */ + cmpw cr7,r7,r10 /* compare str[curr] with accept[index] */ + beq cr7,L(update) + + lbzx r7,r16,r9 /* load byte zero indexed at r9 */ + cmpw cr7,r7,r10 /* compare str[curr] with accept[index] */ + beq cr7,L(update) + +L(loopNow): + addi r9,r9,1 /* increment index[r9] by 1 */ + cmpld cr7,r3,r9 /* compare index[r9] with strlen(accept)[r3] */ + nop + ble cr7,L(outerLoop) + +L(checkUnroll): + subf r7,r9,r3 /* left to process[r7]=strlen(accept)[r3] - index[r9] */ + add r6,r16,r9 /* increment accept by index and store in r6 */ + cmpldi cr7,r7,3 /* left to process = strlen(accept) - index + if length to process[r7] is > 3 ; + perform loop unrolling */ + lbzx r7,r16,r9 + nop + bgt cr7,L(unrolling) + + /* if unrolling is not applied, process byte-by-byte */ + cmpw cr7,r7,r10 /* compare accept[index] with str[curr] */ + nop + bne cr7,L(loopNow) + +L(update): + mr r3,r14 /* update return value */ + +/* we are done, return now */ +L(done): + addi r1,r1,144 /* restore stack pointer*/ + ld r0,16(r1) /* read the saved link register */ + ld r14,-24(r1) /* restore callers save register, r14 */ + ld r15,-16(r1) /* restore callers save register, r15 */ + ld r16,-8(r1) /* restore callers save register, r16 */ + mtlr r0 /* restore link register */ + blr /* branch to link register */ + +/* outerLoop is a small loop with very few instructions align it to 4-bytes. */ + .p2align 2 + +L(outerLoop): + addi r14,r14,1 /* increment str(curr)[r14] */ + cmpld cr7,r14,r15 /* verify end of str */ + bne cr7,L(innerLoop) /* continue to process further, else return NULL */ + +L(retNULL): + li r3,0 /* return r3 = NULL */ + b L(done) /* branch to return */ + +END(strpbrk) + +libc_hidden_builtin_def (strpbrk) -- 1.8.3.1
Attachment:
bench-strpbrk.ppc64.out
Description: Text document
Attachment:
bench-strpbrk.ppc64le.out
Description: Text document
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |