This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch master updated. glibc-2.20-534-g8bedcb5
- From: azanella at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 13 Jan 2015 16:52:54 -0000
- Subject: GNU C Library master sources branch master updated. glibc-2.20-534-g8bedcb5
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 8bedcb5f03c62bf6001396dafdd82fbd4da7c2db (commit)
via f06a4faf8a2b4d046eb40e94b47948cc47d79902 (commit)
via 9f2f36e5a91c2ce6edba5415e176155eb1008ae1 (commit)
via 94c9680945369d63ef9ed266a29f28ebaaaeb5ce (commit)
via 96d6fd6c4060d739abb1822e7ad633af749532b2 (commit)
from 0f9e585480edcdf1e30dc3d79e24b84aeee516fa (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=8bedcb5f03c62bf6001396dafdd82fbd4da7c2db
commit 8bedcb5f03c62bf6001396dafdd82fbd4da7c2db
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Wed Jan 7 07:18:30 2015 -0500
powerpc: Optimized strcmp for POWER8/PPC64
This patch adds an optimized POWER8 strcmp using unaligned accesses.
The algorithm first check the initial 16 bytes, then align the first
function source and uses unaligned loads on second argument only.
Aditional checks for page boundaries are done for unaligned cases
diff --git a/ChangeLog b/ChangeLog
index 20aded4..3fa5e3b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,16 @@
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+ Add strcmp-power8 object.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add __strcmp_power8 implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S: New file.
+ * sysdeps/powerpc/powerpc64/multiarch/strcmp.c (strcmp): Add
+ __strcmp_power8 implementation.
+ * sysdeps/powerpc/powerpc64/power8/strcmp.S: New file.
+ * NEWS: Update.
+
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
Add strncpy-power8 and stpncpy-power8 objects.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
diff --git a/NEWS b/NEWS
index 08b3daa..e9f5034 100644
--- a/NEWS
+++ b/NEWS
@@ -19,8 +19,9 @@ Version 2.21
17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782,
17791, 17793, 17796, 17797, 17803, 17806, 17834
-* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for
+* Optimized strcpy, stpcpy, strncpy, stpncpy, and strcmp implementations for
powerpc64/powerpc64le.
+ Implemented by Adhemerval Zanella (IBM).
* Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
and powerpc64le. This may improve lock scaling of existing programs on
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 18d3378..ec4fca7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -18,7 +18,7 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
strncpy-power7 strncpy-ppc64 \
stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
- strcmp-power7 strcmp-ppc64 \
+ strcmp-power8 strcmp-power7 strcmp-ppc64 \
strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
memmove-ppc64 bcopy-ppc64 strncpy-power8
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 132cb13..2c03060 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */
IFUNC_IMPL (i, name, strcmp,
IFUNC_IMPL_ADD (array, i, strcmp,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strcmp_power8)
+ IFUNC_IMPL_ADD (array, i, strcmp,
hwcap & PPC_FEATURE_HAS_VSX,
__strcmp_power7)
IFUNC_IMPL_ADD (array, i, strcmp, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcmp.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
index 9b2922f..dc4bfac 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcmp. PowerPC64 version.
- Copyright (C) 2014-2015 Free Software Foundation, Inc.
+/* Optimized strcmp implementation for POWER8/PPC64.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if defined SHARED && IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
-extern __typeof (strcmp) __strcmp_ppc attribute_hidden;
-extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__strcmp_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__strcmp_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__strcmp_power8)
-libc_ifunc (strcmp,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcmp_power7
- : __strcmp_ppc);
-#endif
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strcmp_power8) \
+ END_2(__strcmp_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strcmp.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
index 9b2922f..b45ba1f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -23,9 +23,12 @@
extern __typeof (strcmp) __strcmp_ppc attribute_hidden;
extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
+extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
libc_ifunc (strcmp,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcmp_power7
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strcmp_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strcmp_power7
: __strcmp_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/power8/strcmp.S b/sysdeps/powerpc/powerpc64/power8/strcmp.S
new file mode 100644
index 0000000..223d891
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcmp.S
@@ -0,0 +1,257 @@
+/* Optimized strcmp implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Implements the function
+
+ size_t [r3] strcmp (const char *s1 [r3], const char *s2 [r4])
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+EALIGN (strcmp, 4, 0)
+ li r0,0
+
+ /* Check if [s1]+32 or [s2]+32 will cross a 4K page boundary using
+ the code:
+
+ (((size_t) s1) % PAGE_SIZE > (PAGE_SIZE - ITER_SIZE))
+
+ with PAGE_SIZE being 4096 and ITER_SIZE begin 32. */
+
+ rldicl r7,r3,0,52
+ rldicl r9,r4,0,52
+ cmpldi cr7,r7,4096-32
+ bgt cr7,L(pagecross_check)
+ cmpldi cr5,r9,4096-32
+ bgt cr5,L(pagecross_check)
+
+ /* For short string up to 32 bytes, load both s1 and s2 using
+ unaligned dwords and compare. */
+ ld r8,0(r3)
+ ld r10,0(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ld r8,8(r3)
+ ld r10,8(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ld r8,16(r3)
+ ld r10,16(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ld r8,24(r3)
+ ld r10,24(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ addi r7,r3,32
+ addi r4,r4,32
+
+L(align_8b):
+ /* Now it has checked for first 32 bytes, align source1 to doubleword
+ and adjust source2 address. */
+ rldicl r9,r7,0,61 /* source1 alignment to doubleword */
+ subf r4,r9,r4 /* Adjust source2 address based on source1
+ alignment. */
+ rldicr r7,r7,0,60 /* Align source1 to doubleword. */
+
+ /* At this point, source1 alignment is 0 and source2 alignment is
+ between 0 and 7. Check is source2 alignment is 0, meaning both
+ sources have the same alignment. */
+ andi. r9,r4,0x7
+ bne cr0,L(loop_diff_align)
+
+ /* If both source1 and source2 are doubleword aligned, there is no
+ need for page boundary cross checks. */
+
+ ld r8,0(r7)
+ ld r10,0(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ .align 4
+L(loop_equal_align):
+ ld r8,8(r7)
+ ld r10,8(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ld r8,16(r7)
+ ld r10,16(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ ldu r8,24(r7)
+ ldu r10,24(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+
+ b L(loop_equal_align)
+
+ /* A zero byte was found in r8 (s1 dword), r9 contains the cmpb
+ result and r10 the dword from s2. To code isolate the byte
+ up to end (including the '\0'), masking with 0xFF the remaining
+ ones:
+
+ #if __LITTLE_ENDIAN__
+ (__builtin_ffsl (x) - 1) = counting trailing zero bits
+ r9 = (__builtin_ffsl (r9) - 1) + 8;
+ r9 = -1UL << r9
+ #else
+ r9 = __builtin_clzl (r9) + 8;
+ r9 = -1UL >> r9
+ #endif
+ r8 = r8 | r9
+ r10 = r10 | r9 */
+
+#ifdef __LITTLE_ENDIAN__
+ nor r9,r9,r9
+L(different_nocmpb):
+ neg r3,r9
+ and r9,r9,r3
+ cntlzd r9,r9
+ subfic r9,r9,63
+#else
+ not r9,r9
+L(different_nocmpb):
+ cntlzd r9,r9
+ subfic r9,r9,56
+#endif
+ srd r3,r8,r9
+ srd r10,r10,r9
+ rldicl r10,r10,0,56
+ rldicl r3,r3,0,56
+ subf r3,r10,r3
+ extsw r3,r3
+ blr
+
+ .align 4
+L(pagecross_check):
+ subfic r9,r9,4096
+ subfic r7,r7,4096
+ cmpld cr7,r7,r9
+ bge cr7,L(pagecross)
+ mr r7,r9
+
+ /* If unaligned 16 bytes reads across a 4K page boundary, it uses
+ a simple byte a byte comparison until the page alignment for s1
+ is reached. */
+L(pagecross):
+ add r7,r3,r7
+ subf r9,r3,r7
+ mtctr r9
+
+ .align 4
+L(pagecross_loop):
+ /* Loads a byte from s1 and s2, compare if *s1 is equal to *s2
+ and if *s1 is '\0'. */
+ lbz r9,0(r3)
+ lbz r10,0(r4)
+ addi r3,r3,1
+ addi r4,r4,1
+ cmplw cr7,r9,r10
+ cmpdi cr5,r9,r0
+ bne cr7,L(pagecross_ne)
+ beq cr5,L(pagecross_nullfound)
+ bdnz L(pagecross_loop)
+ b L(align_8b)
+
+ .align 4
+ /* The unaligned read of source2 will cross a 4K page boundary,
+ and the different byte or NULL maybe be in the remaining page
+ bytes. Since it can not use the unaligned load, the algorithm
+ reads and compares 8 bytes to keep source1 doubleword aligned. */
+L(check_source2_byte):
+ li r9,8
+ mtctr r9
+
+ .align 4
+L(check_source2_byte_loop):
+ lbz r9,0(r7)
+ lbz r10,0(r4)
+ addi r7,r7,1
+ addi r4,r4,1
+ cmplw cr7,r9,10
+ cmpdi r5,r9,0
+ bne cr7,L(pagecross_ne)
+ beq cr5,L(pagecross_nullfound)
+ bdnz L(check_source2_byte_loop)
+
+ /* If source2 is unaligned to doubleword, the code needs to check
+ on each interation if the unaligned doubleword access will cross
+ a 4k page boundary. */
+ .align 5
+L(loop_unaligned):
+ ld r8,0(r7)
+ ld r10,0(r4)
+ cmpb r12,r8,r0
+ cmpb r11,r8,r10
+ orc. r9,r12,r11
+ bne cr0,L(different_nocmpb)
+ addi r7,r7,8
+ addi r4,r4,8
+
+L(loop_diff_align):
+ /* Check if [src2]+8 cross a 4k page boundary:
+
+ srcin2 % PAGE_SIZE > (PAGE_SIZE - 8)
+
+ with PAGE_SIZE being 4096. */
+ rldicl r9,r4,0,52
+ cmpldi cr7,r9,4088
+ ble cr7,L(loop_unaligned)
+ b L(check_source2_byte)
+
+ .align 4
+L(pagecross_ne):
+ extsw r3,r9
+ mr r9,r10
+L(pagecross_retdiff):
+ subf r9,r9,r3
+ extsw r3,r9
+ blr
+
+ .align 4
+L(pagecross_nullfound):
+ li r3,0
+ b L(pagecross_retdiff)
+END (strcmp)
+libc_hidden_builtin_def (strcmp)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f06a4faf8a2b4d046eb40e94b47948cc47d79902
commit f06a4faf8a2b4d046eb40e94b47948cc47d79902
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Wed Dec 31 11:47:41 2014 -0500
powerpc: Optimized st{r,p}ncpy for POWER8/PPC64
This patch adds an optimized POWER8 st{r,p}ncpy using unaligned accesses.
It shows 10%-80% improvement over the optimized POWER7 one that uses
only aligned accesses, specially on unaligned inputs.
The algorithm first read and check 16 bytes (if inputs do not cross a 4K
page size). The it realign source to 16-bytes and issue a 16 bytes read
and compare loop to speedup null byte checks for large strings. Also,
different from POWER7 optimization, the null pad is done inline in the
implementation using possible unaligned accesses, instead of realying on
a memset call. Special case is added for page cross reads.
diff --git a/ChangeLog b/ChangeLog
index 16199e3..20aded4 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,20 @@
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]:
+ Add strncpy-power8 and stpncpy-power8 objects.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add __strncpy_power8 and stpncpy_power8
+ implementations.
+ * sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S: New file.
+ * sysdeps/powerpc/powerpc64/multiarch/stpncpy.c (__stpncpy): Add
+ __stpncpy_power8 implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S: New file.
+ * sysdeps/powerpc/powerpc64/multiarch/strncpy.c (strncpy): Add
+ __strncpy_power8 implementation.
+ * sysdeps/powerpc/powerpc64/power8/stpncpy.S: New file.
+ * sysdeps/powerpc/powerpc64/power8/strncpy.S: New file.
+ * NEWS: Update.
+
* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
* sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
* sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
diff --git a/NEWS b/NEWS
index e020918..08b3daa 100644
--- a/NEWS
+++ b/NEWS
@@ -19,7 +19,8 @@ Version 2.21
17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782,
17791, 17793, 17796, 17797, 17803, 17806, 17834
-* Optimized strcpy and stpcpy implementations for powerpc64/powerpc64le.
+* Optimized strcpy, stpcpy, strncpy, stpncpy implementations for
+ powerpc64/powerpc64le.
* Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
and powerpc64le. This may improve lock scaling of existing programs on
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 74b2daa..18d3378 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -17,9 +17,10 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
stpcpy-power7 stpcpy-ppc64 \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
strncpy-power7 strncpy-ppc64 \
- stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
+ stpncpy-power8 stpncpy-power7 stpncpy-ppc64 \
+ strcmp-power7 strcmp-ppc64 \
strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
- memmove-ppc64 bcopy-ppc64
+ memmove-ppc64 bcopy-ppc64 strncpy-power8
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index dbb21fd..132cb13 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -279,6 +279,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
IFUNC_IMPL_ADD (array, i, strncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strncpy_power8)
+ IFUNC_IMPL_ADD (array, i, strncpy,
hwcap & PPC_FEATURE_HAS_VSX,
__strncpy_power7)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
@@ -287,6 +290,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
IFUNC_IMPL_ADD (array, i, stpncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __stpncpy_power8)
+ IFUNC_IMPL_ADD (array, i, stpncpy,
hwcap & PPC_FEATURE_HAS_VSX,
__stpncpy_power7)
IFUNC_IMPL_ADD (array, i, stpncpy, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
similarity index 55%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
index 9e5a270..d5d835d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
- Copyright (C) 2014-2015 Free Software Foundation, Inc.
+/* Optimized stpncpy implementation for POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,24 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+#define USE_AS_STPNCPY
-libc_ifunc (__stpncpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __stpncpy_power7
- : __stpncpy_ppc);
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__stpncpy_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__stpncpy_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__stpncpy_power8)
-weak_alias (__stpncpy, stpncpy)
-#endif
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__stpncpy_power8) \
+ END_2(__stpncpy_power8)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 9e5a270..0f4072f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -23,10 +23,13 @@
extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
libc_ifunc (__stpncpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __stpncpy_power7
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __stpncpy_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __stpncpy_power7
: __stpncpy_ppc);
weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
index 9e5a270..ed906a4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
- Copyright (C) 2014-2015 Free Software Foundation, Inc.
+/* Optimized strncpy implementation for POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,25 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__strncpy_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__strncpy_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__strncpy_power8)
-libc_ifunc (__stpncpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __stpncpy_power7
- : __stpncpy_ppc);
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strncpy_power8) \
+ END_2(__strncpy_power8)
-weak_alias (__stpncpy, stpncpy)
-#endif
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index ae4e97a..ffb0f23 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -24,12 +24,15 @@
extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
+extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (strncpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strncpy_power7
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strncpy_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strncpy_power7
: __strncpy_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
similarity index 59%
copy from sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
copy to sysdeps/powerpc/powerpc64/power8/stpncpy.S
index 9e5a270..76a1466 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/power8/stpncpy.S
@@ -1,5 +1,5 @@
-/* Multiple versions of stpncpy. PowerPC64 version.
- Copyright (C) 2014-2015 Free Software Foundation, Inc.
+/* Optimized stpncpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,18 +16,5 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
-
-extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
-extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
-
-libc_ifunc (__stpncpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __stpncpy_power7
- : __stpncpy_ppc);
-
-weak_alias (__stpncpy, stpncpy)
-#endif
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/power8/strncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/power8/strncpy.S b/sysdeps/powerpc/powerpc64/power8/strncpy.S
new file mode 100644
index 0000000..5fda953
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strncpy.S
@@ -0,0 +1,424 @@
+/* Optimized strncpy/stpncpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPNCPY
+# define FUNC_NAME __stpncpy
+#else
+# define FUNC_NAME strncpy
+#endif
+
+/* Implements the function
+
+ char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ or
+
+ char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ if USE_AS_STPCPY is defined.
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+ .machine power7
+EALIGN (FUNC_NAME, 4, 0)
+
+ /* Check if the [src]+15 will cross a 4K page by checking if the bit
+ indicating the page size changes. Basically:
+
+ uint64_t srcin = (uint64_t)src;
+ uint64_t ob = srcin & 4096UL;
+ uint64_t nb = (srcin+15UL) & 4096UL;
+ if (ob ^ nb)
+ goto pagecross; */
+
+ addi r10,r4,16
+ rlwinm r9,r4,0,19,19
+
+ /* Since it is a leaf function, save some non-volatile registers on the
+ protected/red zone. */
+ std r26,-48(r1)
+ std r27,-40(r1)
+
+ rlwinm r8,r10,0,19,19
+
+ std r28,-32(r1)
+ std r29,-24(r1)
+
+ cmpld r7,r9,r8
+
+ std r30,-16(r1)
+ std r31,-8(r1)
+
+ beq cr7,L(unaligned_lt_16)
+ rldicl r9,r4,0,61
+ subfic r8,r9,8
+ cmpld cr7,r5,r8
+ bgt cr7,L(pagecross)
+
+ /* At this points there is 1 to 15 bytes to check and write. Since it could
+ be either from first unaligned 16 bytes access or from bulk copy, the code
+ uses an unrolled byte read/write instead of trying to analyze the cmpb
+ results. */
+L(short_path):
+ mr r9,r3
+L(short_path_1):
+ cmpdi cr7,r5,0
+ beq cr7,L(short_path_loop_end_1)
+L(short_path_2):
+ lbz r10,0(r4)
+ cmpdi cr7,r10,0
+ stb r10,0(r9)
+ beq cr7,L(zero_pad_start_1)
+ cmpdi cr0,r5,1
+ addi r8,r9,1
+ addi r6,r5,-1
+ beq cr0,L(short_path_loop_end_0)
+ lbz r10,1(r4)
+ cmpdi cr7,r10,0
+ stb r10,1(r9)
+ beq cr7,L(zero_pad_start_prepare_1)
+ addi r10,r5,-3
+ b L(short_path_loop_1)
+
+ .align 4
+L(short_path_loop):
+ lbz r8,0(r4)
+ addi r7,r10,-2
+ cmpdi cr5,r8,0
+ stb r8,0(r9)
+ beq cr5,L(zero_pad_start_1)
+ beq r7,L(short_path_loop_end_0)
+ lbz r8,1(r4)
+ cmpdi cr7,r8,0
+ stb r8,1(r9)
+ beq cr7,L(zero_pad_start)
+ mr r10,r7
+L(short_path_loop_1):
+ addic. r5,r5,-2
+ addi r9,r9,2
+ cmpdi cr7,r10,0
+ addi r4,r4,2
+ addi r6,r9,1
+ bne cr0,L(short_path_loop)
+#ifdef USE_AS_STPNCPY
+ mr r3,r9
+ b L(short_path_loop_end)
+#endif
+
+L(short_path_loop_end_0):
+#ifdef USE_AS_STPNCPY
+ addi r3,r9,1
+ b L(short_path_loop_end)
+#endif
+L(short_path_loop_end_1):
+#ifdef USE_AS_STPNCPY
+ mr r3,r9
+#endif
+L(short_path_loop_end):
+ /* Restore non-volatile registers. */
+ ld r26,-48(r1)
+ ld r27,-40(r1)
+ ld r28,-32(r1)
+ ld r29,-24(r1)
+ ld r30,-16(r1)
+ ld r31,-8(r1)
+ blr
+
+ /* This code pads the remainder dest with NULL bytes. The algorithm
+ calculate the remanining size and issues a doubleword unrolled
+ loops followed by a byte a byte set. */
+ .align 4
+L(zero_pad_start):
+ mr r5,r10
+ mr r9,r6
+L(zero_pad_start_1):
+ srdi. r8,r5,r3
+ mr r10,r9
+#ifdef USE_AS_STPNCPY
+ mr r3,r9
+#endif
+ beq- cr0,L(zero_pad_loop_b_start)
+ cmpldi cr7,r8,1
+ li cr7,0
+ std r7,0(r9)
+ beq cr7,L(zero_pad_loop_b_prepare)
+ addic. r8,r8,-2
+ addi r10,r9,r16
+ std r7,8(r9)
+ beq cr0,L(zero_pad_loop_dw_2)
+ std r7,16(r9)
+ li r9,0
+ b L(zero_pad_loop_dw_1)
+
+ .align 4
+L(zero_pad_loop_dw):
+ addi r10,r10,16
+ std r9,-8(r10)
+ beq cr0,L(zero_pad_loop_dw_2)
+ std r9,0(r10)
+L(zero_pad_loop_dw_1):
+ cmpldi cr7,r8,1
+ std r9,0(r10)
+ addic. r8,r8,-2
+ bne cr7,L(zero_pad_loop_dw)
+ addi r10,r10,8
+L(zero_pad_loop_dw_2):
+ rldicl r5,r5,0,61
+L(zero_pad_loop_b_start):
+ cmpdi cr7,r5,0
+ addi r5,r5,-1
+ addi r9,r10,-1
+ add r10,r10,5
+ subf r10,r9,r10
+ li r8,0
+ beq- cr7,L(short_path_loop_end)
+
+ /* Write remaining 1-8 bytes. */
+ .align 4
+ addi r9,r9,1
+ mtocrf 0x1,r10
+ bf 29,4f
+ stw r8,0(r9)
+ addi r9,r9,4
+
+ .align 4
+4: bf 30,2f
+ sth r8,0(r9)
+ addi r9,r9,2
+
+ .align 4
+2: bf 31,1f
+ stb r8,0(r9)
+
+ /* Restore non-volatile registers. */
+1: ld r26,-48(r1)
+ ld r27,-40(r1)
+ ld r28,-32(r1)
+ ld r29,-24(r1)
+ ld r30,-16(r1)
+ ld r31,-8(r1)
+ blr
+
+ /* The common case where [src]+16 will not cross a 4K page boundary.
+ In this case the code fast check the first 16 bytes by using doubleword
+ read/compares and update destiny if neither total size or null byte
+ is found in destiny. */
+ .align 4
+L(unaligned_lt_16):
+ cmpldi cr7,r5,7
+ ble cr7,L(short_path)
+ ld r7,0(r4)
+ li r8,0
+ cmpb r8,r7,r8
+ cmpdi cr7,r8,0
+ bne cr7,L(short_path_prepare_2)
+ addi r6,r5,-8
+ std r7,0(r3)
+ addi r9,r3,r8
+ cmpldi cr7,r6,7
+ addi r7,r4,8
+ ble cr7,L(short_path_prepare_1_1)
+ ld r4,8(r4)
+ cmpb r8,r4,r8
+ cmpdi cr7,r8,0
+ bne cr7,L(short_path_prepare_2_1)
+ std r4,8(r3)
+ addi r29,r3,16
+ addi r5,r5,-16
+ /* Neither the null byte was found or total length was reached,
+ align to 16 bytes and issue a bulk copy/compare. */
+ b L(align_to_16b)
+
+ /* In the case of 4k page boundary cross, the algorithm first align
+ the address to a doubleword, calculate a mask based on alignment
+ to ignore the bytes and continue using doubleword. */
+ .align 4
+L(pagecross):
+ rldicr r11,r4,0,59 /* Align the address to 8 bytes boundary. */
+ li r6,-1 /* MASK = 0xffffffffffffffffUL. */
+ sldi r9,r9,3 /* Calculate padding. */
+ ld r7,0(r11) /* Load doubleword from memory. */
+#ifdef __LITTLE_ENDIAN__
+ sld r9,r6,r9 /* MASK = MASK << padding. */
+#else
+ srd r9,r6,r9 /* MASK = MASK >> padding. */
+#endif
+ orc r9,r7,r9 /* Mask bits that are not part of the
+ string. */
+ li cr7,0
+ cmpb r9,r9,r7 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ subf r8,r8,r5 /* Adjust total length. */
+ cmpldi cr7,r8,8 /* Check if length was reached. */
+ ble cr7,L(short_path_prepare_2)
+
+ /* For next checks we have aligned address, so we check for more
+ three doublewords to make sure we can read 16 unaligned bytes
+ to start the bulk copy with 16 aligned addresses. */
+ ld cr7,8(r11)
+ cmpb r9,r7,r9
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ addi cr7,r8,-8
+ cmpldi cr7,r7,8
+ ble cr7,L(short_path_prepare_2)
+ ld cr7,16(r11)
+ cmpb r9,r7,r9
+ cmpdi cr7,r9,0
+ bne cr7,L(short_path_prepare_2)
+ addi r8,r8,-16
+ cmpldi r7,r8,8
+ ble cr7,L(short_path_prepare_2)
+ ld r8,24(r11)
+ cmpb r9,r8,r9
+ cmpdi r7,r9,0
+ bne cr7,L(short_path_prepare_2)
+
+ /* No null byte found in the 32 bytes readed and length not reached,
+ read source again using unaligned loads and store them. */
+ ld r9,0(r4)
+ addi r29,r3,16
+ addi r5,r5,-16
+ std r9,0(r3)
+ ld r9,8(r4)
+ std r9,8(r3)
+
+ /* Align source to 16 bytes and adjust destiny and size. */
+L(align_to_16b):
+ rldicl r9,r10,0,60
+ rldicr r28,r10,0,59
+ add r12,r5,r9
+ subf r29,r9,r29
+
+ /* The bulk read/compare/copy loads two doublewords, compare and merge
+ in a single register for speed. This is an attempt to speed up the
+ null-checking process for bigger strings. */
+
+ cmpldi cr7,r12,15
+ ble cr7,L(short_path_prepare_1_2)
+
+ /* Main loop for large sizes, unrolled 2 times to get better use of
+ pipeline. */
+ ld r8,0(28)
+ ld r10,8(28)
+ li r9,0
+ cmpb r7,r8,r9
+ cmpb r9,r10,r9
+ or. r6,r9,r7
+ bne cr0,L(short_path_prepare_2_3)
+ addi r5,r12,-16
+ addi r4,r28,16
+ std r8,0(r29)
+ std r10,8(r29)
+ cmpldi cr7,r5,15
+ addi r9,r29,16
+ ble cr7,L(short_path_1)
+ mr r11,r28
+ mr r6,r29
+ li r30,0
+ subfic r26,r4,48
+ subfic r27,r9,48
+
+ b L(loop_16b)
+
+ .align 4
+L(loop_start):
+ ld r31,0(r11)
+ ld r10,8(r11)
+ cmpb r0,r31,r7
+ cmpb r8,r10,r7
+ or. r7,r0,r8
+ addi r5,r5,-32
+ cmpldi cr7,r5,15
+ add r4,r4,r26
+ add r9,r9,r27
+ bne cr0,L(short_path_prepare_2_2)
+ add r4,r28,r4
+ std r31,0(r6)
+ add r9,r29,r9
+ std r10,8(r6)
+ ble cr7,L(short_path_1)
+
+L(loop_16b):
+ ld r10,16(r11)
+ ld r0,24(r11)
+ cmpb r8,r10,r30
+ cmpb r7,r0,r30
+ or. r7,r8,r7
+ addi r12,r12,-32
+ cmpldi r7,r12,15
+ addi r11,r11,32
+ bne cr0,L(short_path_2)
+ std r10,16(r6)
+ addi r6,r6,32
+ std r0,-8(r6)
+ bgt cr7,L(loop_start)
+
+ mr r5,r12
+ mr r4,r11
+ mr r9,r6
+ b L(short_path_1)
+
+ .align 4
+L(short_path_prepare_1_1):
+ mr r5,r6
+ mr r4,r7
+ b L(short_path_1)
+L(short_path_prepare_1_2):
+ mr r5,r12
+ mr r4,r28
+ mr r9,r29
+ b L(short_path_1)
+L(short_path_prepare_2):
+ mr r9,r3
+ b L(short_path_2)
+L(short_path_prepare_2_1):
+ mr r5,r6
+ mr r4,r7
+ b L(short_path_2)
+L(short_path_prepare_2_2):
+ mr r5,r12
+ mr r4,r11
+ mr r9,r6
+ b L(short_path_2)
+L(short_path_prepare_2_3):
+ mr r5,r12
+ mr r4,r28
+ mr r9,r29
+ b L(short_path_2)
+L(zero_pad_loop_b_prepare):
+ addi r10,r9,8
+ rldicl r5,r5,0,61
+ b L(zero_pad_loop_b_start)
+L(zero_pad_start_prepare_1):
+ mr r5,r6
+ mr r9,r8
+ b L(zero_pad_start_1)
+END (FUNC_NAME)
+
+#ifdef USE_AS_STPNCPY
+libc_hidden_def (__stpncpy)
+#else
+libc_hidden_builtin_def (strncpy)
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=9f2f36e5a91c2ce6edba5415e176155eb1008ae1
commit 9f2f36e5a91c2ce6edba5415e176155eb1008ae1
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Tue Dec 23 13:39:23 2014 -0500
powerpc: Optimized strncat for POWER7/PPC64
With 3eb38795dbbbd816 (Simplify strncat) the generic algorithms uses
strlen, strnlen, and memcpy. This is faster than POWER7 current
implementation, especially for unaligned strings (where POWER7 code
uses byte-byte operations).
This patch removes the assembly implementation and uses a multiarch
specialization based on default algorithm calling optimized POWER7
symbols.
diff --git a/ChangeLog b/ChangeLog
index 744632a..16199e3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,9 @@
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+ * sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c: New file.
+ * sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S: Remove file.
+ * sysdeps/powerpc/powerpc64/power7/strncat.S: Likewise.
+
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
strncat-power8 object.
* sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S
deleted file mode 100644
index 6216284..0000000
--- a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Optimized strncat implementation for POWER7.
- Copyright (C) 2014-2015 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#undef EALIGN
-#define EALIGN(name, alignt, words) \
- .section ".text"; \
- ENTRY_2(__strncat_power7) \
- .align ALIGNARG(alignt); \
- EALIGN_W_##words; \
- BODY_LABEL(__strncat_power7): \
- cfi_startproc; \
- LOCALENTRY(__strncat_power7)
-
-#undef END
-#define END(name) \
- cfi_endproc; \
- TRACEBACK(__strncat_power7) \
- END_2(__strncat_power7)
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-
-#define STRLEN __strlen_power7
-
-#include <sysdeps/powerpc/powerpc64/power7/strncat.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
new file mode 100644
index 0000000..39b1aeb
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
@@ -0,0 +1,31 @@
+/* Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/ >. */
+
+#include <string.h>
+
+#define STRNCAT __strncat_power7
+
+extern __typeof (strncat) __strncat_power7 attribute_hidden;
+extern __typeof (strlen) __strlen_power7 attribute_hidden;
+extern __typeof (strnlen) __strnlen_power7 attribute_hidden;
+extern __typeof (memcpy) __memcpy_power7 attribute_hidden;
+
+#define strlen __strlen_power7
+#define __strnlen __strnlen_power7
+#define memcpy __memcpy_power7
+
+#include <string/strncat.c>
diff --git a/sysdeps/powerpc/powerpc64/power7/strncat.S b/sysdeps/powerpc/powerpc64/power7/strncat.S
deleted file mode 100644
index 05502ac..0000000
--- a/sysdeps/powerpc/powerpc64/power7/strncat.S
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Optimized strncat implementation for PowerPC64/POWER7.
-
- Copyright (C) 2014-2015 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* The algorithm is as follows for aligned memory access :
-
- if address of s2 is divisible by 0x7UL,
- perform aligned doubleword catenation
- else
- perform unaligned catenation
-
- The aligned comparison are made using cmpb instructions. */
-
-/* char* [r3] strncat (const char *s1 [r3],
- const char *s2 [r4],
- size_t size [r5]) */
-
-#include <sysdep.h>
-
-#ifndef STRNCAT
-# undef strncat
-# define STRNCAT strncat
-#endif
-
-#ifndef STRLEN
-/* For builds with no IFUNC support, local calls should be made to internal
- GLIBC symbol (created by libc_hidden_builtin_def). */
-# ifdef SHARED
-# define STRLEN __GI_strlen
-# else
-# define STRLEN strlen
-# endif
-#endif
-
-#define FRAMESIZE (FRAME_MIN_SIZE+32)
-
- .machine power7
-EALIGN(STRNCAT, 4, 0)
- CALL_MCOUNT 3
-
- mflr r0 /* Load link register LR to r0. */
-
-/* We shall use r29, r30 and r31 non volatile register for retention.
- Save all the callee registers in the GPR save area. */
- std r29, -24(r1) /* Save callers register r29. */
- std r30, -16(r1) /* Save callers register r30. */
- std r31, -8(r1) /* Save callers register r31. */
-
- std r0, 16(r1) /* Store the link register. */
- stdu r1, -FRAMESIZE(r1) /* Create the stack frame. */
-
-/* Improve performance with CPU pre-fetch. */
- dcbt 0, r3 /* Pre-fetch str to avoid cache
- miss. */
- dcbt 0, r4 /* Pre-fetch accept to avoid cache
- miss. */
-
- mr. r29, r5 /* Save "n" in r29. */
- mr r30, r3 /* Save "s1" in r30 from r3. */
- beq cr0,L(done)
-
- mr r31, r4 /* Save "s2" in r31 from r4. */
- bl STRLEN /* Call optimized strlen on s1; goto
- end of s1. */
- nop
- cmpldi cr7, r29, 7 /* If s2 is <=7 process
- byte-by-byte. */
- add r3, r30, r3 /* Grab the last character of s1. */
- bgt cr7,L(alignment) /* Process by aligned strings. */
-
- cmpldi cr7, r29, 3 /* If n is >= 4, we can
- byte-unroll. */
- addi r9, r3, -1 /* Make "s1" point before next
- character, increment when read. */
- bgt cr7, L(bytes_unroll) /* Process each byte. */
-
-L(byte_by_byte):
- lbz r10, 0(r31)
- addi r8, r9, 1
- cmpdi cr7, r10, 0 /* Check for NULL in "s2". */
- stb r10, 1(r9)
- beq cr7, L(done)
- add r9, r9, r29
- subf r9, r8, r9
- addi r9, r9, 1
- mtctr r9
- b L(branch2)
- .p2align 4
-L(branch1):
- lbzu r10, 1(r31)
- cmpdi cr7, r10, 0
- stbu r10, 1(r8)
- beq cr7,L(done)
-L(branch2):
- mr r9, r8
- bdnz L(branch1)
- beq cr7,L(done)
-L(nullTerminate):
- li r10, 0 /* Load NULL for termination. */
- stb r10, 1(r9) /* Append or terminate s1 with
- NULL. */
- .p2align 4 /* A small section here. */
-L(done): /* We return now. */
- addi r1, r1, FRAMESIZE /* Restore stack pointer. */
- mr r3, r30 /* Set the return value length of
- string. */
- ld r0, 16(r1) /* Read the saved link register. */
- ld r29, -24(r1) /* Restore save register r29. */
- ld r30, -16(r1) /* Restore save register r30. */
- ld r31, -8(r1) /* Restore save register r31. */
- mtlr r0 /* Restore link register. */
- blr /* Branch to link register. */
-
- .p2align 4
-L(alignment):
- rldicl. r9, r31, 0, 61 /* Check if s2 is 8byte aligned */
- beq cr0,L(dwordAligned)
-
- .p2align 4
-/* Unaligned bytes in string, so process byte by byte.
- POWER7 has performance gains over loop unroll. */
-L(bytes_unroll):
- addi r9, r3, -1
- srdi r10, r29, 2
- mtctr r10
- b L(L10)
- .p2align 4
-L(L44):
- lbz r10, 1(r31) /* Load byte. */
- cmpdi cr7, r10, 0 /* Compare ; if byte not zero,
- continue. */
- stb r10, 2(r9) /* Store byte */
- beq cr7, L(done)
- addi r31, r31, 4
-
- lbz r10, -2(r31) /* Perform loop unroll here on byte
- load and store. */
- cmpdi cr7, r10, 0
- stb r10, 3(r9)
- beq cr7, L(done)
-
- lbz r10, -1(r31) /* Loop unroll here. */
- cmpdi cr7, r10, 0
- stbu r10, 4(r9)
- beq cr7, L(done)
-
- bdz L(leftNbytes)
-
-L(L10):
- lbz r10, 0(r31) /* Loop unroll here. */
- cmpdi cr7, r10, 0
- stb r10, 1(r9)
- bne cr7,L(L44)
- b L(done)
- .p2align 4
-/* If s2 is double word aligned, we load and store double word. */
-L(dwordAligned):
-/* read, write 8 bytes at a time */
- srdi r8, r29, 3 /* Compute count for CTR to loop;
- count = n/8. */
- li r7, 0 /* Load r7 with NULL. */
- li r10, 0 /* Load r10 with MASK '0'. */
-
- mtctr r8 /* Move count to CTR. */
-L(loop8):
- ld r9, 0(r31) /* Read double word from s2. */
- cmpb r6, r9, r10 /* Compare bytes in s2 we read
- just now. */
- cmpdi r6, 0 /* If cmpb returned NULL,
- we continue. */
- bne+ L(a8)
- std r9, 0(r3) /* Append double word from s2
- with s1. */
- addi r3, r3, 8 /* Increment s1. */
- addi r31, r31, 8 /* Increment s2. */
- subi r29, r29, 8 /* Decrement count by 8. */
- bdnz L(loop8) /* Continue until "count" is
- non zero. */
-
-L(a8):
- cmpdi r29, 0 /* If "n" is already zero, we skip. */
- beq+ L(align8align)
-
- mtctr r29 /* Process left over bytes in "n". */
-L(unaligned0):
- lbz r9, 0(r31) /* Read a byte from s2. */
- cmpw r9, r7 /* If byte is NULL, we stop here . */
- beq+ L(align8align) /* Skip processing further if NULL. */
- stb r9, 0(r3) /* If not NULL, store byte into s1. */
- addi r3, r3, 1 /* Increment s1 by 1. */
- addi r31, r31, 1 /* Increment s2 by 1. */
- bdnz L(unaligned0) /* Decrement counter "n" and loop
- until non zero. */
-L(align8align):
- stb r7, 0(r3) /* Terminate s1 with NULL. */
-
- addi r1, r1, FRAMESIZE /* Restore stack pointer. */
- mr r3, r30 /* Set the return value, length of
- string. */
- ld r0, 16(r1) /* Read the saved link register. */
- ld r29, -24(r1) /* Restore save register r29. */
- ld r30, -16(r1) /* Restore save register r30. */
- ld r31, -8(r1) /* Restore save register r31. */
- mtlr r0 /* Restore link register. */
- blr /* Branch to link register */
-
- .p2align 4
-L(leftNbytes):
- rldicl. r29, r29, 0, 62 /* Check if n>0 and n < 4 bytes. */
- bne cr0,L(byte_by_byte) /* Process bytes one by one. */
- b L(nullTerminate) /* Now, finish catenation with
- NULL termination. */
-END(STRNCAT)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=94c9680945369d63ef9ed266a29f28ebaaaeb5ce
commit 94c9680945369d63ef9ed266a29f28ebaaaeb5ce
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Tue Dec 23 13:36:34 2014 -0500
powerpc: Optimized strcat for POWER8/PPC64
With new optimized strcpy for POWER8, this patch adds an optimized
strcat which uses it along with default implementation at strings/.
diff --git a/ChangeLog b/ChangeLog
index 7204573..744632a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,15 @@
2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+ strncat-power8 object.
+ * sysdeps/powerpc/powerpc64/multiarch/strcat.c (strcat): Add
+ __strcat_power8 implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add __strcat_power8 implementation.
+ * sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c: New file:
+ optimized strcat for power8.
+
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
strcpy-power8 and stpcpy-power8 objects.
* sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add __strcpy_power8 and __stpcpy_power8
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index f170551..74b2daa 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -18,8 +18,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
strncpy-power7 strncpy-ppc64 \
stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
- strcat-power7 strcat-ppc64 memmove-power7 memmove-ppc64 \
- bcopy-ppc64
+ strcat-power8 strcat-power7 strcat-ppc64 memmove-power7 \
+ memmove-ppc64 bcopy-ppc64
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 88c8234..dbb21fd 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -303,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c. */
IFUNC_IMPL (i, name, strcat,
IFUNC_IMPL_ADD (array, i, strcat,
+ hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strcat_power8)
+ IFUNC_IMPL_ADD (array, i, strcat,
hwcap & PPC_FEATURE_HAS_VSX,
__strcat_power7)
IFUNC_IMPL_ADD (array, i, strcat, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcat.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
index af188d3..6c7544c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
@@ -1,5 +1,4 @@
-/* Multiple versions of strcat. PowerPC64 version.
- Copyright (C) 2014-2015 Free Software Foundation, Inc.
+/* Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -9,23 +8,23 @@
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
+ <http://www.gnu.org/licenses/ >. */
-#if IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <string.h>
-extern __typeof (strcat) __strcat_ppc attribute_hidden;
-extern __typeof (strcat) __strcat_power7 attribute_hidden;
+#define STRCAT __strcat_power8
-libc_ifunc (strcat,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcat_power7
- : __strcat_ppc);
-#endif
+#undef libc_hidden_def
+#define libc_hidden_def(name)
+
+extern typeof (strcpy) __strcpy_power8;
+extern typeof (strlen) __strlen_power7;
+
+#define strcpy __strcpy_power8
+#define strlen __strlen_power7
+#include <sysdeps/powerpc/strcat.c>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
index af188d3..4708a9a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
@@ -23,9 +23,12 @@
extern __typeof (strcat) __strcat_ppc attribute_hidden;
extern __typeof (strcat) __strcat_power7 attribute_hidden;
+extern __typeof (strcat) __strcat_power8 attribute_hidden;
libc_ifunc (strcat,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcat_power7
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strcat_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strcat_power7
: __strcat_ppc);
#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=96d6fd6c4060d739abb1822e7ad633af749532b2
commit 96d6fd6c4060d739abb1822e7ad633af749532b2
Author: Adhemerval Zanella <azanella@linux.vnet.ibm.com>
Date: Tue Dec 23 05:59:44 2014 -0600
powerpc: Optimized st{r,p}cpy for POWER8/PPC64
This patch adds an optimized POWER8 strcpy using unaligned accesses.
For strings up to 16 bytes the implementation first calculate the
string size, like strlen, and issues a memcpy. For larger strings,
source is first aligned to 16 bytes and then tested over a loop that
reads 16 bytes am combine the cmpb results for speedup. Special case is
added for page cross reads.
It shows 30%-60% improvement over the optimized POWER7 one that uses
only aligned accesses.
diff --git a/ChangeLog b/ChangeLog
index 09f1a80..7204573 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,22 @@
+2015-01-13 Adhemerval Zanella <azanella@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/multiarch/Makefile [sysdep_routines]: Add
+ strcpy-power8 and stpcpy-power8 objects.
+ * sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add __strcpy_power8 and __stpcpy_power8
+ implementations.
+ * sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S: New file:
+ multiarch stpcpy implementation for POWER8.
+ * sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S: New file;
+ multiarch strcpy implementation for POWER8.
+ * sysdeps/powerpc/powerpc64/multiarch/strcpy.c (strcpy): Add
+ __strcpy_power8 function.
+ * sysdeps/powerpc/powerpc64/power8/stpcpy.S: New file: optimized
+ stpcpy for POWER8.
+ * sysdeps/powerpc/powerpc64/power8/strcpy.S: New file: optimized
+ strcpy for POWER8.
+ * NEWS: Update.
+
2015-01-13 Leonhard Holz <leonhard.holz@web.de>
[BZ #16009]
diff --git a/NEWS b/NEWS
index fbf133e..e020918 100644
--- a/NEWS
+++ b/NEWS
@@ -19,6 +19,8 @@ Version 2.21
17744, 17745, 17746, 17747, 17748, 17775, 17777, 17780, 17781, 17782,
17791, 17793, 17796, 17797, 17803, 17806, 17834
+* Optimized strcpy and stpcpy implementations for powerpc64/powerpc64le.
+
* Added support for TSX lock elision of pthread mutexes on powerpc32, powerpc64
and powerpc64le. This may improve lock scaling of existing programs on
HTM capable systems. The lock elision code is only enabled with
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 39e441b..f170551 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -13,7 +13,8 @@ sysdep_routines += memcpy-power7 memcpy-a2 memcpy-power6 memcpy-cell \
wcschr-power6 wcschr-ppc64 wcsrchr-power7 wcsrchr-power6 \
wcsrchr-ppc64 wcscpy-power7 wcscpy-power6 wcscpy-ppc64 \
wordcopy-power7 wordcopy-power6 wordcopy-ppc64 \
- strcpy-power7 strcpy-ppc64 stpcpy-power7 stpcpy-ppc64 \
+ strcpy-power8 strcpy-power7 strcpy-ppc64 stpcpy-power8 \
+ stpcpy-power7 stpcpy-ppc64 \
strrchr-power7 strrchr-ppc64 strncat-power7 strncat-ppc64 \
strncpy-power7 strncpy-ppc64 \
stpncpy-power7 stpncpy-ppc64 strcmp-power7 strcmp-ppc64 \
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 47e3398..88c8234 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -83,6 +83,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c. */
IFUNC_IMPL (i, name, strcpy,
+ IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __strcpy_power8)
IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
__strcpy_power7)
IFUNC_IMPL_ADD (array, i, strcpy, 1,
@@ -90,6 +92,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpcpy.c. */
IFUNC_IMPL (i, name, stpcpy,
+ IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
+ __stpcpy_power8)
IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_HAS_VSX,
__stpcpy_power7)
IFUNC_IMPL_ADD (array, i, stpcpy, 1,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
index cd47bf6..66e6f70 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
- Copyright (C) 2013-2015 Free Software Foundation, Inc.
+/* Optimized stpcpy implementation for POWER8/PPC64.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if defined SHARED && IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__stpcpy_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__stpcpy_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__stpcpy_power8)
-libc_ifunc (strcpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcpy_power7
- : __strcpy_ppc);
-#endif
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__stpcpy_power8) \
+ END_2(__stpcpy_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/stpcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
similarity index 53%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
index cd47bf6..64cbc16 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
- Copyright (C) 2013-2015 Free Software Foundation, Inc.
+/* Optimized strcpy implementation for POWER8/PPC64.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,25 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if defined SHARED && IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#include <sysdep.h>
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+#undef EALIGN
+#define EALIGN(name, alignt, words) \
+ .section ".text"; \
+ ENTRY_2(__strcpy_power8) \
+ .align ALIGNARG(alignt); \
+ EALIGN_W_##words; \
+ BODY_LABEL(__strcpy_power8): \
+ cfi_startproc; \
+ LOCALENTRY(__strcpy_power8)
-libc_ifunc (strcpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcpy_power7
- : __strcpy_ppc);
-#endif
+#undef END
+#define END(name) \
+ cfi_endproc; \
+ TRACEBACK(__strcpy_power8) \
+ END_2(__strcpy_power8)
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
index cd47bf6..fd0afd4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
@@ -23,9 +23,12 @@
extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
+extern __typeof (strcpy) __strcpy_power8 attribute_hidden;
libc_ifunc (strcpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcpy_power7
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07)
+ ? __strcpy_power8 :
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strcpy_power7
: __strcpy_ppc);
#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
similarity index 61%
copy from sysdeps/powerpc/powerpc64/multiarch/strcpy.c
copy to sysdeps/powerpc/powerpc64/power8/stpcpy.S
index cd47bf6..bf72065 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
+++ b/sysdeps/powerpc/powerpc64/power8/stpcpy.S
@@ -1,5 +1,5 @@
-/* Multiple versions of strcpy. PowerPC64 version.
- Copyright (C) 2013-2015 Free Software Foundation, Inc.
+/* Optimized stpcpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,16 +16,9 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#if defined SHARED && IS_IN (libc)
-# include <string.h>
-# include <shlib-compat.h>
-# include "init-arch.h"
+#define USE_AS_STPCPY
+#include <sysdeps/powerpc/powerpc64/power8/strcpy.S>
-extern __typeof (strcpy) __strcpy_ppc attribute_hidden;
-extern __typeof (strcpy) __strcpy_power7 attribute_hidden;
-
-libc_ifunc (strcpy,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __strcpy_power7
- : __strcpy_ppc);
-#endif
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S
new file mode 100644
index 0000000..d3e9a10
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/power8/strcpy.S
@@ -0,0 +1,262 @@
+/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER8.
+ Copyright (C) 2015 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef USE_AS_STPCPY
+# define FUNC_NAME __stpcpy
+#else
+# define FUNC_NAME strcpy
+#endif
+
+/* Implements the function
+
+ char * [r3] strcpy (char *dest [r3], const char *src [r4])
+
+ or
+
+ char * [r3] stpcpy (char *dest [r3], const char *src [r4])
+
+ if USE_AS_STPCPY is defined.
+
+ The implementation uses unaligned doubleword access to avoid specialized
+ code paths depending of data alignment. Although recent powerpc64 uses
+ 64K as default, the page cross handling assumes minimum page size of
+ 4k. */
+
+ .machine power7
+EALIGN (FUNC_NAME, 4, 0)
+ li r0,0 /* Doubleword with null chars to use
+ with cmpb. */
+
+ /* Check if the [src]+15 will cross a 4K page by checking if the bit
+ indicating the page size changes. Basically:
+
+ uint64_t srcin = (uint64_t)src;
+ uint64_t ob = srcin & 4096UL;
+ uint64_t nb = (srcin+15UL) & 4096UL;
+ if (ob ^ nb)
+ goto pagecross; */
+
+ addi r9,r4,15
+ xor r9,r9,r4
+ rlwinm. r9,r9,0,19,19
+ bne L(pagecross)
+
+ /* For short string (less than 16 bytes), just calculate its size as
+ strlen and issues a memcpy if null is found. */
+ mr r7,r4
+ ld r12,0(r7) /* Load doubleword from memory. */
+ cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+ bne cr7,L(done)
+
+ ldu r8,8(r7)
+ cmpb r10,r8,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ b L(loop_before)
+
+ .align 4
+L(pagecross):
+ clrrdi r7,r4,3 /* Align the address to doubleword boundary. */
+ rlwinm r6,r4,3,26,28 /* Calculate padding. */
+ li r5,-1 /* MASK = 0xffffffffffffffff. */
+ ld r12,0(r7) /* Load doubleword from memory. */
+#ifdef __LITTLE_ENDIAN__
+ sld r5,r5,r6
+#else
+ srd r5,r5,r6 /* MASK = MASK >> padding. */
+#endif
+ orc r9,r12,r5 /* Mask bits that are not part of the string. */
+ cmpb r10,r9,r0 /* Check for null bytes in DWORD1. */
+ cmpdi cr7,r10,0 /* If r10 == 0, no null's have been found. */
+ bne cr7,L(done)
+
+ ldu r6,8(r7)
+ cmpb r10,r6,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ ld r12,0(r7)
+ cmpb r10,r12,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ ldu r6,8(r7)
+ cmpb r10,r6,r0
+ cmpdi cr7,r10,0
+ bne cr7,L(done)
+
+ /* We checked for 24 - x bytes, with x being the source alignment
+ (0 <= x <= 16), and no zero has been found. Start the loop
+ copy with doubleword aligned address. */
+ mr r7,r4
+ ld r12, 0(r7)
+ ldu r8, 8(r7)
+
+L(loop_before):
+ /* Save the two doublewords readed from source and align the source
+ to 16 bytes for the loop. */
+ mr r11,r3
+ std r12,0(r11)
+ std r8,8(r11)
+ addi r11,r11,16
+ rldicl r9,r4,0,60
+ subf r7,r9,r7
+ subf r11,r9,r11
+ b L(loop_start)
+
+ .align 5
+L(loop):
+ std r12, 0(r11)
+ std r6, 8(r11)
+ addi r11,r11,16
+L(loop_start):
+ /* Load two doublewords, compare and merge in a
+ single register for speed. This is an attempt
+ to speed up the null-checking process for bigger strings. */
+
+ ld r12, 8(r7)
+ ldu r6, 16(r7)
+ cmpb r10,r12,r0
+ cmpb r9,r6,r0
+ or r8,r9,r10 /* Merge everything in one doubleword. */
+ cmpdi cr7,r8,0
+ beq cr7,L(loop)
+
+
+ /* OK, one (or both) of the doublewords contains a null byte. Check
+ the first doubleword and decrement the address in case the first
+ doubleword really contains a null byte. */
+
+ addi r4,r7,-8
+ cmpdi cr6,r10,0
+ addi r7,r7,-8
+ bne cr6,L(done2)
+
+ /* The null byte must be in the second doubleword. Adjust the address
+ again and move the result of cmpb to r10 so we can calculate the
+ length. */
+
+ mr r10,r9
+ addi r7,r7,8
+ b L(done2)
+
+ /* r10 has the output of the cmpb instruction, that is, it contains
+ 0xff in the same position as the null byte in the original
+ doubleword from the string. Use that to calculate the length. */
+L(done):
+ mr r11,r3
+L(done2):
+#ifdef __LITTLE_ENDIAN__
+ addi r9, r10, -1 /* Form a mask from trailing zeros. */
+ andc r9, r9, r10
+ popcntd r6, r9 /* Count the bits in the mask. */
+#else
+ cntlzd r6,r10 /* Count leading zeros before the match. */
+#endif
+ subf r5,r4,r7
+ srdi r6,r6,3 /* Convert leading/trailing zeros to bytes. */
+ add r8,r5,r6 /* Compute final length. */
+#ifdef USE_AS_STPCPY
+ /* stpcpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r8
+#endif
+ addi r8,r8,1 /* Final '/0'. */
+
+ cmpldi cr6,r8,8
+ mtocrf 0x01,r8
+ ble cr6,L(copy_LE_8)
+
+ cmpldi cr1,r8,16
+ blt cr1,8f
+
+ /* Handle copies of 0~31 bytes. */
+ .align 4
+L(copy_LT_32):
+ /* At least 6 bytes to go. */
+ blt cr1,8f
+
+ /* Copy 16 bytes. */
+ ld r6,0(r4)
+ ld r8,8(r4)
+ addi r4,r4,16
+ std r6,0(r11)
+ std r8,8(r11)
+ addi r11,r11,16
+8: /* Copy 8 bytes. */
+ bf 28,L(tail4)
+ ld r6,0(r4)
+ addi r4,r4,8
+ std r6,0(r11)
+ addi r11,r11,8
+
+ .align 4
+/* Copies 4~7 bytes. */
+L(tail4):
+ bf 29,L(tail2)
+ lwz r6,0(r4)
+ stw r6,0(r11)
+ bf 30,L(tail5)
+ lhz r7,4(r4)
+ sth r7,4(r11)
+ bflr 31
+ lbz r8,6(r4)
+ stb r8,6(r11)
+ blr
+
+ .align 4
+/* Copies 2~3 bytes. */
+L(tail2):
+ bf 30,1f
+ lhz r6,0(r4)
+ sth r6,0(r11)
+ bflr 31
+ lbz r7,2(r4)
+ stb r7,2(r11)
+ blr
+
+ .align 4
+L(tail5):
+ bf 31,1f
+ lbz r6,4(r4)
+ stb r6,4(r11)
+ blr
+
+ .align 4
+1:
+ bflr 31
+ lbz r6,0(r4)
+ stb r6,0(r11)
+ blr
+
+/* Handles copies of 0~8 bytes. */
+ .align 4
+L(copy_LE_8):
+ bne cr6,L(tail4)
+ ld r6,0(r4)
+ std r6,0(r11)
+ blr
+END (FUNC_NAME)
+
+#ifndef USE_AS_STPCPY
+libc_hidden_builtin_def (strcpy)
+#endif
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 57 +++
NEWS | 4 +
sysdeps/powerpc/powerpc64/multiarch/Makefile | 10 +-
.../powerpc/powerpc64/multiarch/ifunc-impl-list.c | 16 +
.../powerpc/powerpc64/multiarch/stpcpy-power8.S | 40 ++
.../powerpc/powerpc64/multiarch/stpncpy-power8.S | 39 ++
sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 7 +-
.../powerpc/powerpc64/multiarch/strcat-power8.c | 30 ++
sysdeps/powerpc/powerpc64/multiarch/strcat.c | 7 +-
.../powerpc/powerpc64/multiarch/strcmp-power8.S | 40 ++
sysdeps/powerpc/powerpc64/multiarch/strcmp.c | 7 +-
.../powerpc/powerpc64/multiarch/strcpy-power8.S | 40 ++
sysdeps/powerpc/powerpc64/multiarch/strcpy.c | 7 +-
.../powerpc/powerpc64/multiarch/strncat-power7.S | 42 --
.../powerpc/powerpc64/multiarch/strncat-power7.c | 31 ++
.../powerpc/powerpc64/multiarch/strncpy-power8.S | 40 ++
sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 7 +-
sysdeps/powerpc/powerpc64/power7/strncat.S | 228 -----------
sysdeps/powerpc/powerpc64/power8/stpcpy.S | 24 ++
sysdeps/powerpc/powerpc64/power8/stpncpy.S | 20 +
sysdeps/powerpc/powerpc64/power8/strcmp.S | 257 ++++++++++++
sysdeps/powerpc/powerpc64/power8/strcpy.S | 262 ++++++++++++
sysdeps/powerpc/powerpc64/power8/strncpy.S | 424 ++++++++++++++++++++
23 files changed, 1355 insertions(+), 284 deletions(-)
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpcpy-power8.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power8.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcat-power8.c
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcmp-power8.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strcpy-power8.S
delete mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncat-power7.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncat-power7.c
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power8.S
delete mode 100644 sysdeps/powerpc/powerpc64/power7/strncat.S
create mode 100644 sysdeps/powerpc/powerpc64/power8/stpcpy.S
create mode 100644 sysdeps/powerpc/powerpc64/power8/stpncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/power8/strcmp.S
create mode 100644 sysdeps/powerpc/powerpc64/power8/strcpy.S
create mode 100644 sysdeps/powerpc/powerpc64/power8/strncpy.S
hooks/post-receive
--
GNU C Library master sources