This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch master updated. glibc-2.26.9000-981-g1e36806
- From: raji at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 15 Dec 2017 05:29:36 -0000
- Subject: GNU C Library master sources branch master updated. glibc-2.26.9000-981-g1e36806
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 1e36806fb8589050350ececfade454c13f75e5aa (commit)
from 5f1603c331d9e2194170762b7e5e80a5571e4b4e (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1e36806fb8589050350ececfade454c13f75e5aa
commit 1e36806fb8589050350ececfade454c13f75e5aa
Author: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Date: Fri Dec 15 10:55:58 2017 +0530
powerpc: st{r,p}cpy optimization for aligned strings
This patch makes use of vectors for aligned inputs. Improvements
upto 30% seen for larger aligned inputs.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
diff --git a/ChangeLog b/ChangeLog
index 99792bf..a993af3 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2017-12-15 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
+
+ * sysdeps/powerpc/powerpc64/power8/strcpy.S: Use vectors
+ for aligned inputs.
+
2017-12-14 Siddhesh Poyarekar <siddhesh@sourceware.org>
* benchtests/bench-strcmp.c: Print output in JSON format.
diff --git a/sysdeps/powerpc/powerpc64/power8/strcpy.S b/sysdeps/powerpc/powerpc64/power8/strcpy.S
index 13e7a0f..a1683f9 100644
--- a/sysdeps/powerpc/powerpc64/power8/strcpy.S
+++ b/sysdeps/powerpc/powerpc64/power8/strcpy.S
@@ -47,7 +47,7 @@
64K as default, the page cross handling assumes minimum page size of
4k. */
- .machine power7
+ .machine power8
ENTRY_TOCLESS (FUNC_NAME, 4)
li r0,0 /* Doubleword with null chars to use
with cmpb. */
@@ -120,7 +120,7 @@ L(pagecross):
ldu r8, 8(r7)
L(loop_before):
- /* Save the two doublewords readed from source and align the source
+ /* Save the two doublewords read from source and align the source
to 16 bytes for the loop. */
mr r11,r3
std r12,0(r11)
@@ -129,7 +129,150 @@ L(loop_before):
rldicl r9,r4,0,60
subf r7,r9,r7
subf r11,r9,r11
- b L(loop_start)
+ /* Source is adjusted to 16B alignment and destination r11 is
+ also moved based on that adjustment. Now check if r11 is
+ also 16B aligned to move to vectorized loop. */
+ andi. r6, r11, 0xF
+ bne L(loop_start)
+
+ /* Prepare for the loop. */
+ subf r4, r9, r4 /* Adjust r4 based on alignment. */
+ li r7, 16 /* Load required offsets. */
+ li r8, 32
+ li r9, 48
+ vspltisb v0, 0
+ addi r4, r4, 16
+ /* Are we 64-byte aligned? If so, jump to the vectorized loop.
+ Else copy 16B till r4 is 64B aligned. */
+ andi. r6, r4, 63
+ beq L(qw_loop)
+
+ lvx v6, 0, r4 /* Load 16 bytes from memory. */
+ vcmpequb. v5, v0, v6 /* Check for null. */
+ bne cr6, L(qw_done)
+ stvx v6, 0, r11 /* Store 16 bytes. */
+ addi r4, r4, 16 /* Increment the address. */
+ addi r11, r11, 16
+ andi. r6, r4, 63
+ beq L(qw_loop)
+
+ lvx v6, 0, r4
+ vcmpequb. v5, v0, v6
+ bne cr6, L(qw_done)
+ stvx v6, 0, r11
+ addi r4, r4, 16
+ addi r11, r11, 16
+ andi. r6, r4, 63
+ beq L(qw_loop)
+
+ lvx v6, 0, r4
+ vcmpequb. v5, v0, v6
+ bne cr6, L(qw_done)
+ stvx v6, 0, r11
+ addi r4, r4, 16
+ addi r11, r11, 16
+
+ .align 4
+L(qw_loop):
+ lvx v1, r4, r0 /* Load 4 quadwords. */
+ lvx v2, r4, r7
+ lvx v3, r4, r8
+ lvx v4, r4, r9
+ vminub v5, v1, v2 /* Compare and merge into one VR for speed. */
+ vminub v8, v3, v4
+ vminub v7, v5, v8
+ vcmpequb. v7, v7, v0 /* Check for NULLs. */
+ bne cr6, L(qw_loop_done)
+ stvx v1, r11, r0 /* Store 4 quadwords. */
+ stvx v2, r11, r7
+ stvx v3, r11, r8
+ stvx v4, r11, r9
+ addi r4, r4, 64 /* Adjust address for the next iteration. */
+ addi r11, r11, 64 /* Adjust address for the next iteration. */
+
+ lvx v1, r4, r0 /* Load 4 quadwords. */
+ lvx v2, r4, r7
+ lvx v3, r4, r8
+ lvx v4, r4, r9
+ vminub v5, v1, v2 /* Compare and merge into one VR for speed. */
+ vminub v8, v3, v4
+ vminub v7, v5, v8
+ vcmpequb. v7, v7, v0 /* Check for NULLs. */
+ bne cr6, L(qw_loop_done)
+ stvx v1, r11, r0 /* Store 4 quadwords. */
+ stvx v2, r11, r7
+ stvx v3, r11, r8
+ stvx v4, r11, r9
+ addi r4, r4, 64 /* Adjust address for the next iteration. */
+ addi r11, r11, 64 /* Adjust address for the next iteration. */
+
+ lvx v1, r4, r0 /* Load 4 quadwords. */
+ lvx v2, r4, r7
+ lvx v3, r4, r8
+ lvx v4, r4, r9
+ vminub v5, v1, v2 /* Compare and merge into one VR for speed. */
+ vminub v8, v3, v4
+ vminub v7, v5, v8
+ vcmpequb. v7, v7, v0 /* Check for NULLs. */
+ bne cr6, L(qw_loop_done)
+ stvx v1, r11, r0 /* Store 4 quadwords. */
+ stvx v2, r11, r7
+ stvx v3, r11, r8
+ stvx v4, r11, r9
+ addi r4, r4, 64 /* Adjust address for the next iteration. */
+ addi r11, r11, 64 /* Adjust address for the next iteration. */
+ b L(qw_loop)
+
+ .align 4
+L(qw_loop_done):
+ /* Null found in one of the 4 loads. */
+ vcmpequb. v7, v1, v0
+ vor v6, v1, v1
+ bne cr6, L(qw_done)
+ /* Not on the first 16B, So store it. */
+ stvx v1, r11, r0
+ addi r4, r4, 16
+ addi r11, r11, 16
+ vcmpequb. v7, v2, v0
+ vor v6, v2, v2
+ bne cr6, L(qw_done)
+ /* Not on the second 16B, So store it. */
+ stvx v2, r11, r0
+ addi r4, r4, 16
+ addi r11, r11, 16
+ vcmpequb. v7, v3, v0
+ vor v6, v3, v3
+ bne cr6, L(qw_done)
+ /* Not on the third 16B, So store it. */
+ stvx v6, r11, r0
+ addi r4, r4, 16
+ addi r11, r11, 16
+ vor v6, v4, v4
+
+ .align 4
+L(qw_done):
+ mr r7, r4
+ /* Move the result to GPR. */
+#ifdef __LITTLE_ENDIAN__
+ vsldoi v4, v6, v0, 8
+ mfvrd r12, v4
+#else
+ mfvrd r12, v6
+#endif
+ /* Check for null in the first 8 bytes. */
+ cmpb r10, r12, r0
+ cmpdi cr6, r10, 0
+ bne cr6, L(done2)
+ /* Null found in second doubleword. */
+#ifdef __LITTLE_ENDIAN__
+ mfvrd r6, v6
+#else
+ vsldoi v6, v6, v0, 8
+ mfvrd r6, v6
+#endif
+ cmpb r10, r6, r0
+ addi r7, r7, 8
+ b L(done2)
.align 5
L(loop):
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 5 +
sysdeps/powerpc/powerpc64/power8/strcpy.S | 149 ++++++++++++++++++++++++++++-
2 files changed, 151 insertions(+), 3 deletions(-)
hooks/post-receive
--
GNU C Library master sources