This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH][AArch64] Tune memcpy


ping


-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy

This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.

OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )

ChangeLog:
2015-11-19  Wilco Dijkstra  <wdijkstr@arm.com>

        * sysdeps/aarch64/memcpy.S (memcpy):
        Further tuning for performance.


---
 sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
 #define A_h    x7
 #define A_hw   w7
 #define B_l    x8
+#define B_lw   w8
 #define B_h    x9
 #define C_l    x10
 #define C_h    x11
@@ -70,21 +71,40 @@ END (memmove)
 libc_hidden_builtin_def (memmove)
 ENTRY (memcpy)

+       prfm    PLDL1KEEP, [src]
        add     srcend, src, count
        add     dstend, dstin, count
+       cmp     count, 16
+       b.ls    L(copy16)
        cmp     count, 96
        b.hi    L(copy_long)
-       cmp     count, 16
-       b.hs    L(copy_medium)

+       /* Medium copies: 17..96 bytes.  */
+       sub     tmp1, count, 1
+       ldp     A_l, A_h, [src]
+       tbnz    tmp1, 6, L(copy96)
+       ldp     D_l, D_h, [srcend, -16]
+       tbz     tmp1, 5, 1f
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+1:
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
        /* Small copies: 0..16 bytes.  */
 L(copy16):
-       tbz     count, 3, 1f
+       cmp     count, 8
+       b.lo    1f
        ldr     A_l, [src]
        ldr     A_h, [srcend, -8]
        str     A_l, [dstin]
        str     A_h, [dstend, -8]
        ret
+       .p2align 4
 1:
        tbz     count, 2, 1f
        ldr     A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
        str     A_lw, [dstin]
        str     A_hw, [dstend, -4]
        ret
-       .p2align 4
+
+       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 1:
        cbz     count, 2f
+       lsr     tmp1, count, 1
        ldrb    A_lw, [src]
-       tbz     count, 1, 1f
-       ldrh    A_hw, [srcend, -2]
-       strh    A_hw, [dstend, -2]
-1:     strb    A_lw, [dstin]
+       ldrb    A_hw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    A_hw, [dstend, -1]
 2:     ret

        .p2align 4
-       /* Medium copies: 17..96 bytes.  */
-L(copy_medium):
-       ldp     A_l, A_h, [src]
-       tbnz    count, 6, L(copy96)
-       ldp     D_l, D_h, [srcend, -16]
-       tbz     count, 5, 1f
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
-1:
-       stp     A_l, A_h, [dstin]
-       stp     D_l, D_h, [dstend, -16]
-       ret
-
-       .p2align 4
        /* Copy 64..96 bytes.  Copy 64 bytes from the start and
           32 bytes from the end.  */
 L(copy96):
--
1.9.1


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]