This is the mail archive of the
libc-ports@sources.redhat.com
mailing list for the libc-ports project.
[PATCH 1/1] ARM: NEON optimized implementation of memcpy.
- From: Siarhei Siamashka <siarhei dot siamashka at nokia dot com>
- To: libc-ports at sourceware dot org
- Date: Sat, 4 Jul 2009 00:04:34 +0300
- Subject: [PATCH 1/1] ARM: NEON optimized implementation of memcpy.
NEON optimizations provide ~1.5x speedup when copying memory blocks,
that are much larger than L2 cache size. Performance improvement
varies for the other block sizes.
In order to get NEON code enabled, ASFLAGS needs to be defined as
something like "-mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon"
when building glibc.
---
sysdeps/arm/memcpy.S | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 108 insertions(+), 0 deletions(-)
diff --git a/sysdeps/arm/memcpy.S b/sysdeps/arm/memcpy.S
index 61cf33c..4edf261 100644
--- a/sysdeps/arm/memcpy.S
+++ b/sysdeps/arm/memcpy.S
@@ -2,6 +2,7 @@
This file is part of the GNU C Library.
Contributed by MontaVista Software, Inc. (written by Nicolas Pitre)
+ NEON code contributed by Nokia Corporation (written by Siarhei Siamashka)
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -20,6 +21,111 @@
#include <sysdep.h>
+#ifdef __ARM_NEON__
+ .text
+ .fpu neon
+/*
+ * Unaligned memory accesses are avoided, even though they
+ * would be a bit faster. This is done in order to avoid any
+ * potential problems if SCTLR.A bit is set or strongly
+ * ordered/device memory is accessed.
+ */
+ENTRY(memcpy)
+ cmp r2, #16
+ bge 3f
+
+ /* Do small memory copies (up to 15 bytes) using ARM */
+ push {r0, lr}
+ subs r2, r2, #2
+ blt 2f
+1: ldrb r3, [r1], #1
+ ldrb lr, [r1], #1
+ subs r2, r2, #2
+ strb r3, [r0], #1
+ strb lr, [r0], #1
+ bge 1b
+2: cmp r2, #-1
+ ldreqb r3, [r1], #1
+ streqb r3, [r0], #1
+ pop {r0, pc}
+3:
+ /* Do bigger memory copies using NEON instructions */
+ mov ip, r0
+ tst r0, #1
+ beq 1f
+ vld1.8 {d0[0]}, [r1]!
+ vst1.8 {d0[0]}, [ip]!
+ sub r2, r2, #1
+1:
+ tst ip, #2
+ beq 1f
+ vld2.8 {d0[0], d1[0]}, [r1]!
+ vst2.8 {d0[0], d1[0]}, [ip]!
+ sub r2, r2, #2
+1:
+ tst ip, #4
+ beq 1f
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
+ sub r2, r2, #4
+1:
+ tst ip, #8
+ beq 1f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [ip, :64]!
+ sub r2, r2, #8
+1:
+ subs r2, r2, #32
+ blt 3f
+ mov r3, #32
+1:
+ vld1.8 {d0-d3}, [r1]!
+ cmp r3, #(320 - 32)
+ pld [r1, r3]
+ addle r3, r3, #32
+ sub r2, r2, #32
+ vst1.8 {d0-d3}, [ip, :128]!
+ cmp r2, r3
+ bge 1b
+ cmp r2, #0
+ blt 3f
+1:
+ vld1.8 {d0-d3}, [r1]!
+ subs r2, r2, #32
+ vst1.8 {d0-d3}, [ip, :128]!
+ bge 1b
+3:
+ tst r2, #16
+ beq 1f
+ vld1.8 {d0, d1}, [r1]!
+ vst1.8 {d0, d1}, [ip, :128]!
+1:
+ tst r2, #8
+ beq 1f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [ip, :64]!
+1:
+ tst r2, #4
+ beq 1f
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [ip, :32]!
+1:
+ tst r2, #2
+ beq 1f
+ vld2.8 {d0[0], d1[0]}, [r1]!
+ vst2.8 {d0[0], d1[0]}, [ip]!
+1:
+ tst r2, #1
+ beq 1f
+ vld1.8 {d0[0]}, [r1]!
+ vst1.8 {d0[0]}, [ip]!
+1:
+ bx lr
+END(memcpy)
+libc_hidden_builtin_def (memcpy)
+
+#else
+
/*
* Data preload for architectures that support it (ARM V5TE and above)
*/
@@ -225,3 +331,5 @@ ENTRY(memcpy)
END(memcpy)
libc_hidden_builtin_def (memcpy)
+
+#endif
--
1.5.6.5