This is the mail archive of the
newlib@sourceware.org
mailing list for the newlib project.
Re: [PATCH v2, ARM] Integrate optimized Cortex A15 memcpy.
- From: Will Newton <will dot newton at linaro dot org>
- To: newlib at sourceware dot org
- Date: Tue, 2 Apr 2013 09:37:51 +0100
- Subject: Re: [PATCH v2, ARM] Integrate optimized Cortex A15 memcpy.
- References: <5154570E dot 3010407 at linaro dot org>
On 28 March 2013 14:43, Will Newton <will.newton@linaro.org> wrote:
> 2013-03-28 Will Newton <will.newton@linaro.org>
>
> * libc/machine/arm/memcpy-stub.c: Use generic memcpy if unaligned
> access is not enabled or if building for big-endian.
> * libc/machine/arm/memcpy.S: Faster memcpy implementation for
> Cortex A15 cores using NEON and VFP if available.
> ---
> newlib/libc/machine/arm/memcpy-stub.c | 3 +-
> newlib/libc/machine/arm/memcpy.S | 1006 ++++++++++++++++++++-------------
> 2 files changed, 607 insertions(+), 402 deletions(-)
>
> Changes in v2:
> - Compile fix and cleanup of whitespace. Please disregard v1 which was sent in error!
Below are some benchmark numbers for this code versus the existing
newlib assembly implementation on a Pandaboard (OMAP4, Cortex-A9) for
copies between 32 and 160 bytes with alignments of 1, 2, 4 or 8 bytes.
As you can see it is usually faster and quite often substantially so.
The code and benchmarks can be found as part of the cortex strings
package on launchpad:
https://launchpad.net/cortex-strings
this:memcpy:32:100000000:1:4.033051: took 4.033051 s for 100000000
calls to memcpy of 32 bytes. ~973.264 MB/s corrected.
newlib:memcpy:32:100000000:1:6.136169: took 6.136169 s for 100000000
calls to memcpy of 32 bytes. ~582.540 MB/s corrected.
this:memcpy:32:100000000:2:3.679077: took 3.679077 s for 100000000
calls to memcpy of 32 bytes. ~1097.116 MB/s corrected.
newlib:memcpy:32:100000000:2:6.276703: took 6.276703 s for 100000000
calls to memcpy of 32 bytes. ~567.321 MB/s corrected.
this:memcpy:32:100000000:4:3.683685: took 3.683685 s for 100000000
calls to memcpy of 32 bytes. ~1095.302 MB/s corrected.
newlib:memcpy:32:100000000:4:5.600708: took 5.600708 s for 100000000
calls to memcpy of 32 bytes. ~648.862 MB/s corrected.
this:memcpy:32:100000000:8:2.711365: took 2.711365 s for 100000000
calls to memcpy of 32 bytes. ~1682.425 MB/s corrected.
newlib:memcpy:32:100000000:8:5.602295: took 5.602295 s for 100000000
calls to memcpy of 32 bytes. ~648.643 MB/s corrected.
this:memcpy:63:100000000:1:5.859314: took 5.859314 s for 100000000
calls to memcpy of 63 bytes. ~1210.868 MB/s corrected.
newlib:memcpy:63:100000000:1:5.462769: took 5.462769 s for 100000000
calls to memcpy of 63 bytes. ~1316.044 MB/s corrected.
this:memcpy:63:100000000:2:6.081696: took 6.081696 s for 100000000
calls to memcpy of 63 bytes. ~1158.927 MB/s corrected.
newlib:memcpy:63:100000000:2:5.548157: took 5.548157 s for 100000000
calls to memcpy of 63 bytes. ~1291.881 MB/s corrected.
this:memcpy:63:100000000:4:5.775055: took 5.775055 s for 100000000
calls to memcpy of 63 bytes. ~1231.785 MB/s corrected.
newlib:memcpy:63:100000000:4:4.597931: took 4.597931 s for 100000000
calls to memcpy of 63 bytes. ~1623.617 MB/s corrected.
this:memcpy:63:100000000:8:4.101227: took 4.101227 s for 100000000
calls to memcpy of 63 bytes. ~1875.339 MB/s corrected.
newlib:memcpy:63:100000000:8:4.602753: took 4.602753 s for 100000000
calls to memcpy of 63 bytes. ~1621.504 MB/s corrected.
this:memcpy:64:100000000:1:4.352570: took 4.352570 s for 100000000
calls to memcpy of 64 bytes. ~1766.519 MB/s corrected.
newlib:memcpy:64:100000000:1:5.355713: took 5.355713 s for 100000000
calls to memcpy of 64 bytes. ~1369.038 MB/s corrected.
this:memcpy:64:100000000:2:4.442871: took 4.442871 s for 100000000
calls to memcpy of 64 bytes. ~1721.526 MB/s corrected.
newlib:memcpy:64:100000000:2:5.564880: took 5.564880 s for 100000000
calls to memcpy of 64 bytes. ~1307.685 MB/s corrected.
this:memcpy:64:100000000:4:4.772339: took 4.772339 s for 100000000
calls to memcpy of 64 bytes. ~1575.150 MB/s corrected.
newlib:memcpy:64:100000000:4:5.024628: took 5.024628 s for 100000000
calls to memcpy of 64 bytes. ~1478.863 MB/s corrected.
this:memcpy:64:100000000:8:3.264404: took 3.264404 s for 100000000
calls to memcpy of 64 bytes. ~2578.648 MB/s corrected.
newlib:memcpy:64:100000000:8:4.183289: took 4.183289 s for 100000000
calls to memcpy of 64 bytes. ~1857.527 MB/s corrected.
this:memcpy:72:100000000:1:4.851227: took 4.851227 s for 100000000
calls to memcpy of 72 bytes. ~1736.687 MB/s corrected.
newlib:memcpy:72:100000000:1:5.897339: took 5.897339 s for 100000000
calls to memcpy of 72 bytes. ~1373.324 MB/s corrected.
this:memcpy:72:100000000:2:4.603149: took 4.603149 s for 100000000
calls to memcpy of 72 bytes. ~1852.949 MB/s corrected.
newlib:memcpy:72:100000000:2:6.141449: took 6.141449 s for 100000000
calls to memcpy of 72 bytes. ~1309.395 MB/s corrected.
this:memcpy:72:100000000:4:5.019592: took 5.019592 s for 100000000
calls to memcpy of 72 bytes. ~1665.753 MB/s corrected.
newlib:memcpy:72:100000000:4:5.530426: took 5.530426 s for 100000000
calls to memcpy of 72 bytes. ~1482.086 MB/s corrected.
this:memcpy:72:100000000:8:3.971161: took 3.971161 s for 100000000
calls to memcpy of 72 bytes. ~2233.937 MB/s corrected.
newlib:memcpy:72:100000000:8:4.518860: took 4.518860 s for 100000000
calls to memcpy of 72 bytes. ~1896.078 MB/s corrected.
this:memcpy:96:100000000:1:4.939087: took 4.939087 s for 100000000
calls to memcpy of 96 bytes. ~2265.245 MB/s corrected.
newlib:memcpy:96:100000000:1:9.204254: took 9.204254 s for 100000000
calls to memcpy of 96 bytes. ~1102.143 MB/s corrected.
this:memcpy:96:100000000:2:5.021393: took 5.021393 s for 100000000
calls to memcpy of 96 bytes. ~2220.035 MB/s corrected.
newlib:memcpy:96:100000000:2:9.454041: took 9.454041 s for 100000000
calls to memcpy of 96 bytes. ~1069.969 MB/s corrected.
this:memcpy:96:100000000:4:5.274933: took 5.274933 s for 100000000
calls to memcpy of 96 bytes. ~2091.452 MB/s corrected.
newlib:memcpy:96:100000000:4:6.463531: took 6.463531 s for 100000000
calls to memcpy of 96 bytes. ~1644.836 MB/s corrected.
this:memcpy:96:100000000:8:4.519592: took 4.519592 s for 100000000
calls to memcpy of 96 bytes. ~2527.592 MB/s corrected.
newlib:memcpy:96:100000000:8:8.443573: took 8.443573 s for 100000000
calls to memcpy of 96 bytes. ~1213.244 MB/s corrected.
this:memcpy:128:100000000:1:5.774780: took 5.774780 s for 100000000
calls to memcpy of 128 bytes. ~2502.815 MB/s corrected.
newlib:memcpy:128:100000000:1:8.236755: took 8.236755 s for 100000000
calls to memcpy of 128 bytes. ~1663.243 MB/s corrected.
this:memcpy:128:100000000:2:5.685516: took 5.685516 s for 100000000
calls to memcpy of 128 bytes. ~2549.475 MB/s corrected.
newlib:memcpy:128:100000000:2:8.489899: took 8.489899 s for 100000000
calls to memcpy of 128 bytes. ~1607.788 MB/s corrected.
this:memcpy:128:100000000:4:6.190735: took 6.190735 s for 100000000
calls to memcpy of 128 bytes. ~2306.140 MB/s corrected.
newlib:memcpy:128:100000000:4:7.871613: took 7.871613 s for 100000000
calls to memcpy of 128 bytes. ~1750.325 MB/s corrected.
this:memcpy:128:100000000:8:4.352386: took 4.352386 s for 100000000
calls to memcpy of 128 bytes. ~3533.225 MB/s corrected.
newlib:memcpy:128:100000000:8:6.947327: took 6.947327 s for 100000000
calls to memcpy of 128 bytes. ~2017.736 MB/s corrected.
this:memcpy:160:100000000:1:6.361786: took 6.361786 s for 100000000
calls to memcpy of 160 bytes. ~2792.438 MB/s corrected.
newlib:memcpy:160:100000000:1:9.669708: took 9.669708 s for 100000000
calls to memcpy of 160 bytes. ~1739.439 MB/s corrected.
this:memcpy:160:100000000:2:6.437195: took 6.437195 s for 100000000
calls to memcpy of 160 bytes. ~2754.426 MB/s corrected.
newlib:memcpy:160:100000000:2:9.913208: took 9.913208 s for 100000000
calls to memcpy of 160 bytes. ~1692.460 MB/s corrected.
this:memcpy:160:100000000:4:6.688354: took 6.688354 s for 100000000
calls to memcpy of 160 bytes. ~2634.962 MB/s corrected.
newlib:memcpy:160:100000000:4:10.134827: took 10.134827 s for
100000000 calls to memcpy of 160 bytes. ~1651.855 MB/s corrected.
this:memcpy:160:100000000:8:5.942505: took 5.942505 s for 100000000
calls to memcpy of 160 bytes. ~3024.510 MB/s corrected.
newlib:memcpy:160:100000000:8:8.276428: took 8.276428 s for 100000000
calls to memcpy of 160 bytes. ~2067.876 MB/s corrected.
> diff --git a/newlib/libc/machine/arm/memcpy-stub.c b/newlib/libc/machine/arm/memcpy-stub.c
> index 536b869..8a09b5c 100644
> --- a/newlib/libc/machine/arm/memcpy-stub.c
> +++ b/newlib/libc/machine/arm/memcpy-stub.c
> @@ -29,7 +29,8 @@
> /* The sole purpose of this file is to include the plain memcpy provided in newlib.
> An optimized version of memcpy is provided in the assembly file memcpy.S in this directory. */
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
> - (!(defined (__ARM_ARCH_7A__))))
> + (!(defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \
> + defined (__ARMEL__))))
>
> #include "../../string/memcpy.c"
>
> diff --git a/newlib/libc/machine/arm/memcpy.S b/newlib/libc/machine/arm/memcpy.S
> index e408ed0..2eeeca8 100644
> --- a/newlib/libc/machine/arm/memcpy.S
> +++ b/newlib/libc/machine/arm/memcpy.S
> @@ -1,423 +1,627 @@
> -/*
> - * Copyright (c) 2011 ARM Ltd
> - * All rights reserved.
> - *
> - * Redistribution and use in source and binary forms, with or without
> - * modification, are permitted provided that the following conditions
> - * are met:
> - * 1. Redistributions of source code must retain the above copyright
> - * notice, this list of conditions and the following disclaimer.
> - * 2. Redistributions in binary form must reproduce the above copyright
> - * notice, this list of conditions and the following disclaimer in the
> - * documentation and/or other materials provided with the distribution.
> - * 3. The name of the company may not be used to endorse or promote
> - * products derived from this software without specific prior written
> - * permission.
> - *
> - * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
> - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
> - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
> - * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
> - * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
> - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
> - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
> - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +/* Copyright (c) 2013, Linaro Limited
> + All rights reserved.
> +
> + Redistribution and use in source and binary forms, with or without
> + modification, are permitted provided that the following conditions
> + are met:
> +
> + * Redistributions of source code must retain the above copyright
> + notice, this list of conditions and the following disclaimer.
> +
> + * Redistributions in binary form must reproduce the above copyright
> + notice, this list of conditions and the following disclaimer in the
> + documentation and/or other materials provided with the distribution.
> +
> + * Neither the name of Linaro Limited nor the names of its
> + contributors may be used to endorse or promote products derived
> + from this software without specific prior written permission.
> +
> + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> +
> + This memcpy routine is optimised for Cortex-A cores and takes advantage
> + of VFP or NEON when built with the appropriate flags.
> +
> + Assumptions:
> +
> + ARMv6 (ARMv7-a if using Neon)
> + ARM state
> + Unaligned accesses
> + LDRD/STRD support unaligned word accesses
> + Not tested on big-endian
> +
> */
>
> #if (defined (__OPTIMIZE_SIZE__) || defined (PREFER_SIZE_OVER_SPEED) || \
> - (!(defined (__ARM_ARCH_7A__))))
> + (!(defined (__ARM_ARCH_7A__) && defined (__ARM_FEATURE_UNALIGNED) && \
> + defined (__ARMEL__))))
>
> /* Do nothing here. See memcpy-stub.c in the same directory. */
>
> #else
> - /* Prototype: void *memcpy (void *dst, const void *src, size_t count). */
>
> - /* Use the version of memcpy implemented using LDRD and STRD.
> - This version is tuned for Cortex-A15.
> - This might not be the best for other ARMv7-A CPUs,
> - but there is no predefine to distinguish between
> - different CPUs in the same architecture,
> - and this version is better than the plain memcpy provided in newlib.
> + .syntax unified
> + /* This implementation requires ARM state. */
> + .arm
> +
> +#ifdef __ARM_NEON__
> +
> + .fpu neon
> + .arch armv7-a
> +# define FRAME_SIZE 4
> +# define USE_VFP
> +# define USE_NEON
> +
> +#elif !defined (__SOFTFP__)
> +
> + .arch armv6
> + .fpu vfpv2
> +# define FRAME_SIZE 32
> +# define USE_VFP
> +
> +#else
> + .arch armv6
> +# define FRAME_SIZE 32
> +
> +#endif
> +
> +/* Old versions of GAS incorrectly implement the NEON align semantics. */
> +#ifdef BROKEN_ASM_NEON_ALIGN
> +#define ALIGN(addr, align) addr,:align
> +#else
> +#define ALIGN(addr, align) addr:align
> +#endif
>
> - Therefore, we use this version for all ARMv7-A CPUS. */
> +#define PC_OFFSET 8 /* PC pipeline compensation. */
> +#define INSN_SIZE 4
> +
> +/* Call parameters. */
> +#define dstin r0
> +#define src r1
> +#define count r2
> +
> +/* Locals. */
> +#define tmp1 r3
> +#define dst ip
> +#define tmp2 r10
> +
> +#ifndef USE_NEON
> +/* For bulk copies using GP registers. */
> +#define A_l r2 /* Call-clobbered. */
> +#define A_h r3 /* Call-clobbered. */
> +#define B_l r4
> +#define B_h r5
> +#define C_l r6
> +#define C_h r7
> +#define D_l r8
> +#define D_h r9
> +#endif
>
> - /* To make the same code compile for both ARM and Thumb instruction
> - sets, switch to unified syntax at the beginning of this function.
> - However, by using the same code, we may be missing optimization
> - opportunities. For instance, in LDRD/STRD instructions, the first
> - destination register must be even and the second consecutive in
> - ARM state, but not in Thumb state. */
> +/* Number of lines ahead to pre-fetch data. If you change this the code
> + below will need adjustment to compensate. */
> +
> +#define prefetch_lines 5
> +
> +#ifdef USE_VFP
> + .macro cpy_line_vfp vreg, base
> + vstr \vreg, [dst, #\base]
> + vldr \vreg, [src, #\base]
> + vstr d0, [dst, #\base + 8]
> + vldr d0, [src, #\base + 8]
> + vstr d1, [dst, #\base + 16]
> + vldr d1, [src, #\base + 16]
> + vstr d2, [dst, #\base + 24]
> + vldr d2, [src, #\base + 24]
> + vstr \vreg, [dst, #\base + 32]
> + vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
> + vstr d0, [dst, #\base + 40]
> + vldr d0, [src, #\base + 40]
> + vstr d1, [dst, #\base + 48]
> + vldr d1, [src, #\base + 48]
> + vstr d2, [dst, #\base + 56]
> + vldr d2, [src, #\base + 56]
> + .endm
> +
> + .macro cpy_tail_vfp vreg, base
> + vstr \vreg, [dst, #\base]
> + vldr \vreg, [src, #\base]
> + vstr d0, [dst, #\base + 8]
> + vldr d0, [src, #\base + 8]
> + vstr d1, [dst, #\base + 16]
> + vldr d1, [src, #\base + 16]
> + vstr d2, [dst, #\base + 24]
> + vldr d2, [src, #\base + 24]
> + vstr \vreg, [dst, #\base + 32]
> + vstr d0, [dst, #\base + 40]
> + vldr d0, [src, #\base + 40]
> + vstr d1, [dst, #\base + 48]
> + vldr d1, [src, #\base + 48]
> + vstr d2, [dst, #\base + 56]
> + vldr d2, [src, #\base + 56]
> + .endm
> +#endif
>
> - .syntax unified
> + .macro def_fn f p2align=0
> + .text
> + .p2align \p2align
> + .global \f
> + .type \f, %function
> +\f:
> + .endm
> +
> +def_fn memcpy p2align=6
> +
> + mov dst, dstin /* Preserve dstin, we need to return it. */
> + cmp count, #64
> + bge .Lcpy_not_short
> + /* Deal with small copies quickly by dropping straight into the
> + exit block. */
> +
> +.Ltail63unaligned:
> +#ifdef USE_NEON
> + and tmp1, count, #0x38
> + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> + add pc, pc, tmp1
> + vld1.8 {d0}, [src]! /* 14 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 12 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 10 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 8 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 6 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 4 words to go. */
> + vst1.8 {d0}, [dst]!
> + vld1.8 {d0}, [src]! /* 2 words to go. */
> + vst1.8 {d0}, [dst]!
> +
> + tst count, #4
> + ldrne tmp1, [src], #4
> + strne tmp1, [dst], #4
> +#else
> + /* Copy up to 15 full words of data. May not be aligned. */
> + /* Cannot use VFP for unaligned data. */
> + and tmp1, count, #0x3c
> + add dst, dst, tmp1
> + add src, src, tmp1
> + rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
> + /* Jump directly into the sequence below at the correct offset. */
> + add pc, pc, tmp1, lsl #1
> +
> + ldr tmp1, [src, #-60] /* 15 words to go. */
> + str tmp1, [dst, #-60]
> +
> + ldr tmp1, [src, #-56] /* 14 words to go. */
> + str tmp1, [dst, #-56]
> + ldr tmp1, [src, #-52]
> + str tmp1, [dst, #-52]
> +
> + ldr tmp1, [src, #-48] /* 12 words to go. */
> + str tmp1, [dst, #-48]
> + ldr tmp1, [src, #-44]
> + str tmp1, [dst, #-44]
> +
> + ldr tmp1, [src, #-40] /* 10 words to go. */
> + str tmp1, [dst, #-40]
> + ldr tmp1, [src, #-36]
> + str tmp1, [dst, #-36]
> +
> + ldr tmp1, [src, #-32] /* 8 words to go. */
> + str tmp1, [dst, #-32]
> + ldr tmp1, [src, #-28]
> + str tmp1, [dst, #-28]
> +
> + ldr tmp1, [src, #-24] /* 6 words to go. */
> + str tmp1, [dst, #-24]
> + ldr tmp1, [src, #-20]
> + str tmp1, [dst, #-20]
> +
> + ldr tmp1, [src, #-16] /* 4 words to go. */
> + str tmp1, [dst, #-16]
> + ldr tmp1, [src, #-12]
> + str tmp1, [dst, #-12]
> +
> + ldr tmp1, [src, #-8] /* 2 words to go. */
> + str tmp1, [dst, #-8]
> + ldr tmp1, [src, #-4]
> + str tmp1, [dst, #-4]
> +#endif
>
> -#if defined (__thumb__)
> - .thumb
> - .thumb_func
> + lsls count, count, #31
> + ldrhcs tmp1, [src], #2
> + ldrbne src, [src] /* Src is dead, use as a scratch. */
> + strhcs tmp1, [dst], #2
> + strbne src, [dst]
> + bx lr
> +
> +.Lcpy_not_short:
> + /* At least 64 bytes to copy, but don't know the alignment yet. */
> + str tmp2, [sp, #-FRAME_SIZE]!
> + and tmp2, src, #3
> + and tmp1, dst, #3
> + cmp tmp1, tmp2
> + bne .Lcpy_notaligned
> +
> +#ifdef USE_VFP
> + /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
> + that the FP pipeline is much better at streaming loads and
> + stores. This is outside the critical loop. */
> + vmov.f32 s0, s0
> #endif
>
> - .global memcpy
> - .type memcpy, %function
> -memcpy:
> -
> - /* Assumes that n >= 0, and dst, src are valid pointers.
> - If there is at least 8 bytes to copy, use LDRD/STRD.
> - If src and dst are misaligned with different offsets,
> - first copy byte by byte until dst is aligned,
> - and then copy using LDRD/STRD and shift if needed.
> - When less than 8 left, copy a word and then byte by byte. */
> -
> - /* Save registers (r0 holds the return value):
> - optimized push {r0, r4, r5, lr}.
> - To try and improve performance, stack layout changed,
> - i.e., not keeping the stack looking like users expect
> - (highest numbered register at highest address). */
> - push {r0, lr}
> - strd r4, r5, [sp, #-8]!
> -
> - /* TODO: Add debug frame directives.
> - We don't need exception unwind directives, because the code below
> - does not throw any exceptions and does not call any other functions.
> - Generally, newlib functions like this lack debug information for
> - assembler source. */
> -
> - /* Get copying of tiny blocks out of the way first. */
> - /* Is there at least 4 bytes to copy? */
> - subs r2, r2, #4
> - blt copy_less_than_4 /* If n < 4. */
> -
> - /* Check word alignment. */
> - ands ip, r0, #3 /* ip = last 2 bits of dst. */
> - bne dst_not_word_aligned /* If dst is not word-aligned. */
> -
> - /* Get here if dst is word-aligned. */
> - ands ip, r1, #3 /* ip = last 2 bits of src. */
> - bne src_not_word_aligned /* If src is not word-aligned. */
> -word_aligned:
> - /* Get here if source and dst both are word-aligned.
> - The number of bytes remaining to copy is r2+4. */
> -
> - /* Is there is at least 64 bytes to copy? */
> - subs r2, r2, #60
> - blt copy_less_than_64 /* If r2 + 4 < 64. */
> -
> - /* First, align the destination buffer to 8-bytes,
> - to make sure double loads and stores don't cross cache line boundary,
> - as they are then more expensive even if the data is in the cache
> - (require two load/store issue cycles instead of one).
> - If only one of the buffers is not 8-bytes aligned,
> - then it's more important to align dst than src,
> - because there is more penalty for stores
> - than loads that cross cacheline boundary.
> - This check and realignment are only worth doing
> - if there is a lot to copy. */
> -
> - /* Get here if dst is word aligned,
> - i.e., the 2 least significant bits are 0.
> - If dst is not 2w aligned (i.e., the 3rd bit is not set in dst),
> - then copy 1 word (4 bytes). */
> - ands r3, r0, #4
> - beq 11f /* If dst already two-word aligned. */
> - ldr r3, [r1], #4
> - str r3, [r0], #4
> - subs r2, r2, #4
> - blt copy_less_than_64
> -
> -11:
> - /* TODO: Align to cacheline (useful for PLD optimization). */
> -
> - /* Every loop iteration copies 64 bytes. */
> + /* SRC and DST have the same mutual 32-bit alignment, but we may
> + still need to pre-copy some bytes to get to natural alignment.
> + We bring DST into full 64-bit alignment. */
> + lsls tmp2, dst, #29
> + beq 1f
> + rsbs tmp2, tmp2, #0
> + sub count, count, tmp2, lsr #29
> + ldrmi tmp1, [src], #4
> + strmi tmp1, [dst], #4
> + lsls tmp2, tmp2, #2
> + ldrhcs tmp1, [src], #2
> + ldrbne tmp2, [src], #1
> + strhcs tmp1, [dst], #2
> + strbne tmp2, [dst], #1
> +
> 1:
> - .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
> - ldrd r4, r5, [r1, \offset]
> - strd r4, r5, [r0, \offset]
> - .endr
> + subs tmp2, count, #64 /* Use tmp2 for count. */
> + blt .Ltail63aligned
> +
> + cmp tmp2, #512
> + bge .Lcpy_body_long
>
> - add r0, r0, #64
> - add r1, r1, #64
> - subs r2, r2, #64
> - bge 1b /* If there is more to copy. */
> +.Lcpy_body_medium: /* Count in tmp2. */
> +#ifdef USE_VFP
> +1:
> + vldr d0, [src, #0]
> + subs tmp2, tmp2, #64
> + vldr d1, [src, #8]
> + vstr d0, [dst, #0]
> + vldr d0, [src, #16]
> + vstr d1, [dst, #8]
> + vldr d1, [src, #24]
> + vstr d0, [dst, #16]
> + vldr d0, [src, #32]
> + vstr d1, [dst, #24]
> + vldr d1, [src, #40]
> + vstr d0, [dst, #32]
> + vldr d0, [src, #48]
> + vstr d1, [dst, #40]
> + vldr d1, [src, #56]
> + vstr d0, [dst, #48]
> + add src, src, #64
> + vstr d1, [dst, #56]
> + add dst, dst, #64
> + bge 1b
> + tst tmp2, #0x3f
> + beq .Ldone
> +
> +.Ltail63aligned: /* Count in tmp2. */
> + and tmp1, tmp2, #0x38
> + add dst, dst, tmp1
> + add src, src, tmp1
> + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> + add pc, pc, tmp1
> +
> + vldr d0, [src, #-56] /* 14 words to go. */
> + vstr d0, [dst, #-56]
> + vldr d0, [src, #-48] /* 12 words to go. */
> + vstr d0, [dst, #-48]
> + vldr d0, [src, #-40] /* 10 words to go. */
> + vstr d0, [dst, #-40]
> + vldr d0, [src, #-32] /* 8 words to go. */
> + vstr d0, [dst, #-32]
> + vldr d0, [src, #-24] /* 6 words to go. */
> + vstr d0, [dst, #-24]
> + vldr d0, [src, #-16] /* 4 words to go. */
> + vstr d0, [dst, #-16]
> + vldr d0, [src, #-8] /* 2 words to go. */
> + vstr d0, [dst, #-8]
> +#else
> + sub src, src, #8
> + sub dst, dst, #8
> +1:
> + ldrd A_l, A_h, [src, #8]
> + strd A_l, A_h, [dst, #8]
> + ldrd A_l, A_h, [src, #16]
> + strd A_l, A_h, [dst, #16]
> + ldrd A_l, A_h, [src, #24]
> + strd A_l, A_h, [dst, #24]
> + ldrd A_l, A_h, [src, #32]
> + strd A_l, A_h, [dst, #32]
> + ldrd A_l, A_h, [src, #40]
> + strd A_l, A_h, [dst, #40]
> + ldrd A_l, A_h, [src, #48]
> + strd A_l, A_h, [dst, #48]
> + ldrd A_l, A_h, [src, #56]
> + strd A_l, A_h, [dst, #56]
> + ldrd A_l, A_h, [src, #64]!
> + strd A_l, A_h, [dst, #64]!
> + subs tmp2, tmp2, #64
> + bge 1b
> + tst tmp2, #0x3f
> + bne 1f
> + ldr tmp2,[sp], #FRAME_SIZE
> + bx lr
> +1:
> + add src, src, #8
> + add dst, dst, #8
> +
> +.Ltail63aligned: /* Count in tmp2. */
> + /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
> + we know that the src and dest are 32-bit aligned so we can use
> + LDRD/STRD to improve efficiency. */
> + /* TMP2 is now negative, but we don't care about that. The bottom
> + six bits still tell us how many bytes are left to copy. */
> +
> + and tmp1, tmp2, #0x38
> + add dst, dst, tmp1
> + add src, src, tmp1
> + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
> + add pc, pc, tmp1
> + ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
> + strd A_l, A_h, [dst, #-56]
> + ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
> + strd A_l, A_h, [dst, #-48]
> + ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
> + strd A_l, A_h, [dst, #-40]
> + ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
> + strd A_l, A_h, [dst, #-32]
> + ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
> + strd A_l, A_h, [dst, #-24]
> + ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
> + strd A_l, A_h, [dst, #-16]
> + ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
> + strd A_l, A_h, [dst, #-8]
>
> -copy_less_than_64:
> +#endif
> + tst tmp2, #4
> + ldrne tmp1, [src], #4
> + strne tmp1, [dst], #4
> + lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
> + ldrhcs tmp1, [src], #2
> + ldrbne tmp2, [src]
> + strhcs tmp1, [dst], #2
> + strbne tmp2, [dst]
> +
> +.Ldone:
> + ldr tmp2, [sp], #FRAME_SIZE
> + bx lr
> +
> +.Lcpy_body_long: /* Count in tmp2. */
> +
> + /* Long copy. We know that there's at least (prefetch_lines * 64)
> + bytes to go. */
> +#ifdef USE_VFP
> + /* Don't use PLD. Instead, read some data in advance of the current
> + copy position into a register. This should act like a PLD
> + operation but we won't have to repeat the transfer. */
> +
> + vldr d3, [src, #0]
> + vldr d4, [src, #64]
> + vldr d5, [src, #128]
> + vldr d6, [src, #192]
> + vldr d7, [src, #256]
> +
> + vldr d0, [src, #8]
> + vldr d1, [src, #16]
> + vldr d2, [src, #24]
> + add src, src, #32
> +
> + subs tmp2, tmp2, #prefetch_lines * 64 * 2
> + blt 2f
> +1:
> + cpy_line_vfp d3, 0
> + cpy_line_vfp d4, 64
> + cpy_line_vfp d5, 128
> + add dst, dst, #3 * 64
> + add src, src, #3 * 64
> + cpy_line_vfp d6, 0
> + cpy_line_vfp d7, 64
> + add dst, dst, #2 * 64
> + add src, src, #2 * 64
> + subs tmp2, tmp2, #prefetch_lines * 64
> + bge 1b
>
> - /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
> - Restore the count if there is more than 7 bytes to copy. */
> - adds r2, r2, #56
> - blt copy_less_than_8
> +2:
> + cpy_tail_vfp d3, 0
> + cpy_tail_vfp d4, 64
> + cpy_tail_vfp d5, 128
> + add src, src, #3 * 64
> + add dst, dst, #3 * 64
> + cpy_tail_vfp d6, 0
> + vstr d7, [dst, #64]
> + vldr d7, [src, #64]
> + vstr d0, [dst, #64 + 8]
> + vldr d0, [src, #64 + 8]
> + vstr d1, [dst, #64 + 16]
> + vldr d1, [src, #64 + 16]
> + vstr d2, [dst, #64 + 24]
> + vldr d2, [src, #64 + 24]
> + vstr d7, [dst, #64 + 32]
> + add src, src, #96
> + vstr d0, [dst, #64 + 40]
> + vstr d1, [dst, #64 + 48]
> + vstr d2, [dst, #64 + 56]
> + add dst, dst, #128
> + add tmp2, tmp2, #prefetch_lines * 64
> + b .Lcpy_body_medium
> +#else
> + /* Long copy. Use an SMS style loop to maximize the I/O
> + bandwidth of the core. We don't have enough spare registers
> + to synthesise prefetching, so use PLD operations. */
> + /* Pre-bias src and dst. */
> + sub src, src, #8
> + sub dst, dst, #8
> + pld [src, #8]
> + pld [src, #72]
> + subs tmp2, tmp2, #64
> + pld [src, #136]
> + ldrd A_l, A_h, [src, #8]
> + strd B_l, B_h, [sp, #8]
> + ldrd B_l, B_h, [src, #16]
> + strd C_l, C_h, [sp, #16]
> + ldrd C_l, C_h, [src, #24]
> + strd D_l, D_h, [sp, #24]
> + pld [src, #200]
> + ldrd D_l, D_h, [src, #32]!
> + b 1f
> + .p2align 6
> +2:
> + pld [src, #232]
> + strd A_l, A_h, [dst, #40]
> + ldrd A_l, A_h, [src, #40]
> + strd B_l, B_h, [dst, #48]
> + ldrd B_l, B_h, [src, #48]
> + strd C_l, C_h, [dst, #56]
> + ldrd C_l, C_h, [src, #56]
> + strd D_l, D_h, [dst, #64]!
> + ldrd D_l, D_h, [src, #64]!
> + subs tmp2, tmp2, #64
> +1:
> + strd A_l, A_h, [dst, #8]
> + ldrd A_l, A_h, [src, #8]
> + strd B_l, B_h, [dst, #16]
> + ldrd B_l, B_h, [src, #16]
> + strd C_l, C_h, [dst, #24]
> + ldrd C_l, C_h, [src, #24]
> + strd D_l, D_h, [dst, #32]
> + ldrd D_l, D_h, [src, #32]
> + bcs 2b
> + /* Save the remaining bytes and restore the callee-saved regs. */
> + strd A_l, A_h, [dst, #40]
> + add src, src, #40
> + strd B_l, B_h, [dst, #48]
> + ldrd B_l, B_h, [sp, #8]
> + strd C_l, C_h, [dst, #56]
> + ldrd C_l, C_h, [sp, #16]
> + strd D_l, D_h, [dst, #64]
> + ldrd D_l, D_h, [sp, #24]
> + add dst, dst, #72
> + tst tmp2, #0x3f
> + bne .Ltail63aligned
> + ldr tmp2, [sp], #FRAME_SIZE
> + bx lr
> +#endif
>
> - /* Copy 8 bytes at a time. */
> +.Lcpy_notaligned:
> + pld [src]
> + pld [src, #64]
> + /* There's at least 64 bytes to copy, but there is no mutual
> + alignment. */
> + /* Bring DST to 64-bit alignment. */
> + lsls tmp2, dst, #29
> + pld [src, #(2 * 64)]
> + beq 1f
> + rsbs tmp2, tmp2, #0
> + sub count, count, tmp2, lsr #29
> + ldrmi tmp1, [src], #4
> + strmi tmp1, [dst], #4
> + lsls tmp2, tmp2, #2
> + ldrbne tmp1, [src], #1
> + ldrhcs tmp2, [src], #2
> + strbne tmp1, [dst], #1
> + strhcs tmp2, [dst], #2
> +1:
> + pld [src, #(3 * 64)]
> + subs count, count, #64
> + ldrmi tmp2, [sp], #FRAME_SIZE
> + bmi .Ltail63unaligned
> + pld [src, #(4 * 64)]
> +
> +#ifdef USE_NEON
> + vld1.8 {d0-d3}, [src]!
> + vld1.8 {d4-d7}, [src]!
> + subs count, count, #64
> + bmi 2f
> +1:
> + pld [src, #(4 * 64)]
> + vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
> + vld1.8 {d0-d3}, [src]!
> + vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
> + vld1.8 {d4-d7}, [src]!
> + subs count, count, #64
> + bpl 1b
> 2:
> - ldrd r4, r5, [r1], #8
> - strd r4, r5, [r0], #8
> - subs r2, r2, #8
> - bge 2b /* If there is more to copy. */
> -
> -copy_less_than_8:
> -
> - /* Get here if less than 8 bytes to copy, -8 <= r2 < 0.
> - Check if there is more to copy. */
> - cmn r2, #8
> - beq return /* If r2 + 8 == 0. */
> -
> - /* Restore the count if there is more than 3 bytes to copy. */
> - adds r2, r2, #4
> - blt copy_less_than_4
> -
> - /* Copy 4 bytes. */
> - ldr r3, [r1], #4
> - str r3, [r0], #4
> -
> -copy_less_than_4:
> - /* Get here if less than 4 bytes to copy, -4 <= r2 < 0. */
> -
> - /* Restore the count, check if there is more to copy. */
> - adds r2, r2, #4
> - beq return /* If r2 == 0. */
> -
> - /* Get here with r2 is in {1,2,3}={01,10,11}. */
> - /* Logical shift left r2, insert 0s, update flags. */
> - lsls r2, r2, #31
> -
> - /* Copy byte by byte.
> - Condition ne means the last bit of r2 is 0.
> - Condition cs means the second to last bit of r2 is set,
> - i.e., r2 is 1 or 3. */
> - itt ne
> - ldrbne r3, [r1], #1
> - strbne r3, [r0], #1
> -
> - itttt cs
> - ldrbcs r4, [r1], #1
> - ldrbcs r5, [r1]
> - strbcs r4, [r0], #1
> - strbcs r5, [r0]
> -
> -return:
> - /* Restore registers: optimized pop {r0, r4, r5, pc} */
> - ldrd r4, r5, [sp], #8
> - pop {r0, pc} /* This is the only return point of memcpy. */
> -
> -#ifndef __ARM_FEATURE_UNALIGNED
> -
> - /* The following assembly macro implements misaligned copy in software.
> - Assumes that dst is word aligned, src is at offset "pull" bits from
> - word, push = 32 - pull, and the number of bytes that remain to copy
> - is r2 + 4, r2 >= 0. */
> -
> - /* In the code below, r2 is the number of bytes that remain to be
> - written. The number of bytes read is always larger, because we have
> - partial words in the shift queue. */
> -
> - .macro miscopy pull push shiftleft shiftright
> -
> - /* Align src to the previous word boundary. */
> - bic r1, r1, #3
> -
> - /* Initialize the shift queue. */
> - ldr r5, [r1], #4 /* Load a word from source. */
> -
> - subs r2, r2, #4
> - blt 6f /* Go to misaligned copy of less than 8 bytes. */
> -
> - /* Get here if there is more than 8 bytes to copy.
> - The number of bytes to copy is r2+8, r2 >= 0. */
> -
> - /* Save registers: push { r6, r7 }.
> - We need additional registers for LDRD and STRD, because in ARM state
> - the first destination register must be even and the second
> - consecutive. */
> - strd r6, r7, [sp, #-8]!
> -
> - subs r2, r2, #56
> - blt 4f /* Go to misaligned copy of less than 64 bytes. */
> -
> -3:
> - /* Get here if there is more than 64 bytes to copy.
> - The number of bytes to copy is r2+64, r2 >= 0. */
> -
> - /* Copy 64 bytes in every iteration.
> - Use a partial word from the shift queue. */
> - .irp offset, #0, #8, #16, #24, #32, #40, #48, #56
> - mov r6, r5, \shiftleft #\pull
> - ldrd r4, r5, [r1, \offset]
> - orr r6, r6, r4, \shiftright #\push
> - mov r7, r4, \shiftleft #\pull
> - orr r7, r7, r5, \shiftright #\push
> - strd r6, r7, [r0, \offset]
> - .endr
> -
> - add r1, r1, #64
> - add r0, r0, #64
> - subs r2, r2, #64
> - bge 3b
> -
> -4:
> - /* Get here if there is less than 64 bytes to copy (-64 <= r2 < 0)
> - and they are misaligned. */
> -
> - /* Restore the count if there is more than 7 bytes to copy. */
> - adds r2, r2, #56
> -
> - /* If less than 8 bytes to copy,
> - restore registers saved for this loop: optimized poplt { r6, r7 }. */
> - itt lt
> - ldrdlt r6, r7, [sp], #8
> - blt 6f /* Go to misaligned copy of less than 8 bytes. */
> -
> -5:
> - /* Copy 8 bytes at a time.
> - Use a partial word from the shift queue. */
> - mov r6, r5, \shiftleft #\pull
> - ldrd r4, r5, [r1], #8
> - orr r6, r6, r4, \shiftright #\push
> - mov r7, r4, \shiftleft #\pull
> - orr r7, r7, r5, \shiftright #\push
> - strd r6, r7, [r0], #8
> -
> - subs r2, r2, #8
> - bge 5b /* If there is more to copy. */
> -
> - /* Restore registers saved for this loop: optimized pop { r6, r7 }. */
> - ldrd r6, r7, [sp], #8
> -
> -6:
> - /* Get here if there less than 8 bytes to copy (-8 <= r2 < 0)
> - and they are misaligned. */
> -
> - /* Check if there is more to copy. */
> - cmn r2, #8
> - beq return
> -
> - /* Check if there is less than 4 bytes to copy. */
> - cmn r2, #4
> -
> - itt lt
> - /* Restore src offset from word-align. */
> - sublt r1, r1, #(\push / 8)
> - blt copy_less_than_4
> -
> - /* Use a partial word from the shift queue. */
> - mov r3, r5, \shiftleft #\pull
> - /* Load a word from src, but without writeback
> - (this word is not fully written to dst). */
> - ldr r5, [r1]
> -
> - /* Restore src offset from word-align. */
> - add r1, r1, #(\pull / 8)
> -
> - /* Shift bytes to create one dst word and store it. */
> - orr r3, r3, r5, \shiftright #\push
> - str r3, [r0], #4
> -
> - /* Use single byte copying of the remaining bytes. */
> - b copy_less_than_4
> -
> - .endm
> -
> -#endif /* not __ARM_FEATURE_UNALIGNED */
> -
> -dst_not_word_aligned:
> -
> - /* Get here when dst is not aligned and ip has the last 2 bits of dst,
> - i.e., ip is the offset of dst from word.
> - The number of bytes that remains to copy is r2 + 4,
> - i.e., there are at least 4 bytes to copy.
> - Write a partial word (0 to 3 bytes), such that dst becomes
> - word-aligned. */
> -
> - /* If dst is at ip bytes offset from a word (with 0 < ip < 4),
> - then there are (4 - ip) bytes to fill up to align dst to the next
> - word. */
> - rsb ip, ip, #4 /* ip = #4 - ip. */
> - cmp ip, #2
> -
> - /* Copy byte by byte with conditionals. */
> - itt gt
> - ldrbgt r3, [r1], #1
> - strbgt r3, [r0], #1
> -
> - itt ge
> - ldrbge r4, [r1], #1
> - strbge r4, [r0], #1
> -
> - ldrb lr, [r1], #1
> - strb lr, [r0], #1
> -
> - /* Update the count.
> - ip holds the number of bytes we have just copied. */
> - subs r2, r2, ip /* r2 = r2 - ip. */
> - blt copy_less_than_4 /* If r2 < ip. */
> -
> - /* Get here if there are more than 4 bytes to copy.
> - Check if src is aligned. If beforehand src and dst were not word
> - aligned but congruent (same offset), then now they are both
> - word-aligned, and we can copy the rest efficiently (without
> - shifting). */
> - ands ip, r1, #3 /* ip = last 2 bits of src. */
> - beq word_aligned /* If r1 is word-aligned. */
> -
> -src_not_word_aligned:
> - /* Get here when src is not word-aligned, but dst is word-aligned.
> - The number of bytes that remains to copy is r2+4. */
> -
> -#ifdef __ARM_FEATURE_UNALIGNED
> - /* Copy word by word using LDR when alignment can be done in hardware,
> - i.e., SCTLR.A is set, supporting unaligned access in LDR and STR. */
> - subs r2, r2, #60
> - blt 8f
> -
> -7:
> - /* Copy 64 bytes in every loop iteration. */
> - .irp offset, #0, #4, #8, #12, #16, #20, #24, #28, #32, #36, #40, #44, #48, #52, #56, #60
> - ldr r3, [r1, \offset]
> - str r3, [r0, \offset]
> - .endr
> -
> - add r0, r0, #64
> - add r1, r1, #64
> - subs r2, r2, #64
> - bge 7b
> -
> -8:
> - /* Get here if less than 64 bytes to copy, -64 <= r2 < 0.
> - Check if there is more than 3 bytes to copy. */
> - adds r2, r2, #60
> - blt copy_less_than_4
> -
> -9:
> - /* Get here if there is less than 64 but at least 4 bytes to copy,
> - where the number of bytes to copy is r2+4. */
> - ldr r3, [r1], #4
> - str r3, [r0], #4
> - subs r2, r2, #4
> - bge 9b
> -
> - b copy_less_than_4
> -
> -#else /* not __ARM_FEATURE_UNALIGNED */
> -
> - /* ip has last 2 bits of src,
> - i.e., ip is the offset of src from word, and ip > 0.
> - Compute shifts needed to copy from src to dst. */
> - cmp ip, #2
> - beq miscopy_16_16 /* If ip == 2. */
> - bge miscopy_24_8 /* If ip == 3. */
> -
> - /* Get here if ip == 1. */
> -
> - /* Endian independent macros for shifting bytes within registers. */
> -
> -#ifndef __ARMEB__
> -miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsr shiftright=lsl
> -miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsr shiftright=lsl
> -miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsr shiftright=lsl
> -#else /* not __ARMEB__ */
> -miscopy_8_24: miscopy pull=8 push=24 shiftleft=lsl shiftright=lsr
> -miscopy_16_16: miscopy pull=16 push=16 shiftleft=lsl shiftright=lsr
> -miscopy_24_8: miscopy pull=24 push=8 shiftleft=lsl shiftright=lsr
> -#endif /* not __ARMEB__ */
> -
> -#endif /* not __ARM_FEATURE_UNALIGNED */
> + vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
> + vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
> + ands count, count, #0x3f
> +#else
> + /* Use an SMS style loop to maximize the I/O bandwidth. */
> + sub src, src, #4
> + sub dst, dst, #8
> + subs tmp2, count, #64 /* Use tmp2 for count. */
> + ldr A_l, [src, #4]
> + ldr A_h, [src, #8]
> + strd B_l, B_h, [sp, #8]
> + ldr B_l, [src, #12]
> + ldr B_h, [src, #16]
> + strd C_l, C_h, [sp, #16]
> + ldr C_l, [src, #20]
> + ldr C_h, [src, #24]
> + strd D_l, D_h, [sp, #24]
> + ldr D_l, [src, #28]
> + ldr D_h, [src, #32]!
> + b 1f
> + .p2align 6
> +2:
> + pld [src, #(5 * 64) - (32 - 4)]
> + strd A_l, A_h, [dst, #40]
> + ldr A_l, [src, #36]
> + ldr A_h, [src, #40]
> + strd B_l, B_h, [dst, #48]
> + ldr B_l, [src, #44]
> + ldr B_h, [src, #48]
> + strd C_l, C_h, [dst, #56]
> + ldr C_l, [src, #52]
> + ldr C_h, [src, #56]
> + strd D_l, D_h, [dst, #64]!
> + ldr D_l, [src, #60]
> + ldr D_h, [src, #64]!
> + subs tmp2, tmp2, #64
> +1:
> + strd A_l, A_h, [dst, #8]
> + ldr A_l, [src, #4]
> + ldr A_h, [src, #8]
> + strd B_l, B_h, [dst, #16]
> + ldr B_l, [src, #12]
> + ldr B_h, [src, #16]
> + strd C_l, C_h, [dst, #24]
> + ldr C_l, [src, #20]
> + ldr C_h, [src, #24]
> + strd D_l, D_h, [dst, #32]
> + ldr D_l, [src, #28]
> + ldr D_h, [src, #32]
> + bcs 2b
> +
> + /* Save the remaining bytes and restore the callee-saved regs. */
> + strd A_l, A_h, [dst, #40]
> + add src, src, #36
> + strd B_l, B_h, [dst, #48]
> + ldrd B_l, B_h, [sp, #8]
> + strd C_l, C_h, [dst, #56]
> + ldrd C_l, C_h, [sp, #16]
> + strd D_l, D_h, [dst, #64]
> + ldrd D_l, D_h, [sp, #24]
> + add dst, dst, #72
> + ands count, tmp2, #0x3f
> +#endif
> + ldr tmp2, [sp], #FRAME_SIZE
> + bne .Ltail63unaligned
> + bx lr
> +
> + .size memcpy, . - memcpy
>
> #endif /* memcpy */
> --
> 1.8.1.4
>
--
Will Newton
Toolchain Working Group, Linaro