This is the mail archive of the
newlib@sourceware.org
mailing list for the newlib project.
Re: [PATCH, AArch64]: Optimize strlen
- From: Richard Earnshaw <rearnsha at arm dot com>
- To: Wilco Dijkstra <wdijkstr at arm dot com>, "newlib at sourceware dot org" <newlib at sourceware dot org>
- Date: Fri, 16 Jan 2015 16:56:42 +0000
- Subject: Re: [PATCH, AArch64]: Optimize strlen
- Authentication-results: sourceware.org; auth=none
- References: <002201d031a9$a8f29a90$fad7cfb0$ at com>
On 16/01/15 16:29, Wilco Dijkstra wrote:
> Optimize the strlen implementation by using a page cross check and a fast check
> for nul bytes which reverts to separate loop when a non-ASCII char is encountered.
> Speedup on cortex-strings test is ~10%, long ASCII strings are processed ~60%
> faster, and on random tests it is ~80% better.
>
> OK for commit?
>
>
Please can you re-send with the patch in unified diff format. ie diff -u.
You also need a ChangeLog entry.
R.
> strlen.txt
>
>
> Index: strlen.S
> ===================================================================
> RCS file: /cvs/src/src/newlib/libc/machine/aarch64/strlen.S,v
> retrieving revision 1.1
> diff -r1.1 strlen.S
> 1c1
> < /* Copyright (c) 2013, Linaro Limited
> ---
>> /* Copyright (c) 2013-2015, Linaro Limited
> 7c7
> < notice, this list of conditions and the following disclaimer.
> ---
>> notice, this list of conditions and the following disclaimer.
> 9,10c9,10
> < notice, this list of conditions and the following disclaimer in the
> < documentation and/or other materials provided with the distribution.
> ---
>> notice, this list of conditions and the following disclaimer in the
>> documentation and/or other materials provided with the distribution.
> 12,13c12,13
> < names of its contributors may be used to endorse or promote products
> < derived from this software without specific prior written permission.
> ---
>> names of its contributors may be used to endorse or promote products
>> derived from this software without specific prior written permission.
> 33c33
> < * ARMv8-a, AArch64
> ---
>> * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> 35a36,39
>> /* To test the page crossing code path more thoroughly, compile with
>> -DTEST_PAGE_CROSS - this will force all calls through the slower
>> entry path. This option is not intended for production use. */
>>
> 44,52c48,56
> < #define data2a x4
> < #define has_nul1 x5
> < #define has_nul2 x6
> < #define tmp1 x7
> < #define tmp2 x8
> < #define tmp3 x9
> < #define tmp4 x10
> < #define zeroones x11
> < #define pos x12
> ---
>> #define has_nul1 x4
>> #define has_nul2 x5
>> #define tmp1 x4
>> #define tmp2 x5
>> #define tmp3 x6
>> #define tmp4 x7
>> #define zeroones x8
>>
>> #define L(l) .L ## l
> 61a66,71
>> /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
>> (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
>> can be done in parallel across the entire word. A faster check
>> (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
>> false hits for characters 129..255. */
>>
> 66c76,106
> < /* Start of critial section -- keep to one 64Byte cache line. */
> ---
>> #ifdef TEST_PAGE_CROSS
>> # define MIN_PAGE_SIZE 15
>> #else
>> # define MIN_PAGE_SIZE 4096
>> #endif
>>
>> /* Since strings are short on average, we check the first 16 bytes
>> of the string for a NUL character. In order to do an unaligned ldp
>> safely we have to do a page cross check first. If there is a NUL
>> byte we calculate the length from the 2 8-byte words using
>> conditional select to reduce branch mispredictions (it is unlikely
>> strlen will be repeatedly called on strings with the same length).
>>
>> If the string is longer than 16 bytes, we align src so don't need
>> further page cross checks, and process 32 bytes per iteration
>> using the fast NUL check. If we encounter non-ASCII characters,
>> fallback to a second loop using the full NUL check.
>>
>> If the page cross check fails, we read 16 bytes from an aligned
>> address, remove any characters before the string, and continue
>> in the main loop using aligned loads. Since strings crossing a
>> page in the first 16 bytes are rare (probability of
>> 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
>>
>> AArch64 systems have a minimum page size of 4k. We don't bother
>> checking for larger page sizes - the cost of setting up the correct
>> page size is just not worth the extra gain from a small reduction in
>> the cases taking the slow path. Note that we only care about
>> whether the first fetch, which may be misaligned, crosses a page
>> boundary. */
>>
> 68,81c108,120
> < mov zeroones, #REP8_01
> < bic src, srcin, #15
> < ands tmp1, srcin, #15
> < b.ne .Lmisaligned
> < /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> < (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> < can be done in parallel across the entire word. */
> < /* The inner loop deals with two Dwords at a time. This has a
> < slightly higher start-up cost, but we should win quite quickly,
> < especially on cores with a high number of issue slots per
> < cycle, as we get much better parallelism out of the operations. */
> < .Lloop:
> < ldp data1, data2, [src], #16
> < .Lrealigned:
> ---
>> and tmp1, srcin, MIN_PAGE_SIZE - 1
>> mov zeroones, REP8_01
>> cmp tmp1, MIN_PAGE_SIZE - 16
>> b.gt L(page_cross)
>> ldp data1, data2, [srcin]
>> #ifdef __AARCH64EB__
>> /* For big-endian, carry propagation (if the final byte in the
>> string is 0x01) means we cannot use has_nul1/2 directly.
>> Since we expect strings to be small and early-exit,
>> byte-swap the data now so has_null1/2 will be correct. */
>> rev data1, data1
>> rev data2, data2
>> #endif
> 83c122
> < orr tmp2, data1, #REP8_7f
> ---
>> orr tmp2, data1, REP8_7f
> 85,90c124,137
> < orr tmp4, data2, #REP8_7f
> < bic has_nul1, tmp1, tmp2
> < bics has_nul2, tmp3, tmp4
> < ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
> < b.eq .Lloop
> < /* End of critical section -- keep to one 64Byte cache line. */
> ---
>> orr tmp4, data2, REP8_7f
>> bics has_nul1, tmp1, tmp2
>> bic has_nul2, tmp3, tmp4
>> ccmp has_nul2, 0, 0, eq
>> beq L(main_loop_entry)
>>
>> /* Enter with C = has_nul1 == 0. */
>> csel has_nul1, has_nul1, has_nul2, cc
>> mov len, 8
>> rev has_nul1, has_nul1
>> clz tmp1, has_nul1
>> csel len, xzr, len, cc
>> add len, len, tmp1, lsr 3
>> ret
> 92,99c139,171
> < sub len, src, srcin
> < cbz has_nul1, .Lnul_in_data2
> < #ifdef __AARCH64EB__
> < mov data2, data1
> < #endif
> < sub len, len, #8
> < mov has_nul2, has_nul1
> < .Lnul_in_data2:
> ---
>> /* The inner loop processes 32 bytes per iteration and uses the fast
>> NUL check. If we encounter non-ASCII characters, use a second
>> loop with the accurate NUL check. */
>> .p2align 4
>> L(main_loop_entry):
>> bic src, srcin, 15
>> sub src, src, 16
>> L(main_loop):
>> ldp data1, data2, [src, 32]!
>> .Lpage_cross_entry:
>> sub tmp1, data1, zeroones
>> sub tmp3, data2, zeroones
>> orr tmp2, tmp1, tmp3
>> tst tmp2, zeroones, lsl 7
>> bne 1f
>> ldp data1, data2, [src, 16]
>> sub tmp1, data1, zeroones
>> sub tmp3, data2, zeroones
>> orr tmp2, tmp1, tmp3
>> tst tmp2, zeroones, lsl 7
>> beq L(main_loop)
>> add src, src, 16
>> 1:
>> /* The fast check failed, so do the slower, accurate NUL check. */
>> orr tmp2, data1, REP8_7f
>> orr tmp4, data2, REP8_7f
>> bics has_nul1, tmp1, tmp2
>> bic has_nul2, tmp3, tmp4
>> ccmp has_nul2, 0, 0, eq
>> beq L(nonascii_loop)
>>
>> /* Enter with C = has_nul1 == 0. */
>> L(tail):
> 102c174
> < string is 0x01) means we cannot use has_nul directly. The
> ---
>> string is 0x01) means we cannot use has_nul1/2 directly. The
> 105,108c177,183
> < rev data2, data2
> < sub tmp1, data2, zeroones
> < orr tmp2, data2, #REP8_7f
> < bic has_nul2, tmp1, tmp2
> ---
>> csel data1, data1, data2, cc
>> rev data1, data1
>> sub tmp1, data1, zeroones
>> orr tmp2, data1, REP8_7f
>> bic has_nul1, tmp1, tmp2
>> #else
>> csel has_nul1, has_nul1, has_nul2, cc
> 110,113c185,190
> < sub len, len, #8
> < rev has_nul2, has_nul2
> < clz pos, has_nul2
> < add len, len, pos, lsr #3 /* Bits to bytes. */
> ---
>> sub len, src, srcin
>> rev has_nul1, has_nul1
>> add tmp2, len, 8
>> clz tmp1, has_nul1
>> csel len, len, tmp2, cc
>> add len, len, tmp1, lsr 3
> 116,121c193,221
> < .Lmisaligned:
> < cmp tmp1, #8
> < neg tmp1, tmp1
> < ldp data1, data2, [src], #16
> < lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
> < mov tmp2, #~0
> ---
>> L(nonascii_loop):
>> ldp data1, data2, [src, 16]!
>> sub tmp1, data1, zeroones
>> orr tmp2, data1, REP8_7f
>> sub tmp3, data2, zeroones
>> orr tmp4, data2, REP8_7f
>> bics has_nul1, tmp1, tmp2
>> bic has_nul2, tmp3, tmp4
>> ccmp has_nul2, 0, 0, eq
>> bne L(tail)
>> ldp data1, data2, [src, 16]!
>> sub tmp1, data1, zeroones
>> orr tmp2, data1, REP8_7f
>> sub tmp3, data2, zeroones
>> orr tmp4, data2, REP8_7f
>> bics has_nul1, tmp1, tmp2
>> bic has_nul2, tmp3, tmp4
>> ccmp has_nul2, 0, 0, eq
>> beq L(nonascii_loop)
>> b L(tail)
>>
>> /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
>> srcin to 0x7f, so we ignore any NUL bytes before the string.
>> Then continue in the aligned loop. */
>> L(page_cross):
>> bic src, srcin, 15
>> ldp data1, data2, [src]
>> lsl tmp1, srcin, 3
>> mov tmp4, -1
> 123,124c223,224
> < /* Big-endian. Early bytes are at MSB. */
> < lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> ---
>> /* Big-endian. Early bytes are at MSB. */
>> lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> 127c227
> < lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> ---
>> lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> 129,133c229,235
> < orr data1, data1, tmp2
> < orr data2a, data2, tmp2
> < csinv data1, data1, xzr, le
> < csel data2, data2, data2a, le
> < b .Lrealigned
> ---
>> orr tmp1, tmp1, REP8_80
>> orn data1, data1, tmp1
>> orn tmp2, data2, tmp1
>> tst srcin, 8
>> csel data1, data1, tmp4, eq
>> csel data2, data2, tmp2, eq
>> b L(page_cross_entry)