This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
RE: [PATCH][AArch64] Optimize strlen
- From: "Wilco Dijkstra" <wdijkstr at arm dot com>
- To: <libc-alpha at sourceware dot org>
- Date: Wed, 15 Apr 2015 13:34:06 +0100
- Subject: RE: [PATCH][AArch64] Optimize strlen
- Authentication-results: sourceware.org; auth=none
- References:
ping
> > -----Original Message-----
> > From: Wilco Dijkstra [mailto:wdijkstr@arm.com]
> > Sent: 16 January 2015 17:39
> > To: libc-alpha@sourceware.org
> > Subject: [PATCH][AArch64] Optimize strlen
> >
> > Optimize the strlen implementation by using a page cross check and a fast check
> > for nul bytes which reverts to separate loop when a non-ASCII char is encountered.
> > Speedup on test-strlen is ~10%, long ASCII strings are processed ~60% faster,
> > and on random tests it is ~80% better.
> >
> > ChangeLog:
> >
> > 2015-01-16 Wilco Dijkstra <wdijkstr@arm.com>
> >
> > * sysdeps/aarch64/strlen.S (strlen): Optimize strlen.
> >
> > ---
> > sysdeps/aarch64/strlen.S | 230 +++++++++++++++++++++++++++++++++--------------
> > 1 file changed, 165 insertions(+), 65 deletions(-)
> >
> > diff --git a/sysdeps/aarch64/strlen.S b/sysdeps/aarch64/strlen.S
> > index 54271b9..7ca47d9 100644
> > --- a/sysdeps/aarch64/strlen.S
> > +++ b/sysdeps/aarch64/strlen.S
> > @@ -20,9 +20,13 @@
> >
> > /* Assumptions:
> > *
> > - * ARMv8-a, AArch64
> > + * ARMv8-a, AArch64, unaligned accesses, min page size 4k.
> > */
> >
> > +/* To test the page crossing code path more thoroughly, compile with
> > + -DTEST_PAGE_CROSS - this will force all calls through the slower
> > + entry path. This option is not intended for production use. */
> > +
> > /* Arguments and results. */
> > #define srcin x0
> > #define len x0
> > @@ -31,87 +35,183 @@
> > #define src x1
> > #define data1 x2
> > #define data2 x3
> > -#define data2a x4
> > -#define has_nul1 x5
> > -#define has_nul2 x6
> > -#define tmp1 x7
> > -#define tmp2 x8
> > -#define tmp3 x9
> > -#define tmp4 x10
> > -#define zeroones x11
> > -#define pos x12
> > +#define has_nul1 x4
> > +#define has_nul2 x5
> > +#define tmp1 x4
> > +#define tmp2 x5
> > +#define tmp3 x6
> > +#define tmp4 x7
> > +#define zeroones x8
> > +
> > + /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> > + (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> > + can be done in parallel across the entire word. A faster check
> > + (X - 1) & 0x80 is zero for non-NUL ASCII characters, but gives
> > + false hits for characters 129..255. */
> >
> > #define REP8_01 0x0101010101010101
> > #define REP8_7f 0x7f7f7f7f7f7f7f7f
> > #define REP8_80 0x8080808080808080
> >
> > - /* Start of critial section -- keep to one 64Byte cache line. */
> > +#ifdef TEST_PAGE_CROSS
> > +# define MIN_PAGE_SIZE 15
> > +#else
> > +# define MIN_PAGE_SIZE 4096
> > +#endif
> > +
> > + /* Since strings are short on average, we check the first 16 bytes
> > + of the string for a NUL character. In order to do an unaligned ldp
> > + safely we have to do a page cross check first. If there is a NUL
> > + byte we calculate the length from the 2 8-byte words using
> > + conditional select to reduce branch mispredictions (it is unlikely
> > + strlen will be repeatedly called on strings with the same length).
> > +
> > + If the string is longer than 16 bytes, we align src so don't need
> > + further page cross checks, and process 32 bytes per iteration
> > + using the fast NUL check. If we encounter non-ASCII characters,
> > + fallback to a second loop using the full NUL check.
> > +
> > + If the page cross check fails, we read 16 bytes from an aligned
> > + address, remove any characters before the string, and continue
> > + in the main loop using aligned loads. Since strings crossing a
> > + page in the first 16 bytes are rare (probability of
> > + 16/MIN_PAGE_SIZE ~= 0.4%), this case does not need to be optimized.
> > +
> > + AArch64 systems have a minimum page size of 4k. We don't bother
> > + checking for larger page sizes - the cost of setting up the correct
> > + page size is just not worth the extra gain from a small reduction in
> > + the cases taking the slow path. Note that we only care about
> > + whether the first fetch, which may be misaligned, crosses a page
> > + boundary. */
> > +
> > ENTRY_ALIGN (strlen, 6)
> > - mov zeroones, #REP8_01
> > - bic src, srcin, #15
> > - ands tmp1, srcin, #15
> > - b.ne L(misaligned)
> > - /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
> > - (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
> > - can be done in parallel across the entire word. */
> > - /* The inner loop deals with two Dwords at a time. This has a
> > - slightly higher start-up cost, but we should win quite quickly,
> > - especially on cores with a high number of issue slots per
> > - cycle, as we get much better parallelism out of the operations. */
> > -L(loop):
> > - ldp data1, data2, [src], #16
> > -L(realigned):
> > + and tmp1, srcin, MIN_PAGE_SIZE - 1
> > + mov zeroones, REP8_01
> > + cmp tmp1, MIN_PAGE_SIZE - 16
> > + b.gt L(page_cross)
> > + ldp data1, data2, [srcin]
> > +#ifdef __AARCH64EB__
> > + /* For big-endian, carry propagation (if the final byte in the
> > + string is 0x01) means we cannot use has_nul1/2 directly.
> > + Since we expect strings to be small and early-exit,
> > + byte-swap the data now so has_null1/2 will be correct. */
> > + rev data1, data1
> > + rev data2, data2
> > +#endif
> > sub tmp1, data1, zeroones
> > - orr tmp2, data1, #REP8_7f
> > + orr tmp2, data1, REP8_7f
> > sub tmp3, data2, zeroones
> > - orr tmp4, data2, #REP8_7f
> > - bic has_nul1, tmp1, tmp2
> > - bics has_nul2, tmp3, tmp4
> > - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
> > - b.eq L(loop)
> > - /* End of critical section -- keep to one 64Byte cache line. */
> > + orr tmp4, data2, REP8_7f
> > + bics has_nul1, tmp1, tmp2
> > + bic has_nul2, tmp3, tmp4
> > + ccmp has_nul2, 0, 0, eq
> > + beq L(main_loop_entry)
> >
> > - sub len, src, srcin
> > - cbz has_nul1, L(nul_in_data2)
> > -#ifdef __AARCH64EB__
> > - mov data2, data1
> > -#endif
> > - sub len, len, #8
> > - mov has_nul2, has_nul1
> > -L(nul_in_data2):
> > + /* Enter with C = has_nul1 == 0. */
> > + csel has_nul1, has_nul1, has_nul2, cc
> > + mov len, 8
> > + rev has_nul1, has_nul1
> > + clz tmp1, has_nul1
> > + csel len, xzr, len, cc
> > + add len, len, tmp1, lsr 3
> > + ret
> > +
> > + /* The inner loop processes 32 bytes per iteration and uses the fast
> > + NUL check. If we encounter non-ASCII characters, use a second
> > + loop with the accurate NUL check. */
> > + .p2align 4
> > +L(main_loop_entry):
> > + bic src, srcin, 15
> > + sub src, src, 16
> > +L(main_loop):
> > + ldp data1, data2, [src, 32]!
> > +.Lpage_cross_entry:
> > + sub tmp1, data1, zeroones
> > + sub tmp3, data2, zeroones
> > + orr tmp2, tmp1, tmp3
> > + tst tmp2, zeroones, lsl 7
> > + bne 1f
> > + ldp data1, data2, [src, 16]
> > + sub tmp1, data1, zeroones
> > + sub tmp3, data2, zeroones
> > + orr tmp2, tmp1, tmp3
> > + tst tmp2, zeroones, lsl 7
> > + beq L(main_loop)
> > + add src, src, 16
> > +1:
> > + /* The fast check failed, so do the slower, accurate NUL check. */
> > + orr tmp2, data1, REP8_7f
> > + orr tmp4, data2, REP8_7f
> > + bics has_nul1, tmp1, tmp2
> > + bic has_nul2, tmp3, tmp4
> > + ccmp has_nul2, 0, 0, eq
> > + beq L(nonascii_loop)
> > +
> > + /* Enter with C = has_nul1 == 0. */
> > +L(tail):
> > #ifdef __AARCH64EB__
> > /* For big-endian, carry propagation (if the final byte in the
> > - string is 0x01) means we cannot use has_nul directly. The
> > + string is 0x01) means we cannot use has_nul1/2 directly. The
> > easiest way to get the correct byte is to byte-swap the data
> > and calculate the syndrome a second time. */
> > - rev data2, data2
> > - sub tmp1, data2, zeroones
> > - orr tmp2, data2, #REP8_7f
> > - bic has_nul2, tmp1, tmp2
> > + csel data1, data1, data2, cc
> > + rev data1, data1
> > + sub tmp1, data1, zeroones
> > + orr tmp2, data1, REP8_7f
> > + bic has_nul1, tmp1, tmp2
> > +#else
> > + csel has_nul1, has_nul1, has_nul2, cc
> > #endif
> > - sub len, len, #8
> > - rev has_nul2, has_nul2
> > - clz pos, has_nul2
> > - add len, len, pos, lsr #3 /* Bits to bytes. */
> > - RET
> > -
> > -L(misaligned):
> > - cmp tmp1, #8
> > - neg tmp1, tmp1
> > - ldp data1, data2, [src], #16
> > - lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
> > - mov tmp2, #~0
> > + sub len, src, srcin
> > + rev has_nul1, has_nul1
> > + add tmp2, len, 8
> > + clz tmp1, has_nul1
> > + csel len, len, tmp2, cc
> > + add len, len, tmp1, lsr 3
> > + ret
> > +
> > +L(nonascii_loop):
> > + ldp data1, data2, [src, 16]!
> > + sub tmp1, data1, zeroones
> > + orr tmp2, data1, REP8_7f
> > + sub tmp3, data2, zeroones
> > + orr tmp4, data2, REP8_7f
> > + bics has_nul1, tmp1, tmp2
> > + bic has_nul2, tmp3, tmp4
> > + ccmp has_nul2, 0, 0, eq
> > + bne L(tail)
> > + ldp data1, data2, [src, 16]!
> > + sub tmp1, data1, zeroones
> > + orr tmp2, data1, REP8_7f
> > + sub tmp3, data2, zeroones
> > + orr tmp4, data2, REP8_7f
> > + bics has_nul1, tmp1, tmp2
> > + bic has_nul2, tmp3, tmp4
> > + ccmp has_nul2, 0, 0, eq
> > + beq L(nonascii_loop)
> > + b L(tail)
> > +
> > + /* Load 16 bytes from [srcin & ~15] and force the bytes that precede
> > + srcin to 0x7f, so we ignore any NUL bytes before the string.
> > + Then continue in the aligned loop. */
> > +L(page_cross):
> > + bic src, srcin, 15
> > + ldp data1, data2, [src]
> > + lsl tmp1, srcin, 3
> > + mov tmp4, -1
> > #ifdef __AARCH64EB__
> > - /* Big-endian. Early bytes are at MSB. */
> > - lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> > + /* Big-endian. Early bytes are at MSB. */
> > + lsr tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> > #else
> > /* Little-endian. Early bytes are at LSB. */
> > - lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
> > + lsl tmp1, tmp4, tmp1 /* Shift (tmp1 & 63). */
> > #endif
> > - orr data1, data1, tmp2
> > - orr data2a, data2, tmp2
> > - csinv data1, data1, xzr, le
> > - csel data2, data2, data2a, le
> > - b L(realigned)
> > + orr tmp1, tmp1, REP8_80
> > + orn data1, data1, tmp1
> > + orn tmp2, data2, tmp1
> > + tst srcin, 8
> > + csel data1, data1, tmp4, eq
> > + csel data2, data2, tmp2, eq
> > + b L(page_cross_entry)
> > END (strlen)
> > libc_hidden_builtin_def (strlen)
> > --
> > 1.9.1