This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PING][PATCH] Improve memcmp with unaligned loads.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 10 Feb 2014 18:18:48 +0100
- Subject: [PING][PATCH] Improve memcmp with unaligned loads.
- Authentication-results: sourceware.org; auth=none
- References: <20130918215808 dot GA31232 at domone>
ping
On Wed, Sep 18, 2013 at 11:58:08PM +0200, OndÅej BÃlka wrote:
> Hi,
>
> A sse4_1 version of memcmp was optimized relatively well and it took me
> quite of time to come with improvement. When memcmp does check we need to
> integrate ends caused by mismatches with ending by exceeding size.
> I tried several approaches that were slower than current memcmp.
>
> A solution come from bit unexpected direction, when I tried to implement
> strcmp I noticed that a same idea applies to memcmp as well and this
> gains us 10% in real workloads.
>
> There are few optimizations that I left for future patches,
>
> First one is a wmemcmp support which is straigthforward to add but I did
> not have time to do so.
>
> Second is that if doing first check if first byte matches or not helps
> us? (strcmp is same case) For that I would need to collect more data as
> this depends on profile.
>
> A code is generated from following C skeleton, there is some room of
> improvement by having better control flow.
>
> int memcmp (unsigned char *a, unsigned char *b, unsigned long n)
> {
> unsigned long i;
> finish:;
> if (!n)
> return 0;
> /* if (a[0] != b[0]) return a[0] - b[0]; */
> int or1=((unsigned long)a) & 4095;
> int or2=((unsigned long)b) & 4095;
> if ((or1 <= 4096 - 64) & (or2 <= 4096 - 64)) {
> unsigned long xm = (2L << (n - 1)) - 1; /* To handle n==64. */
>
> /* Check first 16 bytes and return bitmask with 1 on positions where
> bytes differ. */
> unsigned long ma= get_diff16 (a, b);
> if (ma & xm)
> {
> int f=ffs (ma & xm);
> return a[f] - b[f];
> }
> if (n <= 16) /* Profiling shown that sizes less than 16 that are
> same are relatively common and this improves
> performance by around 10%. */
> return 0;
> ma = ma | get_diff48 (a + 16, b + 16);
> if ((ma) & xm)
> {
> int f = ffs (ma & xm);
> return a[f] - b[f];
> }
> if (n <= 64)
> return 0;
> if (ma)
> {
> int f = ffs (ma);
> return a[f]-b[f];
> }
> } else {
> for (i = 0;i <= 64; i++){
> if (i >= n)
> return 0;
> if (a[i] != b[i])
> return a[i] - b[i];
> }
> }
> unsigned char *a_al = ALIGN_DOWN (a + 64, 64);
> int shift= a_al - a;
> a += shift;
> b += shift;
> n -= shift; /* Possible to replace with counter. */
> while (1)
> {
> if (n<=64) goto finish;
> if (get_diff_loop64(a,b)) /*nonzero if 64 bytes differ. */
> goto finish;
> a += 64;
> b += 64;
> n -= 64;
> }
> }
>
> Passes test ok to commit?
>
> * sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S: New file.
> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
> sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
> * sysdeps/x86_64/multiarch/memcmp.S: Add __memcmp_sse2_unaligned
> ifunc selection.
> * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
> __memcmp_sse2_unaligned.
>
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 +-
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 1 +
> sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S | 190 +++++++++++++++++++++++
> sysdeps/x86_64/multiarch/memcmp.S | 4 +-
> 4 files changed, 194 insertions(+), 3 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 5ab950a..44976c7 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -8,7 +8,7 @@ ifeq ($(subdir),string)
>
> sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
> strcmp-sse2-unaligned strncmp-ssse3 \
> - strend-sse4 memcmp-sse4 memcpy-ssse3 \
> + strend-sse4 memcmp-sse2-unaligned memcpy-ssse3 \
> memcpy-sse2-unaligned mempcpy-ssse3 \
> memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
> memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 1a65ac0..a6d0c11 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -42,6 +42,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
> __memcmp_sse4_1)
> IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
> + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>
> /* Support sysdeps/x86_64/multiarch/memmove_chk.S. */
> diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
> new file mode 100644
> index 0000000..8d11b47
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
> @@ -0,0 +1,190 @@
> +/* memcmp with SSE2 and unaligned loads
> + Copyright (C) 2013 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#ifndef NOT_IN_libc
> +
> +# include <sysdep.h>
> +
> +# ifndef MEMCMP
> +# define MEMCMP __memcmp_sse2_unaligned
> +# endif
> +
> +# ifndef ALIGN
> +# define ALIGN(n) .p2align n
> +# endif
> +
> +ENTRY (MEMCMP)
> + testq %rdx, %rdx
> + je L(return_zero)
> + pxor %xmm4, %xmm4
> +L(handle_end):
> + movl %esi, %eax
> + andl $4095, %eax
> + cmpl $4032, %eax
> + jg L(cross_page)
> + movl %edi, %eax
> + andl $4095, %eax
> + cmpl $4032, %eax
> + jg L(cross_page)
> + movdqu (%rdi), %xmm0
> + lea -1(%edx), %ecx
> + movl $2, %eax
> + movdqu (%rsi), %xmm1
> + salq %cl, %rax
> + leaq -1(%rax), %rcx
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm4, %xmm0
> + pmovmskb %xmm0, %r8d
> + movq %r8, %rax
> + andq %rcx, %rax
> + jne L(different)
> + cmpq $16, %rdx
> + jbe L(return_zero)
> + movdqu 16(%rdi), %xmm2
> + movdqu 16(%rsi), %xmm6
> + movdqu 32(%rdi), %xmm1
> + pcmpeqb %xmm6, %xmm2
> + movdqu 32(%rsi), %xmm5
> + pcmpeqb %xmm4, %xmm2
> + pcmpeqb %xmm5, %xmm1
> + movdqu 48(%rdi), %xmm0
> + pmovmskb %xmm2, %eax
> + movdqu 48(%rsi), %xmm3
> + pcmpeqb %xmm4, %xmm1
> + pmovmskb %xmm1, %r9d
> + salq $16, %rax
> + pcmpeqb %xmm3, %xmm0
> + salq $32, %r9
> + pcmpeqb %xmm4, %xmm0
> + orq %r9, %rax
> + orq %r8, %rax
> + pmovmskb %xmm0, %r8d
> + salq $48, %r8
> + orq %r8, %rax
> + andq %rax, %rcx
> + jne L(different2)
> + cmpq $64, %rdx
> + jbe L(return_zero)
> + testq %rax, %rax
> + jne L(different)
> +L(align_loop):
> + leaq 64(%rdi), %rax
> + andq $-64, %rax
> + subq %rdi, %rax
> + subq %rax, %rdx
> + addq %rax, %rdi
> + addq %rax, %rsi
> + cmpq $64, %rdx
> + ja L(loop_start)
> + testq %rdx, %rdx
> + jne L(handle_end)
> + xorl %eax, %eax
> + ret
> +
> +
> + ALIGN (4)
> +L(different):
> + bsfq %rax, %rdx
> + movzbl (%rdi,%rdx), %eax
> + movzbl (%rsi,%rdx), %edx
> + subl %edx, %eax
> + ret
> +L(different2):
> + bsfq %rcx, %rdx
> + movzbl (%rdi,%rdx), %eax
> + movzbl (%rsi,%rdx), %edx
> + subl %edx, %eax
> + ret
> +
> + ALIGN (4)
> +L(loop):
> + subq $64, %rdx
> + addq $64, %rdi
> + addq $64, %rsi
> + cmpq $64, %rdx
> + jbe L(less_64_bytes)
> +L(loop_start):
> + movdqu (%rsi), %xmm0
> + movdqu 16(%rsi), %xmm3
> + pcmpeqb (%rdi), %xmm0
> + movdqu 32(%rsi), %xmm2
> + pcmpeqb 16(%rdi), %xmm3
> + movdqu 48(%rsi), %xmm1
> + pcmpeqb 32(%rdi), %xmm2
> + pminub %xmm3, %xmm0
> + pcmpeqb 48(%rdi), %xmm1
> + pminub %xmm2, %xmm0
> + pminub %xmm1, %xmm0
> + pcmpeqb %xmm4, %xmm0
> + pmovmskb %xmm0, %eax
> + testl %eax, %eax
> + je L(loop)
> + jmp L(handle_end)
> +
> + ALIGN (4)
> +L(less_64_bytes):
> + testq %rdx, %rdx
> + jne L(handle_end)
> + xorl %eax, %eax
> + ret
> +
> +
> +
> + /* Byte by byte loop, this should be fast enough as
> + we expect mismatch in first 5 bytes.
> +
> + for (i=0;i<64;i++)
> + {
> + if (i==n)
> + return 0;
> + if (a[i]!=b[i])
> + return a[i]-b[i];
> + }
> + */
> + ALIGN (4)
> +L(cross_page):
> + testq %rdx, %rdx
> + je L(return_zero)
> + movzbl (%rdi), %eax
> + movzbl (%rsi), %ecx
> + cmpb %cl, %al
> + jne L(cross_page_different)
> + movl $1, %r8d
> + jmp L(cross_page_loop_start)
> +
> + ALIGN (4)
> +L(cross_page_loop):
> + movzbl (%rdi,%r8), %eax
> + movzbl (%rsi,%r8), %ecx
> + cmpb %cl, %al
> + jne L(cross_page_different)
> + addq $1, %r8
> + cmpq $65, %r8
> + je L(align_loop)
> +L(cross_page_loop_start):
> + cmpq %rdx, %r8
> + jne L(cross_page_loop)
> +L(return_zero):
> + xorl %eax, %eax
> + ret
> +L(cross_page_different):
> + subl %ecx, %eax
> + ret
> +
> +END (MEMCMP)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
> index da88af2..9095f00 100644
> --- a/sysdeps/x86_64/multiarch/memcmp.S
> +++ b/sysdeps/x86_64/multiarch/memcmp.S
> @@ -35,9 +35,9 @@ ENTRY(memcmp)
> leaq __memcmp_sse2(%rip), %rax
> ret
>
> -2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
> +2: testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
> jz 3f
> - leaq __memcmp_sse4_1(%rip), %rax
> + leaq __memcmp_sse2_unaligned(%rip), %rax
> ret
>
> 3: leaq __memcmp_ssse3(%rip), %rax
> --
> 1.8.3.2
--
You must've hit the wrong any key.