This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] Improve memcmp with unaligned loads.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Wed, 18 Sep 2013 23:58:08 +0200
- Subject: [PATCH] Improve memcmp with unaligned loads.
- Authentication-results: sourceware.org; auth=none
Hi,
A sse4_1 version of memcmp was optimized relatively well and it took me
quite of time to come with improvement. When memcmp does check we need to
integrate ends caused by mismatches with ending by exceeding size.
I tried several approaches that were slower than current memcmp.
A solution come from bit unexpected direction, when I tried to implement
strcmp I noticed that a same idea applies to memcmp as well and this
gains us 10% in real workloads.
There are few optimizations that I left for future patches,
First one is a wmemcmp support which is straigthforward to add but I did
not have time to do so.
Second is that if doing first check if first byte matches or not helps
us? (strcmp is same case) For that I would need to collect more data as
this depends on profile.
A code is generated from following C skeleton, there is some room of
improvement by having better control flow.
int memcmp (unsigned char *a, unsigned char *b, unsigned long n)
{
unsigned long i;
finish:;
if (!n)
return 0;
/* if (a[0] != b[0]) return a[0] - b[0]; */
int or1=((unsigned long)a) & 4095;
int or2=((unsigned long)b) & 4095;
if ((or1 <= 4096 - 64) & (or2 <= 4096 - 64)) {
unsigned long xm = (2L << (n - 1)) - 1; /* To handle n==64. */
/* Check first 16 bytes and return bitmask with 1 on positions where
bytes differ. */
unsigned long ma= get_diff16 (a, b);
if (ma & xm)
{
int f=ffs (ma & xm);
return a[f] - b[f];
}
if (n <= 16) /* Profiling shown that sizes less than 16 that are
same are relatively common and this improves
performance by around 10%. */
return 0;
ma = ma | get_diff48 (a + 16, b + 16);
if ((ma) & xm)
{
int f = ffs (ma & xm);
return a[f] - b[f];
}
if (n <= 64)
return 0;
if (ma)
{
int f = ffs (ma);
return a[f]-b[f];
}
} else {
for (i = 0;i <= 64; i++){
if (i >= n)
return 0;
if (a[i] != b[i])
return a[i] - b[i];
}
}
unsigned char *a_al = ALIGN_DOWN (a + 64, 64);
int shift= a_al - a;
a += shift;
b += shift;
n -= shift; /* Possible to replace with counter. */
while (1)
{
if (n<=64) goto finish;
if (get_diff_loop64(a,b)) /*nonzero if 64 bytes differ. */
goto finish;
a += 64;
b += 64;
n -= 64;
}
}
Passes test ok to commit?
* sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S: New file.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
* sysdeps/x86_64/multiarch/memcmp.S: Add __memcmp_sse2_unaligned
ifunc selection.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
__memcmp_sse2_unaligned.
---
sysdeps/x86_64/multiarch/Makefile | 2 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 1 +
sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S | 190 +++++++++++++++++++++++
sysdeps/x86_64/multiarch/memcmp.S | 4 +-
4 files changed, 194 insertions(+), 3 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5ab950a..44976c7 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,7 +8,7 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
- strend-sse4 memcmp-sse4 memcpy-ssse3 \
+ strend-sse4 memcmp-sse2-unaligned memcpy-ssse3 \
memcpy-sse2-unaligned mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 1a65ac0..a6d0c11 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -42,6 +42,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
__memcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
+ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
/* Support sysdeps/x86_64/multiarch/memmove_chk.S. */
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
new file mode 100644
index 0000000..8d11b47
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
@@ -0,0 +1,190 @@
+/* memcmp with SSE2 and unaligned loads
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_sse2_unaligned
+# endif
+
+# ifndef ALIGN
+# define ALIGN(n) .p2align n
+# endif
+
+ENTRY (MEMCMP)
+ testq %rdx, %rdx
+ je L(return_zero)
+ pxor %xmm4, %xmm4
+L(handle_end):
+ movl %esi, %eax
+ andl $4095, %eax
+ cmpl $4032, %eax
+ jg L(cross_page)
+ movl %edi, %eax
+ andl $4095, %eax
+ cmpl $4032, %eax
+ jg L(cross_page)
+ movdqu (%rdi), %xmm0
+ lea -1(%edx), %ecx
+ movl $2, %eax
+ movdqu (%rsi), %xmm1
+ salq %cl, %rax
+ leaq -1(%rax), %rcx
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %r8d
+ movq %r8, %rax
+ andq %rcx, %rax
+ jne L(different)
+ cmpq $16, %rdx
+ jbe L(return_zero)
+ movdqu 16(%rdi), %xmm2
+ movdqu 16(%rsi), %xmm6
+ movdqu 32(%rdi), %xmm1
+ pcmpeqb %xmm6, %xmm2
+ movdqu 32(%rsi), %xmm5
+ pcmpeqb %xmm4, %xmm2
+ pcmpeqb %xmm5, %xmm1
+ movdqu 48(%rdi), %xmm0
+ pmovmskb %xmm2, %eax
+ movdqu 48(%rsi), %xmm3
+ pcmpeqb %xmm4, %xmm1
+ pmovmskb %xmm1, %r9d
+ salq $16, %rax
+ pcmpeqb %xmm3, %xmm0
+ salq $32, %r9
+ pcmpeqb %xmm4, %xmm0
+ orq %r9, %rax
+ orq %r8, %rax
+ pmovmskb %xmm0, %r8d
+ salq $48, %r8
+ orq %r8, %rax
+ andq %rax, %rcx
+ jne L(different2)
+ cmpq $64, %rdx
+ jbe L(return_zero)
+ testq %rax, %rax
+ jne L(different)
+L(align_loop):
+ leaq 64(%rdi), %rax
+ andq $-64, %rax
+ subq %rdi, %rax
+ subq %rax, %rdx
+ addq %rax, %rdi
+ addq %rax, %rsi
+ cmpq $64, %rdx
+ ja L(loop_start)
+ testq %rdx, %rdx
+ jne L(handle_end)
+ xorl %eax, %eax
+ ret
+
+
+ ALIGN (4)
+L(different):
+ bsfq %rax, %rdx
+ movzbl (%rdi,%rdx), %eax
+ movzbl (%rsi,%rdx), %edx
+ subl %edx, %eax
+ ret
+L(different2):
+ bsfq %rcx, %rdx
+ movzbl (%rdi,%rdx), %eax
+ movzbl (%rsi,%rdx), %edx
+ subl %edx, %eax
+ ret
+
+ ALIGN (4)
+L(loop):
+ subq $64, %rdx
+ addq $64, %rdi
+ addq $64, %rsi
+ cmpq $64, %rdx
+ jbe L(less_64_bytes)
+L(loop_start):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm3
+ pcmpeqb (%rdi), %xmm0
+ movdqu 32(%rsi), %xmm2
+ pcmpeqb 16(%rdi), %xmm3
+ movdqu 48(%rsi), %xmm1
+ pcmpeqb 32(%rdi), %xmm2
+ pminub %xmm3, %xmm0
+ pcmpeqb 48(%rdi), %xmm1
+ pminub %xmm2, %xmm0
+ pminub %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ testl %eax, %eax
+ je L(loop)
+ jmp L(handle_end)
+
+ ALIGN (4)
+L(less_64_bytes):
+ testq %rdx, %rdx
+ jne L(handle_end)
+ xorl %eax, %eax
+ ret
+
+
+
+ /* Byte by byte loop, this should be fast enough as
+ we expect mismatch in first 5 bytes.
+
+ for (i=0;i<64;i++)
+ {
+ if (i==n)
+ return 0;
+ if (a[i]!=b[i])
+ return a[i]-b[i];
+ }
+ */
+ ALIGN (4)
+L(cross_page):
+ testq %rdx, %rdx
+ je L(return_zero)
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ cmpb %cl, %al
+ jne L(cross_page_different)
+ movl $1, %r8d
+ jmp L(cross_page_loop_start)
+
+ ALIGN (4)
+L(cross_page_loop):
+ movzbl (%rdi,%r8), %eax
+ movzbl (%rsi,%r8), %ecx
+ cmpb %cl, %al
+ jne L(cross_page_different)
+ addq $1, %r8
+ cmpq $65, %r8
+ je L(align_loop)
+L(cross_page_loop_start):
+ cmpq %rdx, %r8
+ jne L(cross_page_loop)
+L(return_zero):
+ xorl %eax, %eax
+ ret
+L(cross_page_different):
+ subl %ecx, %eax
+ ret
+
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index da88af2..9095f00 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -35,9 +35,9 @@ ENTRY(memcmp)
leaq __memcmp_sse2(%rip), %rax
ret
-2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+2: testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
jz 3f
- leaq __memcmp_sse4_1(%rip), %rax
+ leaq __memcmp_sse2_unaligned(%rip), %rax
ret
3: leaq __memcmp_ssse3(%rip), %rax
--
1.8.3.2