This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH] Improve memcmp with unaligned loads.

From: OndÅej BÃlka <neleai at seznam dot cz>
To: libc-alpha at sourceware dot org
Date: Wed, 18 Sep 2013 23:58:08 +0200
Subject: [PATCH] Improve memcmp with unaligned loads.
Authentication-results: sourceware.org; auth=none
Hi, 

A sse4_1 version of memcmp was optimized relatively well and it took me 
quite of time to come with improvement. When memcmp does check we need to
 integrate ends caused by mismatches with ending by exceeding size. 
I tried several approaches that were slower than current memcmp.

A solution come from bit unexpected direction, when I tried to implement
strcmp I noticed that a same idea applies to memcmp as well and this
gains us 10% in real workloads.

There are few optimizations that I left for future patches, 

First one is a wmemcmp support which is straigthforward to add but I did
not have time to do so.

Second is that if doing first check if first byte matches or not helps
us? (strcmp is same case) For that I would need to collect more data as 
this depends on profile.

A code is generated from following C skeleton, there is some room of
improvement by having better control flow.

int memcmp (unsigned char *a, unsigned char *b, unsigned long n)
{  
  unsigned long i;
  finish:;
  if (!n)
    return 0;
  /* if (a[0] != b[0]) return a[0] - b[0]; */
  int or1=((unsigned long)a) & 4095;
  int or2=((unsigned long)b) & 4095;
  if ((or1 <= 4096 - 64) & (or2 <= 4096 - 64)) {
    unsigned long xm = (2L << (n - 1)) - 1; /* To handle n==64. */

    /* Check first 16 bytes and return bitmask with 1 on positions where
       bytes differ.  */
    unsigned long ma= get_diff16 (a, b); 
    if (ma & xm) 
      {
        int f=ffs (ma & xm);
        return a[f] - b[f];
      }
    if (n <= 16) /* Profiling shown that sizes less than 16 that are
                    same are relatively common and this improves
                    performance by around 10%.  */
      return 0;
    ma = ma | get_diff48 (a + 16, b + 16);
    if ((ma) & xm) 
     {
       int f = ffs (ma & xm);
       return a[f] - b[f];
     }
    if (n <= 64)
      return 0;
    if (ma)
      {
        int f = ffs (ma);
        return a[f]-b[f];
      }
  } else {
    for (i = 0;i <= 64; i++){
      if (i >= n) 
        return 0;
      if (a[i] != b[i])
        return a[i] - b[i];
    }
  }
  unsigned char *a_al = ALIGN_DOWN (a + 64, 64);
  int shift= a_al - a;
  a += shift;
  b += shift;
  n -= shift; /* Possible to replace with counter.  */
  while (1) 
    {
      if (n<=64) goto finish;
      if (get_diff_loop64(a,b)) /*nonzero if 64 bytes differ. */
        goto finish;
      a += 64;
      b += 64;
      n -= 64;
    }
}

Passes test ok to commit?

	* sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S: New file.
	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
	sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
	* sysdeps/x86_64/multiarch/memcmp.S: Add __memcmp_sse2_unaligned 
	ifunc selection.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
	__memcmp_sse2_unaligned.

---
 sysdeps/x86_64/multiarch/Makefile                |   2 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c       |   1 +
 sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S | 190 +++++++++++++++++++++++
 sysdeps/x86_64/multiarch/memcmp.S                |   4 +-
 4 files changed, 194 insertions(+), 3 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5ab950a..44976c7 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,7 +8,7 @@ ifeq ($(subdir),string)
 
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcmp-sse2-unaligned strncmp-ssse3 \
-		   strend-sse4 memcmp-sse4 memcpy-ssse3 \
+		   strend-sse4 memcmp-sse2-unaligned memcpy-ssse3 \
 		   memcpy-sse2-unaligned mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
 		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 1a65ac0..a6d0c11 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -42,6 +42,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
 			      __memcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
+	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/memmove_chk.S.  */
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
new file mode 100644
index 0000000..8d11b47
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-sse2-unaligned.S
@@ -0,0 +1,190 @@
+/* memcmp with SSE2 and unaligned loads
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_sse2_unaligned
+# endif
+
+# ifndef ALIGN
+#  define ALIGN(n)	.p2align n
+# endif
+
+ENTRY (MEMCMP)
+	testq	%rdx, %rdx
+	je	L(return_zero)
+	pxor	%xmm4, %xmm4
+L(handle_end):
+	movl	%esi, %eax
+	andl	$4095, %eax
+	cmpl	$4032, %eax
+	jg	L(cross_page)
+	movl	%edi, %eax
+	andl	$4095, %eax
+	cmpl	$4032, %eax
+	jg	L(cross_page)
+	movdqu	(%rdi), %xmm0
+	lea	-1(%edx), %ecx
+	movl	$2, %eax
+	movdqu	(%rsi), %xmm1
+	salq	%cl, %rax
+	leaq	-1(%rax), %rcx
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb	%xmm0, %r8d
+	movq	%r8, %rax
+	andq	%rcx, %rax
+	jne	L(different)
+	cmpq	$16, %rdx
+	jbe	L(return_zero)
+	movdqu	16(%rdi), %xmm2
+	movdqu	16(%rsi), %xmm6
+	movdqu	32(%rdi), %xmm1
+	pcmpeqb	%xmm6, %xmm2
+	movdqu	32(%rsi), %xmm5
+	pcmpeqb	%xmm4, %xmm2
+	pcmpeqb	%xmm5, %xmm1
+	movdqu	48(%rdi), %xmm0
+	pmovmskb	%xmm2, %eax
+	movdqu	48(%rsi), %xmm3
+	pcmpeqb	%xmm4, %xmm1
+	pmovmskb	%xmm1, %r9d
+	salq	$16, %rax
+	pcmpeqb	%xmm3, %xmm0
+	salq	$32, %r9
+	pcmpeqb	%xmm4, %xmm0
+	orq	%r9, %rax
+	orq	%r8, %rax
+	pmovmskb	%xmm0, %r8d
+	salq	$48, %r8
+	orq	%r8, %rax
+	andq	%rax, %rcx
+	jne	L(different2)
+	cmpq	$64, %rdx
+	jbe	L(return_zero)
+	testq	%rax, %rax
+	jne	L(different)
+L(align_loop):
+	leaq	64(%rdi), %rax
+	andq	$-64, %rax
+	subq	%rdi, %rax
+	subq	%rax, %rdx
+	addq	%rax, %rdi
+	addq	%rax, %rsi
+	cmpq	$64, %rdx
+	ja	L(loop_start)
+	testq	%rdx, %rdx
+	jne	L(handle_end)
+	xorl	%eax, %eax
+	ret
+
+
+	ALIGN (4)
+L(different):
+	bsfq	%rax, %rdx
+	movzbl	(%rdi,%rdx), %eax
+	movzbl	(%rsi,%rdx), %edx
+	subl	%edx, %eax
+	ret
+L(different2):
+	bsfq	%rcx, %rdx
+	movzbl	(%rdi,%rdx), %eax
+	movzbl	(%rsi,%rdx), %edx
+	subl	%edx, %eax
+	ret
+
+	ALIGN (4)
+L(loop):
+	subq	$64, %rdx
+	addq	$64, %rdi
+	addq	$64, %rsi
+	cmpq	$64, %rdx
+	jbe	L(less_64_bytes)
+L(loop_start):
+	movdqu	(%rsi), %xmm0
+	movdqu	16(%rsi), %xmm3
+	pcmpeqb	(%rdi), %xmm0
+	movdqu	32(%rsi), %xmm2
+	pcmpeqb	16(%rdi), %xmm3
+	movdqu	48(%rsi), %xmm1
+	pcmpeqb	32(%rdi), %xmm2
+	pminub	%xmm3, %xmm0
+	pcmpeqb	48(%rdi), %xmm1
+	pminub	%xmm2, %xmm0
+	pminub	%xmm1, %xmm0
+	pcmpeqb	%xmm4, %xmm0
+	pmovmskb	%xmm0, %eax
+	testl	%eax, %eax
+	je	L(loop)
+	jmp	L(handle_end)
+
+	ALIGN (4)
+L(less_64_bytes):
+	testq	%rdx, %rdx
+	jne	L(handle_end)
+	xorl	%eax, %eax
+	ret
+
+
+
+	/* Byte by byte loop, this should be fast enough as 
+           we expect mismatch in first 5 bytes.
+
+           for (i=0;i<64;i++)
+             {
+               if (i==n)
+                 return 0;
+               if (a[i]!=b[i])
+                 return a[i]-b[i];
+             }
+         */ 
+	ALIGN (4)
+L(cross_page):
+	testq	%rdx, %rdx
+	je	L(return_zero)
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	cmpb	%cl, %al
+	jne	L(cross_page_different)
+	movl	$1, %r8d
+	jmp	L(cross_page_loop_start)
+
+	ALIGN (4)
+L(cross_page_loop):
+	movzbl	(%rdi,%r8), %eax
+	movzbl	(%rsi,%r8), %ecx
+	cmpb	%cl, %al
+	jne	L(cross_page_different)
+	addq	$1, %r8
+	cmpq	$65, %r8
+	je	L(align_loop)
+L(cross_page_loop_start):
+	cmpq	%rdx, %r8
+	jne	L(cross_page_loop)
+L(return_zero):
+	xorl	%eax, %eax
+	ret
+L(cross_page_different):
+	subl	%ecx, %eax
+	ret
+
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index da88af2..9095f00 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -35,9 +35,9 @@ ENTRY(memcmp)
 	leaq	__memcmp_sse2(%rip), %rax
 	ret
 
-2:	testl	$bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
+2:	testl	$bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
 	jz	3f
-	leaq	__memcmp_sse4_1(%rip), %rax
+	leaq	__memcmp_sse2_unaligned(%rip), %rax
 	ret
 
 3:	leaq	__memcmp_ssse3(%rip), %rax
-- 
1.8.3.2
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]