This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 1/3] Adding strcasecmp/strncasecmp functionality to unaligned strcmp


On Fri, Sep 13, 2013 at 10:53:03PM +0200, OndÅej BÃlka wrote:
> Hi,
> I tried to gather data also for strcasecmp/strncasecmp and I got
> that they are used rarely on my system.
> 
Thanks to Andreas I have a implementation ready.

It works by first finding different characters with strcmp code, then
checking if their case differ. As it is likely that these characters
were different performance should be similar to strcmp one. I checked
this property on my computer with following code and number of case 
comparisons needed is mostly 1 in my test:

#include <stdio.h>
int strcasecmp(unsigned char *x,unsigned char *y)
{
 int casecmp=0;
 int i=0;
 while(1) {
   if (x[i]!=y[i])
     if (tolower(x[i])==tolower(y[i]))
       casecmp++;
     else
       {
	 fprintf(stderr,"dif chars %i tolower_needed %i\n", i, casecmp+1);
	 return tolower(x[i])-tolower(y[i]);
       }
   if (!x[i]) 
     {
       fprintf(stderr,"same chars %i tolower_needed %i \n",i, casecmp);
       return 0;
     }
   i++;
 }
 return 0;
}

Downsite of this implementation is that checking aaaa vs AAAA will be
slower, as this looks as unlikely case we could make this tradeoff.

I added it in generic way as I plan to add also ssse3 loop version which
will come in separate patch.

	* sysdeps/x86_64/locale-defines.sym (LOCALE_TOLOWER): Add.
	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
	Add strcasecmp_l-sse2-unaligned.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
	strcasecmp_sse2_unaligned.
	* sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S: New file.
	* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add strcasecmp
	implementation.
	* sysdeps/x86_64/multiarch/strcmp.S: Update ifunc.

---
 sysdeps/x86_64/locale-defines.sym                  |   1 +
 sysdeps/x86_64/multiarch/Makefile                  |   1 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c         |   2 +
 .../x86_64/multiarch/strcasecmp_l-sse2-unaligned.S |   2 +
 sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S   | 117 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strcmp.S                  |   9 +-
 6 files changed, 127 insertions(+), 5 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S

diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym
index aebff9a..804debb 100644
--- a/sysdeps/x86_64/locale-defines.sym
+++ b/sysdeps/x86_64/locale-defines.sym
@@ -8,4 +8,5 @@ LOCALE_T___LOCALES		offsetof (struct __locale_struct, __locales)
 LC_CTYPE
 _NL_CTYPE_NONASCII_CASE
 LOCALE_DATA_VALUES		offsetof (struct __locale_data, values)
+LOCALE_TOLOWER			offsetof (struct __locale_struct, __ctype_tolower)
 SIZEOF_VALUES			sizeof (((struct __locale_data *) 0)->values[0])
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5ab950a..551923c 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
 		   memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
+		   strcasecmp_l-sse2-unaligned \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 1a65ac0..40f8895 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -81,6 +81,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strcasecmp_avx)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
 			      __strcasecmp_sse42)
+	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1,
+			      __strcasecmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
 			      __strcasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
new file mode 100644
index 0000000..62ce37e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
@@ -0,0 +1,2 @@
+#define AS_STRCASECMP
+#include "strcmp-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index eed8432..c93d2f5 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -16,10 +16,33 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifndef NOT_IN_libc
+
 #include "sysdep.h"
 #define ALIGN(x)	.p2align x
 
+#ifdef AS_STRCASECMP
+# include "locale-defines.h"
+
+#define __strcasecmp_sse2_unaligned strcasecmp_new
+
+# define  __strcmp_sse2_unaligned __strcasecmp_sse2_unaligned_l
+ENTRY (__strcasecmp_sse2_unaligned)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %rdx
+        // XXX 5 byte should be before the function
+        /* 5-byte NOP.  */
+        .byte   0x0f,0x1f,0x44,0x00,0x00
+
+END (__strcasecmp_sse2_unaligned)
+
+#endif
+
 ENTRY ( __strcmp_sse2_unaligned)
+
+#ifdef AS_STRCASECMP
+	mov	LOCALE_TOLOWER(%rdx), %r11
+#endif
 	movl	%edi, %eax
 	xorl	%edx, %edx
 	pxor	%xmm7, %xmm7
@@ -36,12 +59,16 @@ ENTRY ( __strcmp_sse2_unaligned)
 	pmovmskb	%xmm0, %eax
 	testq	%rax, %rax
 	je	L(next_48_bytes)
+#ifndef AS_STRCASECMP
 L(return):
 	bsfq	%rax, %rdx
 	movzbl	(%rdi, %rdx), %eax
 	movzbl	(%rsi, %rdx), %edx
 	subl	%edx, %eax
 	ret
+#else
+	jmp	L(caseloop1)
+#endif
 
 	ALIGN (4)
 L(next_48_bytes):
@@ -85,6 +112,76 @@ L(main_loop_header):
 	movq	%rcx, %rsi
 	jmp	L(loop_start)
 
+#ifdef AS_STRCASECMP
+L(caseloop1):
+	bsfq	%rax, %rdx
+	leaq	-1(%rax), %rcx
+	andq	%rax, %rcx
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	movl	(%r11, %rax, 4), %eax
+	movl	(%r11, %rdx, 4), %edx
+	testl	%eax, %eax
+	je	L(zero1)
+	cmpl	%edx, %eax
+	je	L(casecnt1)
+L(zero1):
+	subl	%edx, %eax
+	ret
+L(casecnt1):
+	testq	%rcx, %rcx
+	je	L(next_48_bytes)
+	movq	%rcx, %rax
+	jmp	L(caseloop1)
+
+L(return):
+L(caseloop2):
+	bsfq	%rax, %rdx
+	leaq	-1(%rax), %rcx
+	andq	%rax, %rcx
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	movl	(%r11, %rax, 4), %eax
+	movl	(%r11, %rdx, 4), %edx
+	testl	%eax, %eax
+	je	L(zero2)
+	cmpl	%edx, %eax
+	je	L(casecnt2)
+L(zero2):
+	subl	%edx, %eax
+	ret
+L(casecnt2):
+	testq	%rcx, %rcx
+	je	L(main_loop_header)
+	movq	%rcx, %rax
+	jmp	L(caseloop2)
+
+L(caseloop3):
+	bsfq	%rax, %rdx
+	leaq	-1(%rax), %r10
+	andq	%rax, %r10
+	movzbl	(%rdi, %rdx), %eax
+	movzbl	(%rsi, %rdx), %edx
+	movl	(%r11, %rax, 4), %eax
+	movl	(%r11, %rdx, 4), %edx
+	testl	%eax, %eax
+	je	L(zero3)
+	cmpl	%edx, %eax
+	je	L(casecnt3)
+L(zero3):
+	subl	%edx, %eax
+	ret
+L(casecnt3):
+	movq	%rdi, %rax
+	movq	%rsi, %rdx
+	testq	%r10, %r10
+	je	L(back_to_loop)
+	movq	%r10, %rax
+	jmp	L(caseloop3)
+
+#endif
+
+
 	ALIGN	(4)
 L(loop):
 	addq	$64, %rax
@@ -135,11 +232,18 @@ L(back_to_loop):
 	orq	%rdi, %rcx
 	salq	$48, %rsi
 	orq	%rsi, %rcx
+#ifndef AS_STRCASECMP
 	bsfq	%rcx, %rcx
 	movzbl	(%rax, %rcx), %eax
 	movzbl	(%rdx, %rcx), %edx
 	subl	%edx, %eax
 	ret
+#else
+	movq	%rax, %rdi
+	movq	%rdx, %rsi
+	movq	%rcx, %rax
+	jmp	L(return)
+#endif
 
 	ALIGN (4)
 L(loop_cross_page):
@@ -185,11 +289,19 @@ L(loop_cross_page):
 	shrq	%cl, %rdi
 	test	%rdi, %rdi
 	je	L(back_to_loop)
+#ifndef AS_STRCASECMP
 	bsfq	%rdi, %rcx
 	movzbl	(%rax, %rcx), %eax
 	movzbl	(%rdx, %rcx), %edx
 	subl	%edx, %eax
 	ret
+#else
+	movq	%rdi, %r10
+	movq	%rax, %rdi
+	movq	%rdx, %rsi
+	movq	%r10, %rax
+	jmp	L(caseloop3)
+#endif
 
 	ALIGN (4)
 L(cross_page_loop):
@@ -201,6 +313,10 @@ L(cross_page_loop):
 L(cross_page):
 	movzbl	(%rdi, %rdx), %eax
 	movzbl	(%rsi, %rdx), %ecx
+#ifdef AS_STRCASECMP
+	movl	(%r11, %rax, 4), %eax
+	movl	(%r11, %rcx, 4), %ecx
+#endif
 	testb	%al, %al
 	jne	L(cross_page_loop)
 	xorl	%eax, %eax
@@ -208,3 +324,4 @@ L(different):
 	subl	%ecx, %eax
 	ret
 END (__strcmp_sse2_unaligned)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index c5dcd1a..818aa31 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -115,16 +115,15 @@ ENTRY(__strcasecmp)
 	jne	1f
 	call	__init_cpu_features
 1:
+	leaq	__strcasecmp_sse2_unaligned(%rip), %rax
+	testl   $bit_Fast_Unaligned_Load, __cpu_features+CPUID_OFFSET+index_Fast_Unaligned_Load(%rip)
+	jnz     3f
+
 #  ifdef HAVE_AVX_SUPPORT
 	leaq	__strcasecmp_avx(%rip), %rax
 	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
 	jnz	3f
 #  endif
-	testl	$bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
-	jnz	2f
-	leaq	__strcasecmp_sse42(%rip), %rax
-	testl	$bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-	jnz	3f
 2:	leaq	__strcasecmp_ssse3(%rip), %rax
 	testl	$bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
 	jnz	3f
-- 
1.8.3.2


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]