This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 1/3] Adding strcasecmp/strncasecmp functionality to unaligned strcmp
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 16 Sep 2013 14:32:34 +0200
- Subject: Re: [PATCH 1/3] Adding strcasecmp/strncasecmp functionality to unaligned strcmp
- Authentication-results: sourceware.org; auth=none
- References: <20130913200552 dot GA31992 at domone> <20130913205303 dot GA3620 at domone>
On Fri, Sep 13, 2013 at 10:53:03PM +0200, OndÅej BÃlka wrote:
> Hi,
> I tried to gather data also for strcasecmp/strncasecmp and I got
> that they are used rarely on my system.
>
Thanks to Andreas I have a implementation ready.
It works by first finding different characters with strcmp code, then
checking if their case differ. As it is likely that these characters
were different performance should be similar to strcmp one. I checked
this property on my computer with following code and number of case
comparisons needed is mostly 1 in my test:
#include <stdio.h>
int strcasecmp(unsigned char *x,unsigned char *y)
{
int casecmp=0;
int i=0;
while(1) {
if (x[i]!=y[i])
if (tolower(x[i])==tolower(y[i]))
casecmp++;
else
{
fprintf(stderr,"dif chars %i tolower_needed %i\n", i, casecmp+1);
return tolower(x[i])-tolower(y[i]);
}
if (!x[i])
{
fprintf(stderr,"same chars %i tolower_needed %i \n",i, casecmp);
return 0;
}
i++;
}
return 0;
}
Downsite of this implementation is that checking aaaa vs AAAA will be
slower, as this looks as unlikely case we could make this tradeoff.
I added it in generic way as I plan to add also ssse3 loop version which
will come in separate patch.
* sysdeps/x86_64/locale-defines.sym (LOCALE_TOLOWER): Add.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines):
Add strcasecmp_l-sse2-unaligned.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add
strcasecmp_sse2_unaligned.
* sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S: New file.
* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add strcasecmp
implementation.
* sysdeps/x86_64/multiarch/strcmp.S: Update ifunc.
---
sysdeps/x86_64/locale-defines.sym | 1 +
sysdeps/x86_64/multiarch/Makefile | 1 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 2 +
.../x86_64/multiarch/strcasecmp_l-sse2-unaligned.S | 2 +
sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 117 +++++++++++++++++++++
sysdeps/x86_64/multiarch/strcmp.S | 9 +-
6 files changed, 127 insertions(+), 5 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
diff --git a/sysdeps/x86_64/locale-defines.sym b/sysdeps/x86_64/locale-defines.sym
index aebff9a..804debb 100644
--- a/sysdeps/x86_64/locale-defines.sym
+++ b/sysdeps/x86_64/locale-defines.sym
@@ -8,4 +8,5 @@ LOCALE_T___LOCALES offsetof (struct __locale_struct, __locales)
LC_CTYPE
_NL_CTYPE_NONASCII_CASE
LOCALE_DATA_VALUES offsetof (struct __locale_data, values)
+LOCALE_TOLOWER offsetof (struct __locale_struct, __ctype_tolower)
SIZEOF_VALUES sizeof (((struct __locale_data *) 0)->values[0])
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5ab950a..551923c 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
+ strcasecmp_l-sse2-unaligned \
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 1a65ac0..40f8895 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -81,6 +81,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strcasecmp_avx)
IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSE4_2,
__strcasecmp_sse42)
+ IFUNC_IMPL_ADD (array, i, strcasecmp, 1,
+ __strcasecmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcasecmp, HAS_SSSE3,
__strcasecmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
new file mode 100644
index 0000000..62ce37e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2-unaligned.S
@@ -0,0 +1,2 @@
+#define AS_STRCASECMP
+#include "strcmp-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index eed8432..c93d2f5 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -16,10 +16,33 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#ifndef NOT_IN_libc
+
#include "sysdep.h"
#define ALIGN(x) .p2align x
+#ifdef AS_STRCASECMP
+# include "locale-defines.h"
+
+#define __strcasecmp_sse2_unaligned strcasecmp_new
+
+# define __strcmp_sse2_unaligned __strcasecmp_sse2_unaligned_l
+ENTRY (__strcasecmp_sse2_unaligned)
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %rdx
+ // XXX 5 byte should be before the function
+ /* 5-byte NOP. */
+ .byte 0x0f,0x1f,0x44,0x00,0x00
+
+END (__strcasecmp_sse2_unaligned)
+
+#endif
+
ENTRY ( __strcmp_sse2_unaligned)
+
+#ifdef AS_STRCASECMP
+ mov LOCALE_TOLOWER(%rdx), %r11
+#endif
movl %edi, %eax
xorl %edx, %edx
pxor %xmm7, %xmm7
@@ -36,12 +59,16 @@ ENTRY ( __strcmp_sse2_unaligned)
pmovmskb %xmm0, %eax
testq %rax, %rax
je L(next_48_bytes)
+#ifndef AS_STRCASECMP
L(return):
bsfq %rax, %rdx
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
ret
+#else
+ jmp L(caseloop1)
+#endif
ALIGN (4)
L(next_48_bytes):
@@ -85,6 +112,76 @@ L(main_loop_header):
movq %rcx, %rsi
jmp L(loop_start)
+#ifdef AS_STRCASECMP
+L(caseloop1):
+ bsfq %rax, %rdx
+ leaq -1(%rax), %rcx
+ andq %rax, %rcx
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ movl (%r11, %rax, 4), %eax
+ movl (%r11, %rdx, 4), %edx
+ testl %eax, %eax
+ je L(zero1)
+ cmpl %edx, %eax
+ je L(casecnt1)
+L(zero1):
+ subl %edx, %eax
+ ret
+L(casecnt1):
+ testq %rcx, %rcx
+ je L(next_48_bytes)
+ movq %rcx, %rax
+ jmp L(caseloop1)
+
+L(return):
+L(caseloop2):
+ bsfq %rax, %rdx
+ leaq -1(%rax), %rcx
+ andq %rax, %rcx
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ movl (%r11, %rax, 4), %eax
+ movl (%r11, %rdx, 4), %edx
+ testl %eax, %eax
+ je L(zero2)
+ cmpl %edx, %eax
+ je L(casecnt2)
+L(zero2):
+ subl %edx, %eax
+ ret
+L(casecnt2):
+ testq %rcx, %rcx
+ je L(main_loop_header)
+ movq %rcx, %rax
+ jmp L(caseloop2)
+
+L(caseloop3):
+ bsfq %rax, %rdx
+ leaq -1(%rax), %r10
+ andq %rax, %r10
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ movl (%r11, %rax, 4), %eax
+ movl (%r11, %rdx, 4), %edx
+ testl %eax, %eax
+ je L(zero3)
+ cmpl %edx, %eax
+ je L(casecnt3)
+L(zero3):
+ subl %edx, %eax
+ ret
+L(casecnt3):
+ movq %rdi, %rax
+ movq %rsi, %rdx
+ testq %r10, %r10
+ je L(back_to_loop)
+ movq %r10, %rax
+ jmp L(caseloop3)
+
+#endif
+
+
ALIGN (4)
L(loop):
addq $64, %rax
@@ -135,11 +232,18 @@ L(back_to_loop):
orq %rdi, %rcx
salq $48, %rsi
orq %rsi, %rcx
+#ifndef AS_STRCASECMP
bsfq %rcx, %rcx
movzbl (%rax, %rcx), %eax
movzbl (%rdx, %rcx), %edx
subl %edx, %eax
ret
+#else
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ movq %rcx, %rax
+ jmp L(return)
+#endif
ALIGN (4)
L(loop_cross_page):
@@ -185,11 +289,19 @@ L(loop_cross_page):
shrq %cl, %rdi
test %rdi, %rdi
je L(back_to_loop)
+#ifndef AS_STRCASECMP
bsfq %rdi, %rcx
movzbl (%rax, %rcx), %eax
movzbl (%rdx, %rcx), %edx
subl %edx, %eax
ret
+#else
+ movq %rdi, %r10
+ movq %rax, %rdi
+ movq %rdx, %rsi
+ movq %r10, %rax
+ jmp L(caseloop3)
+#endif
ALIGN (4)
L(cross_page_loop):
@@ -201,6 +313,10 @@ L(cross_page_loop):
L(cross_page):
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %ecx
+#ifdef AS_STRCASECMP
+ movl (%r11, %rax, 4), %eax
+ movl (%r11, %rcx, 4), %ecx
+#endif
testb %al, %al
jne L(cross_page_loop)
xorl %eax, %eax
@@ -208,3 +324,4 @@ L(different):
subl %ecx, %eax
ret
END (__strcmp_sse2_unaligned)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index c5dcd1a..818aa31 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -115,16 +115,15 @@ ENTRY(__strcasecmp)
jne 1f
call __init_cpu_features
1:
+ leaq __strcasecmp_sse2_unaligned(%rip), %rax
+ testl $bit_Fast_Unaligned_Load, __cpu_features+CPUID_OFFSET+index_Fast_Unaligned_Load(%rip)
+ jnz 3f
+
# ifdef HAVE_AVX_SUPPORT
leaq __strcasecmp_avx(%rip), %rax
testl $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
jnz 3f
# endif
- testl $bit_Slow_SSE4_2, __cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
- jnz 2f
- leaq __strcasecmp_sse42(%rip), %rax
- testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
- jnz 3f
2: leaq __strcasecmp_ssse3(%rip), %rax
testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
jnz 3f
--
1.8.3.2