This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch, master, updated. glibc-2.12-85-g73507d3
- From: drepper at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 1 Aug 2010 04:42:30 -0000
- Subject: GNU C Library master sources branch, master, updated. glibc-2.12-85-g73507d3
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via 73507d3ae077c2dcf0da857a5a244deff3d4d223 (commit)
from 66f6765a472452937fa821d6c30e2396e113bd60 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=73507d3ae077c2dcf0da857a5a244deff3d4d223
commit 73507d3ae077c2dcf0da857a5a244deff3d4d223
Author: Ulrich Drepper <drepper@redhat.com>
Date: Sat Jul 31 21:41:09 2010 -0700
Add support for SSSE3 and SSE4.2 versions of strcasecmp on x86-64.
diff --git a/ChangeLog b/ChangeLog
index 38df1ae..7a90d21 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+2010-07-31 Ulrich Drepper <drepper@redhat.com>
+
+ * sysdeps/x86_64/multiarch/Makefile [subdir=string] (sysdep_routines):
+ Add strcasecmp_l-ssse3.
+ * sysdeps/x86_64/multiarch/strcmp.S: Add support to compile for
+ strcasecmp.
+ * sysdeps/x86_64/strcmp.S: Allow more flexible compiling of strcasecmp.
+ * sysdeps/x86_64/multiarch/strcasecmp_l.S: New file.
+ * sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S: New file.
+
2010-07-30 Ulrich Drepper <drepper@redhat.com>
* sysdeps/x86_64/multiarch/strcmp.S: Pretty printing.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f1251a0..5113dc1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -7,7 +7,7 @@ ifeq ($(subdir),string)
sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
- memmove-ssse3-back strcasestr-nonascii
+ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
new file mode 100644
index 0000000..bc0eb5b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
@@ -0,0 +1,5 @@
+#define USE_SSSE3 1
+#define USE_AS_STRCASECMP_L
+#define STRCMP __strcasecmp_l_ssse3
+#define __strcasecmp __strcasecmp_ssse3
+#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l.S b/sysdeps/x86_64/multiarch/strcasecmp_l.S
new file mode 100644
index 0000000..5456b3a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l.S
@@ -0,0 +1,6 @@
+#define STRCMP __strcasecmp_l
+#define USE_AS_STRCASECMP_L
+#include "strcmp.S"
+
+weak_alias (__strcasecmp_l, strcasecmp_l)
+libc_hidden_def (strcasecmp_l)
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index 2de1191..3726dbe 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -37,6 +37,15 @@
# define STRCMP_SSSE3 __strncmp_ssse3
# define STRCMP_SSE2 __strncmp_sse2
# define __GI_STRCMP __GI_strncmp
+#elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+
+# define UPDATE_STRNCMP_COUNTER
+
+# define STRCMP_SSE42 __strcasecmp_l_sse42
+# define STRCMP_SSSE3 __strcasecmp_l_ssse3
+# define STRCMP_SSE2 __strcasecmp_l_sse2
+# define __GI_STRCMP __GI___strcasecmp_l
#else
# define UPDATE_STRNCMP_COUNTER
# ifndef STRCMP
@@ -73,6 +82,25 @@ ENTRY(STRCMP)
2: ret
END(STRCMP)
+# ifdef USE_AS_STRCASECMP_L
+ENTRY(__strcasecmp)
+ .type __strcasecmp, @gnu_indirect_function
+ cmpl $0, __cpu_features+KIND_OFFSET(%rip)
+ jne 1f
+ call __init_cpu_features
+1:
+ leaq __strcasecmp_sse42(%rip), %rax
+ testl $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
+ jnz 2f
+ leaq __strcasecmp_ssse3(%rip), %rax
+ testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 2f
+ leaq __strcasecmp_sse2(%rip), %rax
+2: ret
+END(__strcasecmp)
+weak_alias (__strcasecmp, strcasecmp)
+# endif
+
/* We use 0x1a:
_SIDD_SBYTE_OPS
| _SIDD_CMP_EQUAL_EACH
@@ -103,6 +131,16 @@ END(STRCMP)
.section .text.sse4.2,"ax",@progbits
.align 16
.type STRCMP_SSE42, @function
+#ifdef USE_AS_STRCASECMP_L
+ /* 5-byte NOP. */
+ .byte 0x0f,0x1f,0x44,0x00,0x00
+ENTRY (__strcasecmp_sse42)
+ movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
+ movq %fs:(%rax),%rdx
+END (__strcasecmp_sse42)
+ /* FALLTHROUGH to strcasecmp_l. */
+#endif
+
STRCMP_SSE42:
cfi_startproc
CALL_MCOUNT
@@ -110,6 +148,18 @@ STRCMP_SSE42:
/*
* This implementation uses SSE to compare up to 16 bytes at a time.
*/
+#ifdef USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales
+ with encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ movq LOCALE_T___LOCALES+LC_CTYPE*8(%rdx), %rax
+# else
+ movq (%rdx), %rax
+# endif
+ testl $0, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+ jne __strcasecmp_l_nonascii
+#endif
+
#ifdef USE_AS_STRNCMP
test %rdx, %rdx
je LABEL(strcmp_exitz_sse4_2)
@@ -122,12 +172,52 @@ STRCMP_SSE42:
/* Use 64bit AND here to avoid long NOP padding. */
and $0x3f, %rcx /* rsi alignment in cache line */
and $0x3f, %rax /* rdi alignment in cache line */
+#ifdef USE_AS_STRCASECMP_L
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+.Lbelowupper_sse4:
+ .quad 0x4040404040404040
+ .quad 0x4040404040404040
+.Ltopupper_sse4:
+ .quad 0x5b5b5b5b5b5b5b5b
+ .quad 0x5b5b5b5b5b5b5b5b
+.Ltouppermask_sse4:
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+ movdqa .Lbelowupper_sse4(%rip), %xmm4
+# define UCLOW_reg %xmm4
+ movdqa .Ltopupper_sse4(%rip), %xmm5
+# define UCHIGH_reg %xmm5
+ movdqa .Ltouppermask_sse4(%rip), %xmm6
+# define LCQWORD_reg %xmm6
+#endif
cmp $0x30, %ecx
ja LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */
cmp $0x30, %eax
ja LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */
movdqu (%rdi), %xmm1
movdqu (%rsi), %xmm2
+# ifdef USE_AS_STRCASECMP_L
+# define TOLOWER(reg1, reg2) \
+ movdqa reg1, %xmm7; \
+ movdqa UCHIGH_reg, %xmm8; \
+ movdqa reg2, %xmm9; \
+ movdqa UCHIGH_reg, %xmm10; \
+ pcmpgtb UCLOW_reg, %xmm7; \
+ pcmpgtb reg1, %xmm8; \
+ pcmpgtb UCLOW_reg, %xmm9; \
+ pcmpgtb reg2, %xmm10; \
+ pand %xmm8, %xmm7; \
+ pand %xmm10, %xmm9; \
+ pand LCQWORD_reg, %xmm7; \
+ pand LCQWORD_reg, %xmm9; \
+ por %xmm7, reg1; \
+ por %xmm9, reg2
+ TOLOWER (%xmm1, %xmm2)
+# else
+# define TOLOWER(reg1, reg2)
+# endif
pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
@@ -180,7 +270,13 @@ LABEL(ashr_0_sse4_2):
movdqa (%rsi), %xmm1
pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+#ifndef USE_AS_STRCASECMP_L
pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
+#else
+ movdqa (%rdi), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
+#endif
psubb %xmm0, %xmm1 /* packed sub of comparison results*/
pmovmskb %xmm1, %r9d
shr %cl, %edx /* adjust 0xffff for offset */
@@ -204,7 +300,13 @@ LABEL(ashr_0_sse4_2):
.p2align 4
LABEL(ashr_0_use_sse4_2):
movdqa (%rdi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
lea 16(%rdx), %rdx
jbe LABEL(ashr_0_use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
@@ -213,7 +315,13 @@ LABEL(ashr_0_use_sse4_2):
#endif
movdqa (%rdi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
lea 16(%rdx), %rdx
jbe LABEL(ashr_0_use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
@@ -233,12 +341,16 @@ LABEL(ashr_0_use_sse4_2_exit):
lea -16(%rdx, %rcx), %rcx
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %edx
+# ifdef USE_AS_STRCASECMP_L
+ leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
+ movl (%rcx,%rax,4), %eax
+ movl (%rcx,%rdx,4), %edx
+# endif
sub %edx, %eax
ret
-
/*
* The following cases will be handled by ashr_1
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
@@ -251,6 +363,7 @@ LABEL(ashr_1_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0 /* Any null chars? */
pslldq $15, %xmm2 /* shift first string to align with second */
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
psubb %xmm0, %xmm2 /* packed sub of comparison results*/
pmovmskb %xmm2, %r9d
@@ -281,7 +394,13 @@ LABEL(loop_ashr_1_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $1, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -294,7 +413,13 @@ LABEL(loop_ashr_1_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $1, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -330,6 +455,7 @@ LABEL(ashr_2_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $14, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -360,7 +486,13 @@ LABEL(loop_ashr_2_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $2, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -373,7 +505,13 @@ LABEL(loop_ashr_2_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $2, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -409,6 +547,7 @@ LABEL(ashr_3_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $13, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -439,7 +578,13 @@ LABEL(loop_ashr_3_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $3, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -452,7 +597,13 @@ LABEL(loop_ashr_3_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $3, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -488,6 +639,7 @@ LABEL(ashr_4_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $12, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -519,7 +671,13 @@ LABEL(loop_ashr_4_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $4, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -532,7 +690,13 @@ LABEL(loop_ashr_4_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $4, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -568,6 +732,7 @@ LABEL(ashr_5_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $11, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -599,7 +764,13 @@ LABEL(loop_ashr_5_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $5, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -613,7 +784,13 @@ LABEL(loop_ashr_5_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $5, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -649,6 +826,7 @@ LABEL(ashr_6_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $10, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -680,7 +858,13 @@ LABEL(loop_ashr_6_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $6, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -693,7 +877,13 @@ LABEL(loop_ashr_6_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $6, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -729,6 +919,7 @@ LABEL(ashr_7_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $9, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -760,7 +951,13 @@ LABEL(loop_ashr_7_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $7, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -773,7 +970,13 @@ LABEL(loop_ashr_7_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $7, -16(%rdi, %rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -809,6 +1012,7 @@ LABEL(ashr_8_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $8, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -840,7 +1044,13 @@ LABEL(loop_ashr_8_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $8, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -853,7 +1063,13 @@ LABEL(loop_ashr_8_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $8, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -889,6 +1105,7 @@ LABEL(ashr_9_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $7, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -921,7 +1138,13 @@ LABEL(loop_ashr_9_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $9, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -934,7 +1157,13 @@ LABEL(loop_ashr_9_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $9, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -970,6 +1199,7 @@ LABEL(ashr_10_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $6, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -1001,7 +1231,13 @@ LABEL(loop_ashr_10_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $10, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1014,7 +1250,13 @@ LABEL(loop_ashr_10_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $10, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1050,6 +1292,7 @@ LABEL(ashr_11_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $5, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -1081,7 +1324,13 @@ LABEL(loop_ashr_11_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $11, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1094,7 +1343,13 @@ LABEL(loop_ashr_11_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $11, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1130,6 +1385,7 @@ LABEL(ashr_12_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $4, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -1161,7 +1417,13 @@ LABEL(loop_ashr_12_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $12, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1174,7 +1436,13 @@ LABEL(loop_ashr_12_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $12, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1210,6 +1478,7 @@ LABEL(ashr_13_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -1242,7 +1511,13 @@ LABEL(loop_ashr_13_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $13, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1255,7 +1530,13 @@ LABEL(loop_ashr_13_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $13, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1291,6 +1572,7 @@ LABEL(ashr_14_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $2, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -1323,7 +1605,13 @@ LABEL(loop_ashr_14_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $14, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1336,7 +1624,13 @@ LABEL(loop_ashr_14_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $14, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1372,6 +1666,7 @@ LABEL(ashr_15_sse4_2):
movdqa (%rsi), %xmm1
pcmpeqb %xmm1, %xmm0
pslldq $1, %xmm2
+ TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm2
psubb %xmm0, %xmm2
pmovmskb %xmm2, %r9d
@@ -1406,7 +1701,13 @@ LABEL(loop_ashr_15_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $15, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1419,7 +1720,13 @@ LABEL(loop_ashr_15_use_sse4_2):
movdqa (%rdi, %rdx), %xmm0
palignr $15, -16(%rdi, %rdx), %xmm0
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+#ifndef USE_AS_STRCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+#else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+#endif
jbe LABEL(use_sse4_2_exit)
#ifdef USE_AS_STRNCMP
sub $16, %r11
@@ -1458,6 +1765,12 @@ LABEL(use_sse4_2_exit):
jz LABEL(use_sse4_2_ret_sse4_2)
xchg %eax, %edx
LABEL(use_sse4_2_ret_sse4_2):
+# ifdef USE_AS_STRCASECMP_L
+ leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
+ movl (%rcx,%rdx,4), %edx
+ movl (%rcx,%rax,4), %eax
+# endif
+
sub %edx, %eax
ret
@@ -1480,6 +1793,12 @@ LABEL(less16bytes_sse4_2):
movzbl (%rsi, %rdx), %ecx
movzbl (%rdi, %rdx), %eax
+# ifdef USE_AS_STRCASECMP_L
+ leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+ movl (%rdx,%rcx,4), %ecx
+ movl (%rdx,%rax,4), %eax
+# endif
+
sub %ecx, %eax
ret
@@ -1488,15 +1807,27 @@ LABEL(strcmp_exitz_sse4_2):
ret
.p2align 4
+ // XXX Same as code above
LABEL(Byte0_sse4_2):
movzx (%rsi), %ecx
movzx (%rdi), %eax
+# ifdef USE_AS_STRCASECMP_L
+ leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+ movl (%rdx,%rcx,4), %ecx
+ movl (%rdx,%rax,4), %eax
+# endif
+
sub %ecx, %eax
ret
cfi_endproc
.size STRCMP_SSE42, .-STRCMP_SSE42
+# undef UCLOW_reg
+# undef UCHIGH_reg
+# undef LCQWORD_reg
+# undef TOLOWER
+
/* Put all SSE 4.2 functions together. */
.section .rodata.sse4.2,"a",@progbits
.p2align 3
@@ -1528,6 +1859,17 @@ LABEL(unaligned_table_sse4_2):
# undef END
# define END(name) \
cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2
+
+# ifdef USE_AS_STRCASECMP_L
+# define ENTRY2(name) \
+ .type __strcasecmp_sse2, @function; \
+ .align 16; \
+ __strcasecmp_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# define END2(name) \
+ cfi_endproc; .size __strcasecmp_sse2, .-__strcasecmp_sse2
+# endif
+
# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal strcmp calls through a PLT.
The speedup we get from using SSE4.2 instruction is likely eaten away
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 7b2b246..d36fef2 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -74,15 +74,24 @@
#endif
#ifdef USE_AS_STRCASECMP_L
-ENTRY (__strcasecmp)
+# ifndef ENTRY2
+# define ENTRY2(name) ENTRY (name)
+# define END2(name) END (name)
+# define NO_NOLOCALE_ALIAS
+# endif
+
+ ENTRY2 (__strcasecmp)
movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
movq %fs:(%rax),%rdx
+ // XXX 5 byte should be before the function
/* 5-byte NOP. */
.byte 0x0f,0x1f,0x44,0x00,0x00
-END (__strcasecmp)
+END2 (__strcasecmp)
+# ifndef NO_NOLOCALE_ALIAS
weak_alias (__strcasecmp, strcasecmp)
libc_hidden_def (__strcasecmp)
+# endif
/* FALLTHROUGH to strcasecmp_l. */
#endif
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 10 +
sysdeps/x86_64/multiarch/Makefile | 2 +-
sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 5 +
sysdeps/x86_64/{ => multiarch}/strcasecmp_l.S | 0
sysdeps/x86_64/multiarch/strcmp.S | 380 +++++++++++++++++++++++--
sysdeps/x86_64/strcmp.S | 13 +-
6 files changed, 388 insertions(+), 22 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
copy sysdeps/x86_64/{ => multiarch}/strcasecmp_l.S (100%)
hooks/post-receive
--
GNU C Library master sources