This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH 2/2 neleai/string-x64] Add strcmp with avx2
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sat, 20 Jun 2015 13:15:20 +0200
- Subject: Re: [PATCH 2/2 neleai/string-x64] Add strcmp with avx2
- Authentication-results: sourceware.org; auth=none
- References: <20150620083525 dot GA31992 at domone>
On Sat, Jun 20, 2015 at 10:35:25AM +0200, OndÅej BÃlka wrote:
>
> Hi,
>
> When I read strcmp again to improve strncmp and add avx2 strcmp
> I found that I made several mistakes, mainly caused by first optimizing
> c template and then fixing assembly.
>
> First was mainly my idea to simplify handling cross-page check by oring
> src and dest. I recall that I first did complex crosspage handling where
> false positives were cheap. Then I found that due to size it has big
> overhead and simple loop was faster when testing with firefox.
> That turned original decision into bad one.
>
> Second is to reorganize loop instructions so that after loop ends I could
> simply find last byte without recalculating much, using trick that last
> 16 bit mask could be ored with previous three as its relevant only when
> previous three were zero.
>
> Final one is that gcc generates bad loops in regards where to increment
> pointers. You should place them after loads that use them, not at start
> of loop like gcc does. That change is responsible for 10% improvement
> for large sizes.
>
> Final are microoptimizations that save few bytes without measurable
> performance impact like using eax instead rax to save byte or moving
> unnecessary zeroing instruction when they are not needed.
>
> Profile data are here, shortly with avx2 for haswell that I will submit
> next.
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile.html
>
> OK to commit this?
>
Here is a avx2 loop that I promised earlier. Luckily it gives small 2%
practical benefit on gcc workload. Also it does improve
performance on longer inputs twice.
* sysdeps/x86_64/multiarch/Makefile: Add strcmp-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c: Add __strcmp_avx2.
* sysdeps/x86_64/multiarch/strcmp-avx2.S: New file.
* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S: Add avx2
loop.
* sysdeps/x86_64/multiarch/strcmp.S: Add ifunc.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index d01bbbe..bf48283 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -30,7 +30,7 @@ CFLAGS-strspn-c.c += -msse4
endif
ifeq (yes,$(config-cflags-avx2))
-sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2
+sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2 strcmp-avx2
endif
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index cc6f9f2..57ce237 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -126,7 +126,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcmp.S. */
IFUNC_IMPL (i, name, strcmp,
- IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSE4_2, __strcmp_sse42)
+ IFUNC_IMPL_ADD (array, i, strcmp, HAS_AVX2, __strcmp_avx2)
IFUNC_IMPL_ADD (array, i, strcmp, HAS_SSSE3, __strcmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
new file mode 100644
index 0000000..b2f8478
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AVX2
+#define __strcmp_sse2_unaligned __strcmp_avx2
+#include "strcmp-sse2-unaligned.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 03d1b11..10bed9a 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -89,12 +89,35 @@ L(main_loop_header):
subq %rsi, %rcx
shrq $6, %rcx
movq %rcx, %rsi
-
+#ifdef USE_AVX2
+ vpxor %xmm7, %xmm7, %xmm7
+#endif
.p2align 4
L(loop):
add $-1, %rsi
ja L(loop_cross_page)
L(back_to_loop):
+#ifdef USE_AVX2
+ vmovdqu (%rdx), %ymm0
+ vmovdqu 32(%rdx), %ymm1
+ vpcmpeqb (%rax), %ymm0, %ymm0
+ vpminub (%rax), %ymm0, %ymm0
+ vpcmpeqb (%rax), %ymm1, %ymm1
+ vpminub (%rax), %ymm1, %ymm1
+ vpminub %ymm0, %ymm1, %ymm2
+ vpcmpeqb %ymm7, %ymm2, %ymm2
+ addq $64, %rax
+ addq $64, %rdx
+ vpmovmskb %ymm2, %esi
+ test %esi, %esi
+ je L(loop)
+ shl $32, %rsi
+ vpcmpeqb %ymm7, %ymm0, %ymm0
+ vpmovmskb %ymm0, %ecx
+ or %rsi, %rcx
+ vzeroupper
+#else
+
movdqu (%rdx), %xmm0
movdqu 16(%rdx), %xmm1
movdqa (%rax), %xmm2
@@ -132,14 +155,17 @@ L(back_to_loop):
orq %rdi, %rcx
sal $16, %esi
orq %rsi, %rcx
+#endif
bsfq %rcx, %rcx
movzbl -64(%rax, %rcx), %eax
movzbl -64(%rdx, %rcx), %edx
subl %edx, %eax
ret
-
.p2align 4
L(loop_cross_page):
+#ifdef USE_AVX2
+ vzeroupper
+#endif
xor %ecx, %ecx
movq %rdx, %r9
and $63, %r9
diff --git a/sysdeps/x86_64/multiarch/strcmp.S b/sysdeps/x86_64/multiarch/strcmp.S
index f50f26c..867e9d4 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -90,6 +90,12 @@ ENTRY(STRCMP)
call __init_cpu_features
1:
#ifdef USE_AS_STRCMP
+# ifdef HAVE_AVX2_SUPPORT
+
+ leaq __strcmp_avx2(%rip), %rax
+ testl $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
+ jnz 3f
+# endif
leaq __strcmp_sse2_unaligned(%rip), %rax
testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
jnz 3f