This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH neleai/string-x64] Microoptimize strcmp-sse2-unaligned.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Sat, 20 Jun 2015 10:35:25 +0200
- Subject: [PATCH neleai/string-x64] Microoptimize strcmp-sse2-unaligned.
- Authentication-results: sourceware.org; auth=none
Hi,
When I read strcmp again to improve strncmp and add avx2 strcmp
I found that I made several mistakes, mainly caused by first optimizing
c template and then fixing assembly.
First was mainly my idea to simplify handling cross-page check by oring
src and dest. I recall that I first did complex crosspage handling where
false positives were cheap. Then I found that due to size it has big
overhead and simple loop was faster when testing with firefox.
That turned original decision into bad one.
Second is to reorganize loop instructions so that after loop ends I could
simply find last byte without recalculating much, using trick that last
16 bit mask could be ored with previous three as its relevant only when
previous three were zero.
Final one is that gcc generates bad loops in regards where to increment
pointers. You should place them after loads that use them, not at start
of loop like gcc does. That change is responsible for 10% improvement
for large sizes.
Final are microoptimizations that save few bytes without measurable
performance impact like using eax instead rax to save byte or moving
unnecessary zeroing instruction when they are not needed.
Profile data are here, shortly with avx2 for haswell that I will submit
next.
http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile.html
OK to commit this?
* sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
(__strcmp_sse2_unaligned): Add several microoptimizations.
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index 20b65fa..03d1b11 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -19,10 +19,13 @@
#include "sysdep.h"
ENTRY ( __strcmp_sse2_unaligned)
- movl %edi, %eax
- xorl %edx, %edx
pxor %xmm7, %xmm7
- orl %esi, %eax
+ movl %esi, %eax
+ andl $4095, %eax
+ cmpl $4032, %eax
+ jg L(cross_page)
+
+ movl %edi, %eax
andl $4095, %eax
cmpl $4032, %eax
jg L(cross_page)
@@ -30,13 +33,11 @@ ENTRY ( __strcmp_sse2_unaligned)
movdqu (%rsi), %xmm0
pcmpeqb %xmm1, %xmm0
pminub %xmm1, %xmm0
- pxor %xmm1, %xmm1
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %eax
- testq %rax, %rax
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
je L(next_48_bytes)
-L(return):
- bsfq %rax, %rdx
+ bsf %eax, %edx
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
@@ -50,29 +51,35 @@ L(next_48_bytes):
pcmpeqb %xmm6, %xmm3
movdqu 32(%rsi), %xmm2
pminub %xmm6, %xmm3
- pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm7, %xmm3
movdqu 48(%rdi), %xmm4
pcmpeqb %xmm5, %xmm2
- pmovmskb %xmm3, %edx
+ pmovmskb %xmm3, %edx
movdqu 48(%rsi), %xmm0
pminub %xmm5, %xmm2
- pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm7, %xmm2
pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm2, %eax
- salq $16, %rdx
+ pmovmskb %xmm2, %eax
+ sal $16, %edx
pminub %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm7, %xmm0
salq $32, %rax
orq %rdx, %rax
- pmovmskb %xmm0, %ecx
- movq %rcx, %rdx
- salq $48, %rdx
- orq %rdx, %rax
- jne L(return)
+ pmovmskb %xmm0, %ecx
+ salq $48, %rcx
+ orq %rcx, %rax
+ je L(main_loop_header)
+L(return):
+ bsf %rax, %rdx
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+ ret
+
+
L(main_loop_header):
leaq 64(%rdi), %rdx
movl $4096, %ecx
- pxor %xmm9, %xmm9
andq $-64, %rdx
subq %rdi, %rdx
leaq (%rdi, %rdx), %rax
@@ -82,16 +89,11 @@ L(main_loop_header):
subq %rsi, %rcx
shrq $6, %rcx
movq %rcx, %rsi
- jmp L(loop_start)
.p2align 4
L(loop):
- addq $64, %rax
- addq $64, %rdx
-L(loop_start):
- testq %rsi, %rsi
- leaq -1(%rsi), %rsi
- je L(loop_cross_page)
+ add $-1, %rsi
+ ja L(loop_cross_page)
L(back_to_loop):
movdqu (%rdx), %xmm0
movdqu 16(%rdx), %xmm1
@@ -104,61 +106,57 @@ L(back_to_loop):
movdqu 48(%rdx), %xmm6
pminub %xmm3, %xmm1
movdqa 32(%rax), %xmm2
- pminub %xmm1, %xmm0
movdqa 48(%rax), %xmm3
pcmpeqb %xmm2, %xmm5
pcmpeqb %xmm3, %xmm6
+ addq $64, %rax
pminub %xmm2, %xmm5
pminub %xmm3, %xmm6
- pminub %xmm5, %xmm0
- pminub %xmm6, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %ecx
+ addq $64, %rdx
+ pminub %xmm5, %xmm6
+ pminub %xmm1, %xmm6
+ pminub %xmm0, %xmm6
+ pcmpeqb %xmm7, %xmm6
+ pmovmskb %xmm6, %ecx
testl %ecx, %ecx
je L(loop)
- pcmpeqb %xmm7, %xmm5
- movdqu (%rdx), %xmm0
- pcmpeqb %xmm7, %xmm1
- movdqa (%rax), %xmm2
- pcmpeqb %xmm2, %xmm0
- pminub %xmm2, %xmm0
- pcmpeqb %xmm7, %xmm6
pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm1, %ecx
- pmovmskb %xmm5, %r8d
- pmovmskb %xmm0, %edi
- salq $16, %rcx
+ pcmpeqb %xmm7, %xmm1
+ pcmpeqb %xmm7, %xmm5
+ pmovmskb %xmm0, %edi
+ pmovmskb %xmm1, %esi
+ pmovmskb %xmm5, %r8d
+ salq $48, %rcx
salq $32, %r8
- pmovmskb %xmm6, %esi
orq %r8, %rcx
orq %rdi, %rcx
- salq $48, %rsi
+ sal $16, %esi
orq %rsi, %rcx
bsfq %rcx, %rcx
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
+ movzbl -64(%rax, %rcx), %eax
+ movzbl -64(%rdx, %rcx), %edx
subl %edx, %eax
ret
.p2align 4
L(loop_cross_page):
- xor %r10, %r10
+ xor %ecx, %ecx
movq %rdx, %r9
and $63, %r9
- subq %r9, %r10
+ subq %r9, %rcx
- movdqa (%rdx, %r10), %xmm0
- movdqa 16(%rdx, %r10), %xmm1
- movdqu (%rax, %r10), %xmm2
- movdqu 16(%rax, %r10), %xmm3
+ movdqa (%rdx, %rcx), %xmm0
+ movdqa 16(%rdx, %rcx), %xmm1
+ movdqu (%rax, %rcx), %xmm2
+ movdqu 16(%rax, %rcx), %xmm3
pcmpeqb %xmm2, %xmm0
- movdqa 32(%rdx, %r10), %xmm5
+ movdqa 32(%rdx, %rcx), %xmm5
pcmpeqb %xmm3, %xmm1
pminub %xmm2, %xmm0
- movdqa 48(%rdx, %r10), %xmm6
+ movdqa 48(%rdx, %rcx), %xmm6
pminub %xmm3, %xmm1
- movdqu 32(%rax, %r10), %xmm2
- movdqu 48(%rax, %r10), %xmm3
+ movdqu 32(%rax, %rcx), %xmm2
+ movdqu 48(%rax, %rcx), %xmm3
pcmpeqb %xmm2, %xmm5
pcmpeqb %xmm3, %xmm6
pminub %xmm2, %xmm5
@@ -169,12 +167,12 @@ L(loop_cross_page):
pcmpeqb %xmm7, %xmm5
pcmpeqb %xmm7, %xmm6
- pmovmskb %xmm1, %ecx
- pmovmskb %xmm5, %r8d
- pmovmskb %xmm0, %edi
- salq $16, %rcx
+ pmovmskb %xmm1, %ecx
+ pmovmskb %xmm5, %r8d
+ pmovmskb %xmm0, %edi
+ sal $16, %ecx
salq $32, %r8
- pmovmskb %xmm6, %esi
+ pmovmskb %xmm6, %esi
orq %r8, %rdi
orq %rcx, %rdi
salq $48, %rsi
@@ -190,20 +188,21 @@ L(loop_cross_page):
subl %edx, %eax
ret
+L(cross_page):
+ xorl %edx, %edx
+ jmp L(cross_page_loop_start)
.p2align 4
L(cross_page_loop):
- cmpb %cl, %al
- jne L(different)
- addq $1, %rdx
- cmpq $64, %rdx
+ add $1, %edx
+ cmp $64, %edx
je L(main_loop_header)
-L(cross_page):
+L(cross_page_loop_start):
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %ecx
- testb %al, %al
+ subl %ecx, %eax
+ jne L(different)
+ test %ecx, %ecx
jne L(cross_page_loop)
- xorl %eax, %eax
L(different):
- subl %ecx, %eax
ret
END (__strcmp_sse2_unaligned)