This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH 2/3] Clean up strcmp-sse2-unaligned
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Mon, 16 Sep 2013 15:11:12 +0200
- Subject: [PATCH 2/3] Clean up strcmp-sse2-unaligned
- Authentication-results: sourceware.org; auth=none
- References: <20130913200552 dot GA31992 at domone> <20130913205303 dot GA3620 at domone> <20130916123234 dot GA24928 at domone>
In my implementation I was so far concerned to get big picture rigth and
not microoptimizations.
Now I focus also on microoptimizations, they are of several types.
1. Prepare for better scheduling by my evolutionary algorithm (as I
described in http://www.sourceware.org/ml/libc-alpha/2013-09/msg00404.html)
I will add functionality that it will also check alternative instructions
when written in following way:
variant1 #| variant2 #| variant3
I will post that separetely as schedulings generated are more chaotic
than ones that were written to be easy to read.
2. I realized about week ago that I could make loop exits bit more
effective. We now do in loop four 16byte checks that produce mask that
has i-th bit 1 if result is determined by byte a+i. This can be modulo
optimizations be implemented as:
while(1){
m1=mask16(a);
m2=mask16(a+16);
m3=mask16(a+32);
m4=mask16(a+48);
if (m1|m2|m3|m4)
{
m=m1|(m2<<16)|(m3<<32)|(m4<<48);
result=a+ffs(m);
...
}
s+=64;
}
This is problematic on x64 as we use destructive instructions and it is
faster to recompute one mask than do one extra mov instuction in each
iteration. However we can notice that bits 48-63 are relevant only when
bits 0-47 are set to zero so following transformation gives same answer.
while(1){
m1=mask16(a);
m2=mask16(a+16);
m3=mask16(a+32);
m4=mask16(a+48);
mt=m1|m2|m3|m4;
if (mt)
{
m=m1|(m2<<16)|(m3<<32)|(mt<<48);
result=a+ffs(m);
...
}
s+=64;
}
3. Rename registers, A gcc generated extra xmm1 register set to zero
which could be replaced by xmm7 that is also set to zero. I also did
renaming.
---
sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 93 ++++++++++++------------
1 file changed, 45 insertions(+), 48 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
index d536fa4..05f90f9 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S
@@ -44,7 +44,7 @@ ENTRY ( __strcmp_sse2_unaligned)
mov LOCALE_TOLOWER(%rdx), %r11
#endif
movl %edi, %eax
- xorl %edx, %edx
+ xorl %edx, %edx /* Counter in cross-page loop. */
pxor %xmm7, %xmm7
orl %esi, %eax
andl $4095, %eax
@@ -54,14 +54,13 @@ ENTRY ( __strcmp_sse2_unaligned)
movdqu (%rsi), %xmm0
pcmpeqb %xmm1, %xmm0
pminub %xmm1, %xmm0
- pxor %xmm1, %xmm1
- pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm7, %xmm0
pmovmskb %xmm0, %eax
- testq %rax, %rax
+ testq %rax, %rax #| test %eax, %eax
je L(next_48_bytes)
#ifndef AS_STRCASECMP
L(return):
- bsfq %rax, %rdx
+ bsfq %rax, %rdx #| bsf %ax, %dx #| bsf %eax, %edx
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %edx
subl %edx, %eax
@@ -78,18 +77,18 @@ L(next_48_bytes):
pcmpeqb %xmm6, %xmm3
movdqu 32(%rsi), %xmm2
pminub %xmm6, %xmm3
- pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm7, %xmm3
movdqu 48(%rdi), %xmm4
pcmpeqb %xmm5, %xmm2
pmovmskb %xmm3, %edx
movdqu 48(%rsi), %xmm0
pminub %xmm5, %xmm2
- pcmpeqb %xmm1, %xmm2
+ pcmpeqb %xmm7, %xmm2
pcmpeqb %xmm4, %xmm0
pmovmskb %xmm2, %eax
salq $16, %rdx
pminub %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm7, %xmm0
salq $32, %rax
orq %rdx, %rax
pmovmskb %xmm0, %ecx
@@ -100,7 +99,6 @@ L(next_48_bytes):
L(main_loop_header):
leaq 64(%rdi), %rdx
movl $4096, %ecx
- pxor %xmm9, %xmm9
andq $-64, %rdx
subq %rdi, %rdx
leaq (%rdi, %rdx), %rax
@@ -187,50 +185,49 @@ L(loop):
addq $64, %rax
addq $64, %rdx
L(loop_start):
- testq %rsi, %rsi
+ testq %rsi, %rsi /* TODO sub $1, %rsi; je L(loop_cross_page) */
leaq -1(%rsi), %rsi
je L(loop_cross_page)
L(back_to_loop):
- movdqu (%rdx), %xmm0
- movdqu 16(%rdx), %xmm1
- movdqa (%rax), %xmm2
- movdqa 16(%rax), %xmm3
- pcmpeqb %xmm2, %xmm0
- movdqu 32(%rdx), %xmm5
- pcmpeqb %xmm3, %xmm1
- pminub %xmm2, %xmm0
movdqu 48(%rdx), %xmm6
- pminub %xmm3, %xmm1
- movdqa 32(%rax), %xmm2
- pminub %xmm1, %xmm0
movdqa 48(%rax), %xmm3
- pcmpeqb %xmm2, %xmm5
pcmpeqb %xmm3, %xmm6
- pminub %xmm2, %xmm5
pminub %xmm3, %xmm6
- pminub %xmm5, %xmm0
- pminub %xmm6, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %ecx
- testl %ecx, %ecx
- je L(loop)
- pcmpeqb %xmm7, %xmm5
+
movdqu (%rdx), %xmm0
- pcmpeqb %xmm7, %xmm1
movdqa (%rax), %xmm2
pcmpeqb %xmm2, %xmm0
pminub %xmm2, %xmm0
+ pminub %xmm0, %xmm6
+
+ movdqu 16(%rdx), %xmm1
+ movdqa 16(%rax), %xmm8
+ pcmpeqb %xmm8, %xmm1
+ pminub %xmm8, %xmm1
+ pminub %xmm1, %xmm6
+
+ movdqu 32(%rdx), %xmm2
+ movdqa 32(%rax), %xmm5
+ pcmpeqb %xmm5, %xmm2
+ pminub %xmm5, %xmm2
+ pminub %xmm2, %xmm6
+
pcmpeqb %xmm7, %xmm6
+ pmovmskb %xmm6, %ecx
+ testl %ecx, %ecx
+ je L(loop)
pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm1, %ecx
- pmovmskb %xmm5, %r8d
- pmovmskb %xmm0, %edi
- salq $16, %rcx
- salq $32, %r8
- pmovmskb %xmm6, %esi
+ pcmpeqb %xmm7, %xmm1
+ pcmpeqb %xmm7, %xmm2
+
+ pmovmskb %xmm0, %r8d
+ pmovmskb %xmm1, %esi
+ pmovmskb %xmm2, %edi
+ salq $48, %rcx
orq %r8, %rcx
+ salq $16, %rsi
+ salq $32, %rdi
orq %rdi, %rcx
- salq $48, %rsi
orq %rsi, %rcx
#ifndef AS_STRCASECMP
bsfq %rcx, %rcx
@@ -247,23 +244,23 @@ L(back_to_loop):
ALIGN (4)
L(loop_cross_page):
- xor %r10, %r10
+ xor %rdi, %rdi
movq %rdx, %r9
and $63, %r9
- subq %r9, %r10
+ subq %r9, %rdi
- movdqa (%rdx, %r10), %xmm0
- movdqa 16(%rdx, %r10), %xmm1
- movdqu (%rax, %r10), %xmm2
- movdqu 16(%rax, %r10), %xmm3
+ movdqa (%rdx, %rdi), %xmm0
+ movdqa 16(%rdx, %rdi), %xmm1
+ movdqu (%rax, %rdi), %xmm2
+ movdqu 16(%rax, %rdi), %xmm3
pcmpeqb %xmm2, %xmm0
- movdqa 32(%rdx, %r10), %xmm5
+ movdqa 32(%rdx, %rdi), %xmm5
pcmpeqb %xmm3, %xmm1
pminub %xmm2, %xmm0
- movdqa 48(%rdx, %r10), %xmm6
+ movdqa 48(%rdx, %rdi), %xmm6
pminub %xmm3, %xmm1
- movdqu 32(%rax, %r10), %xmm2
- movdqu 48(%rax, %r10), %xmm3
+ movdqu 32(%rax, %rdi), %xmm2
+ movdqu 48(%rax, %rdi), %xmm3
pcmpeqb %xmm2, %xmm5
pcmpeqb %xmm3, %xmm6
pminub %xmm2, %xmm5
--
1.8.3.2