This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[RFC] faster strcmp by avoiding sse42.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Tue, 6 Aug 2013 23:30:33 +0200
- Subject: [RFC] faster strcmp by avoiding sse42.
Hi,
Continuing to improving implementation that needlessly use sse42 we move
to strcmp. A strcmp_sse42 is actually faster than existing
implementations. It is mostly caused by lack of unrolling in other
implementations than sse4 itself.
I decided to write new implementation. It mirrors ideas that I used in a
strlen implementation, a header consist of checking if we cross page and
then first check 16 bytes unaligned, then remaining 48 bytes unaligned.
A loop is nearly same as in strlen, we exploit fact that pcmpeqb
xmma,xmmb will produce zeros only on places that differ.
A implementation is complete upto micro tuning, notably that I now handle
cross-page situations by byte-by-byte loop.
I also did obvious conversion to avx2 but as I do not have haswell I do
not have performance data about it.
I used similar benchmark as in strrchr that is available here:
http://kam.mff.cuni.cz/~ondra/benchmark_string/strcmp_profile.html
http://kam.mff.cuni.cz/~ondra/strcmp_profile060813.tar.bz2
As real performance is concerned my strcmp_new is between 10%-20% faster than
alternatives on all tested architectures.
A workload used is quite representative as in most apps a strcmp will
inspect at most 16 characters with high probability. One of reasons is
that strcmp is used for sorting and searching where difference in first
character is quite likely.
As asymptotic performance is concerned I outperform sse4.2 by 50% for
architectures that support sse4.2 This allows me to replace sse42
implementation with mine.
As other strcmp_sse2 and strcmp_ssse3 are concerned we need to replace
header with strcmp_new header to improve performance. They are
asymptotically faster for processors that are not new to support sse4.2
but also not old so they support ssse3.
For old processors unrolling will beat a strcmp_sse2 implementation
despite that I use unalinged loads that are bit ineffective there. See
http://kam.mff.cuni.cz/~ondra/benchmark_string/athlon_x2/strcmp_profile/results_rand/result.html
Comments?
A new implementation is following:
.file "strcmp.c"
.text
.p2align 4,,15
.globl strcmp_new
.type strcmp_new, @function
strcmp_new:
.LFB519:
.cfi_startproc
movl %esi, %eax
orl %edi, %eax
andl $4095, %eax
cmpl $4031, %eax
jle .L15
movq %rsi, %rcx
leaq 64(%rdi), %r9
movq %rdi, %rdx
jmp .L11
.p2align 4,,10
.p2align 3
.L8:
movzbl (%rcx), %r8d
cmpb %r8b, %al
jne .L16
addq $1, %rdx
addq $1, %rcx
cmpq %r9, %rdx
je .L7
.L11:
movzbl (%rdx), %eax
movq %rdx, %r8
subq %rdi, %r8
testb %al, %al
jne .L8
movzbl (%rsi,%r8), %r8d
xorl %eax, %eax
movsbl %r8b, %r8d
subl %r8d, %eax
ret
.p2align 4,,10
.p2align 3
.L15:
movdqu (%rsi), %xmm5
.L3:
movdqu (%rdi), %xmm0
pcmpeqb %xmm0, %xmm5
pminub %xmm0, %xmm5
pxor %xmm0, %xmm0
pcmpeqb %xmm0, %xmm5
pmovmskb %xmm5, %eax
cltq
testq %rax, %rax
je .L4
.L6:
bsfq %rax, %rax
movslq %eax, %rdx
movsbl (%rdi,%rdx), %eax
movsbl (%rsi,%rdx), %edx
subl %edx, %eax
ret
.p2align 4,,10
.p2align 3
.L16:
movsbl %al, %eax
movsbl %r8b, %r8d
subl %r8d, %eax
ret
.p2align 4,,10
.p2align 3
.L4:
movdqu 16(%rdi), %xmm4
movdqu 16(%rsi), %xmm3
pcmpeqb %xmm4, %xmm3
movdqu 32(%rsi), %xmm2
pminub %xmm4, %xmm3
pcmpeqb %xmm0, %xmm3
movdqu 48(%rsi), %xmm1
pmovmskb %xmm3, %edx
movdqu 32(%rdi), %xmm3
pcmpeqb %xmm3, %xmm2
movslq %edx, %rdx
salq $16, %rdx
pminub %xmm3, %xmm2
pcmpeqb %xmm0, %xmm2
pmovmskb %xmm2, %eax
movdqu 48(%rdi), %xmm2
pcmpeqb %xmm2, %xmm1
salq $32, %rax
orq %rdx, %rax
pminub %xmm2, %xmm1
pcmpeqb %xmm0, %xmm1
pmovmskb %xmm1, %ecx
movq %rcx, %rdx
salq $48, %rdx
orq %rdx, %rax
jne .L6
leaq 64(%rdi), %r9
.L7:
andq $-64, %r9
pxor %xmm6, %xmm6
subq %rdi, %r9
movslq %r9d, %r9
addq %r9, %rdi
addq %r9, %rsi
jmp .L12
.p2align 4,,10
.p2align 3
.L17:
addq $64, %rdi
addq $64, %rsi
.L12:
movdqa (%rdi), %xmm4
movdqu (%rsi), %xmm5
movdqa %xmm4, %xmm0
movdqu 16(%rsi), %xmm3
pcmpeqb %xmm5, %xmm0
movdqu 32(%rsi), %xmm2
pminub %xmm4, %xmm0
movdqa 16(%rdi), %xmm4
pcmpeqb %xmm4, %xmm3
pminub %xmm4, %xmm0
movdqu 48(%rsi), %xmm1
pminub %xmm3, %xmm0
movdqa 32(%rdi), %xmm3
pcmpeqb %xmm3, %xmm2
pminub %xmm3, %xmm0
pminub %xmm2, %xmm0
movdqa 48(%rdi), %xmm2
pcmpeqb %xmm2, %xmm1
pminub %xmm2, %xmm0
pminub %xmm1, %xmm0
pcmpeqb %xmm6, %xmm0
pmovmskb %xmm0, %eax
testl %eax, %eax
je .L17
jmp .L3
.cfi_endproc
.LFE519:
.size strcmp_new, .-strcmp_new
.ident "GCC: (Debian 4.5.3-12) 4.5.3"
.section .note.GNU-stack,"",@progbits