This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH 1/2] Fix strrchr regression.


On Mon, Aug 05, 2013 at 10:11:40PM +0200, OndÅej BÃlka wrote:
> On Mon, Aug 05, 2013 at 07:54:14PM +0200, Liubov Dmitrieva wrote:
> >    I see you didn't check for Haswell. Can you please contribute your
> >    profiler to glibc to let everyone reproduce measurements of your good
> >    benchmarks. I want to be able to get data for Haswell, Atom and Silvermont
> >    for your benchmark to be sure that we didn't miss something. There was the
> >    discussion already that it is important to improve glibc benchmarks.
> >
> Hi,
> 
I did a obvious modification of loop to compute same expression with avx2 instead sse2.

A profiler with avx2 version added is here.
http://kam.mff.cuni.cz/~ondra/strrchr_profile060813_v2.tar.bz2

And avx2 implementation is following


	.file	"strrchr_avx.c"
	.text
	.p2align 4,,15
	.globl	strrchr_new
	.type	strrchr_new, @function
strrchr_new:
.LFB990:
	.cfi_startproc
	movq	%rdi, %rax
	vmovd	%esi, %xmm0
	andl	$4095, %eax
	cmpq	$4032, %rax
	vpbroadcastb	%xmm0, %xmm0
	ja	.L2
	vpxor	%xmm3, %xmm3, %xmm3
	vmovdqu	(%rdi), %xmm1
	vpcmpeqb	%xmm3, %xmm1, %xmm4
	vpcmpeqb	%xmm0, %xmm1, %xmm1
	vpmovmskb	%xmm4, %edx
	vpmovmskb	%xmm1, %ecx
	movslq	%edx, %rdx
	testq	%rdx, %rdx
	movslq	%ecx, %rcx
	je	.L3
	leaq	-1(%rdx), %rax
	xorq	%rdx, %rax
	andq	%rcx, %rax
	je	.L4
#APP
# 63 "header.h" 1
	bsrq	%rax, %rax
# 0 "" 2
#NO_APP
	addq	%rdi, %rax
	ret
	.p2align 4,,10
	.p2align 3
.L2:
	movq	%rdi, %rcx
	vpxor	%xmm1, %xmm1, %xmm1
	andq	$-64, %rcx
	vmovdqu	(%rcx), %xmm6
	vmovdqu	16(%rcx), %xmm5
	vpcmpeqb	%xmm1, %xmm6, %xmm7
	vpcmpeqb	%xmm0, %xmm6, %xmm6
	vmovdqu	32(%rcx), %xmm4
	vpmovmskb	%xmm7, %r9d
	vpcmpeqb	%xmm1, %xmm5, %xmm7
	vmovdqu	48(%rcx), %xmm3
	vpcmpeqb	%xmm0, %xmm5, %xmm5
	vpmovmskb	%xmm7, %eax
	vpcmpeqb	%xmm1, %xmm4, %xmm7
	vpcmpeqb	%xmm1, %xmm3, %xmm1
	cltq
	vpmovmskb	%xmm7, %r8d
	salq	$16, %rax
	vpmovmskb	%xmm1, %edx
	salq	$32, %r8
	vpcmpeqb	%xmm0, %xmm4, %xmm4
	orq	%r8, %rax
	movslq	%r9d, %r8
	salq	$48, %rdx
	orq	%r8, %rax
	vpmovmskb	%xmm4, %r9d
	vpmovmskb	%xmm6, %r8d
	orq	%rdx, %rax
	vpmovmskb	%xmm5, %edx
	vpcmpeqb	%xmm0, %xmm3, %xmm0
	salq	$32, %r9
	movslq	%r8d, %r8
	movslq	%edx, %rdx
	salq	$16, %rdx
	vpmovmskb	%xmm0, %r10d
	orq	%r9, %rdx
	orq	%r8, %rdx
	movq	%r10, %r8
	salq	$48, %r8
	orq	%r8, %rdx
	movl	%edi, %r8d
	subl	%ecx, %r8d
	movl	%r8d, %ecx
	shrq	%cl, %rax
	shrq	%cl, %rdx
	testq	%rax, %rax
	je	.L6
	leaq	-1(%rax), %rcx
	xorq	%rax, %rcx
	andq	%rcx, %rdx
	je	.L4
.L33:
#APP
# 63 "header.h" 1
	bsrq	%rdx, %rdx
# 0 "" 2
#NO_APP
	leaq	(%rdi,%rdx), %rax
	vzeroupper
	ret
	.p2align 4,,10
	.p2align 3
.L6:
	xorl	%eax, %eax
	testq	%rdx, %rdx
	jne	.L35
.L8:
	vmovd	%esi, %xmm2
	addq	$64, %rdi
	andq	$-64, %rdi
	vpxor	%xmm5, %xmm5, %xmm5
	vpbroadcastb	%xmm2, %xmm2
	vinserti128	$1, %xmm2, %ymm2, %ymm2
	jmp	.L13
	.p2align 4,,10
	.p2align 3
.L11:
	testq	%rdx, %rdx
	je	.L12
#APP
# 63 "header.h" 1
	bsrq	%rdx, %rdx
# 0 "" 2
#NO_APP
	leaq	(%rdi,%rdx), %rax
.L12:
	addq	$64, %rdi
.L13:
	vmovdqa	32(%rdi), %ymm0
	vpxor	%xmm4, %xmm4, %xmm4
	vmovdqa	(%rdi), %ymm1
	vpminub	%ymm0, %ymm1, %ymm3
	vpcmpeqb	%ymm5, %ymm3, %ymm3
	vpmovmskb	%ymm3, %ecx
	vpcmpeqb	%ymm2, %ymm1, %ymm3
	vpmovmskb	%ymm3, %esi
	vpcmpeqb	%ymm2, %ymm0, %ymm3
	movslq	%esi, %rsi
	vpmovmskb	%ymm3, %edx
	salq	$32, %rdx
	orq	%rsi, %rdx
	testl	%ecx, %ecx
	je	.L11
	vpcmpeqb	%ymm4, %ymm1, %ymm1
	vpcmpeqb	%ymm4, %ymm0, %ymm0
	vpmovmskb	%ymm1, %ecx
	vpmovmskb	%ymm0, %esi
	movslq	%ecx, %rcx
	salq	$32, %rsi
	orq	%rcx, %rsi
	leaq	-1(%rsi), %rcx
	xorq	%rsi, %rcx
	andq	%rcx, %rdx
	jne	.L33
	vzeroupper
	ret
	.p2align 4,,10
	.p2align 3
.L3:
	vmovdqu	16(%rdi), %xmm5
	vmovdqu	32(%rdi), %xmm4
	vpcmpeqb	%xmm3, %xmm5, %xmm6
	vpcmpeqb	%xmm0, %xmm5, %xmm5
	vmovdqu	48(%rdi), %xmm1
	vpmovmskb	%xmm6, %edx
	vpcmpeqb	%xmm3, %xmm4, %xmm6
	vpcmpeqb	%xmm3, %xmm1, %xmm3
	movslq	%edx, %rdx
	vpmovmskb	%xmm6, %eax
	salq	$16, %rdx
	vpmovmskb	%xmm3, %r8d
	salq	$32, %rax
	vpcmpeqb	%xmm0, %xmm4, %xmm4
	orq	%rdx, %rax
	vpmovmskb	%xmm5, %edx
	movq	%r8, %r9
	vpmovmskb	%xmm4, %r8d
	salq	$48, %r9
	movslq	%edx, %rdx
	vpcmpeqb	%xmm0, %xmm1, %xmm0
	salq	$16, %rdx
	salq	$32, %r8
	orq	%r8, %rdx
	orq	%rcx, %rdx
	vpmovmskb	%xmm0, %ecx
	salq	$48, %rcx
	orq	%rcx, %rdx
	orq	%r9, %rax
	je	.L6
	leaq	-1(%rax), %rcx
	xorq	%rax, %rcx
	andq	%rcx, %rdx
	jne	.L36
	.p2align 4,,10
	.p2align 3
.L4:
	xorl	%eax, %eax
	ret
	.p2align 4,,10
	.p2align 3
.L35:
#APP
# 63 "header.h" 1
	bsrq	%rdx, %rax
# 0 "" 2
#NO_APP
	addq	%rdi, %rax
	jmp	.L8
	.p2align 4,,10
	.p2align 3
.L36:
#APP
# 63 "header.h" 1
	bsrq	%rdx, %rax
# 0 "" 2
#NO_APP
	addq	%rdi, %rax
	ret
	.cfi_endproc
.LFE990:
	.size	strrchr_new, .-strrchr_new
	.ident	"GCC: (Debian 4.7.1-2) 4.7.1"
	.section	.note.GNU-stack,"",@progbits


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]