This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH RFC] Imporve 64bit memset performance for Haswell CPU with AVX2 instruction


On Tue, Jul 01, 2014 at 12:03:15PM +0300, Marko Myllynen wrote:
> Hi,
> 
> On 2014-04-04 10:34, ling.ma.program@gmail.com wrote:
> > From: Ling Ma <ling.ml@alibaba-inc.com>
> > 
> > In this patch we manage to reduce miss branch prediction by 
> > avoid using branch instructions and force destination to be aligned
> > with avx instruction. 
> > 
> > ---
> >  In this version we removed prefetch and append vmovd.
> > 
> >  ChangeLog                              |   9 ++
> >  sysdeps/x86_64/multiarch/Makefile      |   4 +-
> >  sysdeps/x86_64/multiarch/memset-avx2.S | 192 +++++++++++++++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/memset.S      |  59 ++++++++++
> >  sysdeps/x86_64/multiarch/memset_chk.S  |  44 ++++++++
> >  5 files changed, 307 insertions(+), 1 deletion(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/memset.S
> >  create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S
> > 
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
> > new file mode 100644
> > index 0000000..08e8ee8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
> > @@ -0,0 +1,192 @@
> > +/* memset with AVX2
> > +   Copyright (C) 2014 Free Software Foundation, Inc.
> > +   Contributed by Alibaba Group.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <http://www.gnu.org/licenses/>.  */
> > +
> > +#include <sysdep.h>
> > +
> > +#if !defined NOT_IN_libc
> > +
> > +#include "asm-syntax.h"
> > +#ifndef ALIGN
> > +# define ALIGN(n)	.p2align n
> > +#endif
> > +#ifndef MEMSET
> > +# define MEMSET	__memset_avx2
> > +# define MEMSET_CHK	__memset_chk_avx2
> > +#endif
> > +
> > +	.section .text.avx2,"ax",@progbits
> > +#if defined PIC
> > +ENTRY (MEMSET_CHK)
> > +	cmpq	%rdx, %rcx
> > +	jb	HIDDEN_JUMPTARGET (__chk_fail)
> > +END (MEMSET_CHK)
> > +#endif
> > +
> > +ENTRY (MEMSET)
> > +	vpxor	%xmm0, %xmm0, %xmm0
> > +	vmovd %esi, %xmm1
> > +	lea	(%rdi, %rdx), %r8
> > +	vpshufb	%xmm0, %xmm1, %xmm0
> > +	mov	%rdi, %rax
> > +	cmp	$256, %rdx
> > +	jae	L(256bytesormore)
> > +	vmovd %xmm0, %rcx
> > +	cmp	$128, %rdx
> > +	jb	L(less_128bytes)
> > +	vmovups %xmm0, (%rdi)
> > +	vmovups %xmm0, 0x10(%rdi)
> > +	vmovups %xmm0, 0x20(%rdi)
> > +	vmovups %xmm0, 0x30(%rdi)
> > +	vmovups %xmm0, 0x40(%rdi)
> > +	vmovups %xmm0, 0x50(%rdi)
> > +	vmovups %xmm0, 0x60(%rdi)
> > +	vmovups %xmm0, 0x70(%rdi)
> > +	vmovups %xmm0, -0x80(%r8)
> > +	vmovups %xmm0, -0x70(%r8)
> > +	vmovups %xmm0, -0x60(%r8)
> > +	vmovups %xmm0, -0x50(%r8)
> > +	vmovups %xmm0, -0x40(%r8)
> > +	vmovups %xmm0, -0x30(%r8)
> > +	vmovups %xmm0, -0x20(%r8)
> > +	vmovups %xmm0, -0x10(%r8)
> > +	ret
> > +	ALIGN(4)
> > +L(less_128bytes):
> > +	cmp	$64, %edx
> > +	jb	L(less_64bytes)
> > +	vmovups %xmm0, (%rdi)
> > +	vmovups %xmm0, 0x10(%rdi)
> > +	vmovups %xmm0, 0x20(%rdi)
> > +	vmovups %xmm0, 0x30(%rdi)
> > +	vmovups %xmm0, -0x40(%r8)
> > +	vmovups %xmm0, -0x30(%r8)
> > +	vmovups %xmm0, -0x20(%r8)
> > +	vmovups %xmm0, -0x10(%r8)
> > +	ret
> > +	ALIGN(4)
> > +L(less_64bytes):
> > +	cmp	$32, %edx
> > +	jb	L(less_32bytes)
> > +	vmovups %xmm0, (%rdi)
> > +	vmovups %xmm0, 0x10(%rdi)
> > +	vmovups %xmm0, -0x20(%r8)
> > +	vmovups %xmm0, -0x10(%r8)
> > +	ret
> > +	ALIGN(4)
> > +L(less_32bytes):
> > +	cmp	$16, %edx
> > +	jb	L(less_16bytes)
> > +	vmovups %xmm0, (%rdi)
> > +	vmovups %xmm0, -0x10(%r8)
> > +	ret
> > +	ALIGN(4)
> > +L(less_16bytes):
> > +	cmp	$8, %edx
> > +	jb	L(less_8bytes)
> > +	mov %rcx, (%rdi)
> > +	mov %rcx, -0x08(%r8)
> > +	ret
> > +	ALIGN(4)
> > +L(less_8bytes):
> > +	cmp	$4, %edx
> > +	jb	L(less_4bytes)
> > +	mov %ecx, (%rdi)
> > +	mov %ecx, -0x04(%r8)
> > +	ALIGN(4)
> > +L(less_4bytes):
> > +	cmp	$2, %edx
> > +	jb	L(less_2bytes)
> > +	mov	%cx, (%rdi)
> > +	mov	%cx, -0x02(%r8)
> > +	ret
> > +	ALIGN(4)
> > +L(less_2bytes):
> > +	cmp	$1, %edx
> > +	jb	L(less_1bytes)
> > +	mov	%cl, (%rdi)
> > +L(less_1bytes):
> > +	ret
> > +
> > +	ALIGN(4)
> > +L(256bytesormore):
> > +	vinserti128 $1, %xmm0, %ymm0, %ymm0
> 
> this breaks build on RHEL 6 x86_64:
> 
> ../sysdeps/x86_64/multiarch/memset-avx2.S:
> ../sysdeps/x86_64/multiarch/memset-avx2.S: Assembler messages:
> Assembler messages:
> ../sysdeps/x86_64/multiarch/memset-avx2.S:132:
> ../sysdeps/x86_64/multiarch/memset-avx2.S:132: Error: Error: no such
> instruction: `vinserti128 $1,%xmm0,%ymm0,%ymm0'no such instruction:
> `vinserti128 $1,%xmm0,%ymm0,%ymm0'
> 
> Cheers,
> 
What version of gcc?


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]