This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH RFC] Imporve 64bit memset performance for Haswell CPU with AVX2 instruction
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: Marko Myllynen <myllynen at redhat dot com>
- Cc: ling dot ma dot program at gmail dot com, libc-alpha at sourceware dot org, liubov dot dmitrieva at gmail dot com, Ling Ma <ling dot ml at alibaba-inc dot com>, "H.J. Lu" <hjl dot tools at gmail dot com>
- Date: Tue, 1 Jul 2014 11:10:53 +0200
- Subject: Re: [PATCH RFC] Imporve 64bit memset performance for Haswell CPU with AVX2 instruction
- Authentication-results: sourceware.org; auth=none
- References: <1396596849-21891-1-git-send-email-ling dot ma dot program at gmail dot com> <53B27953 dot 3010905 at redhat dot com>
On Tue, Jul 01, 2014 at 12:03:15PM +0300, Marko Myllynen wrote:
> Hi,
>
> On 2014-04-04 10:34, ling.ma.program@gmail.com wrote:
> > From: Ling Ma <ling.ml@alibaba-inc.com>
> >
> > In this patch we manage to reduce miss branch prediction by
> > avoid using branch instructions and force destination to be aligned
> > with avx instruction.
> >
> > ---
> > In this version we removed prefetch and append vmovd.
> >
> > ChangeLog | 9 ++
> > sysdeps/x86_64/multiarch/Makefile | 4 +-
> > sysdeps/x86_64/multiarch/memset-avx2.S | 192 +++++++++++++++++++++++++++++++++
> > sysdeps/x86_64/multiarch/memset.S | 59 ++++++++++
> > sysdeps/x86_64/multiarch/memset_chk.S | 44 ++++++++
> > 5 files changed, 307 insertions(+), 1 deletion(-)
> > create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
> > create mode 100644 sysdeps/x86_64/multiarch/memset.S
> > create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
> > new file mode 100644
> > index 0000000..08e8ee8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
> > @@ -0,0 +1,192 @@
> > +/* memset with AVX2
> > + Copyright (C) 2014 Free Software Foundation, Inc.
> > + Contributed by Alibaba Group.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <http://www.gnu.org/licenses/>. */
> > +
> > +#include <sysdep.h>
> > +
> > +#if !defined NOT_IN_libc
> > +
> > +#include "asm-syntax.h"
> > +#ifndef ALIGN
> > +# define ALIGN(n) .p2align n
> > +#endif
> > +#ifndef MEMSET
> > +# define MEMSET __memset_avx2
> > +# define MEMSET_CHK __memset_chk_avx2
> > +#endif
> > +
> > + .section .text.avx2,"ax",@progbits
> > +#if defined PIC
> > +ENTRY (MEMSET_CHK)
> > + cmpq %rdx, %rcx
> > + jb HIDDEN_JUMPTARGET (__chk_fail)
> > +END (MEMSET_CHK)
> > +#endif
> > +
> > +ENTRY (MEMSET)
> > + vpxor %xmm0, %xmm0, %xmm0
> > + vmovd %esi, %xmm1
> > + lea (%rdi, %rdx), %r8
> > + vpshufb %xmm0, %xmm1, %xmm0
> > + mov %rdi, %rax
> > + cmp $256, %rdx
> > + jae L(256bytesormore)
> > + vmovd %xmm0, %rcx
> > + cmp $128, %rdx
> > + jb L(less_128bytes)
> > + vmovups %xmm0, (%rdi)
> > + vmovups %xmm0, 0x10(%rdi)
> > + vmovups %xmm0, 0x20(%rdi)
> > + vmovups %xmm0, 0x30(%rdi)
> > + vmovups %xmm0, 0x40(%rdi)
> > + vmovups %xmm0, 0x50(%rdi)
> > + vmovups %xmm0, 0x60(%rdi)
> > + vmovups %xmm0, 0x70(%rdi)
> > + vmovups %xmm0, -0x80(%r8)
> > + vmovups %xmm0, -0x70(%r8)
> > + vmovups %xmm0, -0x60(%r8)
> > + vmovups %xmm0, -0x50(%r8)
> > + vmovups %xmm0, -0x40(%r8)
> > + vmovups %xmm0, -0x30(%r8)
> > + vmovups %xmm0, -0x20(%r8)
> > + vmovups %xmm0, -0x10(%r8)
> > + ret
> > + ALIGN(4)
> > +L(less_128bytes):
> > + cmp $64, %edx
> > + jb L(less_64bytes)
> > + vmovups %xmm0, (%rdi)
> > + vmovups %xmm0, 0x10(%rdi)
> > + vmovups %xmm0, 0x20(%rdi)
> > + vmovups %xmm0, 0x30(%rdi)
> > + vmovups %xmm0, -0x40(%r8)
> > + vmovups %xmm0, -0x30(%r8)
> > + vmovups %xmm0, -0x20(%r8)
> > + vmovups %xmm0, -0x10(%r8)
> > + ret
> > + ALIGN(4)
> > +L(less_64bytes):
> > + cmp $32, %edx
> > + jb L(less_32bytes)
> > + vmovups %xmm0, (%rdi)
> > + vmovups %xmm0, 0x10(%rdi)
> > + vmovups %xmm0, -0x20(%r8)
> > + vmovups %xmm0, -0x10(%r8)
> > + ret
> > + ALIGN(4)
> > +L(less_32bytes):
> > + cmp $16, %edx
> > + jb L(less_16bytes)
> > + vmovups %xmm0, (%rdi)
> > + vmovups %xmm0, -0x10(%r8)
> > + ret
> > + ALIGN(4)
> > +L(less_16bytes):
> > + cmp $8, %edx
> > + jb L(less_8bytes)
> > + mov %rcx, (%rdi)
> > + mov %rcx, -0x08(%r8)
> > + ret
> > + ALIGN(4)
> > +L(less_8bytes):
> > + cmp $4, %edx
> > + jb L(less_4bytes)
> > + mov %ecx, (%rdi)
> > + mov %ecx, -0x04(%r8)
> > + ALIGN(4)
> > +L(less_4bytes):
> > + cmp $2, %edx
> > + jb L(less_2bytes)
> > + mov %cx, (%rdi)
> > + mov %cx, -0x02(%r8)
> > + ret
> > + ALIGN(4)
> > +L(less_2bytes):
> > + cmp $1, %edx
> > + jb L(less_1bytes)
> > + mov %cl, (%rdi)
> > +L(less_1bytes):
> > + ret
> > +
> > + ALIGN(4)
> > +L(256bytesormore):
> > + vinserti128 $1, %xmm0, %ymm0, %ymm0
>
> this breaks build on RHEL 6 x86_64:
>
> ../sysdeps/x86_64/multiarch/memset-avx2.S:
> ../sysdeps/x86_64/multiarch/memset-avx2.S: Assembler messages:
> Assembler messages:
> ../sysdeps/x86_64/multiarch/memset-avx2.S:132:
> ../sysdeps/x86_64/multiarch/memset-avx2.S:132: Error: Error: no such
> instruction: `vinserti128 $1,%xmm0,%ymm0,%ymm0'no such instruction:
> `vinserti128 $1,%xmm0,%ymm0,%ymm0'
>
> Cheers,
>
What version of gcc?