This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Enable AVX2 optimized memset only if -mavx2 works


On Tue, Jul 1, 2014 at 9:10 AM, Marko Myllynen <myllynen@redhat.com> wrote:
> Hi,
>
> On 2014-07-01 18:56, H.J. Lu wrote:
>> On Tue, Jul 1, 2014 at 2:03 AM, Marko Myllynen <myllynen@redhat.com> wrote:
>>> On 2014-04-04 10:34, ling.ma.program@gmail.com wrote:
>>>> From: Ling Ma <ling.ml@alibaba-inc.com>
>>>>
>>>> In this patch we manage to reduce miss branch prediction by
>>>> avoid using branch instructions and force destination to be aligned
>>>> with avx instruction.
>>>>
>>>> ---
>>>>  In this version we removed prefetch and append vmovd.
>>>>
>>>>  ChangeLog                              |   9 ++
>>>>  sysdeps/x86_64/multiarch/Makefile      |   4 +-
>>>>  sysdeps/x86_64/multiarch/memset-avx2.S | 192 +++++++++++++++++++++++++++++++++
>>>>  sysdeps/x86_64/multiarch/memset.S      |  59 ++++++++++
>>>>  sysdeps/x86_64/multiarch/memset_chk.S  |  44 ++++++++
>>>>  5 files changed, 307 insertions(+), 1 deletion(-)
>>>>  create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
>>>>  create mode 100644 sysdeps/x86_64/multiarch/memset.S
>>>>  create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S
>>>>
>>>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
>>>> new file mode 100644
>>>> index 0000000..08e8ee8
>>>> --- /dev/null
>>>> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
>>>> @@ -0,0 +1,192 @@
>>>> +/* memset with AVX2
>>>> +   Copyright (C) 2014 Free Software Foundation, Inc.
>>>> +   Contributed by Alibaba Group.
>>>> +   This file is part of the GNU C Library.
>>>> +
>>>> +   The GNU C Library is free software; you can redistribute it and/or
>>>> +   modify it under the terms of the GNU Lesser General Public
>>>> +   License as published by the Free Software Foundation; either
>>>> +   version 2.1 of the License, or (at your option) any later version.
>>>> +
>>>> +   The GNU C Library is distributed in the hope that it will be useful,
>>>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>>>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>>> +   Lesser General Public License for more details.
>>>> +
>>>> +   You should have received a copy of the GNU Lesser General Public
>>>> +   License along with the GNU C Library; if not, see
>>>> +   <http://www.gnu.org/licenses/>.  */
>>>> +
>>>> +#include <sysdep.h>
>>>> +
>>>> +#if !defined NOT_IN_libc
>>>> +
>>>> +#include "asm-syntax.h"
>>>> +#ifndef ALIGN
>>>> +# define ALIGN(n)    .p2align n
>>>> +#endif
>>>> +#ifndef MEMSET
>>>> +# define MEMSET      __memset_avx2
>>>> +# define MEMSET_CHK  __memset_chk_avx2
>>>> +#endif
>>>> +
>>>> +     .section .text.avx2,"ax",@progbits
>>>> +#if defined PIC
>>>> +ENTRY (MEMSET_CHK)
>>>> +     cmpq    %rdx, %rcx
>>>> +     jb      HIDDEN_JUMPTARGET (__chk_fail)
>>>> +END (MEMSET_CHK)
>>>> +#endif
>>>> +
>>>> +ENTRY (MEMSET)
>>>> +     vpxor   %xmm0, %xmm0, %xmm0
>>>> +     vmovd %esi, %xmm1
>>>> +     lea     (%rdi, %rdx), %r8
>>>> +     vpshufb %xmm0, %xmm1, %xmm0
>>>> +     mov     %rdi, %rax
>>>> +     cmp     $256, %rdx
>>>> +     jae     L(256bytesormore)
>>>> +     vmovd %xmm0, %rcx
>>>> +     cmp     $128, %rdx
>>>> +     jb      L(less_128bytes)
>>>> +     vmovups %xmm0, (%rdi)
>>>> +     vmovups %xmm0, 0x10(%rdi)
>>>> +     vmovups %xmm0, 0x20(%rdi)
>>>> +     vmovups %xmm0, 0x30(%rdi)
>>>> +     vmovups %xmm0, 0x40(%rdi)
>>>> +     vmovups %xmm0, 0x50(%rdi)
>>>> +     vmovups %xmm0, 0x60(%rdi)
>>>> +     vmovups %xmm0, 0x70(%rdi)
>>>> +     vmovups %xmm0, -0x80(%r8)
>>>> +     vmovups %xmm0, -0x70(%r8)
>>>> +     vmovups %xmm0, -0x60(%r8)
>>>> +     vmovups %xmm0, -0x50(%r8)
>>>> +     vmovups %xmm0, -0x40(%r8)
>>>> +     vmovups %xmm0, -0x30(%r8)
>>>> +     vmovups %xmm0, -0x20(%r8)
>>>> +     vmovups %xmm0, -0x10(%r8)
>>>> +     ret
>>>> +     ALIGN(4)
>>>> +L(less_128bytes):
>>>> +     cmp     $64, %edx
>>>> +     jb      L(less_64bytes)
>>>> +     vmovups %xmm0, (%rdi)
>>>> +     vmovups %xmm0, 0x10(%rdi)
>>>> +     vmovups %xmm0, 0x20(%rdi)
>>>> +     vmovups %xmm0, 0x30(%rdi)
>>>> +     vmovups %xmm0, -0x40(%r8)
>>>> +     vmovups %xmm0, -0x30(%r8)
>>>> +     vmovups %xmm0, -0x20(%r8)
>>>> +     vmovups %xmm0, -0x10(%r8)
>>>> +     ret
>>>> +     ALIGN(4)
>>>> +L(less_64bytes):
>>>> +     cmp     $32, %edx
>>>> +     jb      L(less_32bytes)
>>>> +     vmovups %xmm0, (%rdi)
>>>> +     vmovups %xmm0, 0x10(%rdi)
>>>> +     vmovups %xmm0, -0x20(%r8)
>>>> +     vmovups %xmm0, -0x10(%r8)
>>>> +     ret
>>>> +     ALIGN(4)
>>>> +L(less_32bytes):
>>>> +     cmp     $16, %edx
>>>> +     jb      L(less_16bytes)
>>>> +     vmovups %xmm0, (%rdi)
>>>> +     vmovups %xmm0, -0x10(%r8)
>>>> +     ret
>>>> +     ALIGN(4)
>>>> +L(less_16bytes):
>>>> +     cmp     $8, %edx
>>>> +     jb      L(less_8bytes)
>>>> +     mov %rcx, (%rdi)
>>>> +     mov %rcx, -0x08(%r8)
>>>> +     ret
>>>> +     ALIGN(4)
>>>> +L(less_8bytes):
>>>> +     cmp     $4, %edx
>>>> +     jb      L(less_4bytes)
>>>> +     mov %ecx, (%rdi)
>>>> +     mov %ecx, -0x04(%r8)
>>>> +     ALIGN(4)
>>>> +L(less_4bytes):
>>>> +     cmp     $2, %edx
>>>> +     jb      L(less_2bytes)
>>>> +     mov     %cx, (%rdi)
>>>> +     mov     %cx, -0x02(%r8)
>>>> +     ret
>>>> +     ALIGN(4)
>>>> +L(less_2bytes):
>>>> +     cmp     $1, %edx
>>>> +     jb      L(less_1bytes)
>>>> +     mov     %cl, (%rdi)
>>>> +L(less_1bytes):
>>>> +     ret
>>>> +
>>>> +     ALIGN(4)
>>>> +L(256bytesormore):
>>>> +     vinserti128 $1, %xmm0, %ymm0, %ymm0
>>>
>>> this breaks build on RHEL 6 x86_64:
>>>
>>> ../sysdeps/x86_64/multiarch/memset-avx2.S:
>>> ../sysdeps/x86_64/multiarch/memset-avx2.S: Assembler messages:
>>> Assembler messages:
>>> ../sysdeps/x86_64/multiarch/memset-avx2.S:132:
>>> ../sysdeps/x86_64/multiarch/memset-avx2.S:132: Error: Error: no such
>>> instruction: `vinserti128 $1,%xmm0,%ymm0,%ymm0'no such instruction:
>>> `vinserti128 $1,%xmm0,%ymm0,%ymm0'
>>>
>>> Cheers,
>>>
>>
>> This patches enables AVX2 optimized memset only if -mavx2 works.  Tested
>> with GCC 4.6 and 4.8 on Fedora 20/x86-64.  OK to install?
>
> thanks, this fixed the issue also on RHEL 6 x86_64.
>

If there is no objection within 24 hours, I will check it in.


-- 
H.J.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]