This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH RFC] Imporve 64bit memset performance for Haswell CPU with AVX2 instruction


On Mon, Apr 14, 2014 at 6:08 AM,  <ling.ma.program@gmail.com> wrote:
> From: Ling Ma <ling.ml@alibaba-inc.com>
>
> In this patch we take advantage of HSW memory bandwidth, manage to
> reduce miss branch prediction by avoiding using branch instructions and
> force destination to be aligned with avx & avx2 instruction.
>
> According to Ondra suggestion, the version shrinks code size & improve
> destination throughput, then re-test all cases compared with sse2 memset.
> The CPU2006 403.gcc benchmark indicates this patch improves performance
> from 23.7% to 57%.
>
> ---
>
>  ChangeLog                              |   9 ++
>  sysdeps/x86_64/multiarch/Makefile      |   4 +-
>  sysdeps/x86_64/multiarch/memset-avx2.S | 195 +++++++++++++++++++++++++++++++++
>  sysdeps/x86_64/multiarch/memset.S      |  58 ++++++++++
>  sysdeps/x86_64/multiarch/memset_chk.S  |  43 ++++++++
>  5 files changed, 308 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/memset.S
>  create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S
>
> diff --git a/ChangeLog b/ChangeLog
> index ba8980c..9e5c3bb 100644
> --- a/ChangeLog
> +++ b/ChangeLog
> @@ -1,3 +1,12 @@
> +2014-04-04  Ling Ma  <ling.ml@alibaba-inc.com>
> +
> +       * sysdeps/x86_64/multiarch/Makefile: Add memset-avx2
> +       * sysdeps/x86_64/multiarch/memset-avx2.S: New file for AVX2 memset
> +       * sysdeps/x86_64/multiarch/memset.S: New file for multiple memset
> +       versions
> +       * sysdeps/x86_64/multiarch/memset_chk.S: New file for multiple memset_chk
> +       versions
> +
>  2014-04-04  Sihai Yao  <sihai.ysh@alibaba-inc.com>
>         * sysdeps/x86_64/multiarch/ifunc-defines.sym: Add COMMON_CPU_INDEX_7 and
>         FEATURE_INDEX_7.
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 57a3c13..42df96f 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -17,7 +17,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>                    strcpy-sse2-unaligned strncpy-sse2-unaligned \
>                    stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
>                    strcat-sse2-unaligned strncat-sse2-unaligned \
> -                  strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned
> +                  strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
> +                  memset-avx2
> +
>  ifeq (yes,$(config-cflags-sse4))
>  sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
>  CFLAGS-varshift.c += -msse4
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
> new file mode 100644
> index 0000000..fea1f5a
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
> @@ -0,0 +1,195 @@
> +/* memset with AVX2
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#if !defined NOT_IN_libc
> +
> +#include "asm-syntax.h"
> +#ifndef MEMSET
> +# define MEMSET        __memset_avx2
> +# define MEMSET_CHK    __memset_chk_avx2
> +#endif
> +
> +       .section .text.avx2,"ax",@progbits
> +#if defined PIC
> +ENTRY (MEMSET_CHK)
> +       cmpq    %rdx, %rcx
> +       jb      HIDDEN_JUMPTARGET (__chk_fail)
> +END (MEMSET_CHK)
> +#endif
> +
> +ENTRY (MEMSET)
> +       vpxor   %xmm0, %xmm0, %xmm0
> +       vmovd %esi, %xmm1
> +       mov     %rdi, %rsi
> +       mov     %rdi, %rax
> +       vpshufb %xmm0, %xmm1, %xmm0
> +       cmp     $256, %rdx
> +       ja      L(256bytesormore)
> +       add     %rdx, %rsi
> +       vmovd %xmm0, %rcx
> +       cmp     $128, %edx
> +       jb      L(less_128bytes)
> +       vmovups %xmm0, (%rdi)
> +       vmovups %xmm0, 0x10(%rdi)
> +       vmovups %xmm0, 0x20(%rdi)
> +       vmovups %xmm0, 0x30(%rdi)
> +       vmovups %xmm0, 0x40(%rdi)
> +       vmovups %xmm0, 0x50(%rdi)
> +       vmovups %xmm0, 0x60(%rdi)
> +       vmovups %xmm0, 0x70(%rdi)
> +       vmovups %xmm0, -0x80(%rsi)
> +       vmovups %xmm0, -0x70(%rsi)
> +       vmovups %xmm0, -0x60(%rsi)
> +       vmovups %xmm0, -0x50(%rsi)
> +       vmovups %xmm0, -0x40(%rsi)
> +       vmovups %xmm0, -0x30(%rsi)
> +       vmovups %xmm0, -0x20(%rsi)
> +       vmovups %xmm0, -0x10(%rsi)
> +       ret
> +
> +       .p2align 4
> +L(less_128bytes):
> +       cmp     $64, %dl
> +       jb      L(less_64bytes)
> +       vmovups %xmm0, (%rdi)
> +       vmovups %xmm0, 0x10(%rdi)
> +       vmovups %xmm0, 0x20(%rdi)
> +       vmovups %xmm0, 0x30(%rdi)
> +       vmovups %xmm0, -0x40(%rsi)
> +       vmovups %xmm0, -0x30(%rsi)
> +       vmovups %xmm0, -0x20(%rsi)
> +       vmovups %xmm0, -0x10(%rsi)
> +       ret
> +
> +       .p2align 4
> +L(less_64bytes):
> +       cmp     $32, %dl
> +       jb      L(less_32bytes)
> +       vmovups %xmm0, (%rdi)
> +       vmovups %xmm0, 0x10(%rdi)
> +       vmovups %xmm0, -0x20(%rsi)
> +       vmovups %xmm0, -0x10(%rsi)
> +       ret
> +
> +       .p2align 4
> +L(less_32bytes):
> +       cmp     $16, %dl
> +       jb      L(less_16bytes)
> +       vmovups %xmm0, (%rdi)
> +       vmovups %xmm0, -0x10(%rsi)
> +       ret
> +       .p2align 4
> +L(less_16bytes):
> +       cmp     $8, %dl
> +       jb      L(less_8bytes)
> +       mov %rcx, (%rdi)
> +       mov %rcx, -0x08(%rsi)
> +       ret
> +
> +       .p2align 4
> +L(less_8bytes):
> +       cmp     $4, %dl
> +       jb      L(less_4bytes)
> +       mov %ecx, (%rdi)
> +       mov %ecx, -0x04(%rsi)
> +
> +       .p2align 4
> +L(less_4bytes):
> +       cmp     $2, %dl
> +       jb      L(less_2bytes)
> +       mov     %cx, (%rdi)
> +       mov     %cx, -0x02(%rsi)
> +       ret
> +       .p2align 4
> +L(less_2bytes):
> +       cmp     $1, %dl
> +       jb      L(less_1bytes)
> +       mov     %cl, (%rdi)
> +L(less_1bytes):
> +       ret
> +
> +       .p2align 4
> +L(256bytesormore):
> +       vinserti128 $1, %xmm0, %ymm0, %ymm0
> +       mov     $0x80, %rcx
> +       add     %rdx, %rsi
> +       mov     %rdi, %r9
> +       vmovups %ymm0, (%rdi)
> +       and     $-0x20, %rdi
> +       add     $32, %rdi
> +       sub     %rdi, %r9
> +       add     %r9, %rdx
> +       cmp     $4096, %rdx
> +       ja      L(gobble_data)
> +       sub     %ecx, %edx
> +L(gobble_128_loop):
> +       vmovaps %ymm0, (%rdi)
> +       vmovaps %ymm0, 0x20(%rdi)
> +       vmovaps %ymm0, 0x40(%rdi)
> +       vmovaps %ymm0, 0x60(%rdi)
> +       add     %rcx, %rdi
> +       sub     %ecx, %edx
> +       jae     L(gobble_128_loop)
> +       vmovups %ymm0, -0x80(%rsi)
> +       vmovups %ymm0, -0x60(%rsi)
> +       vmovups %ymm0, -0x40(%rsi)
> +       vmovups %ymm0, -0x20(%rsi)
> +       vzeroupper
> +       ret
> +
> +       .p2align 4
> +L(gobble_data):
> +#ifdef SHARED_CACHE_SIZE_HALF
> +       mov     $SHARED_CACHE_SIZE_HALF, %r9
> +#else
> +       mov     __x86_shared_cache_size_half(%rip), %r9
> +#endif
> +       shl     $4, %r9
> +       cmp     %r9, %rdx
> +       ja      L(gobble_big_data)
> +       mov     %rax, %rsi
> +       vmovd %xmm0, %eax
> +       mov     %rdx, %rcx
> +       rep     stosb
> +       mov     %rsi, %rax
> +       vzeroupper
> +       ret
> +
> +       .p2align 4
> +L(gobble_big_data):
> +       sub     %rcx, %rdx
> +L(gobble_big_data_loop):
> +       vmovntdq        %ymm0, (%rdi)
> +       vmovntdq        %ymm0, 0x20(%rdi)
> +       vmovntdq        %ymm0, 0x40(%rdi)
> +       vmovntdq        %ymm0, 0x60(%rdi)
> +       add     %rcx, %rdi
> +       sub     %rcx, %rdx
> +       jae     L(gobble_big_data_loop)
> +       vmovups %ymm0, -0x80(%rsi)
> +       vmovups %ymm0, -0x60(%rsi)
> +       vmovups %ymm0, -0x40(%rsi)
> +       vmovups %ymm0, -0x20(%rsi)
> +       vzeroupper
> +       sfence
> +       ret
> +
>

I see mixed FP and INT vector instructions, like
vmovntdq, vmovups, vmovaps,  vinserti128, vpxor,
vpshufb.  Are there any particular reason for that?



-- 
H.J.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]