This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH RFC V2] Improve 64bit memset for Corei7 with avx2 instruction
- From: Ling Ma <ling dot ma dot program at gmail dot com>
- To: libc-alpha at sourceware dot org
- Cc: neleai at seznam dot cz, liubov dot dmitrieva at gmail dot com, aj at suse dot com, Ma Ling <ling dot ml at alibaba-inc dot com>
- Date: Mon, 22 Jul 2013 15:01:47 +0800
- Subject: Re: [PATCH RFC V2] Improve 64bit memset for Corei7 with avx2 instruction
- References: <1373981861-3498-1-git-send-email-ling dot ma dot program at gmail dot com>
Any comments on it ?
Thanks
Ling
2013/7/16, ling.ma.program@gmail.com <ling.ma.program@gmail.com>:
> From: Ma Ling <ling.ml@alibaba-inc.com>
>
> In this patch we use the similar approach with memcpy to avoid branch
> instructions
> and force destination to be aligned with avx instruction.
> By gcc.403 benchmark we find memset spend more time than memcpy by 5~10
> times.
> The benchmark also indicate this patch improve performance from 30% to
> 100%
> compared with original __memset_sse2.
>
> Ondra, I sent test gcc.403 test suit ,patch for glibc and readme.txt as
> well.
>
> Thanks
> Ling
> ---
> In this version we do clearify vzeroupper instruction to avoid SAVE & STORE
> Penalty.
> vpshufb need only one cycle to fill xmm0 register, thanks Ondra.
>
> sysdeps/x86_64/multiarch/Makefile | 2 +-
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 2 +
> sysdeps/x86_64/multiarch/memset-avx2.S | 202
> +++++++++++++++++++++++++++++
> 3 files changed, 205 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> index f92cf18..ae666bf 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -18,7 +18,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c
> strcmp-ssse3 strncmp-ssse3 \
> strcat-sse2-unaligned strncat-sse2-unaligned \
> strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
> strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
> - memcmp-ssse3
> + memcmp-ssse3 memset-avx2
> ifeq (yes,$(config-cflags-sse4))
> sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
> varshift
> CFLAGS-varshift.c += -msse4
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 5639702..24d05d7 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -67,12 +67,14 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/memset_chk.S. */
> IFUNC_IMPL (i, name, __memset_chk,
> + IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_AVX2,
> __memset_chk_avx2)
> IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2)
> IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
> __memset_chk_x86_64))
>
> /* Support sysdeps/x86_64/multiarch/memset.S. */
> IFUNC_IMPL (i, name, memset,
> + IFUNC_IMPL_ADD (array, i, memset, HAS_AVX2, __memset_avx2)
> IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
> IFUNC_IMPL_ADD (array, i, memset, 1, __memset_x86_64))
>
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S
> b/sysdeps/x86_64/multiarch/memset-avx2.S
> new file mode 100644
> index 0000000..dc778c8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
> @@ -0,0 +1,202 @@
> +/* memset with AVX2
> + Copyright (C) 2013 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +#if !defined NOT_IN_libc
> +
> +#include "asm-syntax.h"
> +#ifndef ALIGN
> +# define ALIGN(n) .p2align n
> +#endif
> +#ifndef MEMSET
> +# define MEMSET __memset_avx2
> +# define MEMSET_CHK __memset_chk_avx2
> +#endif
> +
> + .section .text.avx2,"ax",@progbits
> +#if defined PIC
> +ENTRY (MEMSET_CHK)
> + cmpq %rdx, %rcx
> + jb HIDDEN_JUMPTARGET (__chk_fail)
> +END (MEMSET_CHK)
> +#endif
> +
> +ENTRY (MEMSET)
> + vpxor %xmm0, %xmm0, %xmm0
> + vmovd %esi, %xmm1
> + lea (%rdi, %rdx), %r8
> + vpshufb %xmm0, %xmm1, %xmm0
> + mov %rdi, %rax
> + cmp $256, %rdx
> + jae L(256bytesormore)
> + xor %ecx, %ecx
> + mov %sil, %cl
> + mov %cl, %ch
> + cmp $128, %rdx
> + jb L(less_128bytes)
> + vmovups %xmm0, (%rdi)
> + vmovups %xmm0, 0x10(%rdi)
> + vmovups %xmm0, 0x20(%rdi)
> + vmovups %xmm0, 0x30(%rdi)
> + vmovups %xmm0, 0x40(%rdi)
> + vmovups %xmm0, 0x50(%rdi)
> + vmovups %xmm0, 0x60(%rdi)
> + vmovups %xmm0, 0x70(%rdi)
> + vmovups %xmm0, -0x80(%r8)
> + vmovups %xmm0, -0x70(%r8)
> + vmovups %xmm0, -0x60(%r8)
> + vmovups %xmm0, -0x50(%r8)
> + vmovups %xmm0, -0x40(%r8)
> + vmovups %xmm0, -0x30(%r8)
> + vmovups %xmm0, -0x20(%r8)
> + vmovups %xmm0, -0x10(%r8)
> + ret
> + ALIGN(4)
> +L(less_128bytes):
> + xor %esi, %esi
> + mov %ecx, %esi
> + shl $16, %ecx
> + cmp $64, %edx
> + jb L(less_64bytes)
> + vmovups %xmm0, (%rdi)
> + vmovups %xmm0, 0x10(%rdi)
> + vmovups %xmm0, 0x20(%rdi)
> + vmovups %xmm0, 0x30(%rdi)
> + vmovups %xmm0, -0x40(%r8)
> + vmovups %xmm0, -0x30(%r8)
> + vmovups %xmm0, -0x20(%r8)
> + vmovups %xmm0, -0x10(%r8)
> + ret
> + ALIGN(4)
> +L(less_64bytes):
> + orl %esi, %ecx
> + mov %ecx, %esi
> + cmp $32, %edx
> + jb L(less_32bytes)
> + vmovups %xmm0, (%rdi)
> + vmovups %xmm0, 0x10(%rdi)
> + vmovups %xmm0, -0x20(%r8)
> + vmovups %xmm0, -0x10(%r8)
> + ret
> + ALIGN(4)
> +L(less_32bytes):
> + shl $32, %rcx
> + cmp $16, %edx
> + jb L(less_16bytes)
> + vmovups %xmm0, (%rdi)
> + vmovups %xmm0, -0x10(%r8)
> + ret
> + ALIGN(4)
> +L(less_16bytes):
> + or %rsi, %rcx
> + cmp $8, %edx
> + jb L(less_8bytes)
> + mov %rcx, (%rdi)
> + mov %rcx, -0x08(%r8)
> + ret
> + ALIGN(4)
> +L(less_8bytes):
> + cmp $4, %edx
> + jb L(less_4bytes)
> + mov %ecx, (%rdi)
> + mov %ecx, -0x04(%r8)
> + ALIGN(4)
> +L(less_4bytes):
> + cmp $2, %edx
> + jb L(less_2bytes)
> + mov %cx, (%rdi)
> + mov %cx, -0x02(%r8)
> + ret
> + ALIGN(4)
> +L(less_2bytes):
> + cmp $1, %edx
> + jb L(less_1bytes)
> + mov %cl, (%rdi)
> +L(less_1bytes):
> + ret
> +
> + ALIGN(4)
> +L(256bytesormore):
> + vinserti128 $1, %xmm0, %ymm0, %ymm0
> + vmovups %ymm0, (%rdi)
> + mov %rdi, %r9
> + and $-0x20, %rdi
> + add $32, %rdi
> + sub %rdi, %r9
> + add %r9, %rdx
> + cmp $4096, %rdx
> + ja L(gobble_data)
> +
> + sub $0x80, %rdx
> +L(gobble_128_loop):
> + prefetcht0 0x1c0(%rdi)
> + vmovaps %ymm0, (%rdi)
> + prefetcht0 0x280(%rdi)
> + vmovaps %ymm0, 0x20(%rdi)
> + vmovaps %ymm0, 0x40(%rdi)
> + vmovaps %ymm0, 0x60(%rdi)
> + lea 0x80(%rdi), %rdi
> + sub $0x80, %rdx
> + jae L(gobble_128_loop)
> + vmovups %ymm0, -0x80(%r8)
> + vmovups %ymm0, -0x60(%r8)
> + vmovups %ymm0, -0x40(%r8)
> + vmovups %ymm0, -0x20(%r8)
> + vzeroupper
> + ret
> +
> + ALIGN(4)
> +L(gobble_data):
> +#ifdef SHARED_CACHE_SIZE_HALF
> + mov $SHARED_CACHE_SIZE_HALF, %r9
> +#else
> + mov __x86_64_shared_cache_size_half(%rip), %r9
> +#endif
> + shl $4, %r9
> + cmp %r9, %rdx
> + ja L(gobble_big_data)
> + mov %rax, %r9
> + mov %esi, %eax
> + mov %rdx, %rcx
> + rep stosb
> + mov %r9, %rax
> + vzeroupper
> + ret
> +
> + ALIGN(4)
> +L(gobble_big_data):
> + sub $0x80, %rdx
> +L(gobble_big_data_loop):
> + vmovntdq %ymm0, (%rdi)
> + vmovntdq %ymm0, 0x20(%rdi)
> + vmovntdq %ymm0, 0x40(%rdi)
> + vmovntdq %ymm0, 0x60(%rdi)
> + lea 0x80(%rdi), %rdi
> + sub $0x80, %rdx
> + jae L(gobble_big_data_loop)
> + vmovups %ymm0, -0x80(%r8)
> + vmovups %ymm0, -0x60(%r8)
> + vmovups %ymm0, -0x40(%r8)
> + vmovups %ymm0, -0x20(%r8)
> + vzeroupper
> + sfence
> + ret
> +
> +END (MEMSET)
> +#endif
> --
> 1.8.1.4
>
>