This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]

Re: [PATCH RFC V2] Improve 64bit memset for Corei7 with avx2 instruction

From: Ling Ma <ling dot ma dot program at gmail dot com>
To: libc-alpha at sourceware dot org
Cc: neleai at seznam dot cz, liubov dot dmitrieva at gmail dot com, aj at suse dot com, Ma Ling <ling dot ml at alibaba-inc dot com>
Date: Mon, 22 Jul 2013 15:01:47 +0800
Subject: Re: [PATCH RFC V2] Improve 64bit memset for Corei7 with avx2 instruction
References: <1373981861-3498-1-git-send-email-ling dot ma dot program at gmail dot com>

Any comments on it ?

Thanks
Ling

2013/7/16, ling.ma.program@gmail.com <ling.ma.program@gmail.com>:
> From: Ma Ling <ling.ml@alibaba-inc.com>
>
> In this patch we use the similar approach with memcpy to avoid branch
> instructions
> and force destination to be aligned with avx instruction.
> By gcc.403 benchmark we find memset spend more time than memcpy by 5~10
> times.
> The benchmark also indicate this patch improve performance from  30% to
> 100%
> compared with original __memset_sse2.
>
> Ondra, I sent test gcc.403 test suit ,patch for glibc and readme.txt as
> well.
>
> Thanks
> Ling
> ---
> In this version we do clearify vzeroupper instruction to avoid SAVE & STORE
> Penalty.
> vpshufb need only one cycle to fill xmm0 register, thanks Ondra.
>
>  sysdeps/x86_64/multiarch/Makefile          |   2 +-
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   2 +
>  sysdeps/x86_64/multiarch/memset-avx2.S     | 202
> +++++++++++++++++++++++++++++
>  3 files changed, 205 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> index f92cf18..ae666bf 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -18,7 +18,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c
> strcmp-ssse3 strncmp-ssse3 \
>  		   strcat-sse2-unaligned strncat-sse2-unaligned \
>  		   strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \
>  		   strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \
> -		   memcmp-ssse3
> +		   memcmp-ssse3 memset-avx2
>  ifeq (yes,$(config-cflags-sse4))
>  sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
> varshift
>  CFLAGS-varshift.c += -msse4
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 5639702..24d05d7 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -67,12 +67,14 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
>    IFUNC_IMPL (i, name, __memset_chk,
> +	      IFUNC_IMPL_ADD (array, i, __memset_chk, HAS_AVX2,
> __memset_chk_avx2)
>  	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2)
>  	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
>  			      __memset_chk_x86_64))
>
>    /* Support sysdeps/x86_64/multiarch/memset.S.  */
>    IFUNC_IMPL (i, name, memset,
> +	      IFUNC_IMPL_ADD (array, i, memset, HAS_AVX2, __memset_avx2)
>  	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
>  	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_x86_64))
>
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S
> b/sysdeps/x86_64/multiarch/memset-avx2.S
> new file mode 100644
> index 0000000..dc778c8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
> @@ -0,0 +1,202 @@
> +/* memset with AVX2
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +#if !defined NOT_IN_libc
> +
> +#include "asm-syntax.h"
> +#ifndef ALIGN
> +# define ALIGN(n)	.p2align n
> +#endif
> +#ifndef MEMSET
> +# define MEMSET	__memset_avx2
> +# define MEMSET_CHK	__memset_chk_avx2
> +#endif
> +
> +	.section .text.avx2,"ax",@progbits
> +#if defined PIC
> +ENTRY (MEMSET_CHK)
> +	cmpq	%rdx, %rcx
> +	jb	HIDDEN_JUMPTARGET (__chk_fail)
> +END (MEMSET_CHK)
> +#endif
> +
> +ENTRY (MEMSET)
> +	vpxor	%xmm0, %xmm0, %xmm0
> +	vmovd %esi, %xmm1
> +	lea	(%rdi, %rdx), %r8
> +	vpshufb	%xmm0, %xmm1, %xmm0
> +	mov	%rdi, %rax
> +	cmp	$256, %rdx
> +	jae	L(256bytesormore)
> +	xor	%ecx, %ecx
> +	mov %sil, %cl
> +	mov %cl, %ch
> +	cmp	$128, %rdx
> +	jb	L(less_128bytes)
> +	vmovups %xmm0, (%rdi)
> +	vmovups %xmm0, 0x10(%rdi)
> +	vmovups %xmm0, 0x20(%rdi)
> +	vmovups %xmm0, 0x30(%rdi)
> +	vmovups %xmm0, 0x40(%rdi)
> +	vmovups %xmm0, 0x50(%rdi)
> +	vmovups %xmm0, 0x60(%rdi)
> +	vmovups %xmm0, 0x70(%rdi)
> +	vmovups %xmm0, -0x80(%r8)
> +	vmovups %xmm0, -0x70(%r8)
> +	vmovups %xmm0, -0x60(%r8)
> +	vmovups %xmm0, -0x50(%r8)
> +	vmovups %xmm0, -0x40(%r8)
> +	vmovups %xmm0, -0x30(%r8)
> +	vmovups %xmm0, -0x20(%r8)
> +	vmovups %xmm0, -0x10(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_128bytes):
> +	xor	%esi, %esi
> +	mov	%ecx, %esi
> +	shl	$16, %ecx
> +	cmp	$64, %edx
> +	jb	L(less_64bytes)
> +	vmovups %xmm0, (%rdi)
> +	vmovups %xmm0, 0x10(%rdi)
> +	vmovups %xmm0, 0x20(%rdi)
> +	vmovups %xmm0, 0x30(%rdi)
> +	vmovups %xmm0, -0x40(%r8)
> +	vmovups %xmm0, -0x30(%r8)
> +	vmovups %xmm0, -0x20(%r8)
> +	vmovups %xmm0, -0x10(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_64bytes):
> +	orl	%esi, %ecx
> +	mov	%ecx, %esi
> +	cmp	$32, %edx
> +	jb	L(less_32bytes)
> +	vmovups %xmm0, (%rdi)
> +	vmovups %xmm0, 0x10(%rdi)
> +	vmovups %xmm0, -0x20(%r8)
> +	vmovups %xmm0, -0x10(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_32bytes):
> +	shl	$32, %rcx
> +	cmp	$16, %edx
> +	jb	L(less_16bytes)
> +	vmovups %xmm0, (%rdi)
> +	vmovups %xmm0, -0x10(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_16bytes):
> +	or	%rsi, %rcx
> +	cmp	$8, %edx
> +	jb	L(less_8bytes)
> +	mov %rcx, (%rdi)
> +	mov %rcx, -0x08(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_8bytes):
> +	cmp	$4, %edx
> +	jb	L(less_4bytes)
> +	mov %ecx, (%rdi)
> +	mov %ecx, -0x04(%r8)
> +	ALIGN(4)
> +L(less_4bytes):
> +	cmp	$2, %edx
> +	jb	L(less_2bytes)
> +	mov	%cx, (%rdi)
> +	mov	%cx, -0x02(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_2bytes):
> +	cmp	$1, %edx
> +	jb	L(less_1bytes)
> +	mov	%cl, (%rdi)
> +L(less_1bytes):
> +	ret
> +
> +	ALIGN(4)
> +L(256bytesormore):
> +	vinserti128 $1, %xmm0, %ymm0, %ymm0
> +	vmovups	%ymm0, (%rdi)
> +	mov	%rdi, %r9
> +	and	$-0x20, %rdi
> +	add	$32, %rdi
> +	sub	%rdi, %r9
> +	add	%r9, %rdx
> +	cmp	$4096, %rdx
> +	ja	L(gobble_data)
> +
> +	sub	$0x80, %rdx
> +L(gobble_128_loop):
> +	prefetcht0	0x1c0(%rdi)
> +	vmovaps	%ymm0, (%rdi)
> +	prefetcht0	0x280(%rdi)
> +	vmovaps	%ymm0, 0x20(%rdi)
> +	vmovaps	%ymm0, 0x40(%rdi)
> +	vmovaps	%ymm0, 0x60(%rdi)
> +	lea	0x80(%rdi), %rdi
> +	sub	$0x80, %rdx
> +	jae	L(gobble_128_loop)
> +	vmovups	%ymm0, -0x80(%r8)
> +	vmovups	%ymm0, -0x60(%r8)
> +	vmovups	%ymm0, -0x40(%r8)
> +	vmovups	%ymm0, -0x20(%r8)
> +	vzeroupper
> +	ret
> +
> +	ALIGN(4)
> +L(gobble_data):
> +#ifdef SHARED_CACHE_SIZE_HALF
> +	mov	$SHARED_CACHE_SIZE_HALF, %r9
> +#else
> +	mov	__x86_64_shared_cache_size_half(%rip), %r9
> +#endif
> +	shl	$4, %r9
> +	cmp	%r9, %rdx
> +	ja	L(gobble_big_data)
> +	mov	%rax, %r9
> +	mov	%esi, %eax
> +	mov	%rdx, %rcx
> +	rep	stosb
> +	mov	%r9, %rax
> +	vzeroupper
> +	ret
> +
> +	ALIGN(4)
> +L(gobble_big_data):
> +	sub	$0x80, %rdx
> +L(gobble_big_data_loop):
> +	vmovntdq	%ymm0, (%rdi)
> +	vmovntdq	%ymm0, 0x20(%rdi)
> +	vmovntdq	%ymm0, 0x40(%rdi)
> +	vmovntdq	%ymm0, 0x60(%rdi)
> +	lea	0x80(%rdi), %rdi
> +	sub	$0x80, %rdx
> +	jae	L(gobble_big_data_loop)
> +	vmovups	%ymm0, -0x80(%r8)
> +	vmovups	%ymm0, -0x60(%r8)
> +	vmovups	%ymm0, -0x40(%r8)
> +	vmovups	%ymm0, -0x20(%r8)
> +	vzeroupper
> +	sfence
> +	ret
> +
> +END (MEMSET)
> +#endif
> --
> 1.8.1.4
>
>

References:
- [PATCH RFC V2] Improve 64bit memset for Corei7 with avx2 instruction
  - From: ling . ma . program

Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]