This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb


On 03/29/2016 03:23 PM, H.J. Lu wrote:
> The goal of this patch is to replace SSE2 and AVX2 memset.S
> with faster and smaller alternatives, also support 64-byte vector
> register size.  bench-memset data on various Intel and AMD
> processors is at
> 
> https://sourceware.org/bugzilla/show_bug.cgi?id=19881
> 
> Any comments, feedbacks?

Caveats about Penryn being slower apply here, and I expect your answer
is the same: the selection of the ifunc will not change, and so Penryn
will not use the newer versions.

This looks good to me.

Again, same question about thresholding below.
 
> -- H.J.
> 
> 
> 0001-Add-x86-64-memset-with-unaligned-store-and-rep-stosb.patch
> 
> 
> From d0d3495951be16568656971dd2c825da68c2660c Mon Sep 17 00:00:00 2001
> From: "H.J. Lu" <hjl.tools@gmail.com>
> Date: Fri, 25 Mar 2016 08:20:17 -0700
> Subject: [PATCH] Add x86-64 memset with unaligned store and rep stosb
> 
> Implement x86-64 memset with unaligned store and rep movsb.  Support
> 16-byte, 32-byte and 64-byte vector register sizes.  A single file
> provides 2 implementations of memset, one with rep stosb and the other
> without rep stosb.  They share the same codes when size is between 2
> times of vector register size and REP_STOSB_THRESHOLD which is 1KB for
> 16-byte vector register size and scaled up by larger vector register
> size.
> 
> Key features:
> 
> 1. Use overlapping store to avoid branch.
> 2. For size <= 4 times of vector register size, fully unroll the loop.
> 3. For size > 4 times of vector register size, store 4 times of vector
> register size at a time.
> 
> 	[BZ #19881]
> 	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
> 	memset-sse2-unaligned-erms, memset-avx2-unaligned-erms and
> 	memset-avx512-unaligned-erms.
> 	* sysdeps/x86_64/multiarch/ifunc-impl-list.c
> 	(__libc_ifunc_impl_list): Test __memset_chk_sse2_unaligned,
> 	__memset_chk_sse2_unaligned_erms, __memset_chk_avx2_unaligned,
> 	__memset_chk_avx2_unaligned_erms, __memset_chk_avx512_unaligned,
> 	__memset_chk_avx512_unaligned_erms, __memset_sse2_unaligned,
> 	__memset_sse2_unaligned_erms, __memset_erms,
> 	__memset_avx2_unaligned, __memset_avx2_unaligned_erms,
> 	__memset_avx512_unaligned_erms and __memset_avx512_unaligned.
> 	* sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S: New
> 	file.
> 	* sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S:
> 	Likewise.
> 	* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S:
> 	Likewise.
> 	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S:
> 	Likewise.
> 
> Memset
> ---
>  sysdeps/x86_64/multiarch/Makefile                  |   5 +-
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c         |  33 +++
>  .../x86_64/multiarch/memset-avx2-unaligned-erms.S  |  14 ++
>  .../multiarch/memset-avx512-unaligned-erms.S       |  17 ++
>  .../x86_64/multiarch/memset-sse2-unaligned-erms.S  |  16 ++
>  .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 246 +++++++++++++++++++++
>  6 files changed, 330 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>  create mode 100644 sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>  create mode 100644 sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
>  create mode 100644 sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> 
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index ef4dbc0..8878efb 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -23,7 +23,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>  		   memset-avx512-no-vzeroupper \
>  		   memmove-sse2-unaligned-erms \
>  		   memmove-avx-unaligned-erms \
> -		   memmove-avx512-unaligned-erms
> +		   memmove-avx512-unaligned-erms \
> +		   memset-sse2-unaligned-erms \
> +		   memset-avx2-unaligned-erms \
> +		   memset-avx512-unaligned-erms

OK.

>  CFLAGS-varshift.c += -msse4
>  CFLAGS-strcspn-c.c += -msse4
>  CFLAGS-strpbrk-c.c += -msse4
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 9204da4..1e880f6 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -118,12 +118,28 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    IFUNC_IMPL (i, name, __memset_chk,
>  	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
>  			      __memset_chk_sse2)
> +	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
> +			      __memset_chk_sse2_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
> +			      __memset_chk_sse2_unaligned_erms)
>  	      IFUNC_IMPL_ADD (array, i, __memset_chk,
>  			      HAS_ARCH_FEATURE (AVX2_Usable),
>  			      __memset_chk_avx2)
> +	      IFUNC_IMPL_ADD (array, i, __memset_chk,
> +			      HAS_ARCH_FEATURE (AVX2_Usable),
> +			      __memset_chk_avx2_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, __memset_chk,
> +			      HAS_ARCH_FEATURE (AVX2_Usable),
> +			      __memset_chk_avx2_unaligned_erms)
>  #ifdef HAVE_AVX512_ASM_SUPPORT
>  	      IFUNC_IMPL_ADD (array, i, __memset_chk,
>  			      HAS_ARCH_FEATURE (AVX512F_Usable),
> +			      __memset_chk_avx512_unaligned_erms)
> +	      IFUNC_IMPL_ADD (array, i, __memset_chk,
> +			      HAS_ARCH_FEATURE (AVX512F_Usable),
> +			      __memset_chk_avx512_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, __memset_chk,
> +			      HAS_ARCH_FEATURE (AVX512F_Usable),
>  			      __memset_chk_avx512_no_vzeroupper)
>  #endif
>  	      )
> @@ -131,12 +147,29 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/memset.S.  */
>    IFUNC_IMPL (i, name, memset,
>  	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
> +	      IFUNC_IMPL_ADD (array, i, memset, 1,
> +			      __memset_sse2_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, memset, 1,
> +			      __memset_sse2_unaligned_erms)
> +	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_erms)
>  	      IFUNC_IMPL_ADD (array, i, memset,
>  			      HAS_ARCH_FEATURE (AVX2_Usable),
>  			      __memset_avx2)
> +	      IFUNC_IMPL_ADD (array, i, memset,
> +			      HAS_ARCH_FEATURE (AVX2_Usable),
> +			      __memset_avx2_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, memset,
> +			      HAS_ARCH_FEATURE (AVX2_Usable),
> +			      __memset_avx2_unaligned_erms)
>  #ifdef HAVE_AVX512_ASM_SUPPORT
>  	      IFUNC_IMPL_ADD (array, i, memset,
>  			      HAS_ARCH_FEATURE (AVX512F_Usable),
> +			      __memset_avx512_unaligned_erms)
> +	      IFUNC_IMPL_ADD (array, i, memset,
> +			      HAS_ARCH_FEATURE (AVX512F_Usable),
> +			      __memset_avx512_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, memset,
> +			      HAS_ARCH_FEATURE (AVX512F_Usable),
>  			      __memset_avx512_no_vzeroupper)
>  #endif
>  	     )
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> new file mode 100644
> index 0000000..e0dc565
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -0,0 +1,14 @@
> +#define VEC_SIZE	32
> +#define VEC(i)		ymm##i
> +#define VMOVU		vmovdqu
> +#define VMOVA		vmovdqa
> +
> +#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +  vmovd d, %xmm0; \
> +  movq r, %rax; \
> +  vpbroadcastb %xmm0, %ymm0
> +
> +#define SECTION(p)		p##.avx
> +#define MEMSET_SYMBOL(p,s)	p##_avx2_##s

OK.

> +
> +#include "memset-vec-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> new file mode 100644
> index 0000000..72f4095
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -0,0 +1,17 @@
> +#ifdef HAVE_AVX512_ASM_SUPPORT
> +# define VEC_SIZE	64
> +# define VEC(i)		zmm##i
> +# define VMOVU		vmovdqu64
> +# define VMOVA		vmovdqa64
> +
> +# define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +  vmovd d, %xmm0; \
> +  movq r, %rax; \
> +  vpbroadcastb %xmm0, %xmm0; \
> +  vpbroadcastq %xmm0, %zmm0
> +
> +# define SECTION(p)		p##.avx512
> +# define MEMSET_SYMBOL(p,s)	p##_avx512_##s

OK.

> +
> +# include "memset-vec-unaligned-erms.S"
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> new file mode 100644
> index 0000000..437a858
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> @@ -0,0 +1,16 @@
> +#define VEC_SIZE	16
> +#define VEC(i)		xmm##i
> +#define VMOVU		movdqu
> +#define VMOVA		movdqa
> +
> +#define VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> +  movd d, %xmm0; \
> +  movq r, %rax; \
> +  punpcklbw %xmm0, %xmm0; \
> +  punpcklwd %xmm0, %xmm0; \
> +  pshufd $0, %xmm0, %xmm0
> +
> +#define SECTION(p)		p
> +#define MEMSET_SYMBOL(p,s)	p##_sse2_##s

OK.

> +
> +#include "memset-vec-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> new file mode 100644
> index 0000000..dd04789
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -0,0 +1,246 @@
> +/* memset/bzero with unaligned store and rep stosb
> +   Copyright (C) 2016 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +/* memset is implemented as:
> +   1. Use overlapping store to avoid branch.
> +   2. Force 32-bit displacement for branches to avoid long nop between
> +      instructions.
> +   3. If size is less than VEC, use integer register stores.
> +   4. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
> +   5. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
> +   6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
> +      4 VEC stores and store 4 * VEC at a time until done.
> + */

Use GNU formatting please.

e.g.

/* foo */

not

/* foo
*/

> +#include <sysdep.h>
> +
> +#ifndef VZEROUPPER
> +# if VEC_SIZE > 16
> +#  define VZEROUPPER			vzeroupper
> +# else
> +#  define VZEROUPPER
> +# endif
> +#endif
> +
> +#ifndef VZEROUPPER_SHORT_RETURN
> +# if VEC_SIZE > 16
> +#  define VZEROUPPER_SHORT_RETURN	vzeroupper
> +# else
> +#  define VZEROUPPER_SHORT_RETURN	rep
> +# endif
> +#endif
> +
> +#ifndef MOVQ
> +# if VEC_SIZE > 16
> +#  define MOVQ				vmovq
> +# else
> +#  define MOVQ				movq
> +# endif
> +#endif
> +
> +/* Threshold to use Enhanced REP STOSB.  */
> +#ifndef REP_STOSB_THRESHOLD
> +# define REP_STOSB_THRESHOLD	(1024 * (VEC_SIZE / 16))

Same question as your other patch. How are we selecting this threshold?

> +#endif
> +
> +#ifndef SECTION
> +# error SECTION is not defined!
> +#endif
> +
> +#if !defined USE_MULTIARCH && IS_IN (libc)
> +	.section SECTION(.text),"ax",@progbits
> +ENTRY (__bzero)
> +	movq	%rdi, %rax /* Set return value.  */
> +	movq	%rsi, %rdx /* Set n.  */
> +	pxor	%xmm0, %xmm0
> +	jmp	L(entry_from_bzero)
> +END (__bzero)
> +weak_alias (__bzero, bzero)
> +#endif
> +
> +#if defined SHARED && IS_IN (libc)
> +ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
> +	cmpq	%rdx, %rcx
> +	jb	HIDDEN_JUMPTARGET (__chk_fail)
> +END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned))
> +#endif
> +
> +ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> +L(memset_entry):
> +	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +L(entry_from_bzero):
> +	cmpq	$VEC_SIZE, %rdx
> +	jb	L(less_vec)
> +	cmpq	$(VEC_SIZE * 2), %rdx
> +	ja	L(more_2x_vec)
> +	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
> +	VMOVU	%VEC(0), (%rdi)
> +	VZEROUPPER
> +	ret
> +END (MEMSET_SYMBOL (__memset, unaligned))
> +
> +#if VEC_SIZE == 16
> +/* Only used to measure performance of REP STOSB.  */
> +ENTRY (__memset_erms)
> +#else
> +/* Provide a symbol to debugger.  */
> +ENTRY (MEMSET_SYMBOL (__memset, erms))
> +#endif
> +L(stosb):
> +	movq	%rdx, %rcx
> +	movzbl	%sil, %eax
> +	movq	%rdi, %rdx
> +	rep stosb
> +	movq	%rdx, %rax
> +	ret
> +#if VEC_SIZE == 16
> +END (__memset_erms)
> +#else
> +END (MEMSET_SYMBOL (__memset, erms))
> +#endif
> +
> +#if defined SHARED && IS_IN (libc)
> +ENTRY_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
> +	cmpq	%rdx, %rcx
> +	jb	HIDDEN_JUMPTARGET (__chk_fail)
> +END_CHK (MEMSET_SYMBOL (__memset_chk, unaligned_erms))
> +#endif
> +
> +ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
> +	VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +	cmpq	$VEC_SIZE, %rdx
> +	jb	L(less_vec)
> +	cmpq	$(VEC_SIZE * 2), %rdx
> +	ja	L(stosb_more_2x_vec)
> +	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
> +	VMOVU	%VEC(0), (%rdi)
> +	VZEROUPPER
> +	ret
> +
> +	.p2align 4
> +L(stosb_more_2x_vec):
> +	cmpq	$REP_STOSB_THRESHOLD, %rdx
> +	/* Force 32-bit displacement to avoid long nop between
> +	   instructions.  */
> +	ja.d32	L(stosb)
> +	.p2align 4
> +L(more_2x_vec):
> +	cmpq  $(VEC_SIZE * 4), %rdx
> +	ja	L(loop_start)
> +	VMOVU	%VEC(0), (%rdi)
> +	VMOVU	%VEC(0), VEC_SIZE(%rdi)
> +	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
> +	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
> +L(return):
> +	VZEROUPPER
> +	ret
> +
> +	.p2align 4
> +L(loop_start):
> +	leaq	(VEC_SIZE * 4)(%rdi), %rcx
> +	VMOVU	%VEC(0), (%rdi)
> +	andq	$-(VEC_SIZE * 4), %rcx
> +	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
> +	VMOVU	%VEC(0), VEC_SIZE(%rdi)
> +	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
> +	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
> +	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
> +	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
> +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
> +	addq	%rdi, %rdx
> +	andq	$-(VEC_SIZE * 4), %rdx
> +	cmpq	%rdx, %rcx
> +# if VEC_SIZE == 32 || VEC_SIZE == 64
> +	/* Force 32-bit displacement to avoid long nop between
> +	   instructions.  */
> +	je.d32	L(return)
> +# else
> +	je	L(return)
> +# endif
> +	.p2align 4
> +L(loop):
> +	VMOVA	%VEC(0), (%rcx)
> +	VMOVA	%VEC(0), VEC_SIZE(%rcx)
> +	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
> +	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
> +	addq	$(VEC_SIZE * 4), %rcx
> +	cmpq	%rcx, %rdx
> +	jne	L(loop)
> +	VZEROUPPER_SHORT_RETURN
> +	ret
> +L(less_vec):
> +	/* Less than 1 VEC.  */
> +# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> +#  error Unsupported VEC_SIZE!
> +# endif
> +# if VEC_SIZE > 32
> +	cmpb	$32, %dl
> +	jae	L(between_32_63)
> +# endif
> +# if VEC_SIZE > 16
> +	cmpb	$16, %dl
> +	jae	L(between_16_31)
> +# endif
> +	MOVQ	%xmm0, %rcx
> +	cmpb	$8, %dl
> +	jae	L(between_8_15)
> +	cmpb	$4, %dl
> +	jae	L(between_4_7)
> +	cmpb	$1, %dl
> +	ja	L(between_2_3)
> +	jb	1f
> +	movb	%cl, (%rdi)
> +1:
> +	VZEROUPPER
> +	ret
> +# if VEC_SIZE > 32
> +	/* From 32 to 63.  No branch when size == 32.  */
> +L(between_32_63):
> +	vmovdqu	%ymm0, -32(%rdi,%rdx)
> +	vmovdqu	%ymm0, (%rdi)
> +	VZEROUPPER
> +	ret
> +# endif
> +# if VEC_SIZE > 16
> +	/* From 16 to 31.  No branch when size == 16.  */
> +L(between_16_31):
> +	vmovdqu	%xmm0, -16(%rdi,%rdx)
> +	vmovdqu	%xmm0, (%rdi)
> +	VZEROUPPER
> +	ret
> +# endif
> +	/* From 8 to 15.  No branch when size == 8.  */
> +L(between_8_15):
> +	movq	%rcx, -8(%rdi,%rdx)
> +	movq	%rcx, (%rdi)
> +	VZEROUPPER
> +	ret
> +L(between_4_7):
> +	/* From 4 to 7.  No branch when size == 4.  */
> +	movl	%ecx, -4(%rdi,%rdx)
> +	movl	%ecx, (%rdi)
> +	VZEROUPPER
> +	ret
> +L(between_2_3):
> +	/* From 2 to 3.  No branch when size == 2.  */
> +	movw	%cx, -2(%rdi,%rdx)
> +	movw	%cx, (%rdi)
> +	VZEROUPPER
> +	ret
> +END (MEMSET_SYMBOL (__memset, unaligned_erms))
> -- 2.5.5

OK.

-- 
Cheers,
Carlos.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]