This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH RFC] Imporve 64bit memset performance for Haswell CPU with AVX2 instruction


On Mon, Apr 07, 2014 at 01:57:18AM -0400, ling.ma.program@gmail.com wrote:
> From: Ling Ma <ling.ml@alibaba-inc.com>
> 
> In this patch we take advantage of HSW memory bandwidth, manage to
> reduce miss branch prediction by avoid using branch instructions and
> force destination to be aligned with avx instruction. 
> 
> The CPU2006 403.gcc benchmark also indicate this patch improves performance
> from  22.9% to 59% compared with original memset implemented by sse2.
>
Looks mostly ok except mostly mechanic changes.


> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
> new file mode 100644
> index 0000000..5d4a487
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
> @@ -0,0 +1,192 @@
> +/* memset with AVX2
> +   Copyright (C) 2014 Free Software Foundation, Inc.
> +   Contributed by Alibaba Group.

We no longer add Contributed by.

> +#include "asm-syntax.h"
> +#ifndef ALIGN
> +# define ALIGN(n)	.p2align n
> +#endif

Also in meantime we decided to remove ALIGN macro so remove that and
s/ALIGN(3)/.p2align 4/  s/ALIGN(4)/.p2align 4/

> +
> +ENTRY (MEMSET)
> +	vpxor	%xmm0, %xmm0, %xmm0
> +	vmovd %esi, %xmm1
> +	lea	(%rdi, %rdx), %r8

< snip >

> +	vmovups %xmm0, 0x70(%rdi)
> +	vmovups %xmm0, -0x80(%r8)

I would globally replace %r8 by %rsi, this makes instruction byte shorter, %r9 is similar.

> +L(less_4bytes):
> +	cmp	$2, %edx
> +	jb	L(less_2bytes)
> +	mov	%cx, (%rdi)
> +	mov	%cx, -0x02(%r8)
> +	ret
> +	ALIGN(4)
> +L(less_2bytes):
> +	cmp	$1, %edx
> +	jb	L(less_1bytes)
> +	mov	%cl, (%rdi)
> +L(less_1bytes):
> +	ret
> +

Here current implementation saves one comparison by

L(less_4bytes):
     cmp     $1, %edx
     jbe     L(less_2bytes)
     mov     %cx, (%rdi)
     mov     %cx, -0x02(%r8)
     ret
     ALIGN(4)
L(less_2bytes):
     jb      L(less_1bytes)
     mov     %cl, (%rdi)
L(less_1bytes):
     ret

> +	ALIGN(4)
> +L(256bytesormore):
> +	vinserti128 $1, %xmm0, %ymm0, %ymm0
> +	vmovups	%ymm0, (%rdi)
> +	mov	%rdi, %r9
> +	and	$-0x20, %rdi
> +	add	$32, %rdi
> +	sub	%rdi, %r9
> +	add	%r9, %rdx
> +	cmp	$4096, %rdx
> +	ja	L(gobble_data)
> +
> +	sub	$0x80, %rdx
> +L(gobble_128_loop):
> +	vmovaps	%ymm0, (%rdi)
> +	vmovaps	%ymm0, 0x20(%rdi)
> +	vmovaps	%ymm0, 0x40(%rdi)
> +	vmovaps	%ymm0, 0x60(%rdi)
> +	lea	0x80(%rdi), %rdi
> +	sub	$0x80, %rdx
> +	jae	L(gobble_128_loop)
> +	vmovups	%ymm0, -0x80(%r8)
> +	vmovups	%ymm0, -0x60(%r8)
> +	vmovups	%ymm0, -0x40(%r8)
> +	vmovups	%ymm0, -0x20(%r8)
> +	vzeroupper
> +	ret
> +

I looked into this by objdump and loop is misaligned by 5 bytes which 
could be problem if haswell could not handle that. 
If you align loop as below does that it improve performance? 

	.p2align 4
	ret; ret; ret; ret; ret
L(256bytesormore):


also in this pattern

> +
> +	sub	$0x80, %rdx

A gcc saves three bytes by using
	add	$-0x80, %rdx

Third possible optimization is move vmovups before loop which improves
latency but it needs to be tested on haswell.


> +	ALIGN(4)
> +L(gobble_data):
> +#ifdef SHARED_CACHE_SIZE_HALF
> +	mov	$SHARED_CACHE_SIZE_HALF, %r9
> +#else
> +	mov	__x86_shared_cache_size_half(%rip), %r9
> +#endif

typo here, __x86_64_shared_cache_size_half

> +	shl	$4, %r9
> +	cmp	%r9, %rdx
> +	ja	L(gobble_big_data)
> +	mov	%rax, %r9
> +	mov	%esi, %eax
> +	mov	%rdx, %rcx
> +	rep	stosb

How does this compares with stosq equivalent?

> +	mov	%r9, %rax
> +	vzeroupper
> +	ret
> +


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]