This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH RFC] Imporve 64bit memcpy performance for Haswell CPU with AVX instruction


Ondra,

As memset, we changed code as below, and have sent new version out.

Thanks
Ling

2014-04-11 6:50 GMT+08:00, OndÅej BÃlka <neleai@seznam.cz>:
> On Fri, Apr 04, 2014 at 03:17:42AM -0400, ling.ma.program@gmail.com wrote:
>> From: Ling Ma <ling.ml@alibaba-inc.com>
>>
>> In this patch we manage to reduce miss branch prediction by
>> avoid using branch instructions and force destination to be aligned
>> with avx instruction.
>>
>> The CPU2006 403.gcc benchmark also indicate this patch improves
>> performance
>> from 2% to 12% or 2% to 21% compared with original memset implemented
>> by sse2 and ssse3 respectively.
>>
>> 					memcpy-AVX		memcpy-SSE2		memcpy-SSSE3	AVX vs SSE2		AVX vs SSSE3
>> 	gcc.166.i		302551459		332189574		345378682		1.097960575		1.141553517
>> 	gcc.200.i		138036144		155904648		168229120		1.129448009		1.218732392
>> 	gcc.cp-decl.i	283963419		296759183		312970805		1.045061311		1.102151841
>> 	gcc.c-typeck.i	616484068		664855801		682119551		1.078463882		1.106467444
>> 	gcc.expr2.i		781639964		858486085		893803320		1.098313961		1.143497468
>> 	gcc.expr.i		580765337		593709446		596005444		1.022288019		1.02624142
>> 	gcc.g23.i		1063726457		1162692750		1177232886		1.093037352		1.106706408
>> 	gcc.s04.i		892109530		948328853		963836294		1.063018409		1.080401298
>> 	gcc.scilab.i	62298843		66606465		72922104		1.069144494		1.170521
>>
>
> similar comments as memset here.

Fixed in new version

>> +/* memcpy with AVX
>> +   Copyright (C) 2014 Free Software Foundation, Inc.
>> +   Contributed by Alibaba Group.
> no contributed by now.
>
>> +#include "asm-syntax.h"
>> +#ifndef ALIGN
>> +# define ALIGN(n)	.p2align n
>> +#endif
>
> expand to p2align

Fixed in new version
>
>> +	lea	(%rsi, %rdx), %r8
>> +	lea	(%rdi, %rdx), %r9
>
> rcx instead r8 saves byte, changing r9 needs some work. Also could you
> save something by ymm registers or is that killed by latentcy.

We tested performance with ymm, but it hurt performance, so keep
original versoin.
>
>> +	cmp	$256, %rdx
>> +	ja	L(256bytesormore)
>> +	cmp	$128, %edx
>> +	jb	L(less_128bytes)
>> +	vmovups (%rsi), %xmm0
>> +	vmovups 0x10(%rsi), %xmm1
>> +	vmovups 0x20(%rsi), %xmm2
>
> snip
Fixed in new version
>
>> +	ALIGN(4)
>> +L(less_16bytes):
>> +	cmp	$8, %edx
>> +	jb	L(less_8bytes)
>> +	movq (%rsi),	%rcx
>> +	movq -0x08(%r8),	%r10
>> +	movq %rcx, (%rdi)
>> +	movq %r10, -0x08(%r9)
>
> rdx instead r10 saves 2 bytes.

Fixed with similar method in new version
>
>> +L(less_4bytes):
>> +	cmp	$2, %edx
>> +	jb	L(less_2bytes)
>> +	mov (%rsi),	%cx
>> +	mov -0x02(%r8),	%dx
>> +	mov %cx, (%rdi)
>> +	mov %dx, -0x02(%r9)
>> +	ret
>> +	ALIGN(4)
>> +L(less_2bytes):
>> +	cmp	$1, %rdx
>> +	jb	L(less_0bytes)
>> +	mov	(%rsi), %cl
>> +	mov	%cl,	(%rdi)
>> +L(less_0bytes):
>> +	ret
>> +
> again you could save comparison here.
Fixed in new version
>
>> +	ALIGN(4)
>> +L(256bytesormore):
>> +
>> +#ifdef USE_AS_MEMMOVE
>> +	cmp	%rsi, %rdi
>> +	jae	L(copy_backward)
>> +#endif
>
> this could be unpredictable branch, backward copy only when overlap is
> better.

If we compare whether it is overlap, have to introduce another branch
instruction,
so keep it.
>
>> +	mov	%rdi, %r10
>> +	cmp	$2048, %rdx
>> +	jae	L(gobble_data_movsb)
>> +	vmovups -0x80(%r8), %xmm8
>> +	vmovups -0x70(%r8), %xmm9
>> +	and	$-32, %rdi
>> +	add	$32, %rdi
>> +	vmovups -0x60(%r8), %xmm10
>> +	vmovups -0x50(%r8), %xmm11
>> +	mov	%rdi, %r11
>> +	sub	%r10, %r11
>> +	vmovups -0x40(%r8), %xmm12
>> +	vmovups -0x30(%r8), %xmm13
>> +	sub	%r11, %rdx
>> +	vmovups -0x20(%r8), %xmm14
>> +	vmovups -0x10(%r8), %xmm15
>> +	vmovups	(%rsi), %ymm4
>> +	add	%r11, %rsi
>
> does copying moving vmovups %xmm8, -0x80(%r9)... here help?
>
Changed & test mode as below, original  code is better as memset.
 cmp $2048, %rdx
jae L(gobble_data_movsb)
lea (%rdi, %rdx), %r9
.....
    vmovdqu %xmm5, -0x80(%r9)
    vmovdqu %xmm6, -0x70(%r9)
    vmovdqu %xmm7, -0x60(%r9)
    vmovdqu %xmm8, -0x50(%r9)
    vmovdqu %xmm9, -0x40(%r9)
    vmovdqu %xmm10, -0x30(%r9)
    vmovdqu %xmm11, -0x20(%r9)
    vmovdqu %xmm12, -0x10(%r9)
L(goble_128_loop):



> Also check if alignment for loop help.
Fixed in the new version.
>
>> +	sub	$0x80, %rdx
>> +L(goble_128_loop):
>> +	vmovups (%rsi), %ymm0
>> +	vmovups 0x20(%rsi), %ymm1
>> +	vmovups 0x40(%rsi), %ymm2
>> +	vmovups 0x60(%rsi), %ymm3
>> +	lea	0x80(%rsi), %rsi
>> +	vmovaps %ymm0, (%rdi)
>> +	vmovaps %ymm1, 0x20(%rdi)
>> +	vmovaps %ymm2, 0x40(%rdi)
>> +	vmovaps %ymm3, 0x60(%rdi)
>> +	lea	0x80(%rdi), %rdi
>> +	sub	$0x80, %rdx
>> +	jae	L(goble_128_loop)
>> +	vmovups	%ymm4, (%r10)
>> +	vzeroupper
>> +	vmovups %xmm8, -0x80(%r9)
>> +	vmovups %xmm9, -0x70(%r9)
>> +	vmovups %xmm10, -0x60(%r9)
>> +	vmovups %xmm11, -0x50(%r9)
>> +	vmovups %xmm12, -0x40(%r9)
>> +	vmovups %xmm13, -0x30(%r9)
>> +	vmovups %xmm14, -0x20(%r9)
>> +	vmovups %xmm15, -0x10(%r9)
>> +	ret
>> +
>> +L(gobble_data_movsb):
>> +
>> +#ifdef SHARED_CACHE_SIZE_HALF
>> +	mov	$SHARED_CACHE_SIZE_HALF, %rcx
>> +#else
>> +	mov	__x86_shared_cache_size_half(%rip), %rcx
>
> same typo.
The same answer with memset.
>
>


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]