This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH RFC] Imporve 64bit memcpy performance for Haswell CPU with AVX instruction
- From: Ling Ma <ling dot ma dot program at gmail dot com>
- To: OndÅej BÃlka <neleai at seznam dot cz>
- Cc: libc-alpha at sourceware dot org, liubov dot dmitrieva at gmail dot com, "H.J. Lu" <hjl dot tools at gmail dot com>
- Date: Fri, 18 Apr 2014 16:02:59 +0800
- Subject: Re: [PATCH RFC] Imporve 64bit memcpy performance for Haswell CPU with AVX instruction
- Authentication-results: sourceware.org; auth=none
- References: <1396595862-21707-1-git-send-email-ling dot ma dot program at gmail dot com> <20140410225018 dot GD9478 at domone dot podge>
Ondra,
As memset, we changed code as below, and have sent new version out.
Thanks
Ling
2014-04-11 6:50 GMT+08:00, OndÅej BÃlka <neleai@seznam.cz>:
> On Fri, Apr 04, 2014 at 03:17:42AM -0400, ling.ma.program@gmail.com wrote:
>> From: Ling Ma <ling.ml@alibaba-inc.com>
>>
>> In this patch we manage to reduce miss branch prediction by
>> avoid using branch instructions and force destination to be aligned
>> with avx instruction.
>>
>> The CPU2006 403.gcc benchmark also indicate this patch improves
>> performance
>> from 2% to 12% or 2% to 21% compared with original memset implemented
>> by sse2 and ssse3 respectively.
>>
>> memcpy-AVX memcpy-SSE2 memcpy-SSSE3 AVX vs SSE2 AVX vs SSSE3
>> gcc.166.i 302551459 332189574 345378682 1.097960575 1.141553517
>> gcc.200.i 138036144 155904648 168229120 1.129448009 1.218732392
>> gcc.cp-decl.i 283963419 296759183 312970805 1.045061311 1.102151841
>> gcc.c-typeck.i 616484068 664855801 682119551 1.078463882 1.106467444
>> gcc.expr2.i 781639964 858486085 893803320 1.098313961 1.143497468
>> gcc.expr.i 580765337 593709446 596005444 1.022288019 1.02624142
>> gcc.g23.i 1063726457 1162692750 1177232886 1.093037352 1.106706408
>> gcc.s04.i 892109530 948328853 963836294 1.063018409 1.080401298
>> gcc.scilab.i 62298843 66606465 72922104 1.069144494 1.170521
>>
>
> similar comments as memset here.
Fixed in new version
>> +/* memcpy with AVX
>> + Copyright (C) 2014 Free Software Foundation, Inc.
>> + Contributed by Alibaba Group.
> no contributed by now.
>
>> +#include "asm-syntax.h"
>> +#ifndef ALIGN
>> +# define ALIGN(n) .p2align n
>> +#endif
>
> expand to p2align
Fixed in new version
>
>> + lea (%rsi, %rdx), %r8
>> + lea (%rdi, %rdx), %r9
>
> rcx instead r8 saves byte, changing r9 needs some work. Also could you
> save something by ymm registers or is that killed by latentcy.
We tested performance with ymm, but it hurt performance, so keep
original versoin.
>
>> + cmp $256, %rdx
>> + ja L(256bytesormore)
>> + cmp $128, %edx
>> + jb L(less_128bytes)
>> + vmovups (%rsi), %xmm0
>> + vmovups 0x10(%rsi), %xmm1
>> + vmovups 0x20(%rsi), %xmm2
>
> snip
Fixed in new version
>
>> + ALIGN(4)
>> +L(less_16bytes):
>> + cmp $8, %edx
>> + jb L(less_8bytes)
>> + movq (%rsi), %rcx
>> + movq -0x08(%r8), %r10
>> + movq %rcx, (%rdi)
>> + movq %r10, -0x08(%r9)
>
> rdx instead r10 saves 2 bytes.
Fixed with similar method in new version
>
>> +L(less_4bytes):
>> + cmp $2, %edx
>> + jb L(less_2bytes)
>> + mov (%rsi), %cx
>> + mov -0x02(%r8), %dx
>> + mov %cx, (%rdi)
>> + mov %dx, -0x02(%r9)
>> + ret
>> + ALIGN(4)
>> +L(less_2bytes):
>> + cmp $1, %rdx
>> + jb L(less_0bytes)
>> + mov (%rsi), %cl
>> + mov %cl, (%rdi)
>> +L(less_0bytes):
>> + ret
>> +
> again you could save comparison here.
Fixed in new version
>
>> + ALIGN(4)
>> +L(256bytesormore):
>> +
>> +#ifdef USE_AS_MEMMOVE
>> + cmp %rsi, %rdi
>> + jae L(copy_backward)
>> +#endif
>
> this could be unpredictable branch, backward copy only when overlap is
> better.
If we compare whether it is overlap, have to introduce another branch
instruction,
so keep it.
>
>> + mov %rdi, %r10
>> + cmp $2048, %rdx
>> + jae L(gobble_data_movsb)
>> + vmovups -0x80(%r8), %xmm8
>> + vmovups -0x70(%r8), %xmm9
>> + and $-32, %rdi
>> + add $32, %rdi
>> + vmovups -0x60(%r8), %xmm10
>> + vmovups -0x50(%r8), %xmm11
>> + mov %rdi, %r11
>> + sub %r10, %r11
>> + vmovups -0x40(%r8), %xmm12
>> + vmovups -0x30(%r8), %xmm13
>> + sub %r11, %rdx
>> + vmovups -0x20(%r8), %xmm14
>> + vmovups -0x10(%r8), %xmm15
>> + vmovups (%rsi), %ymm4
>> + add %r11, %rsi
>
> does copying moving vmovups %xmm8, -0x80(%r9)... here help?
>
Changed & test mode as below, original code is better as memset.
cmp $2048, %rdx
jae L(gobble_data_movsb)
lea (%rdi, %rdx), %r9
.....
vmovdqu %xmm5, -0x80(%r9)
vmovdqu %xmm6, -0x70(%r9)
vmovdqu %xmm7, -0x60(%r9)
vmovdqu %xmm8, -0x50(%r9)
vmovdqu %xmm9, -0x40(%r9)
vmovdqu %xmm10, -0x30(%r9)
vmovdqu %xmm11, -0x20(%r9)
vmovdqu %xmm12, -0x10(%r9)
L(goble_128_loop):
> Also check if alignment for loop help.
Fixed in the new version.
>
>> + sub $0x80, %rdx
>> +L(goble_128_loop):
>> + vmovups (%rsi), %ymm0
>> + vmovups 0x20(%rsi), %ymm1
>> + vmovups 0x40(%rsi), %ymm2
>> + vmovups 0x60(%rsi), %ymm3
>> + lea 0x80(%rsi), %rsi
>> + vmovaps %ymm0, (%rdi)
>> + vmovaps %ymm1, 0x20(%rdi)
>> + vmovaps %ymm2, 0x40(%rdi)
>> + vmovaps %ymm3, 0x60(%rdi)
>> + lea 0x80(%rdi), %rdi
>> + sub $0x80, %rdx
>> + jae L(goble_128_loop)
>> + vmovups %ymm4, (%r10)
>> + vzeroupper
>> + vmovups %xmm8, -0x80(%r9)
>> + vmovups %xmm9, -0x70(%r9)
>> + vmovups %xmm10, -0x60(%r9)
>> + vmovups %xmm11, -0x50(%r9)
>> + vmovups %xmm12, -0x40(%r9)
>> + vmovups %xmm13, -0x30(%r9)
>> + vmovups %xmm14, -0x20(%r9)
>> + vmovups %xmm15, -0x10(%r9)
>> + ret
>> +
>> +L(gobble_data_movsb):
>> +
>> +#ifdef SHARED_CACHE_SIZE_HALF
>> + mov $SHARED_CACHE_SIZE_HALF, %rcx
>> +#else
>> + mov __x86_shared_cache_size_half(%rip), %rcx
>
> same typo.
The same answer with memset.
>
>