This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH] x86_64: memcpy/memmove family optimized with AVX512
- From: Adhemerval Zanella <adhemerval dot zanella at linaro dot org>
- To: libc-alpha at sourceware dot org
- Date: Wed, 13 Jan 2016 16:36:51 -0200
- Subject: Re: [PATCH] x86_64: memcpy/memmove family optimized with AVX512
- Authentication-results: sourceware.org; auth=none
- References: <CAMXFM3uGLiFE+pKPzFgWP6Sx4C3w2Ktd4w3+35O0Bj=B1s0naA at mail dot gmail dot com> <CAMe9rOrTWce2vy0_OUWRH4FVRUSndpuSh2QpkmcMxgSBBEo0Fg at mail dot gmail dot com>
On 13-01-2016 12:21, H.J. Lu wrote:
> On Tue, Jan 12, 2016 at 6:13 AM, Andrew Senkevich
> <andrew.n.senkevich@gmail.com> wrote:
>> Hi,
>>
>> here is AVX512 implementations of memcpy, mempcpy, memmove,
>> memcpy_chk, mempcpy_chk, memmove_chk.
>> It shows average improvement more than 30% over AVX versions on KNL
>> hardware, performance results attached.
>> Ok for trunk?
>>
>> 2016-01-12 Andrew Senkevich <andrew.senkevich@intel.com>
>>
>> * sysdeps/x86_64/multiarch/Makefile: Added new files.
>> * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Added new tests.
>> * sysdeps/x86_64/multiarch/memcpy-avx512-no-vzeroupper.S: New file.
>> * sysdeps/x86_64/multiarch/mempcpy-avx512-no-vzeroupper.S: Likewise.
>> * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: Likewise.
>> * sysdeps/x86_64/multiarch/memcpy.S: Added new IFUNC branch.
>> * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise.
>> * sysdeps/x86_64/multiarch/memmove.c: Likewise.
>> * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
>> * sysdeps/x86_64/multiarch/mempcpy.S: Likewise.
>> * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise.
>>
>> diff --git a/sysdeps/x86_64/multiarch/Makefile
>> b/sysdeps/x86_64/multiarch/Makefile
>> index b2e31ef..d234f4a 100644
>> --- a/sysdeps/x86_64/multiarch/Makefile
>> +++ b/sysdeps/x86_64/multiarch/Makefile
>> @@ -7,11 +7,12 @@ ifeq ($(subdir),string)
>>
>> sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
>> strcmp-sse2-unaligned strncmp-ssse3 \
>> - memcmp-sse4 memcpy-ssse3 \
>> - memcpy-sse2-unaligned mempcpy-ssse3 \
>> - memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
>> - memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
>> - memmove-ssse3-back strcasecmp_l-ssse3 \
>> + memcmp-sse4 memcpy-ssse3 memcpy-sse2-unaligned \
>> + memcpy-avx512-no-vzeroupper mempcpy-ssse3 memmove-ssse3 \
>> + memcpy-ssse3-back mempcpy-ssse3-back memmove-avx-unaligned \
>> + memcpy-avx-unaligned mempcpy-avx-unaligned \
>> + mempcpy-avx512-no-vzeroupper memmove-ssse3-back \
>> + memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
>> strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
>> strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
>> strcpy-sse2-unaligned strncpy-sse2-unaligned \
>> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> index 5f600dc..7746d79 100644
>> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> @@ -24,7 +24,7 @@
>> #include "init-arch.h"
>>
>> /* Maximum number of IFUNC implementations. */
>> -#define MAX_IFUNC 4
>> +#define MAX_IFUNC 5
>>
>> /* Fill ARRAY of MAX elements with IFUNC implementations for function
>> NAME supported on target machine and return the number of valid
>> @@ -46,9 +46,12 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>> __memcmp_ssse3)
>> IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>>
>> - /* Support sysdeps/x86_64/multiarch/memmove_chk.S. */
>> + /* Support sysdeps/x86_64/multiarch/memmove_chk.c. */
>> IFUNC_IMPL (i, name, __memmove_chk,
>> IFUNC_IMPL_ADD (array, i, __memmove_chk,
>> + HAS_ARCH_FEATURE (AVX512F_Usable),
>> + __memmove_chk_avx512_no_vzeroupper)
>> + IFUNC_IMPL_ADD (array, i, __memmove_chk,
>> HAS_ARCH_FEATURE (AVX_Usable),
>> __memmove_chk_avx_unaligned)
>> IFUNC_IMPL_ADD (array, i, __memmove_chk,
>> @@ -65,6 +68,9 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>> IFUNC_IMPL_ADD (array, i, memmove,
>> HAS_ARCH_FEATURE (AVX_Usable),
>> __memmove_avx_unaligned)
>> + IFUNC_IMPL_ADD (array, i, memmove,
>> + HAS_ARCH_FEATURE (AVX512F_Usable),
>> + __memmove_avx512_no_vzeroupper)
>> IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
>> __memmove_ssse3_back)
>> IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3),
>> @@ -274,6 +280,9 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>> __memcpy_ssse3_back)
>> IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3),
>> __memcpy_ssse3)
>> + IFUNC_IMPL_ADD (array, i, memcpy,
>> + HAS_ARCH_FEATURE (AVX512F_Usable),
>> + __memcpy_avx512_no_vzeroupper)
>> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
>> IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2))
>>
>> @@ -294,6 +303,9 @@ __libc_ifunc_impl_list (const char *name, struct
>> libc_ifunc_impl *array,
>> /* Support sysdeps/x86_64/multiarch/mempcpy.S. */
>> IFUNC_IMPL (i, name, mempcpy,
>> IFUNC_IMPL_ADD (array, i, mempcpy,
>> + HAS_ARCH_FEATURE (AVX512F_Usable),
>> + __mempcpy_avx512_no_vzeroupper)
>> + IFUNC_IMPL_ADD (array, i, mempcpy,
>> HAS_ARCH_FEATURE (AVX_Usable),
>> __mempcpy_avx_unaligned)
>> IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3),
>
> Please add _chk tests.
>
>
>> diff --git a/sysdeps/x86_64/multiarch/memcpy.S
>> b/sysdeps/x86_64/multiarch/memcpy.S
>> index 27fca29..64a1bcd 100644
>> --- a/sysdeps/x86_64/multiarch/memcpy.S
>> +++ b/sysdeps/x86_64/multiarch/memcpy.S
>> @@ -30,19 +30,27 @@
>> ENTRY(__new_memcpy)
>> .type __new_memcpy, @gnu_indirect_function
>> LOAD_RTLD_GLOBAL_RO_RDX
>> - leaq __memcpy_avx_unaligned(%rip), %rax
>> +#ifdef HAVE_AVX512_ASM_SUPPORT
>> + HAS_ARCH_FEATURE (AVX512F_Usable)
>> + jz 1f
>> + HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
>> + jz 1f
>> + leaq __memcpy_avx512_no_vzeroupper(%rip), %rax
>> + ret
>> +#endif
>> +1: leaq __memcpy_avx_unaligned(%rip), %rax
>> HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
>> - jz 1f
>> + jz 2f
>> ret
>> -1: leaq __memcpy_sse2(%rip), %rax
>> +2: leaq __memcpy_sse2(%rip), %rax
>> HAS_ARCH_FEATURE (Slow_BSF)
>> - jnz 2f
>> + jnz 3f
>> leaq __memcpy_sse2_unaligned(%rip), %rax
>> ret
>> -2: HAS_CPU_FEATURE (SSSE3)
>> - jz 3f
>> +3: HAS_CPU_FEATURE (SSSE3)
>> + jz 4f
>> leaq __memcpy_ssse3(%rip), %rax
>> -3: ret
>> +4: ret
>> END(__new_memcpy)
>>
>> # undef ENTRY
>
> Please find a way not to re-order labels when adding a new
> implementation next time.
>
Maybe using 'libc_ifunc(...)' instead and let the compiler handle it?