This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Enable AVX2 optimized memset only if -mavx2 works


On Tue, Jul 1, 2014 at 2:03 AM, Marko Myllynen <myllynen@redhat.com> wrote:
> Hi,
>
> On 2014-04-04 10:34, ling.ma.program@gmail.com wrote:
>> From: Ling Ma <ling.ml@alibaba-inc.com>
>>
>> In this patch we manage to reduce miss branch prediction by
>> avoid using branch instructions and force destination to be aligned
>> with avx instruction.
>>
>> ---
>>  In this version we removed prefetch and append vmovd.
>>
>>  ChangeLog                              |   9 ++
>>  sysdeps/x86_64/multiarch/Makefile      |   4 +-
>>  sysdeps/x86_64/multiarch/memset-avx2.S | 192 +++++++++++++++++++++++++++++++++
>>  sysdeps/x86_64/multiarch/memset.S      |  59 ++++++++++
>>  sysdeps/x86_64/multiarch/memset_chk.S  |  44 ++++++++
>>  5 files changed, 307 insertions(+), 1 deletion(-)
>>  create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S
>>  create mode 100644 sysdeps/x86_64/multiarch/memset.S
>>  create mode 100644 sysdeps/x86_64/multiarch/memset_chk.S
>>
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
>> new file mode 100644
>> index 0000000..08e8ee8
>> --- /dev/null
>> +++ b/sysdeps/x86_64/multiarch/memset-avx2.S
>> @@ -0,0 +1,192 @@
>> +/* memset with AVX2
>> +   Copyright (C) 2014 Free Software Foundation, Inc.
>> +   Contributed by Alibaba Group.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#include <sysdep.h>
>> +
>> +#if !defined NOT_IN_libc
>> +
>> +#include "asm-syntax.h"
>> +#ifndef ALIGN
>> +# define ALIGN(n)    .p2align n
>> +#endif
>> +#ifndef MEMSET
>> +# define MEMSET      __memset_avx2
>> +# define MEMSET_CHK  __memset_chk_avx2
>> +#endif
>> +
>> +     .section .text.avx2,"ax",@progbits
>> +#if defined PIC
>> +ENTRY (MEMSET_CHK)
>> +     cmpq    %rdx, %rcx
>> +     jb      HIDDEN_JUMPTARGET (__chk_fail)
>> +END (MEMSET_CHK)
>> +#endif
>> +
>> +ENTRY (MEMSET)
>> +     vpxor   %xmm0, %xmm0, %xmm0
>> +     vmovd %esi, %xmm1
>> +     lea     (%rdi, %rdx), %r8
>> +     vpshufb %xmm0, %xmm1, %xmm0
>> +     mov     %rdi, %rax
>> +     cmp     $256, %rdx
>> +     jae     L(256bytesormore)
>> +     vmovd %xmm0, %rcx
>> +     cmp     $128, %rdx
>> +     jb      L(less_128bytes)
>> +     vmovups %xmm0, (%rdi)
>> +     vmovups %xmm0, 0x10(%rdi)
>> +     vmovups %xmm0, 0x20(%rdi)
>> +     vmovups %xmm0, 0x30(%rdi)
>> +     vmovups %xmm0, 0x40(%rdi)
>> +     vmovups %xmm0, 0x50(%rdi)
>> +     vmovups %xmm0, 0x60(%rdi)
>> +     vmovups %xmm0, 0x70(%rdi)
>> +     vmovups %xmm0, -0x80(%r8)
>> +     vmovups %xmm0, -0x70(%r8)
>> +     vmovups %xmm0, -0x60(%r8)
>> +     vmovups %xmm0, -0x50(%r8)
>> +     vmovups %xmm0, -0x40(%r8)
>> +     vmovups %xmm0, -0x30(%r8)
>> +     vmovups %xmm0, -0x20(%r8)
>> +     vmovups %xmm0, -0x10(%r8)
>> +     ret
>> +     ALIGN(4)
>> +L(less_128bytes):
>> +     cmp     $64, %edx
>> +     jb      L(less_64bytes)
>> +     vmovups %xmm0, (%rdi)
>> +     vmovups %xmm0, 0x10(%rdi)
>> +     vmovups %xmm0, 0x20(%rdi)
>> +     vmovups %xmm0, 0x30(%rdi)
>> +     vmovups %xmm0, -0x40(%r8)
>> +     vmovups %xmm0, -0x30(%r8)
>> +     vmovups %xmm0, -0x20(%r8)
>> +     vmovups %xmm0, -0x10(%r8)
>> +     ret
>> +     ALIGN(4)
>> +L(less_64bytes):
>> +     cmp     $32, %edx
>> +     jb      L(less_32bytes)
>> +     vmovups %xmm0, (%rdi)
>> +     vmovups %xmm0, 0x10(%rdi)
>> +     vmovups %xmm0, -0x20(%r8)
>> +     vmovups %xmm0, -0x10(%r8)
>> +     ret
>> +     ALIGN(4)
>> +L(less_32bytes):
>> +     cmp     $16, %edx
>> +     jb      L(less_16bytes)
>> +     vmovups %xmm0, (%rdi)
>> +     vmovups %xmm0, -0x10(%r8)
>> +     ret
>> +     ALIGN(4)
>> +L(less_16bytes):
>> +     cmp     $8, %edx
>> +     jb      L(less_8bytes)
>> +     mov %rcx, (%rdi)
>> +     mov %rcx, -0x08(%r8)
>> +     ret
>> +     ALIGN(4)
>> +L(less_8bytes):
>> +     cmp     $4, %edx
>> +     jb      L(less_4bytes)
>> +     mov %ecx, (%rdi)
>> +     mov %ecx, -0x04(%r8)
>> +     ALIGN(4)
>> +L(less_4bytes):
>> +     cmp     $2, %edx
>> +     jb      L(less_2bytes)
>> +     mov     %cx, (%rdi)
>> +     mov     %cx, -0x02(%r8)
>> +     ret
>> +     ALIGN(4)
>> +L(less_2bytes):
>> +     cmp     $1, %edx
>> +     jb      L(less_1bytes)
>> +     mov     %cl, (%rdi)
>> +L(less_1bytes):
>> +     ret
>> +
>> +     ALIGN(4)
>> +L(256bytesormore):
>> +     vinserti128 $1, %xmm0, %ymm0, %ymm0
>
> this breaks build on RHEL 6 x86_64:
>
> ../sysdeps/x86_64/multiarch/memset-avx2.S:
> ../sysdeps/x86_64/multiarch/memset-avx2.S: Assembler messages:
> Assembler messages:
> ../sysdeps/x86_64/multiarch/memset-avx2.S:132:
> ../sysdeps/x86_64/multiarch/memset-avx2.S:132: Error: Error: no such
> instruction: `vinserti128 $1,%xmm0,%ymm0,%ymm0'no such instruction:
> `vinserti128 $1,%xmm0,%ymm0,%ymm0'
>
> Cheers,
>

This patches enables AVX2 optimized memset only if -mavx2 works.  Tested
with GCC 4.6 and 4.8 on Fedora 20/x86-64.  OK to install?

Thanks.

H.J.
---
2014-07-01  H.J. Lu  <hongjiu.lu@intel.com>

* config.h.in (HAVE_AVX2_SUPPORT): New #undef.
* sysdeps/i386/configure.ac: Set HAVE_AVX2_SUPPORT and
config-cflags-avx2.
* sysdeps/x86_64/configure.ac: Likewise.
* sysdeps/i386/configure: Regenerated.
* sysdeps/x86_64/configure: Likewise.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memset-avx2 only if config-cflags-avx2 is yes.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
Tests for memset_chk and memset only if HAVE_AVX2_SUPPORT is
defined.
* sysdeps/x86_64/multiarch/memset.S: Define multiple versions
only if HAVE_AVX2_SUPPORT is defined.
* sysdeps/x86_64/multiarch/memset_chk.S: Likewise.

-- 
H.J.
From 99d75c147abe8ba91f3ad7123126bda4e3f31045 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 1 Jul 2014 08:52:47 -0700
Subject: [PATCH] Enable AVX2 optimized memset only if -mavx2 works

	* config.h.in (HAVE_AVX2_SUPPORT): New #undef.
	* sysdeps/i386/configure.ac: Set HAVE_AVX2_SUPPORT and
	config-cflags-avx2.
	* sysdeps/x86_64/configure.ac: Likewise.
	* sysdeps/i386/configure: Regenerated.
	* sysdeps/x86_64/configure: Likewise.
	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
	memset-avx2 only if config-cflags-avx2 is yes.
	* sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
	Tests for memset_chk and memset only if HAVE_AVX2_SUPPORT is
	defined.
	* sysdeps/x86_64/multiarch/memset.S: Define multiple versions
	only if HAVE_AVX2_SUPPORT is defined.
	* sysdeps/x86_64/multiarch/memset_chk.S: Likewise.
---
 ChangeLog                                  | 17 +++++++++++++++++
 config.h.in                                |  3 +++
 sysdeps/i386/configure                     | 26 ++++++++++++++++++++++++++
 sysdeps/i386/configure.ac                  |  9 +++++++++
 sysdeps/x86_64/configure                   | 26 ++++++++++++++++++++++++++
 sysdeps/x86_64/configure.ac                |  9 +++++++++
 sysdeps/x86_64/multiarch/Makefile          |  7 +++++--
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  2 ++
 sysdeps/x86_64/multiarch/memset.S          | 24 +++++++++++++-----------
 sysdeps/x86_64/multiarch/memset_chk.S      |  2 +-
 10 files changed, 111 insertions(+), 14 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index a1e44b1..20abae7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,20 @@
+2014-07-01  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* config.h.in (HAVE_AVX2_SUPPORT): New #undef.
+	* sysdeps/i386/configure.ac: Set HAVE_AVX2_SUPPORT and
+	config-cflags-avx2.
+	* sysdeps/x86_64/configure.ac: Likewise.
+	* sysdeps/i386/configure: Regenerated.
+	* sysdeps/x86_64/configure: Likewise.
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	memset-avx2 only if config-cflags-avx2 is yes.
+	* sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list):
+	Tests for memset_chk and memset only if HAVE_AVX2_SUPPORT is
+	defined.
+	* sysdeps/x86_64/multiarch/memset.S: Define multiple versions
+	only if HAVE_AVX2_SUPPORT is defined.
+	* sysdeps/x86_64/multiarch/memset_chk.S: Likewise.
+
 2014-07-01  Stefan Liebler  <stli@linux.vnet.ibm.com>
 
 	* sysdeps/s390/fpu/libm-test-ulps: Regenerate.
diff --git a/config.h.in b/config.h.in
index 2dcd135..97b5571 100644
--- a/config.h.in
+++ b/config.h.in
@@ -103,6 +103,9 @@
 /* Define if gcc supports FMA4.  */
 #undef	HAVE_FMA4_SUPPORT
 
+/* Define if gcc supports AVX2.  */
+#undef	HAVE_AVX2_SUPPORT
+
 /* Define if the compiler\'s exception support is based on libunwind.  */
 #undef	HAVE_CC_WITH_LIBUNWIND
 
diff --git a/sysdeps/i386/configure b/sysdeps/i386/configure
index f0a20e3..6e89b59 100644
--- a/sysdeps/i386/configure
+++ b/sysdeps/i386/configure
@@ -240,6 +240,32 @@ $as_echo "$libc_cv_cc_novzeroupper" >&6; }
 config_vars="$config_vars
 config-cflags-novzeroupper = $libc_cv_cc_novzeroupper"
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 support" >&5
+$as_echo_n "checking for AVX2 support... " >&6; }
+if ${libc_cv_cc_avx2+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if { ac_try='${CC-cc} -mavx2 -xc /dev/null -S -o /dev/null'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  libc_cv_cc_avx2=yes
+else
+  libc_cv_cc_avx2=no
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_avx2" >&5
+$as_echo "$libc_cv_cc_avx2" >&6; }
+if test $libc_cv_cc_avx2 = yes; then
+  $as_echo "#define HAVE_AVX2_SUPPORT 1" >>confdefs.h
+
+fi
+config_vars="$config_vars
+config-cflags-avx2 = $libc_cv_cc_avx2"
+
 $as_echo "#define USE_REGPARMS 1" >>confdefs.h
 
 
diff --git a/sysdeps/i386/configure.ac b/sysdeps/i386/configure.ac
index dfe0b47..35c4522 100644
--- a/sysdeps/i386/configure.ac
+++ b/sysdeps/i386/configure.ac
@@ -88,6 +88,15 @@ LIBC_TRY_CC_OPTION([-mno-vzeroupper],
 ])
 LIBC_CONFIG_VAR([config-cflags-novzeroupper], [$libc_cv_cc_novzeroupper])
 
+dnl Check if -mavx2 works.
+AC_CACHE_CHECK(for AVX2 support, libc_cv_cc_avx2, [dnl
+LIBC_TRY_CC_OPTION([-mavx2], [libc_cv_cc_avx2=yes], [libc_cv_cc_avx2=no])
+])
+if test $libc_cv_cc_avx2 = yes; then
+  AC_DEFINE(HAVE_AVX2_SUPPORT)
+fi
+LIBC_CONFIG_VAR([config-cflags-avx2], [$libc_cv_cc_avx2])
+
 AC_DEFINE(USE_REGPARMS)
 
 dnl It is always possible to access static and hidden symbols in an
diff --git a/sysdeps/x86_64/configure b/sysdeps/x86_64/configure
index 45d868d..7d4dadd 100644
--- a/sysdeps/x86_64/configure
+++ b/sysdeps/x86_64/configure
@@ -249,6 +249,32 @@ if test $libc_cv_asm_mpx == yes; then
 
 fi
 
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for AVX2 support" >&5
+$as_echo_n "checking for AVX2 support... " >&6; }
+if ${libc_cv_cc_avx2+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if { ac_try='${CC-cc} -mavx2 -xc /dev/null -S -o /dev/null'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then :
+  libc_cv_cc_avx2=yes
+else
+  libc_cv_cc_avx2=no
+fi
+
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_avx2" >&5
+$as_echo "$libc_cv_cc_avx2" >&6; }
+if test $libc_cv_cc_avx2 = yes; then
+  $as_echo "#define HAVE_AVX2_SUPPORT 1" >>confdefs.h
+
+fi
+config_vars="$config_vars
+config-cflags-avx2 = $libc_cv_cc_avx2"
+
 $as_echo "#define PI_STATIC_AND_HIDDEN 1" >>confdefs.h
 
 # work around problem with autoconf and empty lines at the end of files
diff --git a/sysdeps/x86_64/configure.ac b/sysdeps/x86_64/configure.ac
index 9138f63..c9f9a51 100644
--- a/sysdeps/x86_64/configure.ac
+++ b/sysdeps/x86_64/configure.ac
@@ -90,6 +90,15 @@ if test $libc_cv_asm_mpx == yes; then
   AC_DEFINE(HAVE_MPX_SUPPORT)
 fi
 
+dnl Check if -mavx2 works.
+AC_CACHE_CHECK(for AVX2 support, libc_cv_cc_avx2, [dnl
+LIBC_TRY_CC_OPTION([-mavx2], [libc_cv_cc_avx2=yes], [libc_cv_cc_avx2=no])
+])
+if test $libc_cv_cc_avx2 = yes; then
+  AC_DEFINE(HAVE_AVX2_SUPPORT)
+fi
+LIBC_CONFIG_VAR([config-cflags-avx2], [$libc_cv_cc_avx2])
+
 dnl It is always possible to access static and hidden symbols in an
 dnl position independent way.
 AC_DEFINE(PI_STATIC_AND_HIDDEN)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 42df96f..3bb9702 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,8 +17,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
-		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
-		   memset-avx2
+		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned
 
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
@@ -27,6 +26,10 @@ CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
 CFLAGS-strspn-c.c += -msse4
 endif
+
+ifeq (yes,$(config-cflags-avx2))
+sysdep_routines += memset-avx2
+endif
 endif
 
 ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f1593c5..7e93e59 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -61,6 +61,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2))
 
+#ifdef HAVE_AVX2_SUPPORT
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2)
@@ -71,6 +72,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, memset,
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
 	      IFUNC_IMPL_ADD (array, i, memset, HAS_AVX2, __memset_avx2))
+#endif
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.S.  */
   IFUNC_IMPL (i, name, stpncpy,
diff --git a/sysdeps/x86_64/multiarch/memset.S b/sysdeps/x86_64/multiarch/memset.S
index 3113d1c..00d46d1 100644
--- a/sysdeps/x86_64/multiarch/memset.S
+++ b/sysdeps/x86_64/multiarch/memset.S
@@ -17,12 +17,13 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifdef HAVE_AVX2_SUPPORT
 #include <sysdep.h>
 #include <shlib-compat.h>
 #include <init-arch.h>
 
 /* Define multiple versions only for the definition in lib.  */
-#ifndef NOT_IN_libc
+# ifndef NOT_IN_libc
 ENTRY(memset)
 	.type	memset, @gnu_indirect_function
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
@@ -34,26 +35,27 @@ ENTRY(memset)
 	leaq	__memset_avx2(%rip), %rax
 2:	ret
 END(memset)
-#endif
+# endif
 
-#if !defined NOT_IN_libc
-# undef memset
-# define memset __memset_sse2
+# if !defined NOT_IN_libc
+#  undef memset
+#  define memset __memset_sse2
 
-# undef __memset_chk
-# define __memset_chk __memset_chk_sse2
+#  undef __memset_chk
+#  define __memset_chk __memset_chk_sse2
 
-# ifdef SHARED
+#  ifdef SHARED
 #  undef libc_hidden_builtin_def
 /* It doesn't make sense to send libc-internal memset calls through a PLT.
    The speedup we get from using GPR instruction is likely eaten away
    by the indirect call in the PLT.  */
 #  define libc_hidden_builtin_def(name) \
 	.globl __GI_memset; __GI_memset = __memset_sse2
-# endif
+#  endif
 
-# undef strong_alias
-# define strong_alias(original, alias)
+#  undef strong_alias
+#  define strong_alias(original, alias)
+# endif
 #endif
 
 #include "../memset.S"
diff --git a/sysdeps/x86_64/multiarch/memset_chk.S b/sysdeps/x86_64/multiarch/memset_chk.S
index 2182780..8a607bd 100644
--- a/sysdeps/x86_64/multiarch/memset_chk.S
+++ b/sysdeps/x86_64/multiarch/memset_chk.S
@@ -22,7 +22,7 @@
 
 /* Define multiple versions only for the definition in lib.  */
 #ifndef NOT_IN_libc
-# ifdef SHARED
+# if defined SHARED && defined HAVE_AVX2_SUPPORT
 ENTRY(__memset_chk)
 	.type	__memset_chk, @gnu_indirect_function
 	cmpl	$0, __cpu_features+KIND_OFFSET(%rip)
-- 
1.9.3


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]