This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
Re: [PATCH][BZ #17801] Fix memcpy regression (five times slower on bulldozer.)

From: "H.J. Lu" <hjl dot tools at gmail dot com>
To: OndÅej BÃlka <neleai at seznam dot cz>, GNU C Library <libc-alpha at sourceware dot org>
Date: Fri, 30 Jan 2015 06:56:56 -0800
Subject: Re: [PATCH][BZ #17801] Fix memcpy regression (five times slower on bulldozer.)
Authentication-results: sourceware.org; auth=none
References: <20150106142939 dot GB5835 at domone> <CAMe9rOo4tmQc0bJ1Z=pjURvPBBMndwQ-ynbHc=Mpz3eD=eqjbg at mail dot gmail dot com>
On Tue, Jan 06, 2015 at 06:54:50AM -0800, H.J. Lu wrote:
> On Tue, Jan 6, 2015 at 6:29 AM, OndÅej BÃlka <neleai@seznam.cz> wrote:
> > H. J, in this commit there slipped performance regression by review.
> >
> > commit 05f3633da4f9df870d04dd77336e793746e57ed4
> > Author: Ling Ma <ling.ml@alibaba-inc.com>
> > Date:   Mon Jul 14 00:02:52 2014 -0400
> >
> >     Improve 64bit memcpy performance for Haswell CPU with AVX
> > instruction
> >
> >
> > I seem to recall that I mentioned something about avx being typo and
> > should be avx2 but did not look it further.
> >
> > As I assumed its avx2 only I was ok with that nad haswell specific
> > optimizations like using rep movsq. However ifunc checks for avx which
> > is bad as we already know that avx loads/stores are slow on sandy
> > bridge.
> >
> > Also testing on affected architectures would reveal that. Especially amd
> > bulldozer where its five times slower on 2kb-16kb range, see
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/fx10/memcpy_profile_avx/results_rand/result.html
> > because movsb is slow.
> >
> > On sandy bridge its only 20% regression on same range.
> > http://kam.mff.cuni.cz/~ondra/benchmark_string/i7_ivy_bridge/memcpy_profile_avx/results_rand/result.html
> >
> >
> > Also avx loop for 128-2024 bytes is slower there so there is no point
> > using it.
> >
> > What about following change?
> >
> >
> >         * sysdeps/x86_64/multiarch/memcpy.S: Fix performance regression.
> >
> > diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
> > index 992e40d..27f89e4 100644
> > --- a/sysdeps/x86_64/multiarch/memcpy.S
> > +++ b/sysdeps/x86_64/multiarch/memcpy.S
> > @@ -32,10 +32,13 @@ ENTRY(__new_memcpy)
> >         cmpl    $0, KIND_OFFSET+__cpu_features(%rip)
> >         jne     1f
> >         call    __init_cpu_features
> > +#ifdef HAVE_AVX2_SUPPORT
> >  1:     leaq    __memcpy_avx_unaligned(%rip), %rax
> > -       testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
> > +       testl   $bit_AVX2_Usable, __cpu_features+FEATURE_OFFSET+index_AVX2_Usable(%rip)
> > +
> >         jz 1f
> >         ret
> > +#endif
> >  1:     leaq    __memcpy_sse2(%rip), %rax
> >         testl   $bit_Slow_BSF, __cpu_features+FEATURE_OFFSET+index_Slow_BSF(%rip)
> >         jnz     2f
> 
> Please add a new feature bit, bit_Fast_AVX_Unaligned_Load, and turn it
> on together
> with bit_AVX2_Usable.
> 

I know we are in freeze.  But I'd like to fix this regression in 2.21.
OK for master?

Thanks.


H.J.
---
>From 56d25c11b64a97255a115901d136d753c86de24e Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 30 Jan 2015 06:50:20 -0800
Subject: [PATCH] Use AVX unaligned memcpy only if AVX2 is available

memcpy with unaligned 256-bit AVX register loads/stores are slow on older
processorsl like Sandy Bridge.  This patch adds bit_AVX_Fast_Unaligned_Load
and sets it only when AVX2 is available.

	[BZ #17801]
	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
	Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
	* sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
	New.
	(index_AVX_Fast_Unaligned_Load): Likewise.
	(HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
	* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
	bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
	* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
	* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
	* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
	* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
	HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
	* sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
---
 ChangeLog                              | 18 ++++++++++++++++++
 sysdeps/x86_64/multiarch/init-arch.c   |  9 +++++++--
 sysdeps/x86_64/multiarch/init-arch.h   |  4 ++++
 sysdeps/x86_64/multiarch/memcpy.S      |  2 +-
 sysdeps/x86_64/multiarch/memcpy_chk.S  |  2 +-
 sysdeps/x86_64/multiarch/memmove.c     |  2 +-
 sysdeps/x86_64/multiarch/memmove_chk.c |  2 +-
 sysdeps/x86_64/multiarch/mempcpy.S     |  2 +-
 sysdeps/x86_64/multiarch/mempcpy_chk.S |  2 +-
 9 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 26f7f3f..a696e39 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+2015-01-30  H.J. Lu  <hongjiu.lu@intel.com>
+
+	[BZ #17801]
+	* sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
+	Set the bit_AVX_Fast_Unaligned_Load bit for AVX2.
+	* sysdeps/x86_64/multiarch/init-arch.h (bit_AVX_Fast_Unaligned_Load):
+	New.
+	(index_AVX_Fast_Unaligned_Load): Likewise.
+	(HAS_AVX_FAST_UNALIGNED_LOAD): Likewise.
+	* sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Check the
+	bit_AVX_Fast_Unaligned_Load bit instead of the bit_AVX_Usable bit.
+	* sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Likewise.
+	* sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Likewise.
+	* sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Likewise.
+	* sysdeps/x86_64/multiarch/memmove.c (__libc_memmove): Replace
+	HAS_AVX with HAS_AVX_FAST_UNALIGNED_LOAD.
+	* sysdeps/x86_64/multiarch/memmove_chk.c (__memmove_chk): Likewise.
+
 2015-01-29  Andreas Schwab  <schwab@suse.de>
 
 	* sysdeps/nptl/allocrtsig.c: Include <signal.h>.
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index 9299360..7dec218 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -171,9 +171,14 @@ __init_cpu_features (void)
 	  /* Determine if AVX is usable.  */
 	  if (CPUID_AVX)
 	    __cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
-	  /* Determine if AVX2 is usable.  */
+#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
+# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
+#endif
+	  /* Determine if AVX2 is usable.  Unaligned load with 256-bit
+	     AVX registers are faster on processors with AVX2.  */
 	  if (CPUID_AVX2)
-	    __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable;
+	    __cpu_features.feature[index_AVX2_Usable]
+	      |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
 	  /* Determine if FMA is usable.  */
 	  if (CPUID_FMA)
 	    __cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 55f1c5b..e6b5ba5 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -25,6 +25,7 @@
 #define bit_FMA4_Usable			(1 << 8)
 #define bit_Slow_SSE4_2			(1 << 9)
 #define bit_AVX2_Usable			(1 << 10)
+#define bit_AVX_Fast_Unaligned_Load	(1 << 11)
 
 /* CPUID Feature flags.  */
 
@@ -74,6 +75,7 @@
 # define index_FMA4_Usable		FEATURE_INDEX_1*FEATURE_SIZE
 # define index_Slow_SSE4_2		FEATURE_INDEX_1*FEATURE_SIZE
 # define index_AVX2_Usable		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1*FEATURE_SIZE
 
 #else	/* __ASSEMBLER__ */
 
@@ -169,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_FMA4_Usable		FEATURE_INDEX_1
 # define index_Slow_SSE4_2		FEATURE_INDEX_1
 # define index_AVX2_Usable		FEATURE_INDEX_1
+# define index_AVX_Fast_Unaligned_Load	FEATURE_INDEX_1
 
 # define HAS_ARCH_FEATURE(name) \
   ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
@@ -181,5 +184,6 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define HAS_AVX2			HAS_ARCH_FEATURE (AVX2_Usable)
 # define HAS_FMA			HAS_ARCH_FEATURE (FMA_Usable)
 # define HAS_FMA4			HAS_ARCH_FEATURE (FMA4_Usable)
+# define HAS_AVX_FAST_UNALIGNED_LOAD	HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
 
 #endif	/* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcpy.S b/sysdeps/x86_64/multiarch/memcpy.S
index 992e40d..4e18cd3 100644
--- a/sysdeps/x86_64/multiarch/memcpy.S
+++ b/sysdeps/x86_64/multiarch/memcpy.S
@@ -33,7 +33,7 @@ ENTRY(__new_memcpy)
 	jne	1f
 	call	__init_cpu_features
 1:	leaq	__memcpy_avx_unaligned(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
 	jz 1f
 	ret
 1:	leaq	__memcpy_sse2(%rip), %rax
diff --git a/sysdeps/x86_64/multiarch/memcpy_chk.S b/sysdeps/x86_64/multiarch/memcpy_chk.S
index 5e9cf00..1e756ea 100644
--- a/sysdeps/x86_64/multiarch/memcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/memcpy_chk.S
@@ -39,7 +39,7 @@ ENTRY(__memcpy_chk)
 	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
 	jz	2f
 	leaq	__memcpy_chk_ssse3_back(%rip), %rax
-	testl   $bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	testl   $bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
 	jz  2f
 	leaq    __memcpy_chk_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/memmove.c b/sysdeps/x86_64/multiarch/memmove.c
index d93bfd0..dd153a3 100644
--- a/sysdeps/x86_64/multiarch/memmove.c
+++ b/sysdeps/x86_64/multiarch/memmove.c
@@ -49,7 +49,7 @@ extern __typeof (__redirect_memmove) __memmove_avx_unaligned attribute_hidden;
    ifunc symbol properly.  */
 extern __typeof (__redirect_memmove) __libc_memmove;
 libc_ifunc (__libc_memmove,
-	    HAS_AVX
+	    HAS_AVX_FAST_UNALIGNED_LOAD
 	    ? __memmove_avx_unaligned
 	    : (HAS_SSSE3
 	       ? (HAS_FAST_COPY_BACKWARD
diff --git a/sysdeps/x86_64/multiarch/memmove_chk.c b/sysdeps/x86_64/multiarch/memmove_chk.c
index 743ca2a..8b12d00 100644
--- a/sysdeps/x86_64/multiarch/memmove_chk.c
+++ b/sysdeps/x86_64/multiarch/memmove_chk.c
@@ -30,7 +30,7 @@ extern __typeof (__memmove_chk) __memmove_chk_avx_unaligned attribute_hidden;
 #include "debug/memmove_chk.c"
 
 libc_ifunc (__memmove_chk,
-	    HAS_AVX ? __memmove_chk_avx_unaligned :
+	    HAS_AVX_FAST_UNALIGNED_LOAD ? __memmove_chk_avx_unaligned :
 	    (HAS_SSSE3
 	    ? (HAS_FAST_COPY_BACKWARD
 	       ? __memmove_chk_ssse3_back : __memmove_chk_ssse3)
diff --git a/sysdeps/x86_64/multiarch/mempcpy.S b/sysdeps/x86_64/multiarch/mempcpy.S
index cdf1dab..2eaacdf 100644
--- a/sysdeps/x86_64/multiarch/mempcpy.S
+++ b/sysdeps/x86_64/multiarch/mempcpy.S
@@ -37,7 +37,7 @@ ENTRY(__mempcpy)
 	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
 	jz	2f
 	leaq	__mempcpy_ssse3_back(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
 	jz	2f
 	leaq	__mempcpy_avx_unaligned(%rip), %rax
 2:	ret
diff --git a/sysdeps/x86_64/multiarch/mempcpy_chk.S b/sysdeps/x86_64/multiarch/mempcpy_chk.S
index b7f9e89..17b8470 100644
--- a/sysdeps/x86_64/multiarch/mempcpy_chk.S
+++ b/sysdeps/x86_64/multiarch/mempcpy_chk.S
@@ -39,7 +39,7 @@ ENTRY(__mempcpy_chk)
 	testl	$bit_Fast_Copy_Backward, __cpu_features+FEATURE_OFFSET+index_Fast_Copy_Backward(%rip)
 	jz	2f
 	leaq	__mempcpy_chk_ssse3_back(%rip), %rax
-	testl	$bit_AVX_Usable, __cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
+	testl	$bit_AVX_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_AVX_Fast_Unaligned_Load(%rip)
 	jz	2f
 	leaq	__mempcpy_chk_avx_unaligned(%rip), %rax
 2:	ret
-- 
2.1.0
Follow-Ups:
- Re: [PATCH][BZ #17801] Fix memcpy regression (five times slower on bulldozer.)
  - From: H.J. Lu
References:
- [PATCH][BZ #17801] Fix memcpy regression (five times slower on bulldozer.)
  - From: OndÅej BÃlka
- Re: [PATCH][BZ #17801] Fix memcpy regression (five times slower on bulldozer.)
  - From: H.J. Lu
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]