This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH][BZ #17801] Fix memcpy regression (five times slower on bulldozer.)
- From: "H.J. Lu" <hjl dot tools at gmail dot com>
- To: "Carlos O'Donell" <carlos at redhat dot com>
- Cc: OndÅej BÃlka <neleai at seznam dot cz>, GNU C Library <libc-alpha at sourceware dot org>
- Date: Sat, 31 Jan 2015 06:12:21 -0800
- Subject: Re: [PATCH][BZ #17801] Fix memcpy regression (five times slower on bulldozer.)
- Authentication-results: sourceware.org; auth=none
- References: <20150106142939 dot GB5835 at domone> <CAMe9rOo4tmQc0bJ1Z=pjURvPBBMndwQ-ynbHc=Mpz3eD=eqjbg at mail dot gmail dot com> <20150130145656 dot GA26219 at gmail dot com> <CAMe9rOr9Vn93958O=1fFBTU8zTRJ_j-15EvrOq7J05SwTRejgw at mail dot gmail dot com> <54CC7B38 dot 9060608 at redhat dot com>
On Fri, Jan 30, 2015 at 10:50 PM, Carlos O'Donell <carlos@redhat.com> wrote:
> On 01/30/2015 03:04 PM, H.J. Lu wrote:
>>>>
>>>> Please add a new feature bit, bit_Fast_AVX_Unaligned_Load, and turn it
>>>> on together
>>>> with bit_AVX2_Usable.
>>>>
>>>
>>> I know we are in freeze. But I'd like to fix this regression in 2.21.
>>> OK for master?
>>
>> Since this is a serious performance regression, I will check it in
>> before the end of the day unless I am told otherwise.
>
> In the future please TO: me so that I have high visibility on this change
> as the release manager. I'm testing each of the changes to make sure things
> are in good shape for the release.
>
> Could you explain in detail why this is needed?
>
> +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
> +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
> +#endif
>
> Why do they have to be on the same index in the feature
> array of bits? I don't see anywhere that checks them
> both simultaneously. At the very least please add a detailed
> comment why the error condition exists and how to fix it in
> the future if another author needs to fix it.
>
There are
/* Unaligned load versions are faster than SSSE3
on Silvermont. */
#if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
# error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
#endif
#if index_Fast_Unaligned_Load != index_Slow_SSE4_2
# error index_Fast_Unaligned_Load != index_Slow_SSE4_2
#endif
__cpu_features.feature[index_Fast_Unaligned_Load]
|= (bit_Fast_Unaligned_Load
| bit_Prefer_PMINUB_for_stringop
| bit_Slow_SSE4_2);
and
#if index_Fast_Rep_String != index_Fast_Copy_Backward
# error index_Fast_Rep_String != index_Fast_Copy_Backward
#endif
#if index_Fast_Rep_String != index_Fast_Unaligned_Load
# error index_Fast_Rep_String != index_Fast_Unaligned_Load
#endif
#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
#endif
__cpu_features.feature[index_Fast_Rep_String]
|= (bit_Fast_Rep_String
| bit_Fast_Copy_Backward
| bit_Fast_Unaligned_Load
| bit_Prefer_PMINUB_for_stringop);
before. Since there are
xtern struct cpu_features
{
enum cpu_features_kind
{
arch_kind_unknown = 0,
arch_kind_intel,
arch_kind_amd,
arch_kind_other
} kind;
int max_cpuid;
struct cpuid_registers
{
unsigned int eax;
unsigned int ebx;
unsigned int ecx;
unsigned int edx;
} cpuid[COMMON_CPUID_INDEX_MAX];
unsigned int family;
unsigned int model;
unsigned int feature[FEATURE_INDEX_MAX];
} __cpu_features attribute_hidden;
Each feature element can hold up to 32 features. We use
# define index_Fast_Rep_String FEATURE_INDEX_1
# define index_Fast_Copy_Backward FEATURE_INDEX_1
# define index_Slow_BSF FEATURE_INDEX_1
# define index_Fast_Unaligned_Load FEATURE_INDEX_1
# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1
# define index_AVX_Usable FEATURE_INDEX_1
# define index_FMA_Usable FEATURE_INDEX_1
# define index_FMA4_Usable FEATURE_INDEX_1
# define index_Slow_SSE4_2 FEATURE_INDEX_1
# define index_AVX2_Usable FEATURE_INDEX_1
# define index_AVX_Fast_Unaligned_Load FEATURE_INDEX_1
to indicate which element the feature bit is in and use a single
statement
#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
#endif
/* Determine if AVX2 is usable. Unaligned load with 256-bit
AVX registers are faster on processors with AVX2. */
if (CPUID_AVX2)
__cpu_features.feature[index_AVX2_Usable]
|= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
to update 2 features. It works only if they have the same index_XXX.
We need this check when we update more than one feature bit
in a single statement.
--
H.J.