This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH][BZ #17801] Fix memcpy regression (five times slower on bulldozer.)


On Fri, Jan 30, 2015 at 10:50 PM, Carlos O'Donell <carlos@redhat.com> wrote:
> On 01/30/2015 03:04 PM, H.J. Lu wrote:
>>>>
>>>> Please add a new feature bit, bit_Fast_AVX_Unaligned_Load, and turn it
>>>> on together
>>>> with bit_AVX2_Usable.
>>>>
>>>
>>> I know we are in freeze.  But I'd like to fix this regression in 2.21.
>>> OK for master?
>>
>> Since this is a serious performance regression, I will check it in
>> before the end of the day unless I am told otherwise.
>
> In the future please TO: me so that I have high visibility on this change
> as the release manager. I'm testing each of the changes to make sure things
> are in good shape for the release.
>
> Could you explain in detail why this is needed?
>
> +#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
> +# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
> +#endif
>
> Why do they have to be on the same index in the feature
> array of bits? I don't see anywhere that checks them
> both simultaneously. At the very least please add a detailed
> comment why the error condition exists and how to fix it in
> the future if another author needs to fix it.
>

There are

              /* Unaligned load versions are faster than SSSE3
                 on Silvermont.  */
#if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
# error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
#endif
#if index_Fast_Unaligned_Load != index_Slow_SSE4_2
# error index_Fast_Unaligned_Load != index_Slow_SSE4_2
#endif
              __cpu_features.feature[index_Fast_Unaligned_Load]
                |= (bit_Fast_Unaligned_Load
                    | bit_Prefer_PMINUB_for_stringop
                    | bit_Slow_SSE4_2);

and

#if index_Fast_Rep_String != index_Fast_Copy_Backward
# error index_Fast_Rep_String != index_Fast_Copy_Backward
#endif
#if index_Fast_Rep_String != index_Fast_Unaligned_Load
# error index_Fast_Rep_String != index_Fast_Unaligned_Load
#endif
#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
#endif
              __cpu_features.feature[index_Fast_Rep_String]
                |= (bit_Fast_Rep_String
                    | bit_Fast_Copy_Backward
                    | bit_Fast_Unaligned_Load
                    | bit_Prefer_PMINUB_for_stringop);

before.  Since there are

xtern struct cpu_features
{
  enum cpu_features_kind
    {
      arch_kind_unknown = 0,
      arch_kind_intel,
      arch_kind_amd,
      arch_kind_other
    } kind;
  int max_cpuid;
  struct cpuid_registers
  {
    unsigned int eax;
    unsigned int ebx;
    unsigned int ecx;
    unsigned int edx;
  } cpuid[COMMON_CPUID_INDEX_MAX];
  unsigned int family;
  unsigned int model;
  unsigned int feature[FEATURE_INDEX_MAX];
} __cpu_features attribute_hidden;

Each feature element can hold up to 32 features.  We use

# define index_Fast_Rep_String          FEATURE_INDEX_1
# define index_Fast_Copy_Backward       FEATURE_INDEX_1
# define index_Slow_BSF                 FEATURE_INDEX_1
# define index_Fast_Unaligned_Load      FEATURE_INDEX_1
# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1
# define index_AVX_Usable               FEATURE_INDEX_1
# define index_FMA_Usable               FEATURE_INDEX_1
# define index_FMA4_Usable              FEATURE_INDEX_1
# define index_Slow_SSE4_2              FEATURE_INDEX_1
# define index_AVX2_Usable              FEATURE_INDEX_1
# define index_AVX_Fast_Unaligned_Load  FEATURE_INDEX_1

to indicate which element the feature bit is in and use a single
statement

#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
#endif
          /* Determine if AVX2 is usable.  Unaligned load with 256-bit
             AVX registers are faster on processors with AVX2.  */
          if (CPUID_AVX2)
            __cpu_features.feature[index_AVX2_Usable]
              |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;

to update 2 features.  It works only if they have the same index_XXX.
We need this check when we update more than one feature bit
in a single statement.

-- 
H.J.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]