This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH RFC] X86_64 Avx2 Detection
- From: "H.J. Lu" <hjl dot tools at gmail dot com>
- To: Ling Ma <ling dot ma dot program at gmail dot com>
- Cc: GNU C Library <libc-alpha at sourceware dot org>, Ondrej Bilka <neleai at seznam dot cz>, Liubov Dmitrieva <liubov dot dmitrieva at gmail dot com>, Sihai Yao <sihai dot ysh at alibaba-inc dot com>
- Date: Wed, 9 Apr 2014 09:28:58 -0700
- Subject: Re: [PATCH RFC] X86_64 Avx2 Detection
- Authentication-results: sourceware.org; auth=none
- References: <1396595802-21567-1-git-send-email-ling dot ma dot program at gmail dot com> <CAMe9rOrJiLNC4XAQ2HvTiKPNCvn7OxfZj=yxRyfFv=O3OLojiQ at mail dot gmail dot com>
On Wed, Apr 9, 2014 at 9:12 AM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Fri, Apr 4, 2014 at 12:16 AM, <ling.ma.program@gmail.com> wrote:
>> From: Sihai Yao <sihai.ysh@alibaba-inc.com>
>>
>> This patch sets bit_AVX2_Usable of __cpu_features.feature by checking
>> COMMON_CPUID_INDEX_7 for Haswell. Architecture related assembler file
>> can use this bit to determine calling path.
>>
>> ---
>> ChangeLog | 9 +++++++++
>> sysdeps/x86_64/multiarch/ifunc-defines.sym | 2 ++
>> sysdeps/x86_64/multiarch/init-arch.c | 3 +++
>> sysdeps/x86_64/multiarch/init-arch.h | 9 +++++++++
>> 4 files changed, 23 insertions(+)
>>
>> diff --git a/ChangeLog b/ChangeLog
>> index da8ea6d..ab23a3a 100644
>> --- a/ChangeLog
>> +++ b/ChangeLog
>> @@ -1,3 +1,12 @@
>> +2014-04-04 Sihai Yao <sihai.ysh@alibaba-inc.com>
>> +
>> + * sysdeps/x86_64/multiarch/ifunc-defines.sym: Add COMMON_CPU_INDEX_7 and
>> + FEATURE_INDEX_7.
>> + * sysdeps/x86_64/multiarch/init-arch.c: Add AVX2 detection from cpu
>> + features word of COMMON_CPUID_INDEX_7.
>> + * sysdeps/x86_64/multiarch/init-arch.h: Add bit_AVX2_Usable for memset.S
>> + to determine calling path.
>> +
>> 2014-04-03 David Svoboda <svoboda@cert.org>
>>
>> [BZ #5666]
>> diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
>> index eb1538a..448b8c4 100644
>> --- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
>> +++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
>> @@ -17,4 +17,6 @@ FEATURE_OFFSET offsetof (struct cpu_features, feature)
>> FEATURE_SIZE sizeof (unsigned int)
>>
>> COMMON_CPUID_INDEX_1
>> +COMMON_CPUID_INDEX_7
>> FEATURE_INDEX_1
>> +FEATURE_INDEX_7
>> diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
>> index db74d97..2bbc5eb 100644
>> --- a/sysdeps/x86_64/multiarch/init-arch.c
>> +++ b/sysdeps/x86_64/multiarch/init-arch.c
>> @@ -106,6 +106,7 @@ __init_cpu_features (void)
>> case 0x2c:
>> case 0x2e:
>> case 0x2f:
>> + case 0x3c:
>
> This isn't mentioned in ChangeLog. IA Optimization reference manual
> shows that 0x45 and 0x46 are also Haswell. This should be in a separate
> patch.
>
>> /* Rep string instructions, copy backward, unaligned loads
>> and pminub are fast on Intel Core i3, i5 and i7. */
>> #if index_Fast_Rep_String != index_Fast_Copy_Backward
>> @@ -153,6 +154,8 @@ __init_cpu_features (void)
>> __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ebx,
>> __cpu_features.cpuid[COMMON_CPUID_INDEX_7].ecx,
>> __cpu_features.cpuid[COMMON_CPUID_INDEX_7].edx);
>> + if (CPUID_AVX2)
>> + __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable;
> ^^^^^^^^^^
>
> This should be inside if (CPUID_OSXSAVE), similar to bit_AVX_Usable.
>
>> /* Can we call xgetbv? */
>> if (CPUID_OSXSAVE)
>> diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
>> index 793707a..e453ccc 100644
>> --- a/sysdeps/x86_64/multiarch/init-arch.h
>> +++ b/sysdeps/x86_64/multiarch/init-arch.h
>> @@ -24,6 +24,7 @@
>> #define bit_FMA_Usable (1 << 7)
>> #define bit_FMA4_Usable (1 << 8)
>> #define bit_Slow_SSE4_2 (1 << 9)
>> +#define bit_AVX2_Usable (1 << 10)
>>
>> /* CPUID Feature flags. */
>>
>> @@ -40,6 +41,7 @@
>>
>> /* COMMON_CPUID_INDEX_7. */
>> #define bit_RTM (1 << 11)
>> +#define bit_AVX2 (1 << 5)
>>
>> /* XCR0 Feature flags. */
>> #define bit_XMM_state (1 << 1)
>> @@ -54,6 +56,7 @@
>> # define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
>> # define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
>> # define index_AVX COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
>> +# define index_AVX2 COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
>>
>> # define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
>> # define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE
>> @@ -64,6 +67,7 @@
>> # define index_FMA_Usable FEATURE_INDEX_1*FEATURE_SIZE
>> # define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE
>> # define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE
>> +# define index_AVX2_Usable FEATURE_INDEX_7*FEATURE_SIZE
>>
>> #else /* __ASSEMBLER__ */
>>
>> @@ -81,6 +85,7 @@ enum
>> enum
>> {
>> FEATURE_INDEX_1 = 0,
>> + FEATURE_INDEX_7,
>> /* Keep the following line at the end. */
>> FEATURE_INDEX_MAX
>> };
>> @@ -145,6 +150,8 @@ extern const struct cpu_features *__get_cpu_features (void)
>> HAS_CPUID_FLAG (COMMON_CPUID_INDEX_80000001, ecx, bit_FMA4)
>> # define CPUID_RTM \
>> HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
>> +# define CPUID_AVX2 \
>> + HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX2)
>>
>> /* HAS_* evaluates to true if we may use the feature at runtime. */
>> # define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, bit_SSE2)
>> @@ -153,6 +160,7 @@ extern const struct cpu_features *__get_cpu_features (void)
>> # define HAS_SSE4_1 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_1)
>> # define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_2)
>> # define HAS_RTM HAS_CPU_FEATURE (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
>> +# define HAS_AVX2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_7, ebx, bit_AVX2)
>>
>> # define index_Fast_Rep_String FEATURE_INDEX_1
>> # define index_Fast_Copy_Backward FEATURE_INDEX_1
>> @@ -163,6 +171,7 @@ extern const struct cpu_features *__get_cpu_features (void)
>> # define index_FMA_Usable FEATURE_INDEX_1
>> # define index_FMA4_Usable FEATURE_INDEX_1
>> # define index_Slow_SSE4_2 FEATURE_INDEX_1
>> +# define index_AVX2_Usable FEATURE_INDEX_7
>>
>> # define HAS_ARCH_FEATURE(name) \
>> ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
>> --
>> 1.8.1.4
Here is the updated patch. We don't need FEATURE_INDEX_7.
--
H.J.
diff --git a/sysdeps/x86_64/multiarch/ifunc-defines.sym b/sysdeps/x86_64/multiarch/ifunc-defines.sym
index eb1538a..a410d88 100644
--- a/sysdeps/x86_64/multiarch/ifunc-defines.sym
+++ b/sysdeps/x86_64/multiarch/ifunc-defines.sym
@@ -17,4 +17,5 @@ FEATURE_OFFSET offsetof (struct cpu_features, feature)
FEATURE_SIZE sizeof (unsigned int)
COMMON_CPUID_INDEX_1
+COMMON_CPUID_INDEX_7
FEATURE_INDEX_1
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index db74d97..2a6dcb7 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -167,6 +167,9 @@ __init_cpu_features (void)
/* Determine if AVX is usable. */
if (CPUID_AVX)
__cpu_features.feature[index_AVX_Usable] |= bit_AVX_Usable;
+ /* Determine if AVX2 is usable. */
+ if (CPUID_AVX2)
+ __cpu_features.feature[index_AVX2_Usable] |= bit_AVX2_Usable;
/* Determine if FMA is usable. */
if (CPUID_FMA)
__cpu_features.feature[index_FMA_Usable] |= bit_FMA_Usable;
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index 793707a..813b6de 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -24,6 +24,7 @@
#define bit_FMA_Usable (1 << 7)
#define bit_FMA4_Usable (1 << 8)
#define bit_Slow_SSE4_2 (1 << 9)
+#define bit_AVX2_Usable (1 << 10)
/* CPUID Feature flags. */
@@ -40,6 +41,7 @@
/* COMMON_CPUID_INDEX_7. */
#define bit_RTM (1 << 11)
+#define bit_AVX2 (1 << 5)
/* XCR0 Feature flags. */
#define bit_XMM_state (1 << 1)
@@ -54,6 +56,7 @@
# define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_AVX COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
+# define index_AVX2 COMMON_CPUID_INDEX_7*CPUID_SIZE+CPUID_EBX_OFFSET
# define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
# define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE
@@ -64,6 +67,7 @@
# define index_FMA_Usable FEATURE_INDEX_1*FEATURE_SIZE
# define index_FMA4_Usable FEATURE_INDEX_1*FEATURE_SIZE
# define index_Slow_SSE4_2 FEATURE_INDEX_1*FEATURE_SIZE
+# define index_AVX2_Usable FEATURE_INDEX_1*FEATURE_SIZE
#else /* __ASSEMBLER__ */
@@ -145,6 +149,8 @@ extern const struct cpu_features *__get_cpu_features (void)
HAS_CPUID_FLAG (COMMON_CPUID_INDEX_80000001, ecx, bit_FMA4)
# define CPUID_RTM \
HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
+# define CPUID_AVX2 \
+ HAS_CPUID_FLAG (COMMON_CPUID_INDEX_7, ebx, bit_AVX2)
/* HAS_* evaluates to true if we may use the feature at runtime. */
# define HAS_SSE2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, edx, bit_SSE2)
@@ -153,6 +159,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define HAS_SSE4_1 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_1)
# define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, bit_SSE4_2)
# define HAS_RTM HAS_CPU_FEATURE (COMMON_CPUID_INDEX_7, ebx, bit_RTM)
+# define HAS_AVX2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_7, ebx, bit_AVX2)
# define index_Fast_Rep_String FEATURE_INDEX_1
# define index_Fast_Copy_Backward FEATURE_INDEX_1
@@ -163,6 +170,7 @@ extern const struct cpu_features *__get_cpu_features (void)
# define index_FMA_Usable FEATURE_INDEX_1
# define index_FMA4_Usable FEATURE_INDEX_1
# define index_Slow_SSE4_2 FEATURE_INDEX_1
+# define index_AVX2_Usable FEATURE_INDEX_1
# define HAS_ARCH_FEATURE(name) \
((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)