This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Don't use SSE4_2 instructions on Intel Silvermont Micro Architecture.


On Thu, Jun 27, 2013 at 9:29 AM, Carlos O'Donell <carlos@redhat.com> wrote:
> On 06/27/2013 12:17 PM, Liubov Dmitrieva wrote:
>> I checked glibc benchmark suite and results look good.
>>
>> You probably missed my results.
>>
>> http://sourceware.org/ml/libc-alpha/2013-06/msg00792.html
>
> I did, thanks for reposting.
>
> Yes, the numbers look good ~30% speedup in some of the cases
> I inspected.
>
> I'm happy with the results. If nobody objects I'd say check
> in your patches on Friday.
>
> What we really need now is a comparison script to collate
> and compare two sets of results. It was annoying to look at
> the result data manually :-(
>
> Cheers,
> Carlos.
>

This is what I checked in for Liubov.

Thanks.

--
H.J.
---
diff --git a/ChangeLog b/ChangeLog
index c5551b8..406ca28 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2013-06-28  Liubov Dmitrieva  <liubov.dmitrieva@intel.com>
+
+    * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): Set
+    bit_Slow_SSE4_2 and bit_Prefer_PMINUB_for_stringop for Intel
+    Silvermont.
+    * sysdeps/x86_64/multiarch/init-arch.h (bit_Slow_SSE4_2): New
+    macro.
+    (index_Slow_SSE4_2): Likewise.
+    (index_Prefer_PMINUB_for_stringop): Likewise.
+    * sysdeps/x86_64/multiarch/strchr.S: Skip SSE4.2 version if
+    bit_Slow_SSE4_2 is set.
+    * sysdeps/x86_64/multiarch/strcmp.S: Likewise.
+    * sysdeps/x86_64/multiarch/strrchr.S: Likewise.
+
 2013-06-28  Ryan S. Arnold  <rsa@linux.vnet.ibm.com>

     * sysdeps/powerpc/Makefile: Add comment about generating an offset to
diff --git a/sysdeps/x86_64/multiarch/init-arch.c
b/sysdeps/x86_64/multiarch/init-arch.c
index 9524aee..5583961 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -81,8 +81,16 @@ __init_cpu_features (void)
         case 0x37:
           /* Unaligned load versions are faster than SSSE3
          on Silvermont.  */
+#if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
+# error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
+#endif
+#if index_Fast_Unaligned_Load != index_Slow_SSE4_2
+# error index_Fast_Unaligned_Load != index_Slow_SSE4_2
+#endif
           __cpu_features.feature[index_Fast_Unaligned_Load]
-        |= bit_Fast_Unaligned_Load;
+        |= (bit_Fast_Unaligned_Load
+            | bit_Prefer_PMINUB_for_stringop
+            | bit_Slow_SSE4_2);
           break;

         default:
diff --git a/sysdeps/x86_64/multiarch/init-arch.h
b/sysdeps/x86_64/multiarch/init-arch.h
index 28edbf7..0cb5f5b 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -23,6 +23,7 @@
 #define bit_AVX_Usable            (1 << 6)
 #define bit_FMA_Usable            (1 << 7)
 #define bit_FMA4_Usable            (1 << 8)
+#define bit_Slow_SSE4_2            (1 << 9)

 /* CPUID Feature flags.  */

@@ -62,6 +63,7 @@
 # define index_AVX_Usable        FEATURE_INDEX_1*FEATURE_SIZE
 # define index_FMA_Usable        FEATURE_INDEX_1*FEATURE_SIZE
 # define index_FMA4_Usable        FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Slow_SSE4_2        FEATURE_INDEX_1*FEATURE_SIZE

 #else    /* __ASSEMBLER__ */

@@ -156,9 +158,11 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define index_Fast_Copy_Backward    FEATURE_INDEX_1
 # define index_Slow_BSF            FEATURE_INDEX_1
 # define index_Fast_Unaligned_Load    FEATURE_INDEX_1
+# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1
 # define index_AVX_Usable        FEATURE_INDEX_1
 # define index_FMA_Usable        FEATURE_INDEX_1
 # define index_FMA4_Usable        FEATURE_INDEX_1
+# define index_Slow_SSE4_2        FEATURE_INDEX_1

 # define HAS_ARCH_FEATURE(name) \
   ((__get_cpu_features ()->feature[index_##name] & (bit_##name)) != 0)
diff --git a/sysdeps/x86_64/multiarch/strchr.S
b/sysdeps/x86_64/multiarch/strchr.S
index 6860329..f170238 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -29,6 +29,8 @@ ENTRY(strchr)
     jne    1f
     call    __init_cpu_features
 1:    leaq    __strchr_sse2(%rip), %rax
+    testl    $bit_Slow_SSE4_2,
__cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
+    jnz    2f
     testl    $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
     jz    2f
     leaq    __strchr_sse42(%rip), %rax
diff --git a/sysdeps/x86_64/multiarch/strcmp.S
b/sysdeps/x86_64/multiarch/strcmp.S
index f69aaf4..1d4d711 100644
--- a/sysdeps/x86_64/multiarch/strcmp.S
+++ b/sysdeps/x86_64/multiarch/strcmp.S
@@ -88,14 +88,16 @@ ENTRY(STRCMP)
     jne    1f
     call    __init_cpu_features
 1:
+    testl    $bit_Slow_SSE4_2,
__cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
+    jnz    2f
     leaq    STRCMP_SSE42(%rip), %rax
     testl    $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-    jnz    2f
-    leaq    STRCMP_SSSE3(%rip), %rax
+    jnz    3f
+2:    leaq    STRCMP_SSSE3(%rip), %rax
     testl    $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
-    jnz    2f
+    jnz    3f
     leaq    STRCMP_SSE2(%rip), %rax
-2:    ret
+3:    ret
 END(STRCMP)

 # ifdef USE_AS_STRCASECMP_L
@@ -109,16 +111,18 @@ ENTRY(__strcasecmp)
 #  ifdef HAVE_AVX_SUPPORT
     leaq    __strcasecmp_avx(%rip), %rax
     testl    $bit_AVX_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
-    jnz    2f
+    jnz    3f
 #  endif
+    testl    $bit_Slow_SSE4_2,
__cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
+    jnz    2f
     leaq    __strcasecmp_sse42(%rip), %rax
     testl    $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-    jnz    2f
-    leaq    __strcasecmp_ssse3(%rip), %rax
+    jnz    3f
+2:    leaq    __strcasecmp_ssse3(%rip), %rax
     testl    $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
-    jnz    2f
+    jnz    3f
     leaq    __strcasecmp_sse2(%rip), %rax
-2:    ret
+3:    ret
 END(__strcasecmp)
 weak_alias (__strcasecmp, strcasecmp)
 # endif
@@ -133,16 +137,18 @@ ENTRY(__strncasecmp)
 #  ifdef HAVE_AVX_SUPPORT
     leaq    __strncasecmp_avx(%rip), %rax
     testl    $bit_AVX_Usable,
__cpu_features+FEATURE_OFFSET+index_AVX_Usable(%rip)
-    jnz    2f
+    jnz    3f
 #  endif
+    testl    $bit_Slow_SSE4_2,
__cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
+    jnz    2f
     leaq    __strncasecmp_sse42(%rip), %rax
     testl    $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
-    jnz    2f
-    leaq    __strncasecmp_ssse3(%rip), %rax
+    jnz    3f
+2:    leaq    __strncasecmp_ssse3(%rip), %rax
     testl    $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
-    jnz    2f
+    jnz    3f
     leaq    __strncasecmp_sse2(%rip), %rax
-2:    ret
+3:    ret
 END(__strncasecmp)
 weak_alias (__strncasecmp, strncasecmp)
 # endif
diff --git a/sysdeps/x86_64/multiarch/strrchr.S
b/sysdeps/x86_64/multiarch/strrchr.S
index ee6af6e..3f92a41 100644
--- a/sysdeps/x86_64/multiarch/strrchr.S
+++ b/sysdeps/x86_64/multiarch/strrchr.S
@@ -32,6 +32,8 @@ ENTRY(strrchr)
     jne    1f
     call    __init_cpu_features
 1:    leaq    __strrchr_sse2(%rip), %rax
+    testl    $bit_Slow_SSE4_2,
__cpu_features+CPUID_OFFSET+index_Slow_SSE4_2(%rip)
+    jnz    2f
     testl    $bit_SSE4_2, __cpu_features+CPUID_OFFSET+index_SSE4_2(%rip)
     jz    2f
     leaq    __strrchr_sse42(%rip), %rax


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]