This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] [x86_32] Don't use SSE4_2 instructions on Intel Silvermont Micro Architecture.


It doesn't look very good when I summarized the results because there
are too many small lengths cases in your benchmarks.
And small cases are more inaccurate.
Ssse3 looks asymptotically faster. Actually it would better to use
sse2 unaligned versions for Silvermont but we don't have it in glibc
at the moment.
Bot sse4_2 and ssse3 are not the best for SLM.

--
Liubov

On Mon, Jul 1, 2013 at 12:35 AM, Carlos O'Donell <carlos@redhat.com> wrote:
> On 06/30/2013 03:57 PM, Liubov Dmitrieva wrote:
>> Same patch as recently committed but for 32 bit.
>> Attached performance results for current bench glibc test suite.
>> Hopefully I caught a moment before the freeze of 2.18.
>
> To make life easier for the reviewer could you please summarize
> the performance differences?
>
>> 2013-06-30  Liubov Dmitrieva  <liubov.dmitrieva@intel.com>
>>
>>   * sysdeps/i386/i686/multiarch/memcmp.S: Skip SSE4_2
>>   version if bit_Slow_SSE4_2 is set.
>>   * sysdeps/i386/i686/multiarch/strcmp.S: Likewise.
>>   * sysdeps/i386/i686/multiarch/strncase.S: Likewise.
>>   * sysdeps/i386/i686/multiarch/strcasecmp.S: Likewise.
>>   * sysdeps/i386/i686/multiarch/wmemcmp.S: Likewise.
>>
>> --
>> Liubov
>>
>>
>> silvermont2.patch
>>
>>
>> diff --git a/sysdeps/i386/i686/multiarch/memcmp.S b/sysdeps/i386/i686/multiarch/memcmp.S
>> index 73d1363..8f4b38e 100644
>> --- a/sysdeps/i386/i686/multiarch/memcmp.S
>> +++ b/sysdeps/i386/i686/multiarch/memcmp.S
>> @@ -40,6 +40,8 @@ ENTRY(memcmp)
>>       leal    __memcmp_ssse3@GOTOFF(%ebx), %eax
>>       testl   $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
>>       jz      2f
>> +     testl   $bit_Slow_SSE4_2, CPUID_OFFSET+index_Slow_SSE4_2+__cpu_features@GOTOFF(%ebx)
>> +     jnz     2f
>>       leal    __memcmp_sse4_2@GOTOFF(%ebx), %eax
>>  2:   popl    %ebx
>>       cfi_adjust_cfa_offset (-4)
>
> OK.
>
>> @@ -59,6 +61,8 @@ ENTRY(memcmp)
>>       leal    __memcmp_ssse3, %eax
>>       testl   $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
>>       jz      2f
>> +     testl   $bit_Slow_SSE4_2, CPUID_OFFSET+index_Slow_SSE4_2+__cpu_features
>> +     jnz     2f
>>       leal    __memcmp_sse4_2, %eax
>>  2:   ret
>>  END(memcmp)
>
> OK.
>
>> diff --git a/sysdeps/i386/i686/multiarch/strcasecmp.S b/sysdeps/i386/i686/multiarch/strcasecmp.S
>> index 3b38214..79a154e 100644
>> --- a/sysdeps/i386/i686/multiarch/strcasecmp.S
>> +++ b/sysdeps/i386/i686/multiarch/strcasecmp.S
>> @@ -37,6 +37,8 @@ ENTRY(__strcasecmp)
>>       leal    __strcasecmp_ssse3@GOTOFF(%ebx), %eax
>>       testl   $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
>>       jz      2f
>> +     testl   $bit_Slow_SSE4_2, CPUID_OFFSET+index_Slow_SSE4_2+__cpu_features@GOTOFF(%ebx)
>> +     jnz     2f
>>       leal    __strcasecmp_sse4_2@GOTOFF(%ebx), %eax
>>  2:   popl    %ebx
>>       cfi_adjust_cfa_offset (-4)
>
> OK.
>
>> @@ -58,6 +60,8 @@ ENTRY(__strcasecmp)
>>       // XXX Temporarily
>>       testl   $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
>>       jz      2f
>> +     testl   $bit_Slow_SSE4_2, CPUID_OFFSET+index_Slow_SSE4_2+__cpu_features
>> +     jnz     2f
>>       leal    __strcasecmp_sse4_2, %eax
>>  #endif
>>  2:   ret
>
> You're adding code to an #ifdef'd out block, which is OK,
> but I'd like to know why it's disabled.
>
> Could you please find the history behind this?
>
>> diff --git a/sysdeps/i386/i686/multiarch/strcmp.S b/sysdeps/i386/i686/multiarch/strcmp.S
>> index 7dc2cef..41dd3b3 100644
>> --- a/sysdeps/i386/i686/multiarch/strcmp.S
>> +++ b/sysdeps/i386/i686/multiarch/strcmp.S
>> @@ -68,6 +68,8 @@ ENTRY(STRCMP)
>>       leal    __STRCMP_SSSE3@GOTOFF(%ebx), %eax
>>       testl   $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
>>       jz      2f
>> +     testl   $bit_Slow_SSE4_2, CPUID_OFFSET+index_Slow_SSE4_2+__cpu_features@GOTOFF(%ebx)
>> +     jnz     2f
>>       leal    __STRCMP_SSE4_2@GOTOFF(%ebx), %eax
>>  2:   popl    %ebx
>>       cfi_adjust_cfa_offset (-4)
>
> OK.
>
>> @@ -87,6 +89,8 @@ ENTRY(STRCMP)
>>       leal    __STRCMP_SSSE3, %eax
>>       testl   $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
>>       jz      2f
>> +     testl   $bit_Slow_SSE4_2, CPUID_OFFSET+index_Slow_SSE4_2+__cpu_features
>> +     jnz     2f
>>       leal    __STRCMP_SSE4_2, %eax
>>  2:   ret
>>  END(STRCMP)
>
> OK.
>
>> diff --git a/sysdeps/i386/i686/multiarch/strncase.S b/sysdeps/i386/i686/multiarch/strncase.S
>> index 51c6d72..4045f71 100644
>> --- a/sysdeps/i386/i686/multiarch/strncase.S
>> +++ b/sysdeps/i386/i686/multiarch/strncase.S
>> @@ -37,6 +37,8 @@ ENTRY(__strncasecmp)
>>       leal    __strncasecmp_ssse3@GOTOFF(%ebx), %eax
>>       testl   $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
>>       jz      2f
>> +     testl   $bit_Slow_SSE4_2, CPUID_OFFSET+index_Slow_SSE4_2+__cpu_features@GOTOFF(%ebx)
>> +     jnz     2f
>>       leal    __strncasecmp_sse4_2@GOTOFF(%ebx), %eax
>>  2:   popl    %ebx
>>       cfi_adjust_cfa_offset (-4)
>
> OK.
>
>> @@ -58,6 +60,8 @@ ENTRY(__strncasecmp)
>>       // XXX Temporarily
>>       testl   $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features
>>       jz      2f
>> +     testl   $bit_Slow_SSE4_2, CPUID_OFFSET+index_Slow_SSE4_2+__cpu_features
>> +     jnz     2f
>>       leal    __strncasecmp_sse4_2, %eax
>>  #endif
>>  2:   ret
>
> Same #ifdef'd out code block as above, OK, but I'd like to
> know why it's disabled.
>
>> diff --git a/sysdeps/i386/i686/multiarch/wmemcmp.S b/sysdeps/i386/i686/multiarch/wmemcmp.S
>> index e994038..e685a9f 100644
>> --- a/sysdeps/i386/i686/multiarch/wmemcmp.S
>> +++ b/sysdeps/i386/i686/multiarch/wmemcmp.S
>> @@ -40,6 +40,8 @@ ENTRY(wmemcmp)
>>       leal    __wmemcmp_ssse3@GOTOFF(%ebx), %eax
>>       testl   $bit_SSE4_2, CPUID_OFFSET+index_SSE4_2+__cpu_features@GOTOFF(%ebx)
>>       jz      2f
>> +     testl   $bit_Slow_SSE4_2, CPUID_OFFSET+index_Slow_SSE4_2+__cpu_features@GOTOFF(%ebx)
>> +     jnz     2f
>>       leal    __wmemcmp_sse4_2@GOTOFF(%ebx), %eax
>>  2:   popl    %ebx
>>       cfi_adjust_cfa_offset (-4)
>
> OK.
>
> Please post a v2.
>
> Cheers,
> Carlos.

Attachment: benchmarks.txt
Description: Text document


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]