This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [PATCH v2 neleai/string-x64] Improve memcmp performance and fix regression.
- From: OndÅej BÃlka <neleai at seznam dot cz>
- To: libc-alpha at sourceware dot org
- Date: Fri, 19 Jun 2015 17:53:04 +0200
- Subject: Re: [PATCH v2 neleai/string-x64] Improve memcmp performance and fix regression.
- Authentication-results: sourceware.org; auth=none
- References: <20150618080910 dot GA27306 at domone>
On Thu, Jun 18, 2015 at 10:09:10AM +0200, OndÅej BÃlka wrote:
> Hi,
>
> As I sumbitted before in 2013 memcmp improvement here is new version
> that improves performance a bit more.
>
> Also when I browsed results I found that memcmp-sse4 is in fact
> regression for i7 nehalem, ivy bridge and haswell architectures. There
> its beaten by old sse2 code by more than 10%.
>
> Main idea of new implementation is same, problem with performance is
> that lot inputs were identical with small n.
> For that I found that following approach gives best performance when
> n<64 is likely.
>
> if (!cross_page (s1) && !cross_page (s2))
> {
> mask = get_mask(EQ(EQ(LOAD(s1),LOAD(s2)),zero))
> mask2 = mask & (2 << (n-1));
> if (mask2)
> return s1[first_byte(mask2)]-s2[first_byte(mask2)];
> if (n<=16)
> return 0;
> mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 16;
> mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 32;
> mask |= get_mask(EQ(EQ(LOAD(s1+16),LOAD(s2+16)),zero)) << 48;
> mask2 = mask & (2 << (n-1));
> if (mask2)
> return s1[first_byte(mask2)]-s2[first_byte(mask2)];
> if (n<=64)
> return 0;
> if (mask)
> return s1[first_byte(mask)]-s2[first_byte(mask)];
> }
>
> I didn't checked yet using just registers and byteswap to eliminate need
> of getting exact byte position as I wrote in related thread.
>
> I could improve this bit more, I lose lot of cycles in loop ending
> conditions. Problem is that I need to handle that unaligned s2 may read
> from next page, I would need to add more complicated logic to compute
> number of loop iterations.
>
> Thats related to avx2. I as RFC included it but it harm performance on
> haswell.
>
> Last is wmemcmp that I would also need to convert, now I just moved
> memcmp-sse-4 there.
>
> A profile is found here.
>
> http://kam.mff.cuni.cz/~ondra/benchmark_string/memcmp_profile.html
>
I updated that new version. I removed avx2 for now, I will submit it
when I find how it could improve performance.
Second change is that I added wmemcmp conditionals so now I could delete
memcmp-sse4 and wmemcmp-sse4.
* sysdeps/x86_64/memcmp.S: New implementation.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Remove memcmp-sse4
* sysdeps/x86_64/multiarch/Makefile(routines): Remove memcmp-sse4.
* sysdeps/x86_64/multiarch/memcmp.S: Likewise.
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Removed.
* sysdeps/x86_64/multiarch/wmemcmp-sse4.S: Likewise.
---
sysdeps/x86_64/memcmp.S | 495 ++++++++---------------
sysdeps/x86_64/multiarch/Makefile | 6 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 9 +-
sysdeps/x86_64/multiarch/memcmp-avx2.S | 3 +
sysdeps/x86_64/multiarch/memcmp.S | 25 +-
sysdeps/x86_64/multiarch/wmemcmp-sse4.S | 4 -
sysdeps/x86_64/multiarch/wmemcmp.S | 12 +-
8 files changed, 203 insertions(+), 360 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2.S
delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse4.S
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index f636716..55377fe 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -19,340 +19,185 @@
#include <sysdep.h>
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+
.text
-ENTRY (memcmp)
- test %rdx, %rdx
- jz L(finz)
- cmpq $1, %rdx
- jle L(finr1b)
- subq %rdi, %rsi
- movq %rdx, %r10
- cmpq $32, %r10
- jge L(gt32)
- /* Handle small chunks and last block of less than 32 bytes. */
-L(small):
- testq $1, %r10
- jz L(s2b)
- movzbl (%rdi), %eax
- movzbl (%rdi, %rsi), %edx
- subq $1, %r10
- je L(finz1)
- addq $1, %rdi
- subl %edx, %eax
- jnz L(exit)
-L(s2b):
- testq $2, %r10
- jz L(s4b)
- movzwl (%rdi), %eax
- movzwl (%rdi, %rsi), %edx
- subq $2, %r10
- je L(fin2_7)
- addq $2, %rdi
- cmpl %edx, %eax
- jnz L(fin2_7)
-L(s4b):
- testq $4, %r10
- jz L(s8b)
- movl (%rdi), %eax
- movl (%rdi, %rsi), %edx
- subq $4, %r10
- je L(fin2_7)
- addq $4, %rdi
- cmpl %edx, %eax
- jnz L(fin2_7)
-L(s8b):
- testq $8, %r10
- jz L(s16b)
- movq (%rdi), %rax
- movq (%rdi, %rsi), %rdx
- subq $8, %r10
- je L(fin2_7)
- addq $8, %rdi
- cmpq %rdx, %rax
- jnz L(fin2_7)
-L(s16b):
- movdqu (%rdi), %xmm1
- movdqu (%rdi, %rsi), %xmm0
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- xorl %eax, %eax
- subl $0xffff, %edx
- jz L(finz)
- bsfl %edx, %ecx
- leaq (%rdi, %rcx), %rcx
- movzbl (%rcx), %eax
- movzbl (%rsi, %rcx), %edx
- jmp L(finz1)
+ENTRY (MEMCMP)
+ testq %rdx, %rdx
+ je L(return_zero)
+#ifdef AS_WMEMCMP
+ shl $2, %rdx
+#endif
+ pxor %xmm4, %xmm4
+ movl %edi, %eax
+ andl $4095, %eax
+ cmpl $4032, %eax
+ jg L(cross_page)
+L(handle_end):
+ movl %esi, %eax
+ andl $4095, %eax
+ cmpl $4032, %eax
+ jg L(cross_page)
+ movdqu (%rdi), %xmm0
+ lea -1(%edx), %ecx
+ movl $2, %eax
+ movdqu (%rsi), %xmm1
+ salq %cl, %rax
+ leaq -1(%rax), %rcx
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ and %ecx, %eax
+ jne L(different)
+ cmpq $16, %rdx
+ ja L(next)
+ ret
+L(next):
+ pmovmskb %xmm0, %r8d
+ movdqu 16(%rdi), %xmm2
+ movdqu 16(%rsi), %xmm6
+ movdqu 32(%rdi), %xmm1
+ pcmpeqb %xmm6, %xmm2
+ movdqu 32(%rsi), %xmm5
+ pcmpeqb %xmm4, %xmm2
+ pcmpeqb %xmm5, %xmm1
+ movdqu 48(%rdi), %xmm7
+ pmovmskb %xmm2, %eax
+ movdqu 48(%rsi), %xmm3
+ pcmpeqb %xmm4, %xmm1
+ pmovmskb %xmm1, %r9d
+ sal $16, %eax
+ pcmpeqb %xmm3, %xmm7
+ salq $32, %r9
+ pcmpeqb %xmm4, %xmm7
+ orq %r9, %rax
+ orq %r8, %rax
+ pmovmskb %xmm7, %r8d
+ salq $48, %r8
+ orq %r8, %rax
+ movq %rax, %r8
+ andq %rcx, %rax
+ jne L(different)
+ cmpq $64, %rdx
+ jbe L(return_zero)
+ movq %r8, %rax
+ testq %rax, %rax
+ jne L(different)
+L(align_loop):
+ leaq 64(%rdi), %rax
+ andq $-64, %rax
+ subq %rdi, %rax
+ subq %rax, %rdx
+ addq %rax, %rdi
+ addq %rax, %rsi
+ cmpq $64, %rdx
+ ja L(loop_start)
+ testq %rdx, %rdx
+ jne L(handle_end)
+ xorl %eax, %eax
+ ret
- .p2align 4,, 4
-L(finr1b):
- movzbl (%rdi), %eax
- movzbl (%rsi), %edx
-L(finz1):
+ .p2align 4
+L(different):
+ bsfq %rax, %rdx
+#ifdef AS_WMEMCMP
+ and $-4, %rdx
+ mov (%rdi,%rdx), %eax
+ mov (%rsi,%rdx), %edx
subl %edx, %eax
-L(exit):
+ jg L(ret1)
+ jl L(ret_neg_1)
ret
-
- .p2align 4,, 4
-L(fin2_7):
- cmpq %rdx, %rax
- jz L(finz)
- movq %rax, %r11
- subq %rdx, %r11
- bsfq %r11, %rcx
- sarq $3, %rcx
- salq $3, %rcx
- sarq %cl, %rax
- movzbl %al, %eax
- sarq %cl, %rdx
- movzbl %dl, %edx
+L(ret1):
+ mov $1, %eax
+ ret
+L(ret_neg_1):
+ mov $-1, %eax
+ ret
+#else
+ movzbl (%rdi,%rdx), %eax
+ movzbl (%rsi,%rdx), %edx
subl %edx, %eax
ret
-
- .p2align 4,, 4
-L(finz):
+#endif
+
+ .p2align 4
+L(loop):
+ subq $64, %rdx
+ addq $64, %rdi
+ addq $64, %rsi
+ cmpq $64, %rdx
+ jbe L(less_64_bytes)
+L(loop_start):
+ movdqu (%rsi), %xmm0
+ movdqu 16(%rsi), %xmm1
+ pcmpeqb (%rdi), %xmm0
+ movdqu 32(%rsi), %xmm2
+ pcmpeqb 16(%rdi), %xmm1
+ movdqu 48(%rsi), %xmm3
+ pcmpeqb 32(%rdi), %xmm2
+ pcmpeqb 48(%rdi), %xmm3
+ pminub %xmm0, %xmm3
+ pminub %xmm1, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm4, %xmm3
+ pmovmskb %xmm3, %eax
+ testl %eax, %eax
+ je L(loop)
+ shl $48, %rax
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm4, %xmm1
+ pcmpeqb %xmm4, %xmm2
+ pmovmskb %xmm0, %r8
+ pmovmskb %xmm1, %rcx
+ pmovmskb %xmm2, %r9
+ shl $16, %ecx
+ shl $32, %r9
+ or %r8, %rax
+ or %r9, %rax
+ or %rcx, %rax
+ jmp L(different)
+
+ .p2align 4
+L(less_64_bytes):
+ testq %rdx, %rdx
+ jne L(handle_end)
xorl %eax, %eax
ret
- /* For blocks bigger than 32 bytes
- 1. Advance one of the addr pointer to be 16B aligned.
- 2. Treat the case of both addr pointers aligned to 16B
- separately to avoid movdqu.
- 3. Handle any blocks of greater than 64 consecutive bytes with
- unrolling to reduce branches.
- 4. At least one addr pointer is 16B aligned, use memory version
- of pcmbeqb.
- */
- .p2align 4,, 4
-L(gt32):
- movq %rdx, %r11
- addq %rdi, %r11
- movq %rdi, %r8
-
- andq $15, %r8
- jz L(16am)
- /* Both pointers may be misaligned. */
- movdqu (%rdi), %xmm1
- movdqu (%rdi, %rsi), %xmm0
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- subl $0xffff, %edx
- jnz L(neq)
- neg %r8
- leaq 16(%rdi, %r8), %rdi
-L(16am):
- /* Handle two 16B aligned pointers separately. */
- testq $15, %rsi
- jz L(ATR)
- testq $16, %rdi
- jz L(A32)
- movdqu (%rdi, %rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-L(A32):
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jge L(mt16)
- /* Pre-unroll to be ready for unrolled 64B loop. */
- testq $32, %rdi
- jz L(A64)
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
-L(A64):
- movq %r11, %r10
- andq $-64, %r10
- cmpq %r10, %rdi
- jge L(mt32)
-
-L(A64main):
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- cmpq %rdi, %r10
- jne L(A64main)
-
-L(mt32):
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jge L(mt16)
-
-L(A32main):
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- cmpq %rdi, %r10
- jne L(A32main)
-L(mt16):
- subq %rdi, %r11
- je L(finz)
- movq %r11, %r10
- jmp L(small)
-
- .p2align 4,, 4
-L(neq):
- bsfl %edx, %ecx
- movzbl (%rdi, %rcx), %eax
- addq %rdi, %rsi
- movzbl (%rsi,%rcx), %edx
- jmp L(finz1)
-
- .p2align 4,, 4
-L(ATR):
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jge L(mt16)
- testq $16, %rdi
- jz L(ATR32)
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
- cmpq %rdi, %r10
- je L(mt16)
-
-L(ATR32):
- movq %r11, %r10
- andq $-64, %r10
- testq $32, %rdi
- jz L(ATR64)
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
-L(ATR64):
- cmpq %rdi, %r10
- je L(mt32)
-
-L(ATR64main):
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
- cmpq %rdi, %r10
- jne L(ATR64main)
-
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jge L(mt16)
-
-L(ATR32res):
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- cmpq %r10, %rdi
- jne L(ATR32res)
-
- subq %rdi, %r11
- je L(finz)
- movq %r11, %r10
- jmp L(small)
- /* Align to 16byte to improve instruction fetch. */
- .p2align 4,, 4
-END(memcmp)
+ .p2align 4
+L(cross_page):
+ testq %rdx, %rdx
+ je L(return_zero)
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ cmpb %cl, %al
+ jne L(cross_page_different)
+ movl $1, %r8d
+ jmp L(cross_page_loop_start)
+
+ .p2align 4
+L(cross_page_loop):
+ movzbl (%rdi,%r8), %eax
+ movzbl (%rsi,%r8), %ecx
+ cmpb %cl, %al
+ jne L(cross_page_different)
+ addq $1, %r8
+ cmpq $65, %r8
+ je L(align_loop)
+L(cross_page_loop_start):
+ cmpq %rdx, %r8
+ jne L(cross_page_loop)
+L(return_zero):
+ xorl %eax, %eax
+ ret
+L(cross_page_different):
+ subl %ecx, %eax
+ ret
+END(MEMCMP)
-#undef bcmp
+#undef bcmp
weak_alias (memcmp, bcmp)
libc_hidden_builtin_def (memcmp)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index c573744..679db2a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -8,7 +8,7 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strcmp-sse2-unaligned strncmp-ssse3 \
- memcmp-sse4 memcpy-ssse3 \
+ memcpy-ssse3 \
memcpy-sse2-unaligned mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
memmove-avx-unaligned memcpy-avx-unaligned mempcpy-avx-unaligned \
@@ -29,10 +29,10 @@ CFLAGS-strspn-c.c += -msse4
endif
ifeq (yes,$(config-cflags-avx2))
-sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2
+sysdep_routines += memset-avx2 strcpy-avx2 stpcpy-avx2 memcmp-avx2
endif
endif
ifeq ($(subdir),wcsmbs)
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
+sysdep_routines += wmemcmp-sse2-unaligned wmemcmp-ssse3 wmemcmp-c wcscpy-ssse3 wcscpy-c
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d398e43..b3dbe65 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -39,10 +39,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/memcmp.S. */
IFUNC_IMPL (i, name, memcmp,
- IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSE4_1,
- __memcmp_sse4_1)
+ IFUNC_IMPL_ADD (array, i, memcmp, HAS_AVX2, __memcmp_avx2)
IFUNC_IMPL_ADD (array, i, memcmp, HAS_SSSE3, __memcmp_ssse3)
- IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
+ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2_unaligned))
/* Support sysdeps/x86_64/multiarch/memmove_chk.S. */
IFUNC_IMPL (i, name, __memmove_chk,
@@ -211,8 +210,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/wmemcmp.S. */
IFUNC_IMPL (i, name, wmemcmp,
- IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSE4_1,
- __wmemcmp_sse4_1)
+ IFUNC_IMPL_ADD (array, i, wmemcmp, 1,
+ __wmemcmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, wmemcmp, HAS_SSSE3,
__wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2.S b/sysdeps/x86_64/multiarch/memcmp-avx2.S
new file mode 100644
index 0000000..60483bf
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AVX2
+#define MEMCMP __memcmp_avx2
+#include "../memcmp.S"
diff --git a/sysdeps/x86_64/multiarch/memcmp.S b/sysdeps/x86_64/multiarch/memcmp.S
index f8b4636..5d87a17 100644
--- a/sysdeps/x86_64/multiarch/memcmp.S
+++ b/sysdeps/x86_64/multiarch/memcmp.S
@@ -29,33 +29,28 @@ ENTRY(memcmp)
cmpl $0, KIND_OFFSET+__cpu_features(%rip)
jne 1f
call __init_cpu_features
-
-1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
jnz 2f
- leaq __memcmp_sse2(%rip), %rax
- ret
-
-2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
- jz 3f
- leaq __memcmp_sse4_1(%rip), %rax
+1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 3f
+2: leaq __memcmp_sse2_unaligned(%rip), %rax
ret
3: leaq __memcmp_ssse3(%rip), %rax
ret
-
END(memcmp)
# undef ENTRY
# define ENTRY(name) \
- .type __memcmp_sse2, @function; \
+ .type __memcmp_sse2_unaligned, @function; \
.p2align 4; \
- .globl __memcmp_sse2; \
- .hidden __memcmp_sse2; \
- __memcmp_sse2: cfi_startproc; \
+ .globl __memcmp_sse2_unaligned; \
+ .hidden __memcmp_sse2_unaligned; \
+ __memcmp_sse2_unaligned: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
- cfi_endproc; .size __memcmp_sse2, .-__memcmp_sse2
+ cfi_endproc; .size __memcmp_sse2_unaligned, .-__memcmp_sse2_unaligned
# ifdef SHARED
# undef libc_hidden_builtin_def
@@ -63,7 +58,7 @@ END(memcmp)
they will be called without setting up EBX needed for PLT which is
used by IFUNC. */
# define libc_hidden_builtin_def(name) \
- .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2
+ .globl __GI_memcmp; __GI_memcmp = __memcmp_sse2_unaligned
# endif
#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S b/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
deleted file mode 100644
index b07973a..0000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-sse4.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_sse4_1
-
-#include "memcmp-sse4.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp.S b/sysdeps/x86_64/multiarch/wmemcmp.S
index 109e245..dabd3ed 100644
--- a/sysdeps/x86_64/multiarch/wmemcmp.S
+++ b/sysdeps/x86_64/multiarch/wmemcmp.S
@@ -30,18 +30,16 @@ ENTRY(wmemcmp)
jne 1f
call __init_cpu_features
-1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ testl $bit_Fast_Unaligned_Load, __cpu_features+FEATURE_OFFSET+index_Fast_Unaligned_Load(%rip)
jnz 2f
- leaq __wmemcmp_sse2(%rip), %rax
- ret
-
-2: testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip)
- jz 3f
- leaq __wmemcmp_sse4_1(%rip), %rax
+1: testl $bit_SSSE3, __cpu_features+CPUID_OFFSET+index_SSSE3(%rip)
+ jnz 3f
+2: leaq __wmemcmp_sse2_unaligned(%rip), %rax
ret
3: leaq __wmemcmp_ssse3(%rip), %rax
ret
+
END(wmemcmp)
#endif
--
1.8.4.rc3