This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch master updated. glibc-2.25-608-ge94c310
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 27 Jun 2017 14:55:41 -0000
- Subject: GNU C Library master sources branch master updated. glibc-2.25-608-ge94c310
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via e94c31035739b693c3699b3c4cad0206631fbee7 (commit)
from 6980be7cbf4f108a4936ac64242f58340d56c806 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e94c31035739b693c3699b3c4cad0206631fbee7
commit e94c31035739b693c3699b3c4cad0206631fbee7
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Tue Jun 27 07:55:00 2017 -0700
x86-64: Optimize memcmp-avx2-movbe.S for short difference
Check the first 32 bytes before checking size when size >= 32 bytes
to avoid unnecessary branch if the difference is in the first 32 bytes.
Replace vpmovmskb/subl/jnz with vptest/jnc.
On Haswell, the new version is as fast as the previous one. On Skylake,
the new version is a little bit faster.
* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S (MEMCMP): Check
the first 32 bytes before checking size when size >= 32 bytes.
Replace vpmovmskb/subl/jnz with vptest/jnc.
diff --git a/ChangeLog b/ChangeLog
index 48821c0..88dde2b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2017-06-27 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S (MEMCMP): Check
+ the first 32 bytes before checking size when size >= 32 bytes.
+ Replace vpmovmskb/subl/jnz with vptest/jnc.
+
2017-06-27 Stefan Liebler <stli@linux.vnet.ibm.com>
* sysdeps/s390/s390-32/tls-macros.h (TLS_IE): Use r12 for GOT address.
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index abcc61c..16f4630 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -62,9 +62,68 @@ ENTRY (MEMCMP)
# endif
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
+
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+ vmovdqu (%rsi), %ymm2
+ VPCMPEQ (%rdi), %ymm2, %ymm2
+ vpmovmskb %ymm2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
cmpq $(VEC_SIZE * 2), %rdx
- ja L(more_2x_vec)
+ jbe L(last_vec)
+
+ VPCMPEQ %ymm0, %ymm0, %ymm0
+ /* More than 2 * VEC. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ ja L(more_8x_vec)
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(last_4x_vec)
+
+ /* From 4 * VEC to 8 * VEC, inclusively. */
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ (%rdi), %ymm1, %ymm1
+
+ vmovdqu VEC_SIZE(%rsi), %ymm2
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+
+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+ vpand %ymm1, %ymm2, %ymm5
+ vpand %ymm3, %ymm4, %ymm6
+ vpand %ymm5, %ymm6, %ymm5
+
+ vptest %ymm0, %ymm5
+ jnc L(4x_vec_end)
+
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ (%rdi), %ymm1, %ymm1
+
+ vmovdqu VEC_SIZE(%rsi), %ymm2
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+ vpand %ymm2, %ymm1, %ymm5
+
+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+ vpand %ymm3, %ymm5, %ymm5
+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+ vpand %ymm4, %ymm5, %ymm5
+
+ vptest %ymm0, %ymm5
+ jnc L(4x_vec_end)
+ xorl %eax, %eax
+ VZEROUPPER
+ ret
+
+ .p2align 4
L(last_2x_vec):
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
vmovdqu (%rsi), %ymm2
@@ -219,58 +278,6 @@ L(between_16_31):
ret
.p2align 4
-L(more_2x_vec):
- /* More than 2 * VEC. */
- cmpq $(VEC_SIZE * 8), %rdx
- ja L(more_8x_vec)
- cmpq $(VEC_SIZE * 4), %rdx
- jb L(last_4x_vec)
-
- /* From 4 * VEC to 8 * VEC, inclusively. */
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
-
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
-
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
-
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
-
- vpand %ymm1, %ymm2, %ymm5
- vpand %ymm3, %ymm4, %ymm6
- vpand %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
-
- leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
- leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
-
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
- vpand %ymm2, %ymm1, %ymm5
-
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
- vpand %ymm3, %ymm5, %ymm5
-
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
- vpand %ymm4, %ymm5, %ymm5
-
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
- VZEROUPPER
- ret
-
- .p2align 4
L(more_8x_vec):
/* More than 8 * VEC. Check the first VEC. */
vmovdqu (%rsi), %ymm2
@@ -309,9 +316,8 @@ L(loop_4x_vec):
VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
vpand %ymm4, %ymm5, %ymm5
- vpmovmskb %ymm5, %eax
- subl $VEC_MASK, %eax
- jnz L(4x_vec_end)
+ vptest %ymm0, %ymm5
+ jnc L(4x_vec_end)
addq $(VEC_SIZE * 4), %rdi
addq $(VEC_SIZE * 4), %rsi
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 6 ++
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 118 ++++++++++++++------------
2 files changed, 68 insertions(+), 56 deletions(-)
hooks/post-receive
--
GNU C Library master sources