This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/erms/hybrid updated. glibc-2.23-142-g08eb09c
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 23 Mar 2016 21:44:46 -0000
- Subject: GNU C Library master sources branch hjl/erms/hybrid updated. glibc-2.23-142-g08eb09c
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/erms/hybrid has been updated
via 08eb09c6eeca2038316693245595d0cadd475aad (commit)
via 14c6a52283c862765c4a39d708c0759bd51f6a7c (commit)
from 942d5a67c652603257c4edcf9ee5d05951a454cb (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=08eb09c6eeca2038316693245595d0cadd475aad
commit 08eb09c6eeca2038316693245595d0cadd475aad
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Mar 23 11:45:38 2016 -0700
Add memcpy-avx-unaligned.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index fdb8448..73dc7a9 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -21,6 +21,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
strcspn-c strpbrk-c strspn-c varshift memset-avx2 \
memset-avx512-no-vzeroupper \
+ memcpy-avx-unaligned-erms \
memcpy-erms mempcpy-erms memmove-erms \
memset-erms
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned-erms.S
new file mode 100644
index 0000000..7d1ab7e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned-erms.S
@@ -0,0 +1,158 @@
+/* memcpy with AVX unaliged loads and rep movsb
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+#include <sysdep.h>
+
+#include "asm-syntax.h"
+
+# ifdef SHARED
+ENTRY (__mempcpy_chk_avx_unaligned_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__mempcpy_chk_avx_unaligned_erms)
+# endif
+
+ENTRY (__mempcpy_avx_unaligned_erms)
+ mov %rdi, %rax
+ add %rdx, %rax
+ jmp L(start)
+END (__mempcpy_avx_unaligned_erms)
+
+# ifdef SHARED
+ENTRY (__memcpy_chk_avx_unaligned_erms)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (__memcpy_chk_avx_unaligned_erms)
+# endif
+
+ENTRY(__memcpy_avx_unaligned_erms)
+ movq %rdi, %rax
+L(start):
+ testq %rdx, %rdx
+ je L(return)
+ cmpq $32, %rdx
+ je L(32)
+ jb L(less_32)
+ cmpq $REP_MOVSB_THRESHOLD, %rdx
+ ja L(movsb)
+ vmovdqu (%rsi), %ymm0
+ vmovdqu -32(%rsi,%rdx), %ymm1
+ vmovdqu %ymm0, (%rdi)
+ vmovdqu %ymm1, -32(%rdi,%rdx)
+ cmpq $64, %rdx
+ ja L(more_64)
+ ret
+
+L(32):
+ vmovdqu (%rsi), %ymm0
+ vmovdqu %ymm0, (%rdi)
+ ret
+
+L(movsb):
+ mov %rdx, %rcx
+ rep movsb
+L(return):
+ ret
+
+ .p2align 4,,10
+ .p2align 4
+L(more_64):
+ vmovdqu 32(%rsi), %ymm0
+ vmovdqu -64(%rsi,%rdx), %ymm1
+ vmovdqu %ymm0, 32(%rdi)
+ vmovdqu %ymm1, -64(%rdi,%rdx)
+ cmpq $128, %rdx
+ jbe L(return)
+ vmovdqu 64(%rsi), %ymm0
+ vmovdqu -96(%rsi,%rdx), %ymm1
+ vmovdqu 96(%rsi), %ymm2
+ vmovdqu -128(%rsi,%rdx), %ymm3
+ vmovdqu %ymm0, 64(%rdi)
+ vmovdqu %ymm1, -96(%rdi,%rdx)
+ vmovdqu %ymm2, 96(%rdi)
+ vmovdqu %ymm3, -128(%rdi,%rdx)
+ cmpq $256, %rdx
+ jbe L(return)
+ leaq 128(%rdi), %rcx
+ addq %rdi, %rdx
+ andq $-128, %rdx
+ andq $-128, %rcx
+ movq %rcx, %r11
+ subq %rdi, %r11
+ addq %r11, %rsi
+ cmpq %rdx, %rcx
+ je L(return)
+ movq %rsi, %r10
+ subq %rcx, %r10
+ leaq 32(%r10), %r9
+ leaq 64(%r10), %r8
+ leaq 96(%r10), %r11
+ .p2align 4,,10
+ .p2align 4
+L(loop):
+ vmovdqu (%rcx,%r10), %ymm0
+ vmovdqu (%rcx,%r9), %ymm1
+ vmovdqu (%rcx,%r8), %ymm2
+ vmovdqu (%rcx,%r11), %ymm3
+ vmovdqa %ymm0, (%rcx)
+ vmovdqa %ymm1, 32(%rcx)
+ vmovdqa %ymm2, 64(%rcx)
+ vmovdqa %ymm3, 96(%rcx)
+ addq $128, %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+ ret
+L(less_32):
+ testb $0x30, %dl
+ jne L(between_16_31)
+ testb $8, %dl
+ jne L(between_8_15)
+ testb $4, %dl
+ jne L(between_4_7)
+ testq %rdx, %rdx
+ je L(return)
+ movzbl (%rsi), %ecx
+ testb $2, %dl
+ movb %cl, (%rdi)
+ je L(return)
+ movzwl -2(%rsi,%rdx), %ecx
+ movw %cx, -2(%rdi,%rdx)
+ ret
+L(between_16_31):
+ vmovdqu (%rsi), %xmm0
+ vmovdqu %xmm0, (%rdi)
+ vmovdqu -16(%rsi,%rdx), %xmm0
+ vmovdqu %xmm0, -16(%rdi,%rdx)
+ ret
+L(between_8_15):
+ movq (%rsi), %rcx
+ movq %rcx, (%rdi)
+ movq -8(%rsi,%rdx), %rcx
+ movq %rcx, -8(%rdi,%rdx)
+ ret
+L(between_4_7):
+ movl (%rsi), %ecx
+ movl %ecx, (%rdi)
+ movl -4(%rsi,%rdx), %ecx
+ movl %ecx, -4(%rdi,%rdx)
+ ret
+END (__memcpy_avx_unaligned_erms)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
index 2d42f2b..a273f29 100644
--- a/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S
@@ -33,32 +33,6 @@
.section .text.avx,"ax",@progbits
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (__mempcpy_chk_avx_unaligned_erms)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__mempcpy_chk_avx_unaligned_erms)
-
-ENTRY (__mempcpy_avx_unaligned_erms)
- mov %rdi, %rax
- add %rdx, %rax
- jmp L(start_erms)
-END (__mempcpy_avx_unaligned_erms)
-
-ENTRY (__memcpy_chk_avx_unaligned_erms)
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__memcpy_chk_avx_unaligned_erms)
-
-ENTRY(__memcpy_avx_unaligned_erms)
- movq %rdi, %rax
-L(start_erms):
- cmpq $REP_MOVSB_THRESHOLD, %rdx
- jbe L(start)
- mov %rdx, %rcx
- rep movsb
- ret
-END (__memcpy_avx_unaligned_erms)
-
ENTRY (MEMPCPY_CHK)
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=14c6a52283c862765c4a39d708c0759bd51f6a7c
commit 14c6a52283c862765c4a39d708c0759bd51f6a7c
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Mar 23 12:44:03 2016 -0700
Improve memcpy-sse2-unaligned.S
Use xmm1-3 and correct label names
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
index bced8c4..404422d 100644
--- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -48,19 +48,17 @@ L(start_erms):
testq %rdx, %rdx
je L(return)
cmpq $16, %rdx
- jbe L(less_16)
+ je L(16)
+ jb L(less_16)
cmpq $REP_MOVSB_THRESHOLD, %rdx
ja L(movsb)
- movdqu (%rsi), %xmm8
+ movdqu (%rsi), %xmm0
+ movdqu -16(%rsi,%rdx), %xmm1
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, -16(%rdi,%rdx)
cmpq $32, %rdx
- movdqu %xmm8, (%rdi)
- movdqu -16(%rsi,%rdx), %xmm8
- movdqu %xmm8, -16(%rdi,%rdx)
- ja .L31
+ ja L(more_32)
ret
-
- .p2align 4,,10
- .p2align 4
L(movsb):
mov %rdx, %rcx
rep movsb
@@ -93,33 +91,40 @@ L(start):
testq %rdx, %rdx
je L(return)
cmpq $16, %rdx
- jbe L(less_16)
- movdqu (%rsi), %xmm8
+ je L(16)
+ jb L(less_16)
+ movdqu (%rsi), %xmm0
+ movdqu -16(%rsi,%rdx), %xmm1
+ movdqu %xmm0, (%rdi)
+ movdqu %xmm1, -16(%rdi,%rdx)
cmpq $32, %rdx
- movdqu %xmm8, (%rdi)
- movdqu -16(%rsi,%rdx), %xmm8
- movdqu %xmm8, -16(%rdi,%rdx)
- ja .L31
+ ja L(more_32)
L(return):
ret
+
+L(16):
+ movdqu (%rsi), %xmm0
+ movdqu %xmm0, (%rdi)
+ ret
+
.p2align 4,,10
.p2align 4
-.L31:
- movdqu 16(%rsi), %xmm8
+L(more_32):
+ movdqu 16(%rsi), %xmm0
+ movdqu -32(%rsi,%rdx), %xmm1
+ movdqu %xmm0, 16(%rdi)
+ movdqu %xmm1, -32(%rdi,%rdx)
cmpq $64, %rdx
- movdqu %xmm8, 16(%rdi)
- movdqu -32(%rsi,%rdx), %xmm8
- movdqu %xmm8, -32(%rdi,%rdx)
jbe L(return)
- movdqu 32(%rsi), %xmm8
+ movdqu 32(%rsi), %xmm0
+ movdqu -48(%rsi,%rdx), %xmm1
+ movdqu 48(%rsi), %xmm2
+ movdqu -64(%rsi,%rdx), %xmm3
+ movdqu %xmm0, 32(%rdi)
+ movdqu %xmm1, -48(%rdi,%rdx)
+ movdqu %xmm2, 48(%rdi)
+ movdqu %xmm3, -64(%rdi,%rdx)
cmpq $128, %rdx
- movdqu %xmm8, 32(%rdi)
- movdqu -48(%rsi,%rdx), %xmm8
- movdqu %xmm8, -48(%rdi,%rdx)
- movdqu 48(%rsi), %xmm8
- movdqu %xmm8, 48(%rdi)
- movdqu -64(%rsi,%rdx), %xmm8
- movdqu %xmm8, -64(%rdi,%rdx)
jbe L(return)
leaq 64(%rdi), %rcx
addq %rdi, %rdx
@@ -138,26 +143,24 @@ L(return):
.p2align 4,,10
.p2align 4
L(loop):
- movdqu (%rcx,%r10), %xmm8
- movdqa %xmm8, (%rcx)
- movdqu (%rcx,%r9), %xmm8
- movdqa %xmm8, 16(%rcx)
- movdqu (%rcx,%r8), %xmm8
- movdqa %xmm8, 32(%rcx)
- movdqu (%rcx,%r11), %xmm8
- movdqa %xmm8, 48(%rcx)
+ movdqu (%rcx,%r10), %xmm0
+ movdqu (%rcx,%r9), %xmm1
+ movdqu (%rcx,%r8), %xmm2
+ movdqu (%rcx,%r11), %xmm3
+ movdqa %xmm0, (%rcx)
+ movdqa %xmm1, 16(%rcx)
+ movdqa %xmm2, 32(%rcx)
+ movdqa %xmm3, 48(%rcx)
addq $64, %rcx
cmpq %rcx, %rdx
jne L(loop)
ret
L(less_16):
- testb $24, %dl
- jne L(between_9_16)
+ testb $0x18, %dl
+ jne L(between_8_15)
testb $4, %dl
- .p2align 4,,5
- jne L(between_5_8)
+ jne L(between_4_7)
testq %rdx, %rdx
- .p2align 4,,2
je L(return)
movzbl (%rsi), %ecx
testb $2, %dl
@@ -166,13 +169,13 @@ L(less_16):
movzwl -2(%rsi,%rdx), %ecx
movw %cx, -2(%rdi,%rdx)
ret
-L(between_9_16):
+L(between_8_15):
movq (%rsi), %rcx
movq %rcx, (%rdi)
movq -8(%rsi,%rdx), %rcx
movq %rcx, -8(%rdi,%rdx)
ret
-L(between_5_8):
+L(between_4_7):
movl (%rsi), %ecx
movl %ecx, (%rdi)
movl -4(%rsi,%rdx), %ecx
-----------------------------------------------------------------------
Summary of changes:
sysdeps/x86_64/multiarch/Makefile | 1 +
.../x86_64/multiarch/memcpy-avx-unaligned-erms.S | 158 ++++++++++++++++++++
sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S | 26 ----
sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S | 89 ++++++------
4 files changed, 205 insertions(+), 69 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memcpy-avx-unaligned-erms.S
hooks/post-receive
--
GNU C Library master sources