This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/benchtests/master created. glibc-2.23-158-gc245c47
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 5 Apr 2016 14:29:01 -0000
- Subject: GNU C Library master sources branch hjl/benchtests/master created. glibc-2.23-158-gc245c47
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/benchtests/master has been created
at c245c47ac392b70e05632abc1bc48d3ef14920be (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c245c47ac392b70e05632abc1bc48d3ef14920be
commit c245c47ac392b70e05632abc1bc48d3ef14920be
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Apr 3 17:21:45 2016 -0700
Copy very large data with non-temporal stores
Add __x86_shared_non_temporal_threshold
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index 96463df..cae9907 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -464,6 +464,10 @@ long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
/* Similar to __x86_shared_cache_size, but not rounded. */
long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
+/* Threshold to use non temporal stores. */
+long int __x86_shared_non_temporal_threshold attribute_hidden
+ = 1024 * 1024 * 4;
+
#ifndef DISABLE_PREFETCHW
/* PREFETCHW support flag for use in memory and string routines. */
int __x86_prefetchw attribute_hidden;
@@ -661,5 +665,6 @@ init_cacheinfo (void)
shared = shared & ~255L;
__x86_shared_cache_size_half = shared / 2;
__x86_shared_cache_size = shared;
+ __x86_shared_non_temporal_threshold = shared * 4;
}
}
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index 44711c3..94201b3 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -1,6 +1,8 @@
#if IS_IN (libc)
# define VEC_SIZE 32
# define VEC(i) ymm##i
+# define PREFETCHNT prefetchnta
+# define VMOVNT vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index c2c5293..1ebb9a3 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,6 +1,8 @@
#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
# define VEC_SIZE 64
# define VEC(i) zmm##i
+# define PREFETCHNT prefetchnta
+# define VMOVNT vmovntdq
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 85214fe..84b1263 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -1,8 +1,11 @@
#if IS_IN (libc)
# define VEC_SIZE 16
# define VEC(i) xmm##i
-# define VMOVU movdqu
-# define VMOVA movdqa
+# define PREFETCHNT prefetchnta
+# define VMOVNT movntdq
+/* Use movups and movaps for smaller code sizes. */
+# define VMOVU movups
+# define VMOVA movaps
# define SECTION(p) p
# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 66779a3..bc3cc92 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -18,19 +18,22 @@
/* memmove/memcpy/mempcpy is implemented as:
1. Use overlapping load and store to avoid branch.
- 2. Use 8-bit or 32-bit displacements for branches and nop paddings
- to avoid long nop between instructions.
+ 2. Use 8-bit or 32-bit displacements and nop paddings to avoid long
+ nop between instructions.
3. Load all sources into registers and store them together to avoid
possible address overflap between source and destination.
- 4. If size is 2 * VEC_SIZE or less, load all sources into registers
+ 4. If size is 8 * VEC_SIZE or less, load all sources into registers
and store them together.
- 5. If there is no address overflap, copy from both ends with
- 4 * VEC_SIZE at a time.
- 6. If size is 8 * VEC_SIZE or less, load all sources into registers
- and store them together.
- 7. If address of destination > address of source, backward copy
- 8 * VEC_SIZE at a time.
- 8. Otherwise, forward copy 8 * VEC_SIZE at a time. */
+ 5. If address of destination > address of source, backward copy
+ 4 * VEC_SIZE at a time with unaligned load and aligned store.
+ Load the first 4 * VEC and last VEC before the loop and store
+ them after the loop to support overlapping addresses.
+ 6. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
+ load and aligned store. Load the last 4 * VEC and first VEC
+ before the loop and store them after the loop to support
+ overlapping addresses.
+ 7. If size >= __x86_shared_non_temporal_threshold, use non-temporal
+ store instead of aligned store. */
#if IS_IN (libc)
@@ -186,86 +189,6 @@ L(movsb):
L(nop):
ret
- .p2align 4
-L(movsb_more_2x_vec):
- cmpq $REP_MOVSB_THRESHOLD, %rdx
- /* Force 32-bit displacement to avoid long nop between
- instructions. */
- ja.d32 L(movsb)
- .p2align 4
-L(more_2x_vec):
- /* More than 2 * VEC. */
- cmpq %rsi, %rdi
- jb L(copy_forward)
- /* Source == destination is less common. */
- je L(nop)
- leaq (%rsi,%rdx), %rcx
- cmpq %rcx, %rdi
- jb L(more_2x_vec_overlap)
-L(copy_forward):
- leaq (%rdi,%rdx), %rcx
- cmpq %rcx, %rsi
- jb L(more_2x_vec_overlap)
- VMOVU (%rsi), %VEC(0)
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
- cmpq $(VEC_SIZE * 4), %rdx
- /* Force 32-bit displacement to avoid long nop between
- instructions. */
- jbe.d32 L(return)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(0)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
- VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
- VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
- VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(1), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
- VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
- cmpq $(VEC_SIZE * 8), %rdx
-# if VEC_SIZE == 16
- jbe L(return)
-# else
- /* Use 8-bit displacement to avoid long nop between
- instructions. */
- jbe L(return_disp8)
-# endif
- leaq (VEC_SIZE * 4)(%rdi), %rcx
- addq %rdi, %rdx
- andq $-(VEC_SIZE * 4), %rdx
- andq $-(VEC_SIZE * 4), %rcx
- movq %rcx, %r11
- subq %rdi, %r11
- addq %r11, %rsi
- cmpq %rdx, %rcx
- /* Use 8-bit displacement to avoid long nop between
- instructions. */
- je L(return_disp8)
- movq %rsi, %r10
- subq %rcx, %r10
- leaq VEC_SIZE(%r10), %r9
- leaq (VEC_SIZE * 2)(%r10), %r8
- leaq (VEC_SIZE * 3)(%r10), %r11
- .p2align 4
-L(loop):
- VMOVU (%rcx,%r10), %VEC(0)
- VMOVU (%rcx,%r9), %VEC(1)
- VMOVU (%rcx,%r8), %VEC(2)
- VMOVU (%rcx,%r11), %VEC(3)
- VMOVA %VEC(0), (%rcx)
- VMOVA %VEC(1), VEC_SIZE(%rcx)
- VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
- VMOVA %VEC(3), (VEC_SIZE * 3)(%rcx)
- addq $(VEC_SIZE * 4), %rcx
- cmpq %rcx, %rdx
- jne L(loop)
-L(return_disp8):
- VZEROUPPER
- ret
L(less_vec):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
@@ -331,11 +254,19 @@ L(between_2_3):
movw %si, (%rdi)
ret
-# if VEC_SIZE > 16
- /* Align to 16 bytes to avoid long nop between instructions. */
.p2align 4
-# endif
-L(more_2x_vec_overlap):
+L(more_2x_vec):
+ cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
+ jae L(large_data)
+ jmp L(start_more_2x_vec)
+
+ .p2align 4
+L(movsb_more_2x_vec):
+ cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
+ jae L(large_data)
+ cmpq $REP_MOVSB_THRESHOLD, %rdx
+ ja L(movsb)
+L(start_more_2x_vec):
/* More than 2 * VEC and there is overlap bewteen destination
and source. */
cmpq $(VEC_SIZE * 8), %rdx
@@ -374,84 +305,246 @@ L(last_4x_vec):
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
VZEROUPPER
ret
-L(between_0_and_4x_vec):
- /* Copy from 0 to 4 * VEC. */
- cmpl $(VEC_SIZE * 2), %edx
- jae L(last_4x_vec)
- /* Copy from 0 to 2 * VEC. */
- cmpl $VEC_SIZE, %edx
- jae L(last_2x_vec)
- /* Copy from 0 to VEC. */
- VZEROUPPER
- jmp L(less_vec)
L(more_8x_vec):
cmpq %rsi, %rdi
ja L(more_8x_vec_backward)
-
+ /* Source == destination is less common. */
+ je L(nop)
+ /* Load the first VEC and last 4 * VEC to support overlapping
+ addresses. */
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+# if VEC_SIZE == 16 || VEC_SIZE == 64
+ VMOVU.d32 (%rsi), %VEC(4)
+# else
+ VMOVU (%rsi), %VEC(4)
+# endif
+# if VEC_SIZE == 16
+ VMOVU.d32 -VEC_SIZE(%rsi, %rdx), %VEC(5)
+# else
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
+# endif
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+ /* Save start and stop of the destination buffer. */
+ movq %rdi, %r10
+# if VEC_SIZE == 32
+ leaq -VEC_SIZE(%rdi, %rdx), %rcx
+# else
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ leaq.d32 -VEC_SIZE(%rdi, %rdx), %rcx
+# endif
+ /* Align destination for aligned stores in the loop. Compute
+ how much destination is misaligned. */
+ movq %rdi, %r8
+ andq $(VEC_SIZE - 1), %r8
+ /* Get the negative of offset for alignment. */
+ subq $VEC_SIZE, %r8
+ /* Adjust source. */
+ subq %r8, %rsi
+ /* Adjust destination which should be aligned now. */
+ subq %r8, %rdi
+ /* Adjust length. */
+ addq %r8, %rdx
.p2align 4
-L(loop_8x_vec_forward):
- /* Copy 8 * VEC a time forward. */
+L(loop_4x_vec_forward):
+ /* Copy 4 * VEC a time forward. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- VMOVU (VEC_SIZE * 4)(%rsi), %VEC(4)
- VMOVU (VEC_SIZE * 5)(%rsi), %VEC(5)
- VMOVU (VEC_SIZE * 6)(%rsi), %VEC(6)
- VMOVU (VEC_SIZE * 7)(%rsi), %VEC(7)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(4), (VEC_SIZE * 4)(%rdi)
- VMOVU %VEC(5), (VEC_SIZE * 5)(%rdi)
- VMOVU %VEC(6), (VEC_SIZE * 6)(%rdi)
- VMOVU %VEC(7), (VEC_SIZE * 7)(%rdi)
- addq $(VEC_SIZE * 8), %rdi
- addq $(VEC_SIZE * 8), %rsi
- subq $(VEC_SIZE * 8), %rdx
- cmpq $(VEC_SIZE * 8), %rdx
- je L(between_4x_vec_and_8x_vec)
- ja L(loop_8x_vec_forward)
- /* Less than 8 * VEC to copy. */
+ addq $(VEC_SIZE * 4), %rsi
+ subq $(VEC_SIZE * 4), %rdx
+ VMOVA %VEC(0), (%rdi)
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
+ addq $(VEC_SIZE * 4), %rdi
cmpq $(VEC_SIZE * 4), %rdx
- jb L(between_0_and_4x_vec)
- jmp L(between_4x_vec_and_8x_vec)
+ ja L(loop_4x_vec_forward)
+ /* Store the last 4 * VEC. */
+ VMOVU %VEC(5), (%rcx)
+ VMOVU %VEC(6), -VEC_SIZE(%rcx)
+ VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
+ VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
+ /* Store the first VEC. */
+ VMOVU %VEC(4), (%r10)
+ VZEROUPPER
+ ret
- .p2align 4
L(more_8x_vec_backward):
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
- leaq -VEC_SIZE(%rdi, %rdx), %r9
+ /* Load the first 4 * VEC and last VEC to support overlapping
+ addresses. */
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 (%rsi), %VEC(4)
+# if VEC_SIZE == 32
+ VMOVU.d32 VEC_SIZE(%rsi), %VEC(5)
+# else
+ VMOVU VEC_SIZE(%rsi), %VEC(5)
+# endif
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
+ /* Save stop of the destination buffer. */
+ leaq -VEC_SIZE(%rdi, %rdx), %r11
+ /* Align destination end for aligned stores in the loop. Compute
+ how much destination end is misaligned. */
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ leaq.d32 -VEC_SIZE(%rsi, %rdx), %rcx
+ movq %r11, %r9
+ movq %r11, %r8
+ andq $(VEC_SIZE - 1), %r8
+ /* Adjust source. */
+ subq %r8, %rcx
+ /* Adjust the end of destination which should be aligned now. */
+ subq %r8, %r9
+ /* Adjust length. */
+ subq %r8, %rdx
+ .p2align 4
+L(loop_4x_vec_backward):
+ /* Copy 4 * VEC a time backward. */
+ VMOVU (%rcx), %VEC(0)
+ VMOVU -VEC_SIZE(%rcx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
+ VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
+ subq $(VEC_SIZE * 4), %rcx
+ subq $(VEC_SIZE * 4), %rdx
+ VMOVA %VEC(0), (%r9)
+ VMOVA %VEC(1), -VEC_SIZE(%r9)
+ VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
+ VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
+ subq $(VEC_SIZE * 4), %r9
+ cmpq $(VEC_SIZE * 4), %rdx
+ ja L(loop_4x_vec_backward)
+ /* Store the first 4 * VEC. */
+ VMOVU %VEC(4), (%rdi)
+ VMOVU %VEC(5), VEC_SIZE(%rdi)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
+ /* Store the last VEC. */
+ VMOVU %VEC(8), (%r11)
+ VZEROUPPER
+ ret
+
+L(large_data):
+ /* Copy very large data with non-temporal stores. */
+ cmpq %rsi, %rdi
+ ja L(copy_large_backward)
+ /* Source == destination is less common. */
+ je L(nop)
+ /* Load the first VEC and last 4 * VEC to support overlapping
+ addresses. */
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 (%rsi), %VEC(4)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+ /* Save start and stop of the destination buffer. */
+ movq %rdi, %r10
+ leaq -VEC_SIZE(%rdi, %rdx), %rcx
+ /* Align destination for non-temporal stores in the loop.
+ Compute how much destination is misaligned. */
+ movq %rdi, %r8
+ andq $(VEC_SIZE - 1), %r8
+ /* Get the negative of offset for alignment. */
+ subq $VEC_SIZE, %r8
+ /* Adjust source. */
+ subq %r8, %rsi
+ /* Adjust destination which should be aligned now. */
+ subq %r8, %rdi
+ /* Adjust length. */
+ addq %r8, %rdx
+ .p2align 4
+L(loop_large_forward):
+ /* Copy 4 * VEC a time forward with non-temporal stores. */
+ PREFETCHNT (VEC_SIZE * 14)(%rsi)
+ PREFETCHNT (VEC_SIZE * 20)(%rsi)
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ addq $(VEC_SIZE * 4), %rsi
+ subq $(VEC_SIZE * 4), %rdx
+ VMOVNT %VEC(0), (%rdi)
+ VMOVNT %VEC(1), VEC_SIZE(%rdi)
+ VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
+ addq $(VEC_SIZE * 4), %rdi
+ cmpq $(VEC_SIZE * 4), %rdx
+ ja L(loop_large_forward)
+ sfence
+ /* Store the last 4 * VEC. */
+ VMOVU %VEC(5), (%rcx)
+ VMOVU %VEC(6), -VEC_SIZE(%rcx)
+ VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
+ VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
+ /* Store the first VEC. */
+ VMOVU %VEC(4), (%r10)
+ VZEROUPPER
+ ret
+L(copy_large_backward):
+ /* Load the first 4 * VEC and last VEC to support overlapping
+ addresses. */
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 (%rsi), %VEC(4)
+# if VEC_SIZE == 32
+ VMOVU.d32 VEC_SIZE(%rsi), %VEC(5)
+# else
+ VMOVU VEC_SIZE(%rsi), %VEC(5)
+# endif
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
+ /* Save stop of the destination buffer. */
+ leaq -VEC_SIZE(%rdi, %rdx), %r11
+ /* Align the destination end for non-temporal stores in the loop.
+ Compute how much destination end is misaligned. */
+ leaq -VEC_SIZE(%rsi, %rdx), %rcx
+ movq %r11, %r9
+ movq %r11, %r8
+ andq $(VEC_SIZE - 1), %r8
+ /* Adjust source. */
+ subq %r8, %rcx
+ /* Adjust the end of destination which should be aligned now. */
+ subq %r8, %r9
+ /* Adjust length. */
+ subq %r8, %rdx
.p2align 4
-L(loop_8x_vec_backward):
- /* Copy 8 * VEC a time backward. */
+L(loop_large_backward):
+ /* Copy 4 * VEC a time backward with non-temporal stores. */
+ PREFETCHNT -(VEC_SIZE * 14)(%rcx)
+ PREFETCHNT -(VEC_SIZE * 20)(%rcx)
VMOVU (%rcx), %VEC(0)
VMOVU -VEC_SIZE(%rcx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- VMOVU -(VEC_SIZE * 4)(%rcx), %VEC(4)
- VMOVU -(VEC_SIZE * 5)(%rcx), %VEC(5)
- VMOVU -(VEC_SIZE * 6)(%rcx), %VEC(6)
- VMOVU -(VEC_SIZE * 7)(%rcx), %VEC(7)
- VMOVU %VEC(0), (%r9)
- VMOVU %VEC(1), -VEC_SIZE(%r9)
- VMOVU %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVU %VEC(3), -(VEC_SIZE * 3)(%r9)
- VMOVU %VEC(4), -(VEC_SIZE * 4)(%r9)
- VMOVU %VEC(5), -(VEC_SIZE * 5)(%r9)
- VMOVU %VEC(6), -(VEC_SIZE * 6)(%r9)
- VMOVU %VEC(7), -(VEC_SIZE * 7)(%r9)
- subq $(VEC_SIZE * 8), %rcx
- subq $(VEC_SIZE * 8), %r9
- subq $(VEC_SIZE * 8), %rdx
- cmpq $(VEC_SIZE * 8), %rdx
- je L(between_4x_vec_and_8x_vec)
- ja L(loop_8x_vec_backward)
- /* Less than 8 * VEC to copy. */
+ subq $(VEC_SIZE * 4), %rcx
+ subq $(VEC_SIZE * 4), %rdx
+ VMOVNT %VEC(0), (%r9)
+ VMOVNT %VEC(1), -VEC_SIZE(%r9)
+ VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
+ VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
+ subq $(VEC_SIZE * 4), %r9
cmpq $(VEC_SIZE * 4), %rdx
- jb L(between_0_and_4x_vec)
- jmp L(between_4x_vec_and_8x_vec)
+ ja L(loop_large_backward)
+ sfence
+ /* Store the first 4 * VEC. */
+ VMOVU %VEC(4), (%rdi)
+ VMOVU %VEC(5), VEC_SIZE(%rdi)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
+ /* Store the last VEC. */
+ VMOVU %VEC(8), (%r11)
+ VZEROUPPER
+ ret
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
# ifdef SHARED
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=97e117a1d2929e6633f834542dda1bc89b095620
commit 97e117a1d2929e6633f834542dda1bc89b095620
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Apr 4 09:38:30 2016 -0700
Clear destination buffer set by the previous run
Clear the destination buffer set by the previous run in test-memcpy.c
and test-memmove.c to catch the error when the following implementations
do copy anything.
PR string/19907
* string/test-memcpy.c (do_one_test): Clear the destination
buffer set by the previous run.
* string/test-memmove.c (do_one_test): Likewise.
diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
index c5a7192..9d9e7b6 100644
--- a/benchtests/bench-memcpy.c
+++ b/benchtests/bench-memcpy.c
@@ -55,6 +55,10 @@ do_one_test (impl_t *impl, char *dst, const char *src,
size_t i, iters = INNER_LOOP_ITERS;
timing_t start, stop, cur;
+ /* Must clear the destination buffer set by the previous run. */
+ for (i = 0; i < len; i++)
+ dst[i] = 0;
+
if (CALL (impl, dst, src, len) != MEMCPY_RESULT (dst, len))
{
error (0, 0, "Wrong result in function %s %p %p", impl->name,
diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
index c38596b..3858f2a 100644
--- a/benchtests/bench-memmove.c
+++ b/benchtests/bench-memmove.c
@@ -70,6 +70,7 @@ do_one_test (impl_t *impl, char *dst, char *src, const char *orig_src,
size_t i, iters = INNER_LOOP_ITERS;
timing_t start, stop, cur;
+ /* This also clears the destination buffer set by the previous run. */
memcpy (src, orig_src, len);
#ifdef TEST_BCOPY
CALL (impl, src, dst, len);
diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index 2a0994c..cb072f8 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -53,6 +53,12 @@ static void
do_one_test (impl_t *impl, char *dst, const char *src,
size_t len)
{
+ size_t i;
+
+ /* Must clear the destination buffer set by the previous run. */
+ for (i = 0; i < len; i++)
+ dst[i] = 0;
+
if (CALL (impl, dst, src, len) != MEMCPY_RESULT (dst, len))
{
error (0, 0, "Wrong result in function %s %p %p", impl->name,
diff --git a/string/test-memmove.c b/string/test-memmove.c
index d2ab3f3..4343329 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -68,6 +68,7 @@ static void
do_one_test (impl_t *impl, char *dst, char *src, const char *orig_src,
size_t len)
{
+ /* This also clears the destination buffer set by the previous run. */
memcpy (src, orig_src, len);
#ifdef TEST_BCOPY
CALL (impl, src, dst, len);
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d1a27f0f33a6c07c6842e8d1c7a67ad8758873a1
commit d1a27f0f33a6c07c6842e8d1c7a67ad8758873a1
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Mar 30 09:18:27 2016 -0700
Add --enable-large-benchtests for large data benchmarks
We like to run memcpy memmove and memset benchmarks with large data sizes.
This patch adds --enable-large-benchtests to enable benchmarks with very
large data.
* benchtests/Makefile (string-benchset): Add memcpy-large,
memmove-large and memset-large for --enable-large-benchtests.
* benchtests/bench-memcpy-large.c: New file.
* benchtests/bench-memmove-large.c: Likewise.
* benchtests/bench-memmove-large.c: Likewise.
* benchtests/bench-string.h (TIMEOUT): Don't redefine.
* config.make.in (run-large-benchtests): New.
* configure.ac: Add --enable-large-benchtests.
* configure: Regenerated.
diff --git a/benchtests/Makefile b/benchtests/Makefile
index a37d666..7f8ae02 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -39,6 +39,9 @@ string-benchset := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
strcoll
+ifeq (yes,$(run-large-benchtests))
+string-benchset += memcpy-large memmove-large memset-large
+endif
wcsmbs-benchset := wcslen wcsnlen wcscpy wcpcpy wcsncpy wcpncpy wcscat wcsncat \
wcscmp wcsncmp wcschr wcschrnul wcsrchr wcsspn wcspbrk wcscspn \
wmemchr wmemset wmemcmp
diff --git a/benchtests/bench-memcpy-large.c b/benchtests/bench-memcpy-large.c
new file mode 100644
index 0000000..1a9b25c
--- /dev/null
+++ b/benchtests/bench-memcpy-large.c
@@ -0,0 +1,123 @@
+/* Measure memcpy functions with large data sizes.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef MEMCPY_RESULT
+# define MEMCPY_RESULT(dst, len) dst
+# define START_SIZE (64 * 1024)
+# define MIN_PAGE_SIZE (getpagesize () + 32 * 1024 * 1024)
+# define TEST_MAIN
+# define TEST_NAME "memcpy"
+# define TIMEOUT (20 * 60)
+# include "bench-string.h"
+
+IMPL (memcpy, 1)
+#endif
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+do_one_test (impl_t *impl, char *dst, const char *src,
+ size_t len)
+{
+ size_t i, iters = 16;
+ timing_t start, stop, cur;
+
+ /* Must clear the destination buffer set by the previous run. */
+ for (i = 0; i < len; i++)
+ dst[i] = 0;
+
+ if (CALL (impl, dst, src, len) != MEMCPY_RESULT (dst, len))
+ {
+ error (0, 0, "Wrong result in function %s %p %p", impl->name,
+ CALL (impl, dst, src, len), MEMCPY_RESULT (dst, len));
+ ret = 1;
+ return;
+ }
+
+ if (memcmp (dst, src, len) != 0)
+ {
+ error (0, 0, "Wrong result in function %s dst \"%s\" src \"%s\"",
+ impl->name, dst, src);
+ ret = 1;
+ return;
+ }
+
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, dst, src, len);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ TIMING_PRINT_MEAN ((double) cur, (double) iters);
+}
+
+static void
+do_test (size_t align1, size_t align2, size_t len)
+{
+ size_t i, j;
+ char *s1, *s2;
+
+ align1 &= 63;
+ if (align1 + len >= page_size)
+ return;
+
+ align2 &= 63;
+ if (align2 + len >= page_size)
+ return;
+
+ s1 = (char *) (buf1 + align1);
+ s2 = (char *) (buf2 + align2);
+
+ for (i = 0, j = 1; i < len; i++, j += 23)
+ s1[i] = j;
+
+ printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_test (impl, s2, s1, len);
+
+ putchar ('\n');
+}
+
+int
+test_main (void)
+{
+ size_t i;
+
+ test_init ();
+
+ printf ("%23s", "");
+ FOR_EACH_IMPL (impl, 0)
+ printf ("\t%s", impl->name);
+ putchar ('\n');
+
+ for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+ {
+ do_test (0, 0, i + 7);
+ do_test (0, 3, i + 15);
+ do_test (3, 0, i + 31);
+ do_test (3, 5, i + 63);
+ }
+
+ return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-memmove-large.c b/benchtests/bench-memmove-large.c
new file mode 100644
index 0000000..1a3fc6e
--- /dev/null
+++ b/benchtests/bench-memmove-large.c
@@ -0,0 +1,125 @@
+/* Measure memmove functions with large data sizes.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define BASE_PAGE_SIZE (1024 * 1024)
+#define START_SIZE (4 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 16 * 1024 * 1024)
+#define TEST_MAIN
+#define TEST_NAME "memmove"
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+IMPL (memmove, 1)
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+do_one_test (impl_t *impl, char *dst, char *src, const char *orig_src,
+ size_t len)
+{
+ size_t i, iters = 16;
+ timing_t start, stop, cur;
+
+ /* Must clear the destination buffer set by the previous run. */
+ for (i = 0; i < len; i++)
+ dst[i] = 0;
+
+ memcpy (src, orig_src, len);
+
+ char *res = CALL (impl, dst, src, len);
+ if (res != dst)
+ {
+ error (0, 0, "Wrong result in function %s %p %p", impl->name,
+ res, dst);
+ ret = 1;
+ return;
+ }
+
+ if (memcmp (dst, orig_src, len) != 0)
+ {
+ error (0, 0, "Wrong result in function %s dst \"%s\" src \"%s\"",
+ impl->name, dst, src);
+ ret = 1;
+ return;
+ }
+
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, dst, src, len);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ TIMING_PRINT_MEAN ((double) cur, (double) iters);
+}
+
+static void
+do_test (size_t align1, size_t align2, size_t len)
+{
+ size_t i, j;
+ char *s1, *s2;
+
+ align1 &= 127;
+ if (align1 + len >= page_size)
+ return;
+
+ align2 &= 127;
+ if (align2 + len >= page_size)
+ return;
+
+ s1 = (char *) (buf1 + align1);
+ s2 = (char *) (buf2 + align2);
+
+ for (i = 0, j = 1; i < len; i++, j += 23)
+ s1[i] = j;
+
+ printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_test (impl, s2, (char *) (buf2 + align1), s1, len);
+
+ putchar ('\n');
+}
+
+int
+test_main (void)
+{
+ size_t i;
+
+ test_init ();
+
+ printf ("%23s", "");
+ FOR_EACH_IMPL (impl, 0)
+ printf ("\t%s", impl->name);
+ putchar ('\n');
+
+ for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+ {
+ do_test (0, 64, i + 7);
+ do_test (0, 3, i + 15);
+ do_test (3, 0, i + 31);
+ do_test (3, 7, i + 63);
+ do_test (9, 5, i + 127);
+ }
+
+ return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-memset-large.c b/benchtests/bench-memset-large.c
new file mode 100644
index 0000000..fd3972d
--- /dev/null
+++ b/benchtests/bench-memset-large.c
@@ -0,0 +1,134 @@
+/* Measure memset functions with large data sizes.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#ifndef WIDE
+# define TEST_NAME "memset"
+#else
+# define TEST_NAME "wmemset"
+#endif /* WIDE */
+#define START_SIZE (128 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+#ifndef WIDE
+# define MEMSET memset
+# define CHAR char
+# define SIMPLE_MEMSET simple_memset
+# define MEMCMP memcmp
+#else
+# include <wchar.h>
+# define MEMSET wmemset
+# define CHAR wchar_t
+# define SIMPLE_MEMSET simple_wmemset
+# define MEMCMP wmemcmp
+#endif /* WIDE */
+
+#include <assert.h>
+
+IMPL (MEMSET, 1)
+
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
+
+CHAR *
+inhibit_loop_to_libcall
+SIMPLE_MEMSET (CHAR *s, int c, size_t n)
+{
+ CHAR *r = s, *end = s + n;
+ while (r < end)
+ *r++ = c;
+ return s;
+}
+
+static void
+do_one_test (impl_t *impl, CHAR *s, int c __attribute ((unused)), size_t n)
+{
+ size_t i, iters = 16;
+ timing_t start, stop, cur;
+ CHAR *tstbuf = malloc (n * sizeof (*s));
+ assert (tstbuf != NULL);
+
+ /* Must clear the destination buffer set by the previous run. */
+ for (i = 0; i < n; i++)
+ s[i] = 0;
+
+ CHAR *res = CALL (impl, s, c, n);
+ if (res != s
+ || SIMPLE_MEMSET (tstbuf, c, n) != tstbuf
+ || MEMCMP (s, tstbuf, n) != 0)
+ {
+ error (0, 0, "Wrong result in function %s", impl->name);
+ ret = 1;
+ free (tstbuf);
+ return;
+ }
+
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, s, c, n);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ TIMING_PRINT_MEAN ((double) cur, (double) iters);
+
+ free (tstbuf);
+}
+
+static void
+do_test (size_t align, int c, size_t len)
+{
+ align &= 63;
+ if ((align + len) * sizeof (CHAR) > page_size)
+ return;
+
+ printf ("Length %4zd, alignment %2zd, c %2d:", len, align, c);
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_test (impl, (CHAR *) (buf1) + align, c, len);
+
+ putchar ('\n');
+}
+
+int
+test_main (void)
+{
+ size_t i;
+ int c;
+
+ test_init ();
+
+ printf ("%24s", "");
+ FOR_EACH_IMPL (impl, 0)
+ printf ("\t%s", impl->name);
+ putchar ('\n');
+
+ c = 65;
+ for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+ {
+ do_test (0, c, i);
+ do_test (3, c, i);
+ }
+
+ return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-string.h b/benchtests/bench-string.h
index be4c618..9c5371e 100644
--- a/benchtests/bench-string.h
+++ b/benchtests/bench-string.h
@@ -56,7 +56,9 @@ extern impl_t __start_impls[], __stop_impls[];
# define TEST_FUNCTION test_main ()
-# define TIMEOUT (4 * 60)
+# ifndef TIMEOUT
+# define TIMEOUT (4 * 60)
+# endif
# define OPT_ITERATIONS 10000
# define OPT_RANDOM 10001
# define OPT_SEED 10002
diff --git a/config.make.in b/config.make.in
index 95c6f36..b9a4dbb 100644
--- a/config.make.in
+++ b/config.make.in
@@ -89,6 +89,7 @@ link-obsolete-rpc = @link_obsolete_rpc@
build-nscd = @build_nscd@
use-nscd = @use_nscd@
build-hardcoded-path-in-tests= @hardcoded_path_in_tests@
+run-large-benchtests = @large_benchtests@
build-pt-chown = @build_pt_chown@
enable-lock-elision = @enable_lock_elision@
diff --git a/configure b/configure
index 8fe5937..42bde65 100755
--- a/configure
+++ b/configure
@@ -668,6 +668,7 @@ all_warnings
force_install
bindnow
enable_lock_elision
+large_benchtests
hardcoded_path_in_tests
enable_timezone_tools
use_default_link
@@ -755,6 +756,7 @@ enable_shared
enable_profile
enable_timezone_tools
enable_hardcoded_path_in_tests
+enable_large_benchtests
enable_stackguard_randomization
enable_lock_elision
enable_add_ons
@@ -1411,6 +1413,8 @@ Optional Features:
--enable-hardcoded-path-in-tests
hardcode newly built glibc path in tests
[default=no]
+ --enable-large-benchtests
+ run benchtests with large data size [default=no]
--enable-stackguard-randomization
initialize __stack_chk_guard canary with a random
number at program start
@@ -3363,6 +3367,15 @@ fi
+# Check whether --enable-large-benchtests was given.
+if test "${enable_large_benchtests+set}" = set; then :
+ enableval=$enable_large_benchtests; large_benchtests=$enableval
+else
+ large_benchtests=no
+fi
+
+
+
# Check whether --enable-stackguard-randomization was given.
if test "${enable_stackguard_randomization+set}" = set; then :
enableval=$enable_stackguard_randomization; enable_stackguard_randomize=$enableval
diff --git a/configure.ac b/configure.ac
index 3c766b7..8fb93d9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -190,6 +190,13 @@ AC_ARG_ENABLE([hardcoded-path-in-tests],
[hardcoded_path_in_tests=no])
AC_SUBST(hardcoded_path_in_tests)
+AC_ARG_ENABLE([large-benchtests],
+ AC_HELP_STRING([--enable-large-benchtests],
+ [run benchtests with large data size @<:@default=no@:>@]),
+ [large_benchtests=$enableval],
+ [large_benchtests=no])
+AC_SUBST(large_benchtests)
+
AC_ARG_ENABLE([stackguard-randomization],
AC_HELP_STRING([--enable-stackguard-randomization],
[initialize __stack_chk_guard canary with a random number at program start]),
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources