This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/benchtests/master created. glibc-2.23-158-g1505553
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 5 Apr 2016 04:48:02 -0000
- Subject: GNU C Library master sources branch hjl/benchtests/master created. glibc-2.23-158-g1505553
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/benchtests/master has been created
at 1505553cb19fd7c5f4170303aa11cad17c012de8 (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1505553cb19fd7c5f4170303aa11cad17c012de8
commit 1505553cb19fd7c5f4170303aa11cad17c012de8
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Apr 4 18:41:53 2016 -0700
Remove L(loop)
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 100d7b6..0eba85d 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -186,93 +186,6 @@ L(movsb):
L(nop):
ret
- .p2align 4
-L(more_2x_vec):
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
- jae L(large_data)
- jmp L(start_more_2x_vec)
-
- .p2align 4
-L(movsb_more_2x_vec):
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
- jae L(large_data)
- cmpq $REP_MOVSB_THRESHOLD, %rdx
- ja L(movsb)
-L(start_more_2x_vec):
- /* More than 2 * VEC. */
- cmpq %rsi, %rdi
- jb L(copy_forward)
- /* Source == destination is less common. */
- je L(nop)
- leaq (%rsi,%rdx), %rcx
- cmpq %rcx, %rdi
- jb L(more_2x_vec_overlap)
-L(copy_forward):
- leaq (%rdi,%rdx), %rcx
- cmpq %rcx, %rsi
- jb L(more_2x_vec_overlap)
-# if VEC_SIZE == 64
- /* Force 32-bit displacement to avoid long nop between
- instructions. */
- VMOVU.d32 (%rsi), %VEC(0)
-# else
- VMOVU (%rsi), %VEC(0)
-# endif
- VMOVU VEC_SIZE(%rsi), %VEC(1)
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
- cmpq $(VEC_SIZE * 4), %rdx
- jbe L(return)
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(0)
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
- VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
- VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
- VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(1), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
- VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
- cmpq $(VEC_SIZE * 8), %rdx
- jbe L(return)
- leaq (VEC_SIZE * 4)(%rdi), %rcx
- addq %rdi, %rdx
- andq $-(VEC_SIZE * 4), %rdx
- andq $-(VEC_SIZE * 4), %rcx
- movq %rcx, %r11
- subq %rdi, %r11
- addq %r11, %rsi
- cmpq %rdx, %rcx
- je L(return)
- movq %rsi, %r10
- subq %rcx, %r10
- /* Force 32-bit displacement to avoid long nop between
- instructions. */
- leaq.d32 VEC_SIZE(%r10), %r9
-# if VEC_SIZE == 16
- leaq.d32 (VEC_SIZE * 2)(%r10), %r8
- leaq.d32 (VEC_SIZE * 3)(%r10), %r11
-# else
- leaq (VEC_SIZE * 2)(%r10), %r8
- leaq (VEC_SIZE * 3)(%r10), %r11
-# endif
- .p2align 4
-L(loop):
- VMOVU (%rcx,%r10), %VEC(0)
- VMOVU (%rcx,%r9), %VEC(1)
- VMOVU (%rcx,%r8), %VEC(2)
- VMOVU (%rcx,%r11), %VEC(3)
- VMOVA %VEC(0), (%rcx)
- VMOVA %VEC(1), VEC_SIZE(%rcx)
- VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
- VMOVA %VEC(3), (VEC_SIZE * 3)(%rcx)
- addq $(VEC_SIZE * 4), %rcx
- cmpq %rcx, %rdx
- jne L(loop)
- VZEROUPPER
- ret
L(less_vec):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
@@ -338,11 +251,19 @@ L(between_2_3):
movw %si, (%rdi)
ret
-# if VEC_SIZE > 16
- /* Align to 16 bytes to avoid long nop between instructions. */
.p2align 4
-# endif
-L(more_2x_vec_overlap):
+L(more_2x_vec):
+ cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
+ jae L(large_data)
+ jmp L(start_more_2x_vec)
+
+ .p2align 4
+L(movsb_more_2x_vec):
+ cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
+ jae L(large_data)
+ cmpq $REP_MOVSB_THRESHOLD, %rdx
+ ja L(movsb)
+L(start_more_2x_vec):
/* More than 2 * VEC and there is overlap bewteen destination
and source. */
cmpq $(VEC_SIZE * 8), %rdx
@@ -416,14 +337,15 @@ L(more_8x_vec):
/* Force 32-bit displacement to avoid long nop between
instructions. */
VMOVU.d32 (%rsi), %VEC(4)
-# if VEC_SIZE == 16
VMOVU.d32 -VEC_SIZE(%rsi, %rdx), %VEC(5)
+ VMOVU.d32 -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+# if VEC_SIZE == 16
+ VMOVU.d32 -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+ VMOVU.d32 -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
# else
- VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
-# endif
- VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+# endif
/* Adjust source. */
subq %r8, %rsi
/* Adjust destination which should be aligned now. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cdc0fde157da2f5b33dbd8e402cb3bb4c9b4a625
commit cdc0fde157da2f5b33dbd8e402cb3bb4c9b4a625
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Apr 3 17:21:45 2016 -0700
Copy very large data with non-temporal stores
Add __x86_shared_non_temporal_threshold
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index 96463df..cae9907 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -464,6 +464,10 @@ long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
/* Similar to __x86_shared_cache_size, but not rounded. */
long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
+/* Threshold to use non temporal stores. */
+long int __x86_shared_non_temporal_threshold attribute_hidden
+ = 1024 * 1024 * 4;
+
#ifndef DISABLE_PREFETCHW
/* PREFETCHW support flag for use in memory and string routines. */
int __x86_prefetchw attribute_hidden;
@@ -661,5 +665,6 @@ init_cacheinfo (void)
shared = shared & ~255L;
__x86_shared_cache_size_half = shared / 2;
__x86_shared_cache_size = shared;
+ __x86_shared_non_temporal_threshold = shared * 4;
}
}
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index 44711c3..94201b3 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -1,6 +1,8 @@
#if IS_IN (libc)
# define VEC_SIZE 32
# define VEC(i) ymm##i
+# define PREFETCHNT prefetchnta
+# define VMOVNT vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index c2c5293..1ebb9a3 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,6 +1,8 @@
#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
# define VEC_SIZE 64
# define VEC(i) zmm##i
+# define PREFETCHNT prefetchnta
+# define VMOVNT vmovntdq
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 85214fe..84b1263 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -1,8 +1,11 @@
#if IS_IN (libc)
# define VEC_SIZE 16
# define VEC(i) xmm##i
-# define VMOVU movdqu
-# define VMOVA movdqa
+# define PREFETCHNT prefetchnta
+# define VMOVNT movntdq
+/* Use movups and movaps for smaller code sizes. */
+# define VMOVU movups
+# define VMOVA movaps
# define SECTION(p) p
# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 66779a3..100d7b6 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -187,13 +187,18 @@ L(nop):
ret
.p2align 4
+L(more_2x_vec):
+ cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
+ jae L(large_data)
+ jmp L(start_more_2x_vec)
+
+ .p2align 4
L(movsb_more_2x_vec):
+ cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
+ jae L(large_data)
cmpq $REP_MOVSB_THRESHOLD, %rdx
- /* Force 32-bit displacement to avoid long nop between
- instructions. */
- ja.d32 L(movsb)
- .p2align 4
-L(more_2x_vec):
+ ja L(movsb)
+L(start_more_2x_vec):
/* More than 2 * VEC. */
cmpq %rsi, %rdi
jb L(copy_forward)
@@ -206,7 +211,13 @@ L(copy_forward):
leaq (%rdi,%rdx), %rcx
cmpq %rcx, %rsi
jb L(more_2x_vec_overlap)
+# if VEC_SIZE == 64
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 (%rsi), %VEC(0)
+# else
VMOVU (%rsi), %VEC(0)
+# endif
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
@@ -215,9 +226,7 @@ L(copy_forward):
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
cmpq $(VEC_SIZE * 4), %rdx
- /* Force 32-bit displacement to avoid long nop between
- instructions. */
- jbe.d32 L(return)
+ jbe L(return)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(0)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
@@ -227,13 +236,7 @@ L(copy_forward):
VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
cmpq $(VEC_SIZE * 8), %rdx
-# if VEC_SIZE == 16
jbe L(return)
-# else
- /* Use 8-bit displacement to avoid long nop between
- instructions. */
- jbe L(return_disp8)
-# endif
leaq (VEC_SIZE * 4)(%rdi), %rcx
addq %rdi, %rdx
andq $-(VEC_SIZE * 4), %rdx
@@ -242,14 +245,19 @@ L(copy_forward):
subq %rdi, %r11
addq %r11, %rsi
cmpq %rdx, %rcx
- /* Use 8-bit displacement to avoid long nop between
- instructions. */
- je L(return_disp8)
+ je L(return)
movq %rsi, %r10
subq %rcx, %r10
- leaq VEC_SIZE(%r10), %r9
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ leaq.d32 VEC_SIZE(%r10), %r9
+# if VEC_SIZE == 16
+ leaq.d32 (VEC_SIZE * 2)(%r10), %r8
+ leaq.d32 (VEC_SIZE * 3)(%r10), %r11
+# else
leaq (VEC_SIZE * 2)(%r10), %r8
leaq (VEC_SIZE * 3)(%r10), %r11
+# endif
.p2align 4
L(loop):
VMOVU (%rcx,%r10), %VEC(0)
@@ -263,7 +271,6 @@ L(loop):
addq $(VEC_SIZE * 4), %rcx
cmpq %rcx, %rdx
jne L(loop)
-L(return_disp8):
VZEROUPPER
ret
L(less_vec):
@@ -387,71 +394,237 @@ L(between_0_and_4x_vec):
L(more_8x_vec):
cmpq %rsi, %rdi
ja L(more_8x_vec_backward)
-
+ /* Source == destination is less common. */
+ je L(nop)
+ /* Save start and stop of the destination buffer. */
+ movq %rdi, %r10
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+# if VEC_SIZE == 32
+ leaq -VEC_SIZE(%rdi, %rdx), %rcx
+# else
+ leaq.d32 -VEC_SIZE(%rdi, %rdx), %rcx
+# endif
+ /* Align destination for aligned stores in the loop. Compute
+ how much destination is misaligned. */
+ movq %rdi, %r8
+ andq $(VEC_SIZE - 1), %r8
+ /* Get the negative of offset for alignment. */
+ subq $VEC_SIZE, %r8
+ /* Load the first VEC and last 4 * VEC to support overlapping
+ addresses. */
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 (%rsi), %VEC(4)
+# if VEC_SIZE == 16
+ VMOVU.d32 -VEC_SIZE(%rsi, %rdx), %VEC(5)
+# else
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
+# endif
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+ /* Adjust source. */
+ subq %r8, %rsi
+ /* Adjust destination which should be aligned now. */
+ subq %r8, %rdi
+ /* Adjust length. */
+ addq %r8, %rdx
.p2align 4
-L(loop_8x_vec_forward):
- /* Copy 8 * VEC a time forward. */
+L(loop_4x_vec_forward):
+ /* Copy 4 * VEC a time forward. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- VMOVU (VEC_SIZE * 4)(%rsi), %VEC(4)
- VMOVU (VEC_SIZE * 5)(%rsi), %VEC(5)
- VMOVU (VEC_SIZE * 6)(%rsi), %VEC(6)
- VMOVU (VEC_SIZE * 7)(%rsi), %VEC(7)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(4), (VEC_SIZE * 4)(%rdi)
- VMOVU %VEC(5), (VEC_SIZE * 5)(%rdi)
- VMOVU %VEC(6), (VEC_SIZE * 6)(%rdi)
- VMOVU %VEC(7), (VEC_SIZE * 7)(%rdi)
- addq $(VEC_SIZE * 8), %rdi
- addq $(VEC_SIZE * 8), %rsi
- subq $(VEC_SIZE * 8), %rdx
- cmpq $(VEC_SIZE * 8), %rdx
- je L(between_4x_vec_and_8x_vec)
- ja L(loop_8x_vec_forward)
- /* Less than 8 * VEC to copy. */
+ addq $(VEC_SIZE * 4), %rsi
+ subq $(VEC_SIZE * 4), %rdx
+ VMOVA %VEC(0), (%rdi)
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
+ addq $(VEC_SIZE * 4), %rdi
cmpq $(VEC_SIZE * 4), %rdx
- jb L(between_0_and_4x_vec)
- jmp L(between_4x_vec_and_8x_vec)
+ ja L(loop_4x_vec_forward)
+ /* Store the last 4 * VEC. */
+ VMOVU %VEC(5), (%rcx)
+ VMOVU %VEC(6), -VEC_SIZE(%rcx)
+ VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
+ VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
+ /* Store the first VEC. */
+ VMOVU %VEC(4), (%r10)
+ VZEROUPPER
+ ret
- .p2align 4
L(more_8x_vec_backward):
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
- leaq -VEC_SIZE(%rdi, %rdx), %r9
+ /* Save stop of the destination buffer. */
+ leaq -VEC_SIZE(%rdi, %rdx), %r11
+ /* Load the first 4 * VEC and last VEC to support overlapping
+ addresses. */
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 (%rsi), %VEC(4)
+# if VEC_SIZE == 32
+ VMOVU.d32 VEC_SIZE(%rsi), %VEC(5)
+# else
+ VMOVU VEC_SIZE(%rsi), %VEC(5)
+# endif
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
+ /* Align destination end for aligned stores in the loop. Compute
+ how much destination end is misaligned. */
+ leaq.d32 -VEC_SIZE(%rsi, %rdx), %rcx
+ movq %r11, %r9
+ movq %r11, %r8
+ andq $(VEC_SIZE - 1), %r8
+ /* Adjust source. */
+ subq %r8, %rcx
+ /* Adjust the end of destination which should be aligned now. */
+ subq %r8, %r9
+ /* Adjust length. */
+ subq %r8, %rdx
+ .p2align 4
+L(loop_4x_vec_backward):
+ /* Copy 4 * VEC a time backward. */
+ VMOVU (%rcx), %VEC(0)
+ VMOVU -VEC_SIZE(%rcx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
+ VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
+ subq $(VEC_SIZE * 4), %rcx
+ subq $(VEC_SIZE * 4), %rdx
+ VMOVA %VEC(0), (%r9)
+ VMOVA %VEC(1), -VEC_SIZE(%r9)
+ VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
+ VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
+ subq $(VEC_SIZE * 4), %r9
+ cmpq $(VEC_SIZE * 4), %rdx
+ ja L(loop_4x_vec_backward)
+ /* Store the first 4 * VEC. */
+ VMOVU %VEC(4), (%rdi)
+ VMOVU %VEC(5), VEC_SIZE(%rdi)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
+ /* Store the last VEC. */
+ VMOVU %VEC(8), (%r11)
+ VZEROUPPER
+ ret
+L(large_data):
+ /* Copy very large data with non-temporal stores. */
+ cmpq %rsi, %rdi
+ ja L(copy_large_backward)
+ /* Source == destination is less common. */
+ je L(nop)
+ /* Save start and stop of the destination buffer. */
+ movq %rdi, %r10
+ leaq -VEC_SIZE(%rdi, %rdx), %rcx
+ /* Align destination for non-temporal stores in the loop.
+ Compute how much destination is misaligned. */
+ movq %rdi, %r8
+ andq $(VEC_SIZE - 1), %r8
+ /* Get the negative of offset for alignment. */
+ subq $VEC_SIZE, %r8
+ /* Load the first VEC and last 4 * VEC to support overlapping
+ addresses. */
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 (%rsi), %VEC(4)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+ /* Adjust source. */
+ subq %r8, %rsi
+ /* Adjust destination which should be aligned now. */
+ subq %r8, %rdi
+ /* Adjust length. */
+ addq %r8, %rdx
.p2align 4
-L(loop_8x_vec_backward):
- /* Copy 8 * VEC a time backward. */
+L(loop_large_forward):
+ /* Copy 4 * VEC a time forward with non-temporal stores. */
+ PREFETCHNT (VEC_SIZE * 14)(%rsi)
+ PREFETCHNT (VEC_SIZE * 20)(%rsi)
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ addq $(VEC_SIZE * 4), %rsi
+ subq $(VEC_SIZE * 4), %rdx
+ VMOVNT %VEC(0), (%rdi)
+ VMOVNT %VEC(1), VEC_SIZE(%rdi)
+ VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
+ addq $(VEC_SIZE * 4), %rdi
+ cmpq $(VEC_SIZE * 4), %rdx
+ ja L(loop_large_forward)
+ sfence
+ /* Store the last 4 * VEC. */
+ VMOVU %VEC(5), (%rcx)
+ VMOVU %VEC(6), -VEC_SIZE(%rcx)
+ VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
+ VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
+ /* Store the first VEC. */
+ VMOVU %VEC(4), (%r10)
+ VZEROUPPER
+ ret
+
+L(copy_large_backward):
+ /* Save stop of the destination buffer. */
+ leaq -VEC_SIZE(%rdi, %rdx), %r11
+ /* Align the destination end for non-temporal stores in the loop.
+ Compute how much destination end is misaligned. */
+ leaq -VEC_SIZE(%rsi, %rdx), %rcx
+ movq %r11, %r9
+ movq %r11, %r8
+ andq $(VEC_SIZE - 1), %r8
+ /* Adjust source. */
+ subq %r8, %rcx
+ /* Adjust the end of destination which should be aligned now. */
+ subq %r8, %r9
+ /* Load the first 4 * VEC and last VEC to support overlapping
+ addresses. */
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 (%rsi), %VEC(4)
+# if VEC_SIZE == 32
+ VMOVU.d32 VEC_SIZE(%rsi), %VEC(5)
+# else
+ VMOVU VEC_SIZE(%rsi), %VEC(5)
+# endif
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
+ VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
+ /* Adjust length. */
+ subq %r8, %rdx
+ .p2align 4
+L(loop_large_backward):
+ /* Copy 4 * VEC a time backward with non-temporal stores. */
+ PREFETCHNT -(VEC_SIZE * 14)(%rcx)
+ PREFETCHNT -(VEC_SIZE * 20)(%rcx)
VMOVU (%rcx), %VEC(0)
VMOVU -VEC_SIZE(%rcx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- VMOVU -(VEC_SIZE * 4)(%rcx), %VEC(4)
- VMOVU -(VEC_SIZE * 5)(%rcx), %VEC(5)
- VMOVU -(VEC_SIZE * 6)(%rcx), %VEC(6)
- VMOVU -(VEC_SIZE * 7)(%rcx), %VEC(7)
- VMOVU %VEC(0), (%r9)
- VMOVU %VEC(1), -VEC_SIZE(%r9)
- VMOVU %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVU %VEC(3), -(VEC_SIZE * 3)(%r9)
- VMOVU %VEC(4), -(VEC_SIZE * 4)(%r9)
- VMOVU %VEC(5), -(VEC_SIZE * 5)(%r9)
- VMOVU %VEC(6), -(VEC_SIZE * 6)(%r9)
- VMOVU %VEC(7), -(VEC_SIZE * 7)(%r9)
- subq $(VEC_SIZE * 8), %rcx
- subq $(VEC_SIZE * 8), %r9
- subq $(VEC_SIZE * 8), %rdx
- cmpq $(VEC_SIZE * 8), %rdx
- je L(between_4x_vec_and_8x_vec)
- ja L(loop_8x_vec_backward)
- /* Less than 8 * VEC to copy. */
+ subq $(VEC_SIZE * 4), %rcx
+ subq $(VEC_SIZE * 4), %rdx
+ VMOVNT %VEC(0), (%r9)
+ VMOVNT %VEC(1), -VEC_SIZE(%r9)
+ VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
+ VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
+ subq $(VEC_SIZE * 4), %r9
cmpq $(VEC_SIZE * 4), %rdx
- jb L(between_0_and_4x_vec)
- jmp L(between_4x_vec_and_8x_vec)
+ ja L(loop_large_backward)
+ sfence
+ /* Store the first 4 * VEC. */
+ VMOVU %VEC(4), (%rdi)
+ VMOVU %VEC(5), VEC_SIZE(%rdi)
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
+ /* Store the last VEC. */
+ VMOVU %VEC(8), (%r11)
+ VZEROUPPER
+ ret
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
# ifdef SHARED
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=97d258a1abde09ad8bb6ff5afe38e4ff34e0f344
commit 97d258a1abde09ad8bb6ff5afe38e4ff34e0f344
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Apr 4 05:54:44 2016 -0700
Force 32-bit displacement in memset-vec-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9383517..1745a71 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -159,9 +159,21 @@ L(return):
.p2align 4
L(loop_start):
leaq (VEC_SIZE * 4)(%rdi), %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 %VEC(0), (%rdi)
+# else
VMOVU %VEC(0), (%rdi)
+# endif
andq $-(VEC_SIZE * 4), %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 %VEC(0), -VEC_SIZE(%rdi,%rdx)
+# else
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+# endif
VMOVU %VEC(0), VEC_SIZE(%rdi)
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4dcc8a09a3002cc351967c4c7153ccc7ac6831b2
commit 4dcc8a09a3002cc351967c4c7153ccc7ac6831b2
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Apr 4 04:52:36 2016 -0700
Add a comment in memset-sse2-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 2deba42..4bf3d36 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -1,6 +1,8 @@
#if IS_IN (libc)
# define VEC_SIZE 16
# define VEC(i) xmm##i
+/* Don't use movups and movaps since it will get larger nop paddings
+ for alignment. */
# define VMOVU movdqu
# define VMOVA movdqa
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=263a18c229afe223581b6d964a106eb10399d028
commit 263a18c229afe223581b6d964a106eb10399d028
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Apr 4 09:38:30 2016 -0700
Clear destination buffer set by the previous run
Clear the destination buffer set by the previous run in test-memcpy.c
and test-memmove.c to catch the error when the following implementations
do copy anything.
PR string/19907
* string/test-memcpy.c (do_one_test): Clear the destination
buffer set by the previous run.
* string/test-memmove.c (do_one_test): Likewise.
diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
index c5a7192..9d9e7b6 100644
--- a/benchtests/bench-memcpy.c
+++ b/benchtests/bench-memcpy.c
@@ -55,6 +55,10 @@ do_one_test (impl_t *impl, char *dst, const char *src,
size_t i, iters = INNER_LOOP_ITERS;
timing_t start, stop, cur;
+ /* Must clear the destination buffer set by the previous run. */
+ for (i = 0; i < len; i++)
+ dst[i] = 0;
+
if (CALL (impl, dst, src, len) != MEMCPY_RESULT (dst, len))
{
error (0, 0, "Wrong result in function %s %p %p", impl->name,
diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
index c38596b..3858f2a 100644
--- a/benchtests/bench-memmove.c
+++ b/benchtests/bench-memmove.c
@@ -70,6 +70,7 @@ do_one_test (impl_t *impl, char *dst, char *src, const char *orig_src,
size_t i, iters = INNER_LOOP_ITERS;
timing_t start, stop, cur;
+ /* This also clears the destination buffer set by the previous run. */
memcpy (src, orig_src, len);
#ifdef TEST_BCOPY
CALL (impl, src, dst, len);
diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index 2a0994c..cb072f8 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -53,6 +53,12 @@ static void
do_one_test (impl_t *impl, char *dst, const char *src,
size_t len)
{
+ size_t i;
+
+ /* Must clear the destination buffer set by the previous run. */
+ for (i = 0; i < len; i++)
+ dst[i] = 0;
+
if (CALL (impl, dst, src, len) != MEMCPY_RESULT (dst, len))
{
error (0, 0, "Wrong result in function %s %p %p", impl->name,
diff --git a/string/test-memmove.c b/string/test-memmove.c
index d2ab3f3..4343329 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -68,6 +68,7 @@ static void
do_one_test (impl_t *impl, char *dst, char *src, const char *orig_src,
size_t len)
{
+ /* This also clears the destination buffer set by the previous run. */
memcpy (src, orig_src, len);
#ifdef TEST_BCOPY
CALL (impl, src, dst, len);
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c62191f2c468d506bf2b169f8589ffca90ff8bcc
commit c62191f2c468d506bf2b169f8589ffca90ff8bcc
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Mar 30 09:18:27 2016 -0700
Add --enable-large-benchtests for large data benchmarks
We like to run memcpy memmove and memset benchmarks with large data sizes.
This patch adds --enable-large-benchtests to enable benchmarks with very
large data.
* benchtests/Makefile (string-benchset): Add memcpy-large,
memmove-large and memset-large for --enable-large-benchtests.
* benchtests/bench-memcpy-large.c: New file.
* benchtests/bench-memmove-large.c: Likewise.
* benchtests/bench-memmove-large.c: Likewise.
* benchtests/bench-string.h (TIMEOUT): Don't redefine.
* config.make.in (run-large-benchtests): New.
* configure.ac: Add --enable-large-benchtests.
* configure: Regenerated.
diff --git a/benchtests/Makefile b/benchtests/Makefile
index a37d666..7f8ae02 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -39,6 +39,9 @@ string-benchset := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
strcoll
+ifeq (yes,$(run-large-benchtests))
+string-benchset += memcpy-large memmove-large memset-large
+endif
wcsmbs-benchset := wcslen wcsnlen wcscpy wcpcpy wcsncpy wcpncpy wcscat wcsncat \
wcscmp wcsncmp wcschr wcschrnul wcsrchr wcsspn wcspbrk wcscspn \
wmemchr wmemset wmemcmp
diff --git a/benchtests/bench-memcpy-large.c b/benchtests/bench-memcpy-large.c
new file mode 100644
index 0000000..1a9b25c
--- /dev/null
+++ b/benchtests/bench-memcpy-large.c
@@ -0,0 +1,123 @@
+/* Measure memcpy functions with large data sizes.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef MEMCPY_RESULT
+# define MEMCPY_RESULT(dst, len) dst
+# define START_SIZE (64 * 1024)
+# define MIN_PAGE_SIZE (getpagesize () + 32 * 1024 * 1024)
+# define TEST_MAIN
+# define TEST_NAME "memcpy"
+# define TIMEOUT (20 * 60)
+# include "bench-string.h"
+
+IMPL (memcpy, 1)
+#endif
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+do_one_test (impl_t *impl, char *dst, const char *src,
+ size_t len)
+{
+ size_t i, iters = 16;
+ timing_t start, stop, cur;
+
+ /* Must clear the destination buffer set by the previous run. */
+ for (i = 0; i < len; i++)
+ dst[i] = 0;
+
+ if (CALL (impl, dst, src, len) != MEMCPY_RESULT (dst, len))
+ {
+ error (0, 0, "Wrong result in function %s %p %p", impl->name,
+ CALL (impl, dst, src, len), MEMCPY_RESULT (dst, len));
+ ret = 1;
+ return;
+ }
+
+ if (memcmp (dst, src, len) != 0)
+ {
+ error (0, 0, "Wrong result in function %s dst \"%s\" src \"%s\"",
+ impl->name, dst, src);
+ ret = 1;
+ return;
+ }
+
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, dst, src, len);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ TIMING_PRINT_MEAN ((double) cur, (double) iters);
+}
+
+static void
+do_test (size_t align1, size_t align2, size_t len)
+{
+ size_t i, j;
+ char *s1, *s2;
+
+ align1 &= 63;
+ if (align1 + len >= page_size)
+ return;
+
+ align2 &= 63;
+ if (align2 + len >= page_size)
+ return;
+
+ s1 = (char *) (buf1 + align1);
+ s2 = (char *) (buf2 + align2);
+
+ for (i = 0, j = 1; i < len; i++, j += 23)
+ s1[i] = j;
+
+ printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_test (impl, s2, s1, len);
+
+ putchar ('\n');
+}
+
+int
+test_main (void)
+{
+ size_t i;
+
+ test_init ();
+
+ printf ("%23s", "");
+ FOR_EACH_IMPL (impl, 0)
+ printf ("\t%s", impl->name);
+ putchar ('\n');
+
+ for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+ {
+ do_test (0, 0, i + 7);
+ do_test (0, 3, i + 15);
+ do_test (3, 0, i + 31);
+ do_test (3, 5, i + 63);
+ }
+
+ return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-memmove-large.c b/benchtests/bench-memmove-large.c
new file mode 100644
index 0000000..1a3fc6e
--- /dev/null
+++ b/benchtests/bench-memmove-large.c
@@ -0,0 +1,125 @@
+/* Measure memmove functions with large data sizes.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define BASE_PAGE_SIZE (1024 * 1024)
+#define START_SIZE (4 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 16 * 1024 * 1024)
+#define TEST_MAIN
+#define TEST_NAME "memmove"
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+IMPL (memmove, 1)
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+do_one_test (impl_t *impl, char *dst, char *src, const char *orig_src,
+ size_t len)
+{
+ size_t i, iters = 16;
+ timing_t start, stop, cur;
+
+ /* Must clear the destination buffer set by the previous run. */
+ for (i = 0; i < len; i++)
+ dst[i] = 0;
+
+ memcpy (src, orig_src, len);
+
+ char *res = CALL (impl, dst, src, len);
+ if (res != dst)
+ {
+ error (0, 0, "Wrong result in function %s %p %p", impl->name,
+ res, dst);
+ ret = 1;
+ return;
+ }
+
+ if (memcmp (dst, orig_src, len) != 0)
+ {
+ error (0, 0, "Wrong result in function %s dst \"%s\" src \"%s\"",
+ impl->name, dst, src);
+ ret = 1;
+ return;
+ }
+
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, dst, src, len);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ TIMING_PRINT_MEAN ((double) cur, (double) iters);
+}
+
+static void
+do_test (size_t align1, size_t align2, size_t len)
+{
+ size_t i, j;
+ char *s1, *s2;
+
+ align1 &= 127;
+ if (align1 + len >= page_size)
+ return;
+
+ align2 &= 127;
+ if (align2 + len >= page_size)
+ return;
+
+ s1 = (char *) (buf1 + align1);
+ s2 = (char *) (buf2 + align2);
+
+ for (i = 0, j = 1; i < len; i++, j += 23)
+ s1[i] = j;
+
+ printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_test (impl, s2, (char *) (buf2 + align1), s1, len);
+
+ putchar ('\n');
+}
+
+int
+test_main (void)
+{
+ size_t i;
+
+ test_init ();
+
+ printf ("%23s", "");
+ FOR_EACH_IMPL (impl, 0)
+ printf ("\t%s", impl->name);
+ putchar ('\n');
+
+ for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+ {
+ do_test (0, 64, i + 7);
+ do_test (0, 3, i + 15);
+ do_test (3, 0, i + 31);
+ do_test (3, 7, i + 63);
+ do_test (9, 5, i + 127);
+ }
+
+ return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-memset-large.c b/benchtests/bench-memset-large.c
new file mode 100644
index 0000000..fd3972d
--- /dev/null
+++ b/benchtests/bench-memset-large.c
@@ -0,0 +1,134 @@
+/* Measure memset functions with large data sizes.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#ifndef WIDE
+# define TEST_NAME "memset"
+#else
+# define TEST_NAME "wmemset"
+#endif /* WIDE */
+#define START_SIZE (128 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+#ifndef WIDE
+# define MEMSET memset
+# define CHAR char
+# define SIMPLE_MEMSET simple_memset
+# define MEMCMP memcmp
+#else
+# include <wchar.h>
+# define MEMSET wmemset
+# define CHAR wchar_t
+# define SIMPLE_MEMSET simple_wmemset
+# define MEMCMP wmemcmp
+#endif /* WIDE */
+
+#include <assert.h>
+
+IMPL (MEMSET, 1)
+
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
+
+CHAR *
+inhibit_loop_to_libcall
+SIMPLE_MEMSET (CHAR *s, int c, size_t n)
+{
+ CHAR *r = s, *end = s + n;
+ while (r < end)
+ *r++ = c;
+ return s;
+}
+
+static void
+do_one_test (impl_t *impl, CHAR *s, int c __attribute ((unused)), size_t n)
+{
+ size_t i, iters = 16;
+ timing_t start, stop, cur;
+ CHAR *tstbuf = malloc (n * sizeof (*s));
+ assert (tstbuf != NULL);
+
+ /* Must clear the destination buffer set by the previous run. */
+ for (i = 0; i < n; i++)
+ s[i] = 0;
+
+ CHAR *res = CALL (impl, s, c, n);
+ if (res != s
+ || SIMPLE_MEMSET (tstbuf, c, n) != tstbuf
+ || MEMCMP (s, tstbuf, n) != 0)
+ {
+ error (0, 0, "Wrong result in function %s", impl->name);
+ ret = 1;
+ free (tstbuf);
+ return;
+ }
+
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, s, c, n);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ TIMING_PRINT_MEAN ((double) cur, (double) iters);
+
+ free (tstbuf);
+}
+
+static void
+do_test (size_t align, int c, size_t len)
+{
+ align &= 63;
+ if ((align + len) * sizeof (CHAR) > page_size)
+ return;
+
+ printf ("Length %4zd, alignment %2zd, c %2d:", len, align, c);
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_test (impl, (CHAR *) (buf1) + align, c, len);
+
+ putchar ('\n');
+}
+
+int
+test_main (void)
+{
+ size_t i;
+ int c;
+
+ test_init ();
+
+ printf ("%24s", "");
+ FOR_EACH_IMPL (impl, 0)
+ printf ("\t%s", impl->name);
+ putchar ('\n');
+
+ c = 65;
+ for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+ {
+ do_test (0, c, i);
+ do_test (3, c, i);
+ }
+
+ return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-string.h b/benchtests/bench-string.h
index be4c618..9c5371e 100644
--- a/benchtests/bench-string.h
+++ b/benchtests/bench-string.h
@@ -56,7 +56,9 @@ extern impl_t __start_impls[], __stop_impls[];
# define TEST_FUNCTION test_main ()
-# define TIMEOUT (4 * 60)
+# ifndef TIMEOUT
+# define TIMEOUT (4 * 60)
+# endif
# define OPT_ITERATIONS 10000
# define OPT_RANDOM 10001
# define OPT_SEED 10002
diff --git a/config.make.in b/config.make.in
index 95c6f36..b9a4dbb 100644
--- a/config.make.in
+++ b/config.make.in
@@ -89,6 +89,7 @@ link-obsolete-rpc = @link_obsolete_rpc@
build-nscd = @build_nscd@
use-nscd = @use_nscd@
build-hardcoded-path-in-tests= @hardcoded_path_in_tests@
+run-large-benchtests = @large_benchtests@
build-pt-chown = @build_pt_chown@
enable-lock-elision = @enable_lock_elision@
diff --git a/configure b/configure
index 8fe5937..42bde65 100755
--- a/configure
+++ b/configure
@@ -668,6 +668,7 @@ all_warnings
force_install
bindnow
enable_lock_elision
+large_benchtests
hardcoded_path_in_tests
enable_timezone_tools
use_default_link
@@ -755,6 +756,7 @@ enable_shared
enable_profile
enable_timezone_tools
enable_hardcoded_path_in_tests
+enable_large_benchtests
enable_stackguard_randomization
enable_lock_elision
enable_add_ons
@@ -1411,6 +1413,8 @@ Optional Features:
--enable-hardcoded-path-in-tests
hardcode newly built glibc path in tests
[default=no]
+ --enable-large-benchtests
+ run benchtests with large data size [default=no]
--enable-stackguard-randomization
initialize __stack_chk_guard canary with a random
number at program start
@@ -3363,6 +3367,15 @@ fi
+# Check whether --enable-large-benchtests was given.
+if test "${enable_large_benchtests+set}" = set; then :
+ enableval=$enable_large_benchtests; large_benchtests=$enableval
+else
+ large_benchtests=no
+fi
+
+
+
# Check whether --enable-stackguard-randomization was given.
if test "${enable_stackguard_randomization+set}" = set; then :
enableval=$enable_stackguard_randomization; enable_stackguard_randomize=$enableval
diff --git a/configure.ac b/configure.ac
index 3c766b7..8fb93d9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -190,6 +190,13 @@ AC_ARG_ENABLE([hardcoded-path-in-tests],
[hardcoded_path_in_tests=no])
AC_SUBST(hardcoded_path_in_tests)
+AC_ARG_ENABLE([large-benchtests],
+ AC_HELP_STRING([--enable-large-benchtests],
+ [run benchtests with large data size @<:@default=no@:>@]),
+ [large_benchtests=$enableval],
+ [large_benchtests=no])
+AC_SUBST(large_benchtests)
+
AC_ARG_ENABLE([stackguard-randomization],
AC_HELP_STRING([--enable-stackguard-randomization],
[initialize __stack_chk_guard canary with a random number at program start]),
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources