This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
GNU C Library master sources branch hjl/benchtests/master created. glibc-2.23-158-g1505553

From: hjl at sourceware dot org
To: glibc-cvs at sourceware dot org
Date: 5 Apr 2016 04:48:02 -0000
Subject: GNU C Library master sources branch hjl/benchtests/master created. glibc-2.23-158-g1505553
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, hjl/benchtests/master has been created
        at  1505553cb19fd7c5f4170303aa11cad17c012de8 (commit)

- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=1505553cb19fd7c5f4170303aa11cad17c012de8

commit 1505553cb19fd7c5f4170303aa11cad17c012de8
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Apr 4 18:41:53 2016 -0700

    Remove L(loop)

diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 100d7b6..0eba85d 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -186,93 +186,6 @@ L(movsb):
 L(nop):
 	ret
 
-	.p2align 4
-L(more_2x_vec):
-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
-	jae	L(large_data)
-	jmp	L(start_more_2x_vec)
-
-	.p2align 4
-L(movsb_more_2x_vec):
-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
-	jae	L(large_data)
-	cmpq	$REP_MOVSB_THRESHOLD, %rdx
-	ja	L(movsb)
-L(start_more_2x_vec):
-	/* More than 2 * VEC.  */
-	cmpq	%rsi, %rdi
-	jb	L(copy_forward)
-	/* Source == destination is less common.  */
-	je	L(nop)
-	leaq	(%rsi,%rdx), %rcx
-	cmpq	%rcx, %rdi
-	jb	L(more_2x_vec_overlap)
-L(copy_forward):
-	leaq	(%rdi,%rdx), %rcx
-	cmpq	%rcx, %rsi
-	jb	L(more_2x_vec_overlap)
-# if  VEC_SIZE == 64
-	/* Force 32-bit displacement to avoid long nop between
-	   instructions.  */
-	VMOVU.d32 (%rsi), %VEC(0)
-# else
-	VMOVU	(%rsi), %VEC(0)
-# endif
-	VMOVU	VEC_SIZE(%rsi), %VEC(1)
-	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
-	cmpq	$(VEC_SIZE * 4), %rdx
-	jbe	L(return)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(0)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
-	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3)
-	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(1), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
-	VMOVU	%VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
-	cmpq	$(VEC_SIZE * 8), %rdx
-	jbe	L(return)
-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
-	addq	%rdi, %rdx
-	andq	$-(VEC_SIZE * 4), %rdx
-	andq	$-(VEC_SIZE * 4), %rcx
-	movq	%rcx, %r11
-	subq	%rdi, %r11
-	addq	%r11, %rsi
-	cmpq	%rdx, %rcx
-	je	L(return)
-	movq	%rsi, %r10
-	subq	%rcx, %r10
-	/* Force 32-bit displacement to avoid long nop between
-	   instructions.  */
-	leaq.d32 VEC_SIZE(%r10), %r9
-# if  VEC_SIZE == 16
-	leaq.d32 (VEC_SIZE * 2)(%r10), %r8
-	leaq.d32 (VEC_SIZE * 3)(%r10), %r11
-# else
-	leaq	(VEC_SIZE * 2)(%r10), %r8
-	leaq	(VEC_SIZE * 3)(%r10), %r11
-# endif
-	.p2align 4
-L(loop):
-	VMOVU	(%rcx,%r10), %VEC(0)
-	VMOVU	(%rcx,%r9), %VEC(1)
-	VMOVU	(%rcx,%r8), %VEC(2)
-	VMOVU	(%rcx,%r11), %VEC(3)
-	VMOVA	%VEC(0), (%rcx)
-	VMOVA	%VEC(1), VEC_SIZE(%rcx)
-	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rcx)
-	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rcx)
-	addq	$(VEC_SIZE * 4), %rcx
-	cmpq	%rcx, %rdx
-	jne	L(loop)
-	VZEROUPPER
-	ret
 L(less_vec):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
@@ -338,11 +251,19 @@ L(between_2_3):
 	movw	%si, (%rdi)
 	ret
 
-# if VEC_SIZE > 16
-	/* Align to 16 bytes to avoid long nop between instructions.  */
 	.p2align 4
-# endif
-L(more_2x_vec_overlap):
+L(more_2x_vec):
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	jae	L(large_data)
+	jmp	L(start_more_2x_vec)
+
+	.p2align 4
+L(movsb_more_2x_vec):
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	jae	L(large_data)
+	cmpq	$REP_MOVSB_THRESHOLD, %rdx
+	ja	L(movsb)
+L(start_more_2x_vec):
 	/* More than 2 * VEC and there is overlap bewteen destination
 	   and source.  */
 	cmpq	$(VEC_SIZE * 8), %rdx
@@ -416,14 +337,15 @@ L(more_8x_vec):
 	/* Force 32-bit displacement to avoid long nop between
 	   instructions.  */
 	VMOVU.d32 (%rsi), %VEC(4)
-# if VEC_SIZE == 16
 	VMOVU.d32 -VEC_SIZE(%rsi, %rdx), %VEC(5)
+	VMOVU.d32 -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+# if VEC_SIZE == 16
+	VMOVU.d32 -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+	VMOVU.d32 -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
 # else
-	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
-# endif
-	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
 	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
 	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+# endif
 	/* Adjust source.  */
 	subq	%r8, %rsi
 	/* Adjust destination which should be aligned now.  */

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cdc0fde157da2f5b33dbd8e402cb3bb4c9b4a625

commit cdc0fde157da2f5b33dbd8e402cb3bb4c9b4a625
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Sun Apr 3 17:21:45 2016 -0700

    Copy very large data with non-temporal stores
    
    Add __x86_shared_non_temporal_threshold

diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index 96463df..cae9907 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -464,6 +464,10 @@ long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
 /* Similar to __x86_shared_cache_size, but not rounded.  */
 long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
 
+/* Threshold to use non temporal stores.  */
+long int __x86_shared_non_temporal_threshold attribute_hidden
+  = 1024 * 1024 * 4;
+
 #ifndef DISABLE_PREFETCHW
 /* PREFETCHW support flag for use in memory and string routines.  */
 int __x86_prefetchw attribute_hidden;
@@ -661,5 +665,6 @@ init_cacheinfo (void)
       shared = shared & ~255L;
       __x86_shared_cache_size_half = shared / 2;
       __x86_shared_cache_size = shared;
+      __x86_shared_non_temporal_threshold = shared * 4;
     }
 }
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index 44711c3..94201b3 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -1,6 +1,8 @@
 #if IS_IN (libc)
 # define VEC_SIZE	32
 # define VEC(i)		ymm##i
+# define PREFETCHNT	prefetchnta
+# define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu
 # define VMOVA		vmovdqa
 
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index c2c5293..1ebb9a3 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,6 +1,8 @@
 #if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
 # define VEC_SIZE	64
 # define VEC(i)		zmm##i
+# define PREFETCHNT	prefetchnta
+# define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 85214fe..84b1263 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -1,8 +1,11 @@
 #if IS_IN (libc)
 # define VEC_SIZE	16
 # define VEC(i)		xmm##i
-# define VMOVU		movdqu
-# define VMOVA		movdqa
+# define PREFETCHNT	prefetchnta
+# define VMOVNT		movntdq
+/* Use movups and movaps for smaller code sizes.  */
+# define VMOVU		movups
+# define VMOVA		movaps
 
 # define SECTION(p)		p
 # define MEMMOVE_SYMBOL(p,s)	p##_sse2_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 66779a3..100d7b6 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -187,13 +187,18 @@ L(nop):
 	ret
 
 	.p2align 4
+L(more_2x_vec):
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	jae	L(large_data)
+	jmp	L(start_more_2x_vec)
+
+	.p2align 4
 L(movsb_more_2x_vec):
+	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	jae	L(large_data)
 	cmpq	$REP_MOVSB_THRESHOLD, %rdx
-	/* Force 32-bit displacement to avoid long nop between
-	   instructions.  */
-	ja.d32	L(movsb)
-	.p2align 4
-L(more_2x_vec):
+	ja	L(movsb)
+L(start_more_2x_vec):
 	/* More than 2 * VEC.  */
 	cmpq	%rsi, %rdi
 	jb	L(copy_forward)
@@ -206,7 +211,13 @@ L(copy_forward):
 	leaq	(%rdi,%rdx), %rcx
 	cmpq	%rcx, %rsi
 	jb	L(more_2x_vec_overlap)
+# if  VEC_SIZE == 64
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	VMOVU.d32 (%rsi), %VEC(0)
+# else
 	VMOVU	(%rsi), %VEC(0)
+# endif
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
 	VMOVU	-(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
@@ -215,9 +226,7 @@ L(copy_forward):
 	VMOVU	%VEC(2), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
 	cmpq	$(VEC_SIZE * 4), %rdx
-	/* Force 32-bit displacement to avoid long nop between
-	   instructions.  */
-	jbe.d32	L(return)
+	jbe	L(return)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(0)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(1)
 	VMOVU	-(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
@@ -227,13 +236,7 @@ L(copy_forward):
 	VMOVU	%VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
 	VMOVU	%VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
 	cmpq	$(VEC_SIZE * 8), %rdx
-# if  VEC_SIZE == 16
 	jbe	L(return)
-# else
-	/* Use 8-bit displacement to avoid long nop between
-	   instructions.  */
-	jbe	L(return_disp8)
-# endif
 	leaq	(VEC_SIZE * 4)(%rdi), %rcx
 	addq	%rdi, %rdx
 	andq	$-(VEC_SIZE * 4), %rdx
@@ -242,14 +245,19 @@ L(copy_forward):
 	subq	%rdi, %r11
 	addq	%r11, %rsi
 	cmpq	%rdx, %rcx
-	/* Use 8-bit displacement to avoid long nop between
-	   instructions.  */
-	je	L(return_disp8)
+	je	L(return)
 	movq	%rsi, %r10
 	subq	%rcx, %r10
-	leaq	VEC_SIZE(%r10), %r9
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	leaq.d32 VEC_SIZE(%r10), %r9
+# if  VEC_SIZE == 16
+	leaq.d32 (VEC_SIZE * 2)(%r10), %r8
+	leaq.d32 (VEC_SIZE * 3)(%r10), %r11
+# else
 	leaq	(VEC_SIZE * 2)(%r10), %r8
 	leaq	(VEC_SIZE * 3)(%r10), %r11
+# endif
 	.p2align 4
 L(loop):
 	VMOVU	(%rcx,%r10), %VEC(0)
@@ -263,7 +271,6 @@ L(loop):
 	addq	$(VEC_SIZE * 4), %rcx
 	cmpq	%rcx, %rdx
 	jne	L(loop)
-L(return_disp8):
 	VZEROUPPER
 	ret
 L(less_vec):
@@ -387,71 +394,237 @@ L(between_0_and_4x_vec):
 L(more_8x_vec):
 	cmpq	%rsi, %rdi
 	ja	L(more_8x_vec_backward)
-
+	/* Source == destination is less common.  */
+	je	L(nop)
+	/* Save start and stop of the destination buffer.  */
+	movq	%rdi, %r10
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+# if VEC_SIZE == 32
+	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
+# else
+	leaq.d32 -VEC_SIZE(%rdi, %rdx), %rcx
+# endif
+	/* Align destination for aligned stores in the loop.  Compute
+	   how much destination is misaligned.  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %r8
+	/* Load the first VEC and last 4 * VEC to support overlapping
+	   addresses.  */
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	VMOVU.d32 (%rsi), %VEC(4)
+# if VEC_SIZE == 16
+	VMOVU.d32 -VEC_SIZE(%rsi, %rdx), %VEC(5)
+# else
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+# endif
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
 	.p2align 4
-L(loop_8x_vec_forward):
-	/* Copy 8 * VEC a time forward.  */
+L(loop_4x_vec_forward):
+	/* Copy 4 * VEC a time forward.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	VMOVU	(VEC_SIZE * 4)(%rsi), %VEC(4)
-	VMOVU	(VEC_SIZE * 5)(%rsi), %VEC(5)
-	VMOVU	(VEC_SIZE * 6)(%rsi), %VEC(6)
-	VMOVU	(VEC_SIZE * 7)(%rsi), %VEC(7)
-	VMOVU	%VEC(0), (%rdi)
-	VMOVU	%VEC(1), VEC_SIZE(%rdi)
-	VMOVU	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	VMOVU	%VEC(4), (VEC_SIZE * 4)(%rdi)
-	VMOVU	%VEC(5), (VEC_SIZE * 5)(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 6)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 7)(%rdi)
-	addq	$(VEC_SIZE * 8), %rdi
-	addq	$(VEC_SIZE * 8), %rsi
-	subq	$(VEC_SIZE * 8), %rdx
-	cmpq	$(VEC_SIZE * 8), %rdx
-	je	L(between_4x_vec_and_8x_vec)
-	ja	L(loop_8x_vec_forward)
-	/* Less than 8 * VEC to copy.  */
+	addq	$(VEC_SIZE * 4), %rsi
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
 	cmpq	$(VEC_SIZE * 4), %rdx
-	jb	L(between_0_and_4x_vec)
-	jmp	L(between_4x_vec_and_8x_vec)
+	ja	L(loop_4x_vec_forward)
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r10)
+	VZEROUPPER
+	ret
 
-	.p2align 4
 L(more_8x_vec_backward):
-	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
-	leaq	-VEC_SIZE(%rdi, %rdx), %r9
+	/* Save stop of the destination buffer.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %r11
+	/* Load the first 4 * VEC and last VEC to support overlapping
+	   addresses.  */
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	VMOVU.d32 (%rsi), %VEC(4)
+# if VEC_SIZE == 32
+	VMOVU.d32 VEC_SIZE(%rsi), %VEC(5)
+# else
+	VMOVU	VEC_SIZE(%rsi), %VEC(5)
+# endif
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
+	/* Align destination end for aligned stores in the loop.  Compute
+	   how much destination end is misaligned.  */
+	leaq.d32 -VEC_SIZE(%rsi, %rdx), %rcx
+	movq	%r11, %r9
+	movq	%r11, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Adjust source.  */
+	subq	%r8, %rcx
+	/* Adjust the end of destination which should be aligned now.  */
+	subq	%r8, %r9
+	/* Adjust length.  */
+	subq	%r8, %rdx
+	.p2align 4
+L(loop_4x_vec_backward):
+	/* Copy 4 * VEC a time backward.  */
+	VMOVU	(%rcx), %VEC(0)
+	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+	subq	$(VEC_SIZE * 4), %rcx
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVA	%VEC(0), (%r9)
+	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$(VEC_SIZE * 4), %r9
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec_backward)
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
 
+L(large_data):
+	/* Copy very large data with non-temporal stores.  */
+	cmpq	%rsi, %rdi
+	ja	L(copy_large_backward)
+	/* Source == destination is less common.  */
+	je	L(nop)
+	/* Save start and stop of the destination buffer.  */
+	movq	%rdi, %r10
+	leaq	-VEC_SIZE(%rdi, %rdx), %rcx
+	/* Align destination for non-temporal stores in the loop.
+	   Compute how much destination is misaligned.  */
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %r8
+	/* Load the first VEC and last 4 * VEC to support overlapping
+	   addresses.  */
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	VMOVU.d32	(%rsi), %VEC(4)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(5)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
 	.p2align 4
-L(loop_8x_vec_backward):
-	/* Copy 8 * VEC a time backward.  */
+L(loop_large_forward):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCHNT (VEC_SIZE * 14)(%rsi)
+	PREFETCHNT (VEC_SIZE * 20)(%rsi)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	addq	$(VEC_SIZE * 4), %rsi
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVNT	%VEC(0), (%rdi)
+	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	addq	$(VEC_SIZE * 4), %rdi
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_large_forward)
+	sfence
+	/* Store the last 4 * VEC.  */
+	VMOVU	%VEC(5), (%rcx)
+	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+	/* Store the first VEC.  */
+	VMOVU	%VEC(4), (%r10)
+	VZEROUPPER
+	ret
+
+L(copy_large_backward):
+	/* Save stop of the destination buffer.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %r11
+	/* Align the destination end for non-temporal stores in the loop.
+	   Compute how much destination end is misaligned.  */
+	leaq	-VEC_SIZE(%rsi, %rdx), %rcx
+	movq	%r11, %r9
+	movq	%r11, %r8
+	andq	$(VEC_SIZE - 1), %r8
+	/* Adjust source.  */
+	subq	%r8, %rcx
+	/* Adjust the end of destination which should be aligned now.  */
+	subq	%r8, %r9
+	/* Load the first 4 * VEC and last VEC to support overlapping
+	   addresses.  */
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	VMOVU.d32 (%rsi), %VEC(4)
+# if VEC_SIZE == 32
+	VMOVU.d32 VEC_SIZE(%rsi), %VEC(5)
+# else
+	VMOVU	VEC_SIZE(%rsi), %VEC(5)
+# endif
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(6)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(7)
+	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(8)
+	/* Adjust length.  */
+	subq	%r8, %rdx
+	.p2align 4
+L(loop_large_backward):
+	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+	PREFETCHNT -(VEC_SIZE * 14)(%rcx)
+	PREFETCHNT -(VEC_SIZE * 20)(%rcx)
 	VMOVU	(%rcx), %VEC(0)
 	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
 	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
 	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	VMOVU	-(VEC_SIZE * 4)(%rcx), %VEC(4)
-	VMOVU	-(VEC_SIZE * 5)(%rcx), %VEC(5)
-	VMOVU	-(VEC_SIZE * 6)(%rcx), %VEC(6)
-	VMOVU	-(VEC_SIZE * 7)(%rcx), %VEC(7)
-	VMOVU	%VEC(0), (%r9)
-	VMOVU	%VEC(1), -VEC_SIZE(%r9)
-	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVU	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	VMOVU	%VEC(4), -(VEC_SIZE * 4)(%r9)
-	VMOVU	%VEC(5), -(VEC_SIZE * 5)(%r9)
-	VMOVU	%VEC(6), -(VEC_SIZE * 6)(%r9)
-	VMOVU	%VEC(7), -(VEC_SIZE * 7)(%r9)
-	subq	$(VEC_SIZE * 8), %rcx
-	subq	$(VEC_SIZE * 8), %r9
-	subq	$(VEC_SIZE * 8), %rdx
-	cmpq	$(VEC_SIZE * 8), %rdx
-	je	L(between_4x_vec_and_8x_vec)
-	ja	L(loop_8x_vec_backward)
-	/* Less than 8 * VEC to copy.  */
+	subq	$(VEC_SIZE * 4), %rcx
+	subq	$(VEC_SIZE * 4), %rdx
+	VMOVNT	%VEC(0), (%r9)
+	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+	subq	$(VEC_SIZE * 4), %r9
 	cmpq	$(VEC_SIZE * 4), %rdx
-	jb	L(between_0_and_4x_vec)
-	jmp	L(between_4x_vec_and_8x_vec)
+	ja	L(loop_large_backward)
+	sfence
+	/* Store the first 4 * VEC.  */
+	VMOVU	%VEC(4), (%rdi)
+	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+	/* Store the last VEC.  */
+	VMOVU	%VEC(8), (%r11)
+	VZEROUPPER
+	ret
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 
 # ifdef SHARED

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=97d258a1abde09ad8bb6ff5afe38e4ff34e0f344

commit 97d258a1abde09ad8bb6ff5afe38e4ff34e0f344
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Apr 4 05:54:44 2016 -0700

    Force 32-bit displacement in memset-vec-unaligned-erms.S

diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9383517..1745a71 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -159,9 +159,21 @@ L(return):
 	.p2align 4
 L(loop_start):
 	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	VMOVU.d32 %VEC(0), (%rdi)
+# else
 	VMOVU	%VEC(0), (%rdi)
+# endif
 	andq	$-(VEC_SIZE * 4), %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+	/* Force 32-bit displacement to avoid long nop between
+	   instructions.  */
+	VMOVU.d32 %VEC(0), -VEC_SIZE(%rdi,%rdx)
+# else
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+# endif
 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4dcc8a09a3002cc351967c4c7153ccc7ac6831b2

commit 4dcc8a09a3002cc351967c4c7153ccc7ac6831b2
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Apr 4 04:52:36 2016 -0700

    Add a comment in memset-sse2-unaligned-erms.S

diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 2deba42..4bf3d36 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -1,6 +1,8 @@
 #if IS_IN (libc)
 # define VEC_SIZE	16
 # define VEC(i)		xmm##i
+/* Don't use movups and movaps since it will get larger nop paddings
+   for alignment.  */
 # define VMOVU		movdqu
 # define VMOVA		movdqa
 

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=263a18c229afe223581b6d964a106eb10399d028

commit 263a18c229afe223581b6d964a106eb10399d028
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Apr 4 09:38:30 2016 -0700

    Clear destination buffer set by the previous run
    
    Clear the destination buffer set by the previous run in test-memcpy.c
    and test-memmove.c to catch the error when the following implementations
    do copy anything.
    
    	PR string/19907
    	* string/test-memcpy.c (do_one_test): Clear the destination
    	buffer set by the previous run.
    	* string/test-memmove.c (do_one_test): Likewise.

diff --git a/benchtests/bench-memcpy.c b/benchtests/bench-memcpy.c
index c5a7192..9d9e7b6 100644
--- a/benchtests/bench-memcpy.c
+++ b/benchtests/bench-memcpy.c
@@ -55,6 +55,10 @@ do_one_test (impl_t *impl, char *dst, const char *src,
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
 
+  /* Must clear the destination buffer set by the previous run.  */
+  for (i = 0; i < len; i++)
+    dst[i] = 0;
+
   if (CALL (impl, dst, src, len) != MEMCPY_RESULT (dst, len))
     {
       error (0, 0, "Wrong result in function %s %p %p", impl->name,
diff --git a/benchtests/bench-memmove.c b/benchtests/bench-memmove.c
index c38596b..3858f2a 100644
--- a/benchtests/bench-memmove.c
+++ b/benchtests/bench-memmove.c
@@ -70,6 +70,7 @@ do_one_test (impl_t *impl, char *dst, char *src, const char *orig_src,
   size_t i, iters = INNER_LOOP_ITERS;
   timing_t start, stop, cur;
 
+  /* This also clears the destination buffer set by the previous run.  */
   memcpy (src, orig_src, len);
 #ifdef TEST_BCOPY
   CALL (impl, src, dst, len);
diff --git a/string/test-memcpy.c b/string/test-memcpy.c
index 2a0994c..cb072f8 100644
--- a/string/test-memcpy.c
+++ b/string/test-memcpy.c
@@ -53,6 +53,12 @@ static void
 do_one_test (impl_t *impl, char *dst, const char *src,
 	     size_t len)
 {
+  size_t i;
+
+  /* Must clear the destination buffer set by the previous run.  */
+  for (i = 0; i < len; i++)
+    dst[i] = 0;
+
   if (CALL (impl, dst, src, len) != MEMCPY_RESULT (dst, len))
     {
       error (0, 0, "Wrong result in function %s %p %p", impl->name,
diff --git a/string/test-memmove.c b/string/test-memmove.c
index d2ab3f3..4343329 100644
--- a/string/test-memmove.c
+++ b/string/test-memmove.c
@@ -68,6 +68,7 @@ static void
 do_one_test (impl_t *impl, char *dst, char *src, const char *orig_src,
 	     size_t len)
 {
+  /* This also clears the destination buffer set by the previous run.  */
   memcpy (src, orig_src, len);
 #ifdef TEST_BCOPY
   CALL (impl, src, dst, len);

http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c62191f2c468d506bf2b169f8589ffca90ff8bcc

commit c62191f2c468d506bf2b169f8589ffca90ff8bcc
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Wed Mar 30 09:18:27 2016 -0700

    Add --enable-large-benchtests for large data benchmarks
    
    We like to run memcpy memmove and memset benchmarks with large data sizes.
    This patch adds --enable-large-benchtests to enable benchmarks with very
    large data.
    
    	* benchtests/Makefile (string-benchset): Add memcpy-large,
    	memmove-large and memset-large for --enable-large-benchtests.
    	* benchtests/bench-memcpy-large.c: New file.
    	* benchtests/bench-memmove-large.c: Likewise.
    	* benchtests/bench-memmove-large.c: Likewise.
    	* benchtests/bench-string.h (TIMEOUT): Don't redefine.
    	* config.make.in (run-large-benchtests): New.
    	* configure.ac: Add --enable-large-benchtests.
    	* configure: Regenerated.

diff --git a/benchtests/Makefile b/benchtests/Makefile
index a37d666..7f8ae02 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -39,6 +39,9 @@ string-benchset := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
 		   strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
 		   strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
 		   strcoll
+ifeq (yes,$(run-large-benchtests))
+string-benchset += memcpy-large memmove-large memset-large
+endif
 wcsmbs-benchset := wcslen wcsnlen wcscpy wcpcpy wcsncpy wcpncpy wcscat wcsncat \
 		   wcscmp wcsncmp wcschr wcschrnul wcsrchr wcsspn wcspbrk wcscspn \
 		   wmemchr wmemset wmemcmp
diff --git a/benchtests/bench-memcpy-large.c b/benchtests/bench-memcpy-large.c
new file mode 100644
index 0000000..1a9b25c
--- /dev/null
+++ b/benchtests/bench-memcpy-large.c
@@ -0,0 +1,123 @@
+/* Measure memcpy functions with large data sizes.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef MEMCPY_RESULT
+# define MEMCPY_RESULT(dst, len) dst
+# define START_SIZE (64 * 1024)
+# define MIN_PAGE_SIZE (getpagesize () + 32 * 1024 * 1024)
+# define TEST_MAIN
+# define TEST_NAME "memcpy"
+# define TIMEOUT (20 * 60)
+# include "bench-string.h"
+
+IMPL (memcpy, 1)
+#endif
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+do_one_test (impl_t *impl, char *dst, const char *src,
+	     size_t len)
+{
+  size_t i, iters = 16;
+  timing_t start, stop, cur;
+
+  /* Must clear the destination buffer set by the previous run.  */
+  for (i = 0; i < len; i++)
+    dst[i] = 0;
+
+  if (CALL (impl, dst, src, len) != MEMCPY_RESULT (dst, len))
+    {
+      error (0, 0, "Wrong result in function %s %p %p", impl->name,
+	     CALL (impl, dst, src, len), MEMCPY_RESULT (dst, len));
+      ret = 1;
+      return;
+    }
+
+  if (memcmp (dst, src, len) != 0)
+    {
+      error (0, 0, "Wrong result in function %s dst \"%s\" src \"%s\"",
+	     impl->name, dst, src);
+      ret = 1;
+      return;
+    }
+
+  TIMING_NOW (start);
+  for (i = 0; i < iters; ++i)
+    {
+      CALL (impl, dst, src, len);
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+}
+
+static void
+do_test (size_t align1, size_t align2, size_t len)
+{
+  size_t i, j;
+  char *s1, *s2;
+
+  align1 &= 63;
+  if (align1 + len >= page_size)
+    return;
+
+  align2 &= 63;
+  if (align2 + len >= page_size)
+    return;
+
+  s1 = (char *) (buf1 + align1);
+  s2 = (char *) (buf2 + align2);
+
+  for (i = 0, j = 1; i < len; i++, j += 23)
+    s1[i] = j;
+
+  printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
+
+  FOR_EACH_IMPL (impl, 0)
+    do_one_test (impl, s2, s1, len);
+
+  putchar ('\n');
+}
+
+int
+test_main (void)
+{
+  size_t i;
+
+  test_init ();
+
+  printf ("%23s", "");
+  FOR_EACH_IMPL (impl, 0)
+    printf ("\t%s", impl->name);
+  putchar ('\n');
+
+  for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+    {
+      do_test (0, 0, i + 7);
+      do_test (0, 3, i + 15);
+      do_test (3, 0, i + 31);
+      do_test (3, 5, i + 63);
+    }
+
+  return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-memmove-large.c b/benchtests/bench-memmove-large.c
new file mode 100644
index 0000000..1a3fc6e
--- /dev/null
+++ b/benchtests/bench-memmove-large.c
@@ -0,0 +1,125 @@
+/* Measure memmove functions with large data sizes.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define BASE_PAGE_SIZE (1024 * 1024)
+#define START_SIZE (4 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 16 * 1024 * 1024)
+#define TEST_MAIN
+#define TEST_NAME "memmove"
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+IMPL (memmove, 1)
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+do_one_test (impl_t *impl, char *dst, char *src, const char *orig_src,
+	     size_t len)
+{
+  size_t i, iters = 16;
+  timing_t start, stop, cur;
+
+  /* Must clear the destination buffer set by the previous run.  */
+  for (i = 0; i < len; i++)
+    dst[i] = 0;
+
+  memcpy (src, orig_src, len);
+
+  char *res = CALL (impl, dst, src, len);
+  if (res != dst)
+    {
+      error (0, 0, "Wrong result in function %s %p %p", impl->name,
+	     res, dst);
+      ret = 1;
+      return;
+    }
+
+  if (memcmp (dst, orig_src, len) != 0)
+    {
+      error (0, 0, "Wrong result in function %s dst \"%s\" src \"%s\"",
+	     impl->name, dst, src);
+      ret = 1;
+      return;
+    }
+
+  TIMING_NOW (start);
+  for (i = 0; i < iters; ++i)
+    {
+      CALL (impl, dst, src, len);
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+}
+
+static void
+do_test (size_t align1, size_t align2, size_t len)
+{
+  size_t i, j;
+  char *s1, *s2;
+
+  align1 &= 127;
+  if (align1 + len >= page_size)
+    return;
+
+  align2 &= 127;
+  if (align2 + len >= page_size)
+    return;
+
+  s1 = (char *) (buf1 + align1);
+  s2 = (char *) (buf2 + align2);
+
+  for (i = 0, j = 1; i < len; i++, j += 23)
+    s1[i] = j;
+
+  printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
+
+  FOR_EACH_IMPL (impl, 0)
+    do_one_test (impl, s2, (char *) (buf2 + align1), s1, len);
+
+  putchar ('\n');
+}
+
+int
+test_main (void)
+{
+  size_t i;
+
+  test_init ();
+
+  printf ("%23s", "");
+  FOR_EACH_IMPL (impl, 0)
+    printf ("\t%s", impl->name);
+  putchar ('\n');
+
+  for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+    {
+      do_test (0, 64, i + 7);
+      do_test (0, 3, i + 15);
+      do_test (3, 0, i + 31);
+      do_test (3, 7, i + 63);
+      do_test (9, 5, i + 127);
+    }
+
+  return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-memset-large.c b/benchtests/bench-memset-large.c
new file mode 100644
index 0000000..fd3972d
--- /dev/null
+++ b/benchtests/bench-memset-large.c
@@ -0,0 +1,134 @@
+/* Measure memset functions with large data sizes.
+   Copyright (C) 2016 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifndef WIDE
+# define TEST_NAME "memset"
+#else
+# define TEST_NAME "wmemset"
+#endif /* WIDE */
+#define START_SIZE (128 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 64 * 1024 * 1024)
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+#ifndef WIDE
+# define MEMSET memset
+# define CHAR char
+# define SIMPLE_MEMSET simple_memset
+# define MEMCMP memcmp
+#else
+# include <wchar.h>
+# define MEMSET wmemset
+# define CHAR wchar_t
+# define SIMPLE_MEMSET simple_wmemset
+# define MEMCMP wmemcmp
+#endif /* WIDE */
+
+#include <assert.h>
+
+IMPL (MEMSET, 1)
+
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
+
+CHAR *
+inhibit_loop_to_libcall
+SIMPLE_MEMSET (CHAR *s, int c, size_t n)
+{
+  CHAR *r = s, *end = s + n;
+  while (r < end)
+    *r++ = c;
+  return s;
+}
+
+static void
+do_one_test (impl_t *impl, CHAR *s, int c __attribute ((unused)), size_t n)
+{
+  size_t i, iters = 16;
+  timing_t start, stop, cur;
+  CHAR *tstbuf = malloc (n * sizeof (*s));
+  assert (tstbuf != NULL);
+
+  /* Must clear the destination buffer set by the previous run.  */
+  for (i = 0; i < n; i++)
+    s[i] = 0;
+
+  CHAR *res = CALL (impl, s, c, n);
+  if (res != s
+      || SIMPLE_MEMSET (tstbuf, c, n) != tstbuf
+      || MEMCMP (s, tstbuf, n) != 0)
+    {
+      error (0, 0, "Wrong result in function %s", impl->name);
+      ret = 1;
+      free (tstbuf);
+      return;
+    }
+
+  TIMING_NOW (start);
+  for (i = 0; i < iters; ++i)
+    {
+      CALL (impl, s, c, n);
+    }
+  TIMING_NOW (stop);
+
+  TIMING_DIFF (cur, start, stop);
+
+  TIMING_PRINT_MEAN ((double) cur, (double) iters);
+
+  free (tstbuf);
+}
+
+static void
+do_test (size_t align, int c, size_t len)
+{
+  align &= 63;
+  if ((align + len) * sizeof (CHAR) > page_size)
+    return;
+
+  printf ("Length %4zd, alignment %2zd, c %2d:", len, align, c);
+
+  FOR_EACH_IMPL (impl, 0)
+    do_one_test (impl, (CHAR *) (buf1) + align, c, len);
+
+  putchar ('\n');
+}
+
+int
+test_main (void)
+{
+  size_t i;
+  int c;
+
+  test_init ();
+
+  printf ("%24s", "");
+  FOR_EACH_IMPL (impl, 0)
+    printf ("\t%s", impl->name);
+  putchar ('\n');
+
+  c = 65;
+  for (i = START_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+    {
+      do_test (0, c, i);
+      do_test (3, c, i);
+    }
+
+  return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-string.h b/benchtests/bench-string.h
index be4c618..9c5371e 100644
--- a/benchtests/bench-string.h
+++ b/benchtests/bench-string.h
@@ -56,7 +56,9 @@ extern impl_t __start_impls[], __stop_impls[];
 
 
 # define TEST_FUNCTION test_main ()
-# define TIMEOUT (4 * 60)
+# ifndef TIMEOUT
+#  define TIMEOUT (4 * 60)
+# endif
 # define OPT_ITERATIONS 10000
 # define OPT_RANDOM 10001
 # define OPT_SEED 10002
diff --git a/config.make.in b/config.make.in
index 95c6f36..b9a4dbb 100644
--- a/config.make.in
+++ b/config.make.in
@@ -89,6 +89,7 @@ link-obsolete-rpc = @link_obsolete_rpc@
 build-nscd = @build_nscd@
 use-nscd = @use_nscd@
 build-hardcoded-path-in-tests= @hardcoded_path_in_tests@
+run-large-benchtests = @large_benchtests@
 build-pt-chown = @build_pt_chown@
 enable-lock-elision = @enable_lock_elision@
 
diff --git a/configure b/configure
index 8fe5937..42bde65 100755
--- a/configure
+++ b/configure
@@ -668,6 +668,7 @@ all_warnings
 force_install
 bindnow
 enable_lock_elision
+large_benchtests
 hardcoded_path_in_tests
 enable_timezone_tools
 use_default_link
@@ -755,6 +756,7 @@ enable_shared
 enable_profile
 enable_timezone_tools
 enable_hardcoded_path_in_tests
+enable_large_benchtests
 enable_stackguard_randomization
 enable_lock_elision
 enable_add_ons
@@ -1411,6 +1413,8 @@ Optional Features:
   --enable-hardcoded-path-in-tests
                           hardcode newly built glibc path in tests
                           [default=no]
+  --enable-large-benchtests
+                          run benchtests with large data size [default=no]
   --enable-stackguard-randomization
                           initialize __stack_chk_guard canary with a random
                           number at program start
@@ -3363,6 +3367,15 @@ fi
 
 
 
+# Check whether --enable-large-benchtests was given.
+if test "${enable_large_benchtests+set}" = set; then :
+  enableval=$enable_large_benchtests; large_benchtests=$enableval
+else
+  large_benchtests=no
+fi
+
+
+
 # Check whether --enable-stackguard-randomization was given.
 if test "${enable_stackguard_randomization+set}" = set; then :
   enableval=$enable_stackguard_randomization; enable_stackguard_randomize=$enableval
diff --git a/configure.ac b/configure.ac
index 3c766b7..8fb93d9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -190,6 +190,13 @@ AC_ARG_ENABLE([hardcoded-path-in-tests],
 	      [hardcoded_path_in_tests=no])
 AC_SUBST(hardcoded_path_in_tests)
 
+AC_ARG_ENABLE([large-benchtests],
+	      AC_HELP_STRING([--enable-large-benchtests],
+			     [run benchtests with large data size @<:@default=no@:>@]),
+	      [large_benchtests=$enableval],
+	      [large_benchtests=no])
+AC_SUBST(large_benchtests)
+
 AC_ARG_ENABLE([stackguard-randomization],
 	      AC_HELP_STRING([--enable-stackguard-randomization],
 			     [initialize __stack_chk_guard canary with a random number at program start]),

-----------------------------------------------------------------------


hooks/post-receive
-- 
GNU C Library master sources
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]