This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/benchtests/master created. glibc-2.23-159-g2e12444
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 4 Apr 2016 14:02:21 -0000
- Subject: GNU C Library master sources branch hjl/benchtests/master created. glibc-2.23-159-g2e12444
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/benchtests/master has been created
at 2e124448475bebf7a180cdb03ce27e0461bb02e4 (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2e124448475bebf7a180cdb03ce27e0461bb02e4
commit 2e124448475bebf7a180cdb03ce27e0461bb02e4
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Apr 4 05:45:31 2016 -0700
Force 32-bit displacement in memmove-vec-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 4589c24..2bc61d1 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -84,8 +84,6 @@ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
L(start):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
- jae L(large_data)
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
@@ -158,8 +156,6 @@ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
L(start_erms):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
- jae L(large_data)
cmpq $(VEC_SIZE * 2), %rdx
ja L(movsb_more_2x_vec)
L(last_2x_vec):
@@ -191,13 +187,18 @@ L(nop):
ret
.p2align 4
+L(more_2x_vec):
+ cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
+ jae L(large_data)
+ jmp L(start_more_2x_vec)
+
+ .p2align 4
L(movsb_more_2x_vec):
+ cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
+ jae L(large_data)
cmpq $REP_MOVSB_THRESHOLD, %rdx
- /* Force 32-bit displacement to avoid long nop between
- instructions. */
- ja.d32 L(movsb)
- .p2align 4
-L(more_2x_vec):
+ ja L(movsb)
+L(start_more_2x_vec):
/* More than 2 * VEC. */
cmpq %rsi, %rdi
jb L(copy_forward)
@@ -210,7 +211,13 @@ L(copy_forward):
leaq (%rdi,%rdx), %rcx
cmpq %rcx, %rsi
jb L(more_2x_vec_overlap)
+# if VEC_SIZE == 64
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 (%rsi), %VEC(0)
+# else
VMOVU (%rsi), %VEC(0)
+# endif
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
@@ -219,9 +226,7 @@ L(copy_forward):
VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
cmpq $(VEC_SIZE * 4), %rdx
- /* Force 32-bit displacement to avoid long nop between
- instructions. */
- jbe.d32 L(return)
+ jbe L(return)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(0)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2)
@@ -231,13 +236,7 @@ L(copy_forward):
VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx)
cmpq $(VEC_SIZE * 8), %rdx
-# if VEC_SIZE == 16
jbe L(return)
-# else
- /* Use 8-bit displacement to avoid long nop between
- instructions. */
- jbe L(return_disp8)
-# endif
leaq (VEC_SIZE * 4)(%rdi), %rcx
addq %rdi, %rdx
andq $-(VEC_SIZE * 4), %rdx
@@ -246,14 +245,19 @@ L(copy_forward):
subq %rdi, %r11
addq %r11, %rsi
cmpq %rdx, %rcx
- /* Use 8-bit displacement to avoid long nop between
- instructions. */
- je L(return_disp8)
+ je L(return)
movq %rsi, %r10
subq %rcx, %r10
- leaq VEC_SIZE(%r10), %r9
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ leaq.d32 VEC_SIZE(%r10), %r9
+# if VEC_SIZE == 16
+ leaq.d32 (VEC_SIZE * 2)(%r10), %r8
+ leaq.d32 (VEC_SIZE * 3)(%r10), %r11
+# else
leaq (VEC_SIZE * 2)(%r10), %r8
leaq (VEC_SIZE * 3)(%r10), %r11
+# endif
.p2align 4
L(loop):
VMOVU (%rcx,%r10), %VEC(0)
@@ -267,7 +271,6 @@ L(loop):
addq $(VEC_SIZE * 4), %rcx
cmpq %rcx, %rdx
jne L(loop)
-L(return_disp8):
VZEROUPPER
ret
L(less_vec):
@@ -424,8 +427,10 @@ L(loop_8x_vec_forward):
.p2align 4
L(more_8x_vec_backward):
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
- leaq -VEC_SIZE(%rdi, %rdx), %r9
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ leaq.d32 -VEC_SIZE(%rsi, %rdx), %rcx
+ leaq.d32 -VEC_SIZE(%rdi, %rdx), %r9
.p2align 4
L(loop_8x_vec_backward):
@@ -466,13 +471,15 @@ L(large_data):
je L(nop)
/* Align destination for access with non-temporal stores in the
loop. Compute how much destination is misaligned. */
- movl %edi, %ecx
+ movq %rdi, %rcx
movq %rdi, %r9
- andl $(VEC_SIZE - 1), %ecx
- /* Load the first VEC and store it at the end. */
- VMOVU (%rsi), %VEC(4)
- /* Skip if destination is aligned. */
- jz 1f
+ andq $(VEC_SIZE - 1), %rcx
+ /* Load the first VEC and store it at the end. Force 32-bit
+ displacement to avoid long nop between instructions. */
+ VMOVU.d32 (%rsi), %VEC(4)
+ /* Skip if destination is aligned. Force 32-bit displacement
+ to avoid long nop between instructions. */
+ jz.d32 1f
/* Get the negative of offset for alignment. */
subq $VEC_SIZE, %rcx
/* Adjust source. */
@@ -482,7 +489,9 @@ L(large_data):
/* Adjust length. */
addq %rcx, %rdx
1:
- leaq (%rdi,%rdx), %rcx
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ leaq.d32 (%rdi,%rdx), %rcx
.p2align 4
L(loop_large_forward):
/* Copy 4 * VEC a time forward with non-temporal stores. */
@@ -514,16 +523,25 @@ L(loop_large_forward):
.p2align 4
L(copy_large_backward):
- leaq -VEC_SIZE(%rdi, %rdx), %r9
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ leaq.d32 -VEC_SIZE(%rdi, %rdx), %r9
+ leaq.d32 -VEC_SIZE(%rsi, %rdx), %rcx
movl %r9d, %r10d
movq %r9, %r11
/* Compute how much the end of destination is misaligned. */
andl $(VEC_SIZE - 1), %r10d
- /* Load the last VEC and store it at the end. */
- VMOVU (%rcx), %VEC(4)
+ /* Load the last VEC and store it at the end. Force 32-bit
+ displacement to avoid long nop between instructions. */
+ VMOVU.d32 (%rcx), %VEC(4)
/* Skip if the end of destination is aligned. */
+# if VEC_SIZE == 16
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ jz.d32 L(loop_large_backward)
+# else
jz L(loop_large_backward)
+# endif
/* Adjust source. */
subq %r10, %rcx
/* Adjust the end of destination which should be aligned now. */
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=0b0bd5485f738c2d6b9dfec43a0c33df62c96f64
commit 0b0bd5485f738c2d6b9dfec43a0c33df62c96f64
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Apr 4 05:54:44 2016 -0700
Force 32-bit displacement in memset-vec-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9383517..1745a71 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -159,9 +159,21 @@ L(return):
.p2align 4
L(loop_start):
leaq (VEC_SIZE * 4)(%rdi), %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 %VEC(0), (%rdi)
+# else
VMOVU %VEC(0), (%rdi)
+# endif
andq $-(VEC_SIZE * 4), %rcx
+# if VEC_SIZE == 32 || VEC_SIZE == 64
+ /* Force 32-bit displacement to avoid long nop between
+ instructions. */
+ VMOVU.d32 %VEC(0), -VEC_SIZE(%rdi,%rdx)
+# else
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+# endif
VMOVU %VEC(0), VEC_SIZE(%rdi)
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5c201f7928a8557b8d9127f91a5e319c37738228
commit 5c201f7928a8557b8d9127f91a5e319c37738228
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Apr 4 04:52:36 2016 -0700
Add a comment in memset-sse2-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 2deba42..4bf3d36 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -1,6 +1,8 @@
#if IS_IN (libc)
# define VEC_SIZE 16
# define VEC(i) xmm##i
+/* Don't use movups and movaps since it will get larger nop paddings
+ for alignment. */
# define VMOVU movdqu
# define VMOVA movdqa
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3b2e3a74f6eeda71339c0feb18a15c54b97913b1
commit 3b2e3a74f6eeda71339c0feb18a15c54b97913b1
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Apr 3 21:19:45 2016 -0700
Add __x86_shared_non_temporal_threshold
diff --git a/sysdeps/x86_64/cacheinfo.c b/sysdeps/x86_64/cacheinfo.c
index 96463df..cae9907 100644
--- a/sysdeps/x86_64/cacheinfo.c
+++ b/sysdeps/x86_64/cacheinfo.c
@@ -464,6 +464,10 @@ long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
/* Similar to __x86_shared_cache_size, but not rounded. */
long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
+/* Threshold to use non temporal stores. */
+long int __x86_shared_non_temporal_threshold attribute_hidden
+ = 1024 * 1024 * 4;
+
#ifndef DISABLE_PREFETCHW
/* PREFETCHW support flag for use in memory and string routines. */
int __x86_prefetchw attribute_hidden;
@@ -661,5 +665,6 @@ init_cacheinfo (void)
shared = shared & ~255L;
__x86_shared_cache_size_half = shared / 2;
__x86_shared_cache_size = shared;
+ __x86_shared_non_temporal_threshold = shared * 4;
}
}
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 41e8232..4589c24 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -84,7 +84,7 @@ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
L(start):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
- cmpq __x86_shared_cache_size_half(%rip), %rdx
+ cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
jae L(large_data)
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
@@ -158,7 +158,7 @@ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
L(start_erms):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
- cmpq __x86_shared_cache_size_half(%rip), %rdx
+ cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
jae L(large_data)
cmpq $(VEC_SIZE * 2), %rdx
ja L(movsb_more_2x_vec)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=710404b401127acc5647b77ce7981ff62fbbaf1c
commit 710404b401127acc5647b77ce7981ff62fbbaf1c
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Apr 3 21:06:30 2016 -0700
Copy very large data with non-temporal stores
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
index 44711c3..126716e 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
@@ -1,6 +1,8 @@
#if IS_IN (libc)
# define VEC_SIZE 32
# define VEC(i) ymm##i
+# define PREFETCHNT prefetchnta
+# define VMOVNTU vmovntdq
# define VMOVU vmovdqu
# define VMOVA vmovdqa
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index c2c5293..6590144 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,6 +1,8 @@
#if defined HAVE_AVX512_ASM_SUPPORT && IS_IN (libc)
# define VEC_SIZE 64
# define VEC(i) zmm##i
+# define PREFETCHNT prefetchnta
+# define VMOVNTU vmovntdq
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
index 85214fe..f7b8013 100644
--- a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S
@@ -1,8 +1,11 @@
#if IS_IN (libc)
# define VEC_SIZE 16
# define VEC(i) xmm##i
-# define VMOVU movdqu
-# define VMOVA movdqa
+# define PREFETCHNT prefetchnta
+# define VMOVNTU movntdq
+/* Use movups and movaps for smaller code sizes. */
+# define VMOVU movups
+# define VMOVA movaps
# define SECTION(p) p
# define MEMMOVE_SYMBOL(p,s) p##_sse2_##s
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index bb9773f..41e8232 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -84,11 +84,7 @@ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
L(start):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
-#if 0
cmpq __x86_shared_cache_size_half(%rip), %rdx
-#else
- cmpq $512, %rdx
-#endif
jae L(large_data)
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
@@ -407,6 +403,8 @@ L(loop_8x_vec_forward):
VMOVU (VEC_SIZE * 5)(%rsi), %VEC(5)
VMOVU (VEC_SIZE * 6)(%rsi), %VEC(6)
VMOVU (VEC_SIZE * 7)(%rsi), %VEC(7)
+ addq $(VEC_SIZE * 8), %rsi
+ subq $(VEC_SIZE * 8), %rdx
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(1), VEC_SIZE(%rdi)
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
@@ -416,8 +414,6 @@ L(loop_8x_vec_forward):
VMOVU %VEC(6), (VEC_SIZE * 6)(%rdi)
VMOVU %VEC(7), (VEC_SIZE * 7)(%rdi)
addq $(VEC_SIZE * 8), %rdi
- addq $(VEC_SIZE * 8), %rsi
- subq $(VEC_SIZE * 8), %rdx
cmpq $(VEC_SIZE * 8), %rdx
je L(between_4x_vec_and_8x_vec)
ja L(loop_8x_vec_forward)
@@ -442,6 +438,8 @@ L(loop_8x_vec_backward):
VMOVU -(VEC_SIZE * 5)(%rcx), %VEC(5)
VMOVU -(VEC_SIZE * 6)(%rcx), %VEC(6)
VMOVU -(VEC_SIZE * 7)(%rcx), %VEC(7)
+ subq $(VEC_SIZE * 8), %rcx
+ subq $(VEC_SIZE * 8), %rdx
VMOVU %VEC(0), (%r9)
VMOVU %VEC(1), -VEC_SIZE(%r9)
VMOVU %VEC(2), -(VEC_SIZE * 2)(%r9)
@@ -450,9 +448,7 @@ L(loop_8x_vec_backward):
VMOVU %VEC(5), -(VEC_SIZE * 5)(%r9)
VMOVU %VEC(6), -(VEC_SIZE * 6)(%r9)
VMOVU %VEC(7), -(VEC_SIZE * 7)(%r9)
- subq $(VEC_SIZE * 8), %rcx
subq $(VEC_SIZE * 8), %r9
- subq $(VEC_SIZE * 8), %rdx
cmpq $(VEC_SIZE * 8), %rdx
je L(between_4x_vec_and_8x_vec)
ja L(loop_8x_vec_backward)
@@ -464,29 +460,51 @@ L(loop_8x_vec_backward):
.p2align 4
L(large_data):
/* Copy very large data with non-temporal stores. */
- leaq (%rdi,%rdx), %rcx
cmpq %rsi, %rdi
ja L(copy_large_backward)
/* Source == destination is less common. */
je L(nop)
+ /* Align destination for access with non-temporal stores in the
+ loop. Compute how much destination is misaligned. */
+ movl %edi, %ecx
+ movq %rdi, %r9
+ andl $(VEC_SIZE - 1), %ecx
+ /* Load the first VEC and store it at the end. */
+ VMOVU (%rsi), %VEC(4)
+ /* Skip if destination is aligned. */
+ jz 1f
+ /* Get the negative of offset for alignment. */
+ subq $VEC_SIZE, %rcx
+ /* Adjust source. */
+ subq %rcx, %rsi
+ /* Adjust destination which should be aligned now. */
+ subq %rcx, %rdi
+ /* Adjust length. */
+ addq %rcx, %rdx
+1:
+ leaq (%rdi,%rdx), %rcx
+ .p2align 4
L(loop_large_forward):
/* Copy 4 * VEC a time forward with non-temporal stores. */
- prefetchnta (VEC_SIZE * 14)(%rsi)
- prefetchnta (VEC_SIZE * 20)(%rsi)
+ PREFETCHNT (VEC_SIZE * 14)(%rsi)
+ PREFETCHNT (VEC_SIZE * 20)(%rsi)
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(1), VEC_SIZE(%rdi)
- VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
- addq $(VEC_SIZE * 4), %rdi
addq $(VEC_SIZE * 4), %rsi
subq $(VEC_SIZE * 4), %rdx
+ VMOVNTU %VEC(0), (%rdi)
+ VMOVNTU %VEC(1), VEC_SIZE(%rdi)
+ VMOVNTU %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVNTU %VEC(3), (VEC_SIZE * 3)(%rdi)
+ addq $(VEC_SIZE * 4), %rdi
cmpq $(VEC_SIZE * 4), %rdx
- je L(last_4x_vec)
ja L(loop_large_forward)
+ sfence
+ /* Store the first VEC. */
+ VMOVU %VEC(4), (%r9)
+ je L(last_4x_vec)
/* Less than 4 * VEC to copy. */
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
@@ -496,27 +514,44 @@ L(loop_large_forward):
.p2align 4
L(copy_large_backward):
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
leaq -VEC_SIZE(%rdi, %rdx), %r9
+ leaq -VEC_SIZE(%rsi, %rdx), %rcx
+ movl %r9d, %r10d
+ movq %r9, %r11
+ /* Compute how much the end of destination is misaligned. */
+ andl $(VEC_SIZE - 1), %r10d
+ /* Load the last VEC and store it at the end. */
+ VMOVU (%rcx), %VEC(4)
+ /* Skip if the end of destination is aligned. */
+ jz L(loop_large_backward)
+ /* Adjust source. */
+ subq %r10, %rcx
+ /* Adjust the end of destination which should be aligned now. */
+ subq %r10, %r9
+ /* Adjust length. */
+ subq %r10, %rdx
.p2align 4
L(loop_large_backward):
/* Copy 4 * VEC a time backward with non-temporal stores. */
- prefetchnta -(VEC_SIZE * 14)(%rsi)
- prefetchnta -(VEC_SIZE * 20)(%rsi)
+ PREFETCHNT -(VEC_SIZE * 14)(%rcx)
+ PREFETCHNT -(VEC_SIZE * 20)(%rcx)
VMOVU (%rcx), %VEC(0)
VMOVU -VEC_SIZE(%rcx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- VMOVU %VEC(0), (%r9)
- VMOVU %VEC(1), -VEC_SIZE(%r9)
- VMOVU %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVU %VEC(3), -(VEC_SIZE * 3)(%r9)
subq $(VEC_SIZE * 4), %rcx
- subq $(VEC_SIZE * 4), %r9
subq $(VEC_SIZE * 4), %rdx
+ VMOVNTU %VEC(0), (%r9)
+ VMOVNTU %VEC(1), -VEC_SIZE(%r9)
+ VMOVNTU %VEC(2), -(VEC_SIZE * 2)(%r9)
+ VMOVNTU %VEC(3), -(VEC_SIZE * 3)(%r9)
+ subq $(VEC_SIZE * 4), %r9
cmpq $(VEC_SIZE * 4), %rdx
- je L(last_4x_vec)
ja L(loop_large_backward)
+ sfence
+ /* Store the last VEC. */
+ VMOVU %VEC(4), (%r11)
+ je L(last_4x_vec)
/* Less than 4 * VEC to copy. */
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=e4b3d75451a764184f4f256e71dcf2f49907b181
commit e4b3d75451a764184f4f256e71dcf2f49907b181
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Sun Apr 3 17:21:45 2016 -0700
Add large_data
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 66779a3..bb9773f 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -84,6 +84,12 @@ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2))
L(start):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
+#if 0
+ cmpq __x86_shared_cache_size_half(%rip), %rdx
+#else
+ cmpq $512, %rdx
+#endif
+ jae L(large_data)
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
@@ -156,6 +162,8 @@ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
L(start_erms):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
+ cmpq __x86_shared_cache_size_half(%rip), %rdx
+ jae L(large_data)
cmpq $(VEC_SIZE * 2), %rdx
ja L(movsb_more_2x_vec)
L(last_2x_vec):
@@ -452,6 +460,69 @@ L(loop_8x_vec_backward):
cmpq $(VEC_SIZE * 4), %rdx
jb L(between_0_and_4x_vec)
jmp L(between_4x_vec_and_8x_vec)
+
+ .p2align 4
+L(large_data):
+ /* Copy very large data with non-temporal stores. */
+ leaq (%rdi,%rdx), %rcx
+ cmpq %rsi, %rdi
+ ja L(copy_large_backward)
+ /* Source == destination is less common. */
+ je L(nop)
+L(loop_large_forward):
+ /* Copy 4 * VEC a time forward with non-temporal stores. */
+ prefetchnta (VEC_SIZE * 14)(%rsi)
+ prefetchnta (VEC_SIZE * 20)(%rsi)
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
+ addq $(VEC_SIZE * 4), %rdi
+ addq $(VEC_SIZE * 4), %rsi
+ subq $(VEC_SIZE * 4), %rdx
+ cmpq $(VEC_SIZE * 4), %rdx
+ je L(last_4x_vec)
+ ja L(loop_large_forward)
+ /* Less than 4 * VEC to copy. */
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(last_2x_vec)
+ jmp L(last_4x_vec)
+
+ .p2align 4
+L(copy_large_backward):
+ leaq -VEC_SIZE(%rsi, %rdx), %rcx
+ leaq -VEC_SIZE(%rdi, %rdx), %r9
+ .p2align 4
+L(loop_large_backward):
+ /* Copy 4 * VEC a time backward with non-temporal stores. */
+ prefetchnta -(VEC_SIZE * 14)(%rsi)
+ prefetchnta -(VEC_SIZE * 20)(%rsi)
+ VMOVU (%rcx), %VEC(0)
+ VMOVU -VEC_SIZE(%rcx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
+ VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
+ VMOVU %VEC(0), (%r9)
+ VMOVU %VEC(1), -VEC_SIZE(%r9)
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%r9)
+ VMOVU %VEC(3), -(VEC_SIZE * 3)(%r9)
+ subq $(VEC_SIZE * 4), %rcx
+ subq $(VEC_SIZE * 4), %r9
+ subq $(VEC_SIZE * 4), %rdx
+ cmpq $(VEC_SIZE * 4), %rdx
+ je L(last_4x_vec)
+ ja L(loop_large_backward)
+ /* Less than 4 * VEC to copy. */
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(last_2x_vec)
+ jmp L(last_4x_vec)
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
# ifdef SHARED
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=4c12d12e316f92a9c5d28529f4a566417375a512
commit 4c12d12e316f92a9c5d28529f4a566417375a512
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed Mar 30 09:18:27 2016 -0700
Add --enable-large-benchtests for large data benchmarks
We like to run memcpy memmove and memset benchmarks with large data sizes.
This patch adds --enable-large-benchtests to enable benchmarks with very
large data.
* benchtests/Makefile (string-benchset): Add memcpy-large,
memmove-large and memset-large for --enable-large-benchtests.
* benchtests/bench-memcpy-large.c: New file.
* benchtests/bench-memmove-large.c: Likewise.
* benchtests/bench-memmove-large.c: Likewise.
* benchtests/bench-string.h (TIMEOUT): Don't redefine.
* config.make.in (run-large-benchtests): New.
* configure.ac: Add --enable-large-benchtests.
* configure: Regenerated.
diff --git a/benchtests/Makefile b/benchtests/Makefile
index a37d666..7f8ae02 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -39,6 +39,9 @@ string-benchset := bcopy bzero memccpy memchr memcmp memcpy memmem memmove \
strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
strcoll
+ifeq (yes,$(run-large-benchtests))
+string-benchset += memcpy-large memmove-large memset-large
+endif
wcsmbs-benchset := wcslen wcsnlen wcscpy wcpcpy wcsncpy wcpncpy wcscat wcsncat \
wcscmp wcsncmp wcschr wcschrnul wcsrchr wcsspn wcspbrk wcscspn \
wmemchr wmemset wmemcmp
diff --git a/benchtests/bench-memcpy-large.c b/benchtests/bench-memcpy-large.c
new file mode 100644
index 0000000..470000f
--- /dev/null
+++ b/benchtests/bench-memcpy-large.c
@@ -0,0 +1,105 @@
+/* Measure memcpy functions with large data sizes.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef MEMCPY_RESULT
+# define MEMCPY_RESULT(dst, len) dst
+# define BASE_PAGE_SIZE (1024 * 1024)
+# define MIN_PAGE_SIZE (getpagesize () + 256 * BASE_PAGE_SIZE)
+# define TEST_MAIN
+# define TEST_NAME "memcpy"
+# define TIMEOUT (20 * 60)
+# include "bench-string.h"
+
+IMPL (memcpy, 1)
+#endif
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+do_one_test (impl_t *impl, char *dst, const char *src,
+ size_t len)
+{
+ size_t i, iters = 16;
+ timing_t start, stop, cur;
+
+ /* It is too slow to check result with a simple implementation. */
+
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, dst, src, len);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ TIMING_PRINT_MEAN ((double) cur, (double) iters);
+}
+
+static void
+do_test (size_t align1, size_t align2, size_t len)
+{
+ size_t i, j;
+ char *s1, *s2;
+
+ align1 &= 63;
+ if (align1 + len >= page_size)
+ return;
+
+ align2 &= 63;
+ if (align2 + len >= page_size)
+ return;
+
+ s1 = (char *) (buf1 + align1);
+ s2 = (char *) (buf2 + align2);
+
+ for (i = 0, j = 1; i < len; i++, j += 23)
+ s1[i] = j;
+
+ printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_test (impl, s2, s1, len);
+
+ putchar ('\n');
+}
+
+int
+test_main (void)
+{
+ size_t i;
+
+ test_init ();
+
+ printf ("%23s", "");
+ FOR_EACH_IMPL (impl, 0)
+ printf ("\t%s", impl->name);
+ putchar ('\n');
+
+ for (i = BASE_PAGE_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+ {
+ do_test (0, 0, i);
+ do_test (0, 1, i);
+ do_test (1, 0, i);
+ do_test (1, 1, i);
+ }
+
+ return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-memmove-large.c b/benchtests/bench-memmove-large.c
new file mode 100644
index 0000000..585b94b
--- /dev/null
+++ b/benchtests/bench-memmove-large.c
@@ -0,0 +1,103 @@
+/* Measure memmove functions with large data sizes.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define BASE_PAGE_SIZE (1024 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 256 * BASE_PAGE_SIZE)
+#define TEST_MAIN
+#define TEST_NAME "memmove"
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+IMPL (memmove, 1)
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+
+static void
+do_one_test (impl_t *impl, char *dst, char *src, const char *orig_src,
+ size_t len)
+{
+ size_t i, iters = 16;
+ timing_t start, stop, cur;
+
+ /* It is too slow to check result with a simple implementation. */
+
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, dst, src, len);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ TIMING_PRINT_MEAN ((double) cur, (double) iters);
+}
+
+static void
+do_test (size_t align1, size_t align2, size_t len)
+{
+ size_t i, j;
+ char *s1, *s2;
+
+ align1 &= 127;
+ if (align1 + len >= page_size)
+ return;
+
+ align2 &= 127;
+ if (align2 + len >= page_size)
+ return;
+
+ s1 = (char *) (buf1 + align1);
+ s2 = (char *) (buf2 + align2);
+
+ for (i = 0, j = 1; i < len; i++, j += 23)
+ s1[i] = j;
+
+ printf ("Length %4zd, alignment %2zd/%2zd:", len, align1, align2);
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_test (impl, s2, (char *) (buf2 + align1), s1, len);
+
+ putchar ('\n');
+}
+
+int
+test_main (void)
+{
+ size_t i;
+
+ test_init ();
+
+ printf ("%23s", "");
+ FOR_EACH_IMPL (impl, 0)
+ printf ("\t%s", impl->name);
+ putchar ('\n');
+
+ for (i = BASE_PAGE_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+ {
+ do_test (0, 64, i);
+ do_test (0, 1, i);
+ do_test (1, 0, i);
+ do_test (1, 2, i);
+ do_test (6, 3, i);
+ }
+
+ return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-memset-large.c b/benchtests/bench-memset-large.c
new file mode 100644
index 0000000..acb920e
--- /dev/null
+++ b/benchtests/bench-memset-large.c
@@ -0,0 +1,103 @@
+/* Measure memset functions with large data sizes.
+ Copyright (C) 2016 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#ifndef WIDE
+# define TEST_NAME "memset"
+#else
+# define TEST_NAME "wmemset"
+#endif /* WIDE */
+#define BASE_PAGE_SIZE (1024 * 1024)
+#define MIN_PAGE_SIZE (getpagesize () + 256 * BASE_PAGE_SIZE)
+#define TIMEOUT (20 * 60)
+#include "bench-string.h"
+
+#ifndef WIDE
+# define MEMSET memset
+# define CHAR char
+# define MEMCMP memcmp
+#else
+# include <wchar.h>
+# define MEMSET wmemset
+# define CHAR wchar_t
+# define MEMCMP wmemcmp
+#endif /* WIDE */
+
+IMPL (MEMSET, 1)
+
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
+
+static void
+do_one_test (impl_t *impl, CHAR *s, int c __attribute ((unused)), size_t n)
+{
+ size_t i, iters = 16;
+ timing_t start, stop, cur;
+
+ /* It is too slow to check result with a simple implementation. */
+
+ TIMING_NOW (start);
+ for (i = 0; i < iters; ++i)
+ {
+ CALL (impl, s, c, n);
+ }
+ TIMING_NOW (stop);
+
+ TIMING_DIFF (cur, start, stop);
+
+ TIMING_PRINT_MEAN ((double) cur, (double) iters);
+}
+
+static void
+do_test (size_t align, int c, size_t len)
+{
+ align &= 63;
+ if ((align + len) * sizeof (CHAR) > page_size)
+ return;
+
+ printf ("Length %4zd, alignment %2zd, c %2d:", len, align, c);
+
+ FOR_EACH_IMPL (impl, 0)
+ do_one_test (impl, (CHAR *) (buf1) + align, c, len);
+
+ putchar ('\n');
+}
+
+int
+test_main (void)
+{
+ size_t i;
+ int c;
+
+ test_init ();
+
+ printf ("%24s", "");
+ FOR_EACH_IMPL (impl, 0)
+ printf ("\t%s", impl->name);
+ putchar ('\n');
+
+ c = 65;
+ for (i = BASE_PAGE_SIZE; i <= MIN_PAGE_SIZE; i <<= 1)
+ {
+ do_test (0, c, i);
+ do_test (1, c, i);
+ }
+
+ return ret;
+}
+
+#include "../test-skeleton.c"
diff --git a/benchtests/bench-string.h b/benchtests/bench-string.h
index be4c618..9c5371e 100644
--- a/benchtests/bench-string.h
+++ b/benchtests/bench-string.h
@@ -56,7 +56,9 @@ extern impl_t __start_impls[], __stop_impls[];
# define TEST_FUNCTION test_main ()
-# define TIMEOUT (4 * 60)
+# ifndef TIMEOUT
+# define TIMEOUT (4 * 60)
+# endif
# define OPT_ITERATIONS 10000
# define OPT_RANDOM 10001
# define OPT_SEED 10002
diff --git a/config.make.in b/config.make.in
index 95c6f36..b9a4dbb 100644
--- a/config.make.in
+++ b/config.make.in
@@ -89,6 +89,7 @@ link-obsolete-rpc = @link_obsolete_rpc@
build-nscd = @build_nscd@
use-nscd = @use_nscd@
build-hardcoded-path-in-tests= @hardcoded_path_in_tests@
+run-large-benchtests = @large_benchtests@
build-pt-chown = @build_pt_chown@
enable-lock-elision = @enable_lock_elision@
diff --git a/configure b/configure
index 8fe5937..42bde65 100755
--- a/configure
+++ b/configure
@@ -668,6 +668,7 @@ all_warnings
force_install
bindnow
enable_lock_elision
+large_benchtests
hardcoded_path_in_tests
enable_timezone_tools
use_default_link
@@ -755,6 +756,7 @@ enable_shared
enable_profile
enable_timezone_tools
enable_hardcoded_path_in_tests
+enable_large_benchtests
enable_stackguard_randomization
enable_lock_elision
enable_add_ons
@@ -1411,6 +1413,8 @@ Optional Features:
--enable-hardcoded-path-in-tests
hardcode newly built glibc path in tests
[default=no]
+ --enable-large-benchtests
+ run benchtests with large data size [default=no]
--enable-stackguard-randomization
initialize __stack_chk_guard canary with a random
number at program start
@@ -3363,6 +3367,15 @@ fi
+# Check whether --enable-large-benchtests was given.
+if test "${enable_large_benchtests+set}" = set; then :
+ enableval=$enable_large_benchtests; large_benchtests=$enableval
+else
+ large_benchtests=no
+fi
+
+
+
# Check whether --enable-stackguard-randomization was given.
if test "${enable_stackguard_randomization+set}" = set; then :
enableval=$enable_stackguard_randomization; enable_stackguard_randomize=$enableval
diff --git a/configure.ac b/configure.ac
index 3c766b7..8fb93d9 100644
--- a/configure.ac
+++ b/configure.ac
@@ -190,6 +190,13 @@ AC_ARG_ENABLE([hardcoded-path-in-tests],
[hardcoded_path_in_tests=no])
AC_SUBST(hardcoded_path_in_tests)
+AC_ARG_ENABLE([large-benchtests],
+ AC_HELP_STRING([--enable-large-benchtests],
+ [run benchtests with large data size @<:@default=no@:>@]),
+ [large_benchtests=$enableval],
+ [large_benchtests=no])
+AC_SUBST(large_benchtests)
+
AC_ARG_ENABLE([stackguard-randomization],
AC_HELP_STRING([--enable-stackguard-randomization],
[initialize __stack_chk_guard canary with a random number at program start]),
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources