This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch hjl/x86/optimize created. glibc-2.25-369-g3e31bc4
- From: hjl at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 2 Jun 2017 23:15:36 -0000
- Subject: GNU C Library master sources branch hjl/x86/optimize created. glibc-2.25-369-g3e31bc4
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, hjl/x86/optimize has been created
at 3e31bc4a930e7b32924befe762014f85d5408692 (commit)
- Log -----------------------------------------------------------------
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3e31bc4a930e7b32924befe762014f85d5408692
commit 3e31bc4a930e7b32924befe762014f85d5408692
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon May 22 12:00:43 2017 -0700
Add x86_cache.non_temporal_threshold to GLIBC_TUNABLES
Add support for "glibc.x86_cache.non_temporal_threshold=number" to
GLIBC_TUNABLES.
* elf/dl-tunables.list (x86_cache): New name space.
* sysdeps/x86/cacheinfo.c [HAVE_TUNABLES] (TUNABLE_NAMESPACE):
New.
[HAVE_TUNABLES]: Include <elf/dl-tunables.h>.
[HAVE_TUNABLES] (DL_TUNABLE_CALLBACK (set_non_temporal_threshold)):
New.
[HAVE_TUNABLES] (init_cacheinfo): Call TUNABLE_SET_VAL_WITH_CALLBACK
with set_non_temporal_threshold.
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index b9f1488..2c899fe 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -77,4 +77,10 @@ glibc {
security_level: SXID_IGNORE
}
}
+ x86_cache {
+ non_temporal_threshold {
+ type: SIZE_T
+ security_level: SXID_IGNORE
+ }
+ }
}
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 3434d97..1b195eb 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -23,6 +23,20 @@
#include <cpuid.h>
#include <init-arch.h>
+/* Threshold to use non temporal store. */
+long int __x86_shared_non_temporal_threshold attribute_hidden;
+
+#if HAVE_TUNABLES
+# define TUNABLE_NAMESPACE x86_cache
+# include <elf/dl-tunables.h>
+
+void
+DL_TUNABLE_CALLBACK (set_non_temporal_threshold) (tunable_val_t *valp)
+{
+ __x86_shared_non_temporal_threshold = (long int) valp->numval;
+}
+#endif
+
#define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
#define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
#define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
@@ -466,9 +480,6 @@ long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
/* Similar to __x86_shared_cache_size, but not rounded. */
long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
-/* Threshold to use non temporal store. */
-long int __x86_shared_non_temporal_threshold attribute_hidden;
-
#ifndef DISABLE_PREFETCHW
/* PREFETCHW support flag for use in memory and string routines. */
int __x86_prefetchw attribute_hidden;
@@ -770,4 +781,9 @@ intel_bug_no_cache_info:
total shared cache size. */
__x86_shared_non_temporal_threshold
= __x86_shared_cache_size * threads * 3 / 4;
+
+#if HAVE_TUNABLES
+ TUNABLE_SET_VAL_WITH_CALLBACK (non_temporal_threshold, NULL,
+ set_non_temporal_threshold);
+#endif
}
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=bfb716e07b77f0ed8e0c2689d5cd01e2c8251fc5
commit bfb716e07b77f0ed8e0c2689d5cd01e2c8251fc5
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri May 12 13:38:04 2017 -0700
x86: Update __x86_shared_non_temporal_threshold
__x86_shared_non_temporal_threshold was set to 6 times of per-core
shared cache size, based on the large memcpy micro benchmark in glibc
on a 8-core processor. For a processor with more than 8 cores, the
threshold is too low. Set __x86_shared_non_temporal_threshold to the
3/4 of the total shared cache size so that it is unchanged on 8-core
processors. On processors with less than 8 cores, the threshold is
lower.
* sysdeps/x86/cacheinfo.c (__x86_shared_non_temporal_threshold):
Set to the 3/4 of the total shared cache size.
diff --git a/sysdeps/x86/cacheinfo.c b/sysdeps/x86/cacheinfo.c
index 1ccbe41..3434d97 100644
--- a/sysdeps/x86/cacheinfo.c
+++ b/sysdeps/x86/cacheinfo.c
@@ -766,6 +766,8 @@ intel_bug_no_cache_info:
/* The large memcpy micro benchmark in glibc shows that 6 times of
shared cache size is the approximate value above which non-temporal
- store becomes faster. */
- __x86_shared_non_temporal_threshold = __x86_shared_cache_size * 6;
+ store becomes faster on a 8-core processor. This is the 3/4 of the
+ total shared cache size. */
+ __x86_shared_non_temporal_threshold
+ = __x86_shared_cache_size * threads * 3 / 4;
}
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=3b4afb94b84bba83751f861d5267a74b698caa6c
commit 3b4afb94b84bba83751f861d5267a74b698caa6c
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri May 19 11:14:19 2017 -0700
Test only a subset of memcpy
diff --git a/benchtests/memcpy_benchmark.cc b/benchtests/memcpy_benchmark.cc
index 42d534e..8c5ccf9 100644
--- a/benchtests/memcpy_benchmark.cc
+++ b/benchtests/memcpy_benchmark.cc
@@ -122,11 +122,34 @@ std::map<std::string, std::function<void(impl_t *, int, int)>> schemes =
{"No Cache", BM_memcpy_nocache},
{"Read Cache", BM_memcpy_readcache}};
+const char *checks[]=
+{
+ "__memcpy_avx_unaligned",
+ "__memcpy_avx_unaligned_erms",
+ "__memcpy_sse2_unaligned",
+ "__memcpy_sse2_unaligned_erms",
+ "__memcpy_sse2_unaligned_2_19",
+ "__memcpy_erms",
+ NULL,
+};
+
+bool
+match (const char *name)
+{
+ int i;
+ for (i = 0; checks[i] != NULL; i++)
+ if (strcmp (checks[i], name) == 0)
+ return true;
+ return false;
+}
+
void test() {
std::cout << "Size(bytes) Alignment(src/dest) BW(Gbytes/sec)" << std::endl;
bool first = true;
FOR_EACH_IMPL (impl, 0)
{
+ if (!match (impl->name))
+ continue;
if (!first)
std::cout << " ";
std::cout << impl->name;
@@ -139,6 +162,8 @@ void test() {
first = true;
FOR_EACH_IMPL (impl, 0)
{
+ if (!match (impl->name))
+ continue;
int time = do_timing(scheme.second, impl, size);
if (first)
{
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=f47a8ff13ba2f4e039f028453573c1489ab33186
commit f47a8ff13ba2f4e039f028453573c1489ab33186
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed May 10 16:05:09 2017 -0700
Integrate memcpy_benchmark.cc with glibc benchtests
diff --git a/benchtests/memcpy_benchmark.cc b/benchtests/memcpy_benchmark.cc
index 51dff26..42d534e 100644
--- a/benchtests/memcpy_benchmark.cc
+++ b/benchtests/memcpy_benchmark.cc
@@ -24,10 +24,21 @@
#include <map>
#include <string>
+#define TEST_MAIN
+#define TEST_NAME "memcpy"
+#define TIMEOUT (60 * 60)
+#include "bench-string.h"
+
+typedef char *(*proto_t) (char *, const char *, size_t);
+IMPL (memcpy, 1)
+
std::chrono::time_point<std::chrono::high_resolution_clock> start;
std::chrono::time_point<std::chrono::high_resolution_clock> stop;
size_t bytes;
+#define MAX_ALIGN 128
+int src_align, dest_align;
+
void start_timing() { start = std::chrono::high_resolution_clock::now(); }
void stop_timing() { stop = std::chrono::high_resolution_clock::now(); }
@@ -35,31 +46,37 @@ int size_list[] = {1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19,
1 << 20, 1 << 21, 1 << 22, 1 << 23, 1 << 24, 1 << 25, 1 << 26};
size_t buffer_size = 1 << 28;
-void BM_memcpy_readwritecache(int iters, int size) {
+void BM_memcpy_readwritecache(impl_t *impl, int iters, int size) {
unsigned char * buf1 = new unsigned char [size];
unsigned char * buf2 = new unsigned char [size];
+ src_align = ((uintptr_t) buf1) & (MAX_ALIGN - 1);
+ dest_align = ((uintptr_t) buf2) & (MAX_ALIGN - 1);
+
memset (buf1, 0xa5, size); memset (buf2, 0x5a, size);
start_timing();
for (int i = 0; i < iters; ++i) {
- memcpy(buf2, buf1, size);
+ CALL(impl, buf2, buf1, size);
}
stop_timing();
delete[] buf1; delete[] buf2;
}
-void BM_memcpy_nocache(int iters, int size) {
+void BM_memcpy_nocache(impl_t *impl, int iters, int size) {
unsigned char * buf1 = new unsigned char [buffer_size];
unsigned char * buf2 = new unsigned char [buffer_size];
+ src_align = ((uintptr_t) buf1) & (MAX_ALIGN - 1);
+ dest_align = ((uintptr_t) buf2) & (MAX_ALIGN - 1);
+
memset (buf1, 0xa5, buffer_size); memset (buf2, 0x5a, buffer_size);
size_t offset = 0;
start_timing();
for (int i = 0; i < iters; ++i) {
- memcpy(buf2 + offset, buf1 + offset, size);
+ CALL(impl, buf2 + offset, buf1 + offset, size);
offset += std::max(4097, size + 1);
if (offset >= buffer_size - size) offset = 0;
}
@@ -68,16 +85,19 @@ void BM_memcpy_nocache(int iters, int size) {
delete[] buf1; delete[] buf2;
}
-void BM_memcpy_readcache(int iters, int size) {
+void BM_memcpy_readcache(impl_t *impl, int iters, int size) {
unsigned char * buf1 = new unsigned char [size];
unsigned char * buf2 = new unsigned char [buffer_size];
+ src_align = ((uintptr_t) buf1) & (MAX_ALIGN - 1);
+ dest_align = ((uintptr_t) buf2) & (MAX_ALIGN - 1);
+
memset (buf1, 0xa5, size); memset (buf2, 0x5a, buffer_size);
size_t offset = 0;
start_timing();
for (int i = 0; i < iters; ++i) {
- memcpy(buf2 + offset, buf1, size);
+ CALL(impl, buf2 + offset, buf1, size);
offset += std::max(4097, size + 1);
if (offset >= buffer_size - size) offset = 0;
}
@@ -86,30 +106,60 @@ void BM_memcpy_readcache(int iters, int size) {
delete[] buf1; delete[] buf2;
}
-double do_timing(std::function<void(int, int)> &fn, int size) {
+double do_timing(std::function<void(impl_t *, int, int)> &fn, impl_t *impl, int size) {
int iters = 2; double time = 0;
while (time < 500) {
iters *= 3;
- fn(iters, size);
+ fn(impl, iters, size);
time = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count();
bytes = (2UL * iters * size);
}
return time;
}
-std::map<std::string, std::function<void(int, int)>> schemes =
+std::map<std::string, std::function<void(impl_t *, int, int)>> schemes =
{{"Read and Write Cache", BM_memcpy_readwritecache},
{"No Cache", BM_memcpy_nocache},
{"Read Cache", BM_memcpy_readcache}};
-int main(void) {
- std::cout << " Size (bytes) Time (msec) BW (Gbytes/sec)" << std::endl;
+void test() {
+ std::cout << "Size(bytes) Alignment(src/dest) BW(Gbytes/sec)" << std::endl;
+ bool first = true;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ if (!first)
+ std::cout << " ";
+ std::cout << impl->name;
+ first = false;
+ }
+ std::cout << std::endl;
for (auto scheme : schemes) {
std::cout << scheme.first << std::endl;
for (auto size : size_list) {
- int time = do_timing(scheme.second, size);
- printf("%12d %10d %10.2f\n", size, time, (bytes * 1000L / time) / 1e9);
+ first = true;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ int time = do_timing(scheme.second, impl, size);
+ if (first)
+ {
+ first = false;
+ printf("%d %d/%-d %.2f",
+ size, src_align, dest_align,
+ (bytes * 1000L / time) / 1e9);
+ }
+ else
+ printf(" %.2f",
+ (bytes * 1000L / time) / 1e9);
+ }
+ printf ("\n");
}
std::cout << "----------------\n";
}
-}
\ No newline at end of file
+}
+
+int test_main(void) {
+ test_init ();
+ test ();
+ return 0;
+}
+#include <support/test-driver.c>
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=c91f98af397a5f07f48c464178e140f41e74a8e7
commit c91f98af397a5f07f48c464178e140f41e74a8e7
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed May 10 15:25:54 2017 -0700
Build memcpy_benchmark in benchtests
Compile memcpy_benchmark.cc with -fpermissive -Wno-error -std=c++11 to
silence GCC.
diff --git a/benchtests/Makefile b/benchtests/Makefile
index 7f5fda5..d3cba28 100644
--- a/benchtests/Makefile
+++ b/benchtests/Makefile
@@ -99,6 +99,12 @@ binaries-bench := $(addprefix $(objpfx)bench-,$(bench))
binaries-benchset := $(addprefix $(objpfx)bench-,$(benchset))
binaries-bench-malloc := $(addprefix $(objpfx)bench-,$(bench-malloc))
+ifneq (,$(CXX))
+binaries-bench += $(objpfx)memcpy_benchmark
+CFLAGS-memcpy_benchmark.cc = -fpermissive -Wno-error -std=c++11
+LDLIBS-memcpy_benchmark = -lstdc++
+endif
+
# The default duration: 10 seconds.
ifndef BENCH_DURATION
BENCH_DURATION := 10
@@ -122,6 +128,9 @@ endif
# for all these modules.
cpp-srcs-left := $(binaries-benchset:=.c) $(binaries-bench:=.c) \
$(binaries-bench-malloc:=.c)
+ifneq (,$(CXX))
+cpp-srcs-left += memcpy_benchmark.cc
+endif
lib := nonlib
include $(patsubst %,$(..)libof-iterator.mk,$(cpp-srcs-left))
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=84158fda529ff223af2b3ba5bb293863034233e8
commit 84158fda529ff223af2b3ba5bb293863034233e8
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed May 10 14:30:08 2017 -0700
Import memcpy_benchmark.cc
From
https://gist.github.com/ekelsen/b66cc085eb39f0495b57679cdb1874fa
diff --git a/benchtests/memcpy_benchmark.cc b/benchtests/memcpy_benchmark.cc
new file mode 100644
index 0000000..51dff26
--- /dev/null
+++ b/benchtests/memcpy_benchmark.cc
@@ -0,0 +1,115 @@
+/* Copyright 2017 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ======================================================================*/
+
+#if !defined DO_STRING_INLINES
+#undef __USE_STRING_INLINES
+#endif
+
+#include <string.h>
+#include <chrono>
+#include <iostream>
+#include <functional>
+#include <map>
+#include <string>
+
+std::chrono::time_point<std::chrono::high_resolution_clock> start;
+std::chrono::time_point<std::chrono::high_resolution_clock> stop;
+size_t bytes;
+
+void start_timing() { start = std::chrono::high_resolution_clock::now(); }
+void stop_timing() { stop = std::chrono::high_resolution_clock::now(); }
+
+int size_list[] = {1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19,
+ 1 << 20, 1 << 21, 1 << 22, 1 << 23, 1 << 24, 1 << 25, 1 << 26};
+size_t buffer_size = 1 << 28;
+
+void BM_memcpy_readwritecache(int iters, int size) {
+ unsigned char * buf1 = new unsigned char [size];
+ unsigned char * buf2 = new unsigned char [size];
+
+ memset (buf1, 0xa5, size); memset (buf2, 0x5a, size);
+
+ start_timing();
+ for (int i = 0; i < iters; ++i) {
+ memcpy(buf2, buf1, size);
+ }
+ stop_timing();
+
+ delete[] buf1; delete[] buf2;
+}
+
+void BM_memcpy_nocache(int iters, int size) {
+ unsigned char * buf1 = new unsigned char [buffer_size];
+ unsigned char * buf2 = new unsigned char [buffer_size];
+
+ memset (buf1, 0xa5, buffer_size); memset (buf2, 0x5a, buffer_size);
+
+ size_t offset = 0;
+ start_timing();
+ for (int i = 0; i < iters; ++i) {
+ memcpy(buf2 + offset, buf1 + offset, size);
+ offset += std::max(4097, size + 1);
+ if (offset >= buffer_size - size) offset = 0;
+ }
+ stop_timing();
+
+ delete[] buf1; delete[] buf2;
+}
+
+void BM_memcpy_readcache(int iters, int size) {
+ unsigned char * buf1 = new unsigned char [size];
+ unsigned char * buf2 = new unsigned char [buffer_size];
+
+ memset (buf1, 0xa5, size); memset (buf2, 0x5a, buffer_size);
+
+ size_t offset = 0;
+ start_timing();
+ for (int i = 0; i < iters; ++i) {
+ memcpy(buf2 + offset, buf1, size);
+ offset += std::max(4097, size + 1);
+ if (offset >= buffer_size - size) offset = 0;
+ }
+ stop_timing();
+
+ delete[] buf1; delete[] buf2;
+}
+
+double do_timing(std::function<void(int, int)> &fn, int size) {
+ int iters = 2; double time = 0;
+ while (time < 500) {
+ iters *= 3;
+ fn(iters, size);
+ time = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start).count();
+ bytes = (2UL * iters * size);
+ }
+ return time;
+}
+
+std::map<std::string, std::function<void(int, int)>> schemes =
+ {{"Read and Write Cache", BM_memcpy_readwritecache},
+ {"No Cache", BM_memcpy_nocache},
+ {"Read Cache", BM_memcpy_readcache}};
+
+int main(void) {
+ std::cout << " Size (bytes) Time (msec) BW (Gbytes/sec)" << std::endl;
+ for (auto scheme : schemes) {
+ std::cout << scheme.first << std::endl;
+ for (auto size : size_list) {
+ int time = do_timing(scheme.second, size);
+ printf("%12d %10d %10.2f\n", size, time, (bytes * 1000L / time) / 1e9);
+ }
+ std::cout << "----------------\n";
+ }
+}
\ No newline at end of file
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=006859c5be0cfc7797aaa91399ed32efc335df23
commit 006859c5be0cfc7797aaa91399ed32efc335df23
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed May 10 10:21:08 2017 -0700
x86-64: Restore memcpy-sse2-unaligned.S from glibc 2.19
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2a30538..5ed4e74 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -23,7 +23,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memmove-avx512-unaligned-erms \
memset-avx2-unaligned-erms \
memset-avx512-unaligned-erms \
- strlen-sse4
+ strlen-sse4 memcpy-sse2-unaligned
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 1604678..653716e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -353,6 +353,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1,
__memcpy_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memcpy, 1,
+ __memcpy_sse2_unaligned_2_19)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms))
/* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */
diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
new file mode 100644
index 0000000..1d05c2c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S
@@ -0,0 +1,171 @@
+/* memcpy with unaliged loads
+ Copyright (C) 2013-2014 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#include "asm-syntax.h"
+
+
+ENTRY(__memcpy_sse2_unaligned_2_19)
+ movq %rsi, %rax
+ leaq (%rdx,%rdx), %rcx
+ subq %rdi, %rax
+ subq %rdx, %rax
+ cmpq %rcx, %rax
+ jb L(overlapping)
+ cmpq $16, %rdx
+ jbe L(less_16)
+ movdqu (%rsi), %xmm8
+ cmpq $32, %rdx
+ movdqu %xmm8, (%rdi)
+ movdqu -16(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -16(%rdi,%rdx)
+ ja .L31
+L(return):
+ movq %rdi, %rax
+ ret
+ .p2align 4,,10
+ .p2align 4
+.L31:
+ movdqu 16(%rsi), %xmm8
+ cmpq $64, %rdx
+ movdqu %xmm8, 16(%rdi)
+ movdqu -32(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -32(%rdi,%rdx)
+ jbe L(return)
+ movdqu 32(%rsi), %xmm8
+ cmpq $128, %rdx
+ movdqu %xmm8, 32(%rdi)
+ movdqu -48(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -48(%rdi,%rdx)
+ movdqu 48(%rsi), %xmm8
+ movdqu %xmm8, 48(%rdi)
+ movdqu -64(%rsi,%rdx), %xmm8
+ movdqu %xmm8, -64(%rdi,%rdx)
+ jbe L(return)
+ leaq 64(%rdi), %rcx
+ addq %rdi, %rdx
+ andq $-64, %rdx
+ andq $-64, %rcx
+ movq %rcx, %rax
+ subq %rdi, %rax
+ addq %rax, %rsi
+ cmpq %rdx, %rcx
+ je L(return)
+ movq %rsi, %r10
+ subq %rcx, %r10
+ leaq 16(%r10), %r9
+ leaq 32(%r10), %r8
+ leaq 48(%r10), %rax
+ .p2align 4,,10
+ .p2align 4
+L(loop):
+ movdqu (%rcx,%r10), %xmm8
+ movdqa %xmm8, (%rcx)
+ movdqu (%rcx,%r9), %xmm8
+ movdqa %xmm8, 16(%rcx)
+ movdqu (%rcx,%r8), %xmm8
+ movdqa %xmm8, 32(%rcx)
+ movdqu (%rcx,%rax), %xmm8
+ movdqa %xmm8, 48(%rcx)
+ addq $64, %rcx
+ cmpq %rcx, %rdx
+ jne L(loop)
+ jmp L(return)
+L(overlapping):
+ cmpq %rsi, %rdi
+ jae .L3
+ testq %rdx, %rdx
+ .p2align 4,,5
+ je L(return)
+ movq %rdx, %r9
+ leaq 16(%rsi), %rcx
+ leaq 16(%rdi), %r8
+ shrq $4, %r9
+ movq %r9, %rax
+ salq $4, %rax
+ cmpq %rcx, %rdi
+ setae %cl
+ cmpq %r8, %rsi
+ setae %r8b
+ orl %r8d, %ecx
+ cmpq $15, %rdx
+ seta %r8b
+ testb %r8b, %cl
+ je .L16
+ testq %rax, %rax
+ je .L16
+ xorl %ecx, %ecx
+ xorl %r8d, %r8d
+.L7:
+ movdqu (%rsi,%rcx), %xmm8
+ addq $1, %r8
+ movdqu %xmm8, (%rdi,%rcx)
+ addq $16, %rcx
+ cmpq %r8, %r9
+ ja .L7
+ cmpq %rax, %rdx
+ je L(return)
+.L21:
+ movzbl (%rsi,%rax), %ecx
+ movb %cl, (%rdi,%rax)
+ addq $1, %rax
+ cmpq %rax, %rdx
+ ja .L21
+ jmp L(return)
+L(less_16):
+ testb $24, %dl
+ jne L(between_9_16)
+ testb $4, %dl
+ .p2align 4,,5
+ jne L(between_5_8)
+ testq %rdx, %rdx
+ .p2align 4,,2
+ je L(return)
+ movzbl (%rsi), %eax
+ testb $2, %dl
+ movb %al, (%rdi)
+ je L(return)
+ movzwl -2(%rsi,%rdx), %eax
+ movw %ax, -2(%rdi,%rdx)
+ jmp L(return)
+.L3:
+ leaq -1(%rdx), %rax
+ .p2align 4,,10
+ .p2align 4
+.L11:
+ movzbl (%rsi,%rax), %edx
+ movb %dl, (%rdi,%rax)
+ subq $1, %rax
+ jmp .L11
+L(between_9_16):
+ movq (%rsi), %rax
+ movq %rax, (%rdi)
+ movq -8(%rsi,%rdx), %rax
+ movq %rax, -8(%rdi,%rdx)
+ jmp L(return)
+.L16:
+ xorl %eax, %eax
+ jmp .L21
+L(between_5_8):
+ movl (%rsi), %eax
+ movl %eax, (%rdi)
+ movl -4(%rsi,%rdx), %eax
+ movl %eax, -4(%rdi,%rdx)
+ jmp L(return)
+END(__memcpy_sse2_unaligned_2_19)
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=41ab67bceb481e4721fcbfe4c3787315303fc6a2
commit 41ab67bceb481e4721fcbfe4c3787315303fc6a2
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon May 1 08:32:22 2017 -0700
x86-64: Restore the old SSE4 strlen
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 3736f54..2a30538 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -22,7 +22,8 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memmove-avx-unaligned-erms \
memmove-avx512-unaligned-erms \
memset-avx2-unaligned-erms \
- memset-avx512-unaligned-erms
+ memset-avx512-unaligned-erms \
+ strlen-sse4
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 06d9a9d..1604678 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -410,6 +410,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_sse2_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms))
+ /* Support sysdeps/x86_64/multiarch/strlen.S. */
+ IFUNC_IMPL (i, name, strlen,
+ IFUNC_IMPL_ADD (array, i, strlen, HAS_CPU_FEATURE (SSE4_2),
+ __strlen_sse42)
+ IFUNC_IMPL_ADD (array, i, strlen, 1, strlen))
+
/* Support sysdeps/x86_64/multiarch/strncmp.S. */
IFUNC_IMPL (i, name, strncmp,
IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
diff --git a/sysdeps/x86_64/multiarch/strlen-sse4.S b/sysdeps/x86_64/multiarch/strlen-sse4.S
new file mode 100644
index 0000000..8d685df
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-sse4.S
@@ -0,0 +1,84 @@
+/* strlen with SSE4
+ Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ Contributed by Ulrich Drepper <drepper@redhat.com>.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if defined SHARED && !defined NOT_IN_libc
+
+#include <sysdep.h>
+
+ .section .text.sse4.2,"ax",@progbits
+ENTRY (__strlen_sse42)
+ pxor %xmm1, %xmm1
+ movl %edi, %ecx
+ movq %rdi, %r8
+ andq $~15, %rdi
+ xor %edi, %ecx
+ pcmpeqb (%rdi), %xmm1
+ pmovmskb %xmm1, %edx
+ shrl %cl, %edx
+ shll %cl, %edx
+ andl %edx, %edx
+ jnz L(less16bytes)
+ pxor %xmm1, %xmm1
+
+ .p2align 4
+L(more64bytes_loop):
+ pcmpistri $0x08, 16(%rdi), %xmm1
+ jz L(more32bytes)
+
+ pcmpistri $0x08, 32(%rdi), %xmm1
+ jz L(more48bytes)
+
+ pcmpistri $0x08, 48(%rdi), %xmm1
+ jz L(more64bytes)
+
+ add $64, %rdi
+ pcmpistri $0x08, (%rdi), %xmm1
+ jnz L(more64bytes_loop)
+ leaq (%rdi,%rcx), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more32bytes):
+ leaq 16(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more48bytes):
+ leaq 32(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(more64bytes):
+ leaq 48(%rdi,%rcx, 1), %rax
+ subq %r8, %rax
+ ret
+
+ .p2align 4
+L(less16bytes):
+ subq %r8, %rdi
+ bsfl %edx, %eax
+ addq %rdi, %rax
+ ret
+
+END (__strlen_sse42)
+
+#endif
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=cbb5ed76f4899c3052ec533cbc5879bdc60af8eb
commit cbb5ed76f4899c3052ec533cbc5879bdc60af8eb
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri May 19 10:59:53 2017 -0700
Include bench-timing.h only if _ISOMAC isn't defined
bench-timing.h includes hp-timing.h for high precision timing functions.
But hp-timing.h is an internal header file, which can't be included when
_ISOMAC is defined. But _ISOMAC is defined for C++ benchmark programs
via libc-symbols.h. We don't include bench-timing.h if _ISOMAC is
defined. If high precision timing functions are needed in C++ benchmark
programs, we need to revisit this issue.
* benchtests/bench-string.h: Include bench-timing.h only if
_ISOMAC isn't defined.
diff --git a/benchtests/bench-string.h b/benchtests/bench-string.h
index d76724d..a6fe200 100644
--- a/benchtests/bench-string.h
+++ b/benchtests/bench-string.h
@@ -53,7 +53,9 @@ extern impl_t __start_impls[], __stop_impls[];
# include <ifunc-impl-list.h>
# define GL(x) _##x
# define GLRO(x) _##x
-# include "bench-timing.h"
+# ifndef _ISOMAC
+# include "bench-timing.h"
+# endif
# define TEST_FUNCTION test_main
http://sourceware.org/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=2b04ad441adde9f5299ec69df8815542deab1add
commit 2b04ad441adde9f5299ec69df8815542deab1add
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Wed May 10 16:02:56 2017 -0700
Add __BEGIN_DECLS and __END_DECLS for C++
Add __BEGIN_DECLS and __END_DECLS to support C++. IFUNC_IMPL_ADD and
IFUNC_IMPL are used internally in libc. They shouldn't be used in any
programs.
* include/ifunc-impl-list.h: Add __BEGIN_DECLS and __END_DECLS.
(IFUNC_IMPL_ADD, IFUNC_IMPL): Define only if __cplusplus isn't
defined.
diff --git a/include/ifunc-impl-list.h b/include/ifunc-impl-list.h
index 22ca05f..7d53f11 100644
--- a/include/ifunc-impl-list.h
+++ b/include/ifunc-impl-list.h
@@ -22,6 +22,8 @@
#include <stdbool.h>
#include <stddef.h>
+__BEGIN_DECLS
+
struct libc_ifunc_impl
{
/* The name of function to be tested. */
@@ -32,20 +34,25 @@ struct libc_ifunc_impl
bool usable;
};
+#ifndef __cplusplus
+/* NB: IFUNC_IMPL_ADD and IFUNC_IMPL are used internally in libc. They
+ shouldn't be used in any programs. */
+
/* Add an IFUNC implementation, IMPL, for function FUNC, to ARRAY with
USABLE at index I and advance I by one. */
-#define IFUNC_IMPL_ADD(array, i, func, usable, impl) \
+# define IFUNC_IMPL_ADD(array, i, func, usable, impl) \
extern __typeof (func) impl attribute_hidden; \
(array)[i++] = (struct libc_ifunc_impl) { #impl, (void (*) (void)) impl, (usable) };
/* Return the number of IFUNC implementations, N, for function FUNC if
string NAME matches FUNC. */
-#define IFUNC_IMPL(n, name, func, ...) \
+# define IFUNC_IMPL(n, name, func, ...) \
if (strcmp (name, #func) == 0) \
{ \
__VA_ARGS__; \
return n; \
}
+#endif /* __cplusplus */
/* Fill ARRAY of MAX elements with IFUNC implementations for function
NAME and return the number of valid entries. */
@@ -53,4 +60,6 @@ extern size_t __libc_ifunc_impl_list (const char *name,
struct libc_ifunc_impl *array,
size_t max);
+__END_DECLS
+
#endif /* ifunc-impl-list.h */
-----------------------------------------------------------------------
hooks/post-receive
--
GNU C Library master sources