This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Other format:	[Raw text]
[PATCH RFC] Improve 64bit memset for Corei7 with avx2 instruction

From: ling dot ma dot program at gmail dot com
To: libc-alpha at sourceware dot org
Cc: neleai at seznam dot cz, Ma Ling <ling dot ml at alibaba-inc dot com>
Date: Wed, 10 Jul 2013 07:31:16 -0400
Subject: [PATCH RFC] Improve 64bit memset for Corei7 with avx2 instruction
From: Ma Ling <ling.ml@alibaba-inc.com>

In this patch we use the similar approach with memcpy to avoid branch
instructions and force destination to be aligned with avx2 instruction.
By gcc.403 benchmark we find memset spend more time than memcpy by 5~10 times.
The benchmark also indicate this patch improve performance from  30% to 100%
compared with original __memset_sse2.

Thanks
Ling
---
 sysdeps/x86_64/multiarch/Makefile          |   2 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   2 +
 sysdeps/x86_64/multiarch/memset-avx2.S     | 245 +++++++++++++++++++++++++++++
 3 files changed, 248 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/x86_64/multiarch/memset-avx2.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 02c0a2a..5435682 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -9,7 +9,7 @@ ifeq ($(subdir),string)
 sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
 		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
 		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back memmove-ssse3-back \
-		   memcpy-avx2-unaligned mempcpy-avx2-unaligned memmove-avx2-unaligned \
+		   memcpy-avx2-unaligned mempcpy-avx2-unaligned memmove-avx2-unaligned memset-avx2 \
 		   strcasestr-nonascii strcasecmp_l-ssse3 \
 		   strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 5fb5663..a4938b4 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -67,12 +67,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/memset_chk.S.  */
   IFUNC_IMPL (i, name, __memset_chk,
+	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_avx2)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1, __memset_chk_sse2)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk, 1,
 			      __memset_chk_x86_64))
 
   /* Support sysdeps/x86_64/multiarch/memset.S.  */
   IFUNC_IMPL (i, name, memset,
+	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_avx2)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_sse2)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_x86_64))
 
diff --git a/sysdeps/x86_64/multiarch/memset-avx2.S b/sysdeps/x86_64/multiarch/memset-avx2.S
new file mode 100644
index 0000000..d8ff506
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-avx2.S
@@ -0,0 +1,245 @@
+/* memset with AVX2
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc
+
+#include "asm-syntax.h"
+
+#ifndef MEMSET
+# define MEMSET	__memset_avx2
+# define MEMSET_CHK	__memset_chk_avx2
+#endif
+
+#ifndef L
+# define L(label)	.L##label
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#ifndef cfi_startproc
+# define cfi_startproc	.cfi_startproc
+#endif
+
+#ifndef cfi_endproc
+# define cfi_endproc	.cfi_endproc
+#endif
+
+#ifndef ENTRY
+# define ENTRY(name)	\
+	.type name,  @function;	\
+	.globl name;	\
+	ALIGN(4);	\
+name:	\
+	cfi_startproc
+#endif
+
+#ifndef END
+# define END(name)	\
+	cfi_endproc;	\
+	.size name, .-name
+#endif
+
+	.section .text.avx2,"ax",@progbits
+	
+#if defined PIC
+ENTRY (MEMSET_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMSET_CHK)
+#endif
+
+ENTRY (MEMSET)
+	vzeroupper
+	mov	$0x01010101, %ecx
+	mov	%rdi, %rax
+	cmp	$256, %rdx
+	jae	L(256bytesormore)
+	cmp	$1, %rdx
+	ja	L(more_1_bytes)
+	jb	L(less_1_bytes)
+	mov	%sil, (%rdi)
+L(less_1_bytes):
+	ret
+	ALIGN(4)
+L(more_1_bytes):
+	movzbl	%sil, %ecx
+	movb	%cl, %ch
+	cmp	$4, %edx
+	ja	L(more_3bytes)
+	mov	%cx, (%rdi)
+	mov	%cx, -0x02(%rdi, %rdx)
+	ret
+	ALIGN(4)
+L(more_3bytes):
+	mov	%ecx, %r9d
+	shl	$16, %ecx
+	or	%r9d, %ecx
+	cmp	$8, %edx
+	ja	L(more_8bytes)
+	mov %ecx, (%rdi)
+	mov %ecx, -0x04(%rdi, %rdx)
+	ret
+	ALIGN(4)
+L(more_8bytes):
+	mov	%ecx, %r9d
+	shl	$32, %r9
+	or	%r9, %rcx
+	cmp	$16, %edx
+	ja	L(more_16bytes)
+	mov %rcx, (%rdi)
+	mov %rcx, -0x08(%rdi, %rdx)
+	ret
+	ALIGN(4)
+L(more_16bytes):
+	lea	(%rdi, %rdx), %r8
+#ifdef USE_AS_BZERO
+	pxor	%xmm0, %xmm0
+#else
+	vmovd	%ecx, %xmm0
+	vpshufd	$0, %xmm0, %xmm0
+#endif
+	cmp	$32, %edx
+	ja	L(more_32bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(more_32bytes):
+	cmp	$64, %edx
+	ja	L(more_64bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(more_64bytes):
+	cmp	$128, %edx
+	ja	L(more_128bytes)
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, 0x20(%rdi)
+	vmovups %xmm0, 0x30(%rdi)
+	vmovups %xmm0, -0x40(%r8)
+	vmovups %xmm0, -0x30(%r8)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+	ALIGN(4)
+L(more_128bytes):
+	vmovups %xmm0, (%rdi)
+	vmovups %xmm0, 0x10(%rdi)
+	vmovups %xmm0, 0x20(%rdi)
+	vmovups %xmm0, 0x30(%rdi)
+	vmovups %xmm0, 0x40(%rdi)
+	vmovups %xmm0, 0x50(%rdi)
+	vmovups %xmm0, 0x60(%rdi)
+	vmovups %xmm0, 0x70(%rdi)
+	vmovups %xmm0, -0x80(%r8)
+	vmovups %xmm0, -0x70(%r8)
+	vmovups %xmm0, -0x60(%r8)
+	vmovups %xmm0, -0x50(%r8)
+	vmovups %xmm0, -0x40(%r8)
+	vmovups %xmm0, -0x30(%r8)
+	vmovups %xmm0, -0x20(%r8)
+	vmovups %xmm0, -0x10(%r8)
+	ret
+
+	ALIGN(4)
+L(256bytesormore):
+	lea	(%rdi, %rdx), %r8
+	
+#ifndef USE_AS_BZERO
+	and	$0xff, %esi
+	imul %esi, %ecx
+	vmovd %ecx, %xmm0
+	vpshufd	$0, %xmm0, %xmm0
+	vzeroupper
+	vinserti128 $1, %xmm0, %ymm0, %ymm0
+#else
+	vpxor %xmm0, %xmm0, %xmm0
+#endif
+	vmovups	%ymm0, (%rdi)
+	mov	%rdi, %r9
+	and	$-0x20, %rdi
+	add	$32, %rdi
+	sub	%rdi, %r9
+	add	%r9, %rdx
+	cmp	$4096, %rdx
+	ja	L(gobble_data)
+
+	sub	$0x80, %rdx
+L(gobble_128_loop):
+	prefetcht0 0x1c0(%rdi)
+	vmovaps	%ymm0, (%rdi)
+	prefetcht0 0x280(%rdi)
+	vmovaps	%ymm0, 0x20(%rdi)
+	vmovaps	%ymm0, 0x40(%rdi)
+	vmovaps	%ymm0, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_128_loop)
+L(gobble_exit):
+	vmovups	%ymm0, -0x80(%r8)
+	vmovups	%ymm0, -0x60(%r8)
+	vmovups	%ymm0, -0x40(%r8)
+	vmovups	%ymm0, -0x20(%r8)
+	ret
+
+	ALIGN(4)
+L(gobble_data):
+#ifdef SHARED_CACHE_SIZE_HALF
+	mov	$SHARED_CACHE_SIZE_HALF, %r9
+#else
+	mov	__x86_64_shared_cache_size_half(%rip), %r9
+#endif
+	shl	$4, %r9
+	cmp	%r9, %rdx
+	ja	L(gobble_big_data)
+	mov	%rax, %r9
+	mov	%ecx, %eax
+	mov	%rdx, %rcx
+	rep	stosb
+	mov	%r9, %rax
+	ret
+
+	ALIGN(4)
+L(gobble_big_data):
+	sub	$0x80, %rdx
+L(gobble_big_data_loop):
+	vmovntdq	%ymm0, (%rdi)
+	vmovntdq	%ymm0, 0x20(%rdi)
+	vmovntdq	%ymm0, 0x40(%rdi)
+	vmovntdq	%ymm0, 0x60(%rdi)
+	lea	0x80(%rdi), %rdi
+	sub	$0x80, %rdx
+	jae	L(gobble_big_data_loop)
+	vmovups	%ymm0, -0x80(%r8)
+	vmovups	%ymm0, -0x60(%r8)
+	vmovups	%ymm0, -0x40(%r8)
+	vmovups	%ymm0, -0x20(%r8)
+	sfence
+	ret
+
+END (MEMSET)
+#endif
-- 
1.8.1.4
Follow-Ups:
- Re: [PATCH RFC] Improve 64bit memset for Corei7 with avx2 instruction
  - From: OndÅej BÃlka
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]