This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

GNU C Library master sources branch, master, updated. glibc-2.14-247-g5025581


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".

The branch, master has been updated
       via  5025581e1c66a184a587ab1bd99cd168e8fb7770 (commit)
      from  a450513e1d51cb8fe46ba5ebd92399247060b980 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=5025581e1c66a184a587ab1bd99cd168e8fb7770

commit 5025581e1c66a184a587ab1bd99cd168e8fb7770
Author: Will Schmidt <will_schmidt@vnet.ibm.com>
Date:   Wed Sep 7 21:54:41 2011 -0400

    power7 memcpy VSX optimizations

diff --git a/ChangeLog b/ChangeLog
index c90f2c7..429767d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+2011-07-28  Will Schmidt  <will_schmidt@vnet.ibm.com>
+
+	* sysdeps/powerpc/powerpc32/power7/memcpy.S: Optimize the
+	aligned copy for power7 with vector-scalar instructions.
+	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
+
 2011-07-24  H.J. Lu  <hongjiu.lu@intel.com>
 
 	* sysdeps/x86_64/dl-trampoline.S (_dl_runtime_profile): Simplify
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
index f0c332f..ec70557 100644
--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -1,5 +1,5 @@
 /* Optimized memcpy implementation for PowerPC32/POWER7.
-   Copyright (C) 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
@@ -116,24 +116,82 @@ L(copy_GE_32_aligned_cont):
 	stfd    6,0(3)
 	addi    10,3,8
 
+L(aligned_copy):
+	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
 	.align  4
-4:	/* Main aligned copy loop. Copies 32-bytes at a time.  */
-	lfd	6,0(11)
-	lfd     7,8(11)
-	lfd     8,16(11)
-	lfd     0,24(11)
-	addi    11,11,32
+4:
+	/* check for any 32-byte or 64-byte lumps that are outside of a
+	   nice 128-byte range.  R8 contains the number of 32-byte
+	   lumps, so drop this into the CR, and use the SO/EQ bits to help
+	   handle the 32- or 64- byte lumps.  Then handle the rest with an
+	   unrolled 128-bytes-at-a-time copy loop. */
+	mtocrf	1,8
+	li	6,16	# 16() index
+	li	7,32	# 32() index
+	li	8,48	# 48() index
+
+L(aligned_32byte):
+	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+	bns	cr7,L(aligned_64byte)
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	addi	11,11,32
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	addi	10,10,32
+
+L(aligned_64byte):
+	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+	bne	cr7,L(aligned_128setup)
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	addi	11,11,64
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	addi	10,10,64
+
+L(aligned_128setup):
+	/* Set up for the 128-byte at a time copy loop.  */
+	srwi	8,31,7
+	cmpwi	8,0	# Any 4x lumps left?
+	beq	3f	# if not, move along.
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	mtctr	8	# otherwise, load the ctr and begin.
+	li	8,48	# 48() index
+	b	L(aligned_128loop)
+
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+L(aligned_128loop):
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	stxvd2x	6,0,10
+	addi	11,11,64
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	addi	10,10,64
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	addi	11,11,64
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	addi	10,10,64
+	bdnz	L(aligned_128head)
 
-	stfd    6,0(10)
-	stfd    7,8(10)
-	stfd    8,16(10)
-	stfd    0,24(10)
-	addi    10,10,32
-	bdnz    4b
 3:
-
 	/* Check for tail bytes.  */
-
 	clrrwi  0,31,3
 	mtcrf   0x01,31
 	beq	cr6,0f
diff --git a/sysdeps/powerpc/powerpc64/power7/memcpy.S b/sysdeps/powerpc/powerpc64/power7/memcpy.S
index 2e5beed..8aaef97 100644
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@@ -1,5 +1,5 @@
 /* Optimized memcpy implementation for PowerPC64/POWER7.
-   Copyright (C) 2010 Free Software Foundation, Inc.
+   Copyright (C) 2010, 2011 Free Software Foundation, Inc.
    Contributed by Luis Machado <luisgpm@br.ibm.com>.
    This file is part of the GNU C Library.
 
@@ -115,23 +115,81 @@ L(copy_GE_32_aligned_cont):
 	std     6,0(3)
 	addi    10,3,8
 
-	/* Main aligned copy loop. Copies 32-bytes at a time.  */
+L(aligned_copy):
+	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
 	.align  4
 4:
-	ld	6,0(11)
-	ld      7,8(11)
-	ld      8,16(11)
-	ld      0,24(11)
-	addi    11,11,32
+	/* check for any 32-byte or 64-byte lumps that are outside of a
+	   nice 128-byte range.  R8 contains the number of 32-byte
+	   lumps, so drop this into the CR, and use the SO/EQ bits to help
+	   handle the 32- or 64- byte lumps.  Then handle the rest with an
+	   unrolled 128-bytes-at-a-time copy loop. */
+	mtocrf	1,8
+	li	6,16	# 16() index
+	li	7,32	# 32() index
+	li	8,48	# 48() index
+
+L(aligned_32byte):
+	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
+	bns	cr7,L(aligned_64byte)
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	addi	11,11,32
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	addi	10,10,32
+
+L(aligned_64byte):
+	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
+	bne	cr7,L(aligned_128setup)
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	addi	11,11,64
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	addi	10,10,64
+
+L(aligned_128setup):
+	/* Set up for the 128-byte at a time copy loop.  */
+	srdi	8,31,7
+	cmpdi	8,0	# Any 4x lumps left?
+	beq	3f	# if not, move along.
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	mtctr	8	# otherwise, load the ctr and begin.
+	li	8,48	# 48() index
+	b	L(aligned_128loop)
+
+L(aligned_128head):
+	/* for the 2nd + iteration of this loop. */
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+L(aligned_128loop):
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	stxvd2x	6,0,10
+	addi	11,11,64
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	lxvd2x	6,0,11
+	lxvd2x	7,11,6
+	addi	10,10,64
+	lxvd2x	8,11,7
+	lxvd2x	9,11,8
+	addi	11,11,64
+	stxvd2x	6,0,10
+	stxvd2x	7,10,6
+	stxvd2x	8,10,7
+	stxvd2x	9,10,8
+	addi	10,10,64
+	bdnz	L(aligned_128head)
 
-	std     6,0(10)
-	std     7,8(10)
-	std     8,16(10)
-	std     0,24(10)
-	addi    10,10,32
-	bdnz    4b
 3:
-
 	/* Check for tail bytes.  */
 	rldicr  0,31,0,60
 	mtcrf   0x01,31

-----------------------------------------------------------------------

Summary of changes:
 ChangeLog                                 |    6 ++
 sysdeps/powerpc/powerpc32/power7/memcpy.S |   90 +++++++++++++++++++++++-----
 sysdeps/powerpc/powerpc64/power7/memcpy.S |   88 +++++++++++++++++++++++-----
 3 files changed, 153 insertions(+), 31 deletions(-)


hooks/post-receive
-- 
GNU C Library master sources


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]