[Patch, MIPS] Improve memcpy performance on MIPS

Wed Oct 21 19:36:00 GMT 2015

This patch improves the MIPS memcpy performance on small copies of aligned
data.  It is identical to a patch I checked in to the GLIBC library.  The
original issue was that someone noticed that the MIPS N32 (and N64) memcpy
was slower than the MIPS O32 memcpy for small (less than 16 byte) aligned
memcpy's.  This is because for sizes of 8 to 15 bytes, the O32 memcpy would
do two or three word copies followed by byte copies but the N32 version would
do all byte copies.  Basically, the N32 version did not 'fall back' to doing
word copies when it could not do double-word copies.

This patch addresses the problem with two changes.  One is actually for
large memcpy's on N32.  After doing as many double-word copies as possible
the N32 version will try do do at least one word copy before going to byte
copies.

The other change is that after determining that a memcpy is small (less than
8 bytes for O32 ABI, less than 16 bytes for N32 or N64 ABI), instead of just
doing byte copies it will check the size and alignment of the inputs and,
if possible, do word copies (followed by byte copies if needed).  If it is
not possible to do word copies due to size or alignment it drops back to byte
copies as before.

I did some performance testing, including the glibc performance tests
and for small memcpy's it showed that aligned data was being copied
faster but unaligned data was being copied slower (due to the extra
checks for alignment).

See https://sourceware.org/ml/libc-alpha/2015-10/msg00597.html for
the actual numbers.

OK to check in?

Steve Ellcey
sellcey@imgtec.com



2015-10-21  Steve Ellcey  <sellcey@imgtec.com>

	* libc/machine/mips/memcpy.S (memcpy): Add word copies for small
	aligned data.

diff --git a/newlib/libc/machine/mips/memcpy.S b/newlib/libc/machine/mips/memcpy.S
index ed0edb4..3130f6e 100644
--- a/newlib/libc/machine/mips/memcpy.S
+++ b/newlib/libc/machine/mips/memcpy.S
@@ -311,7 +311,7 @@ L(memcpy):
  * size, copy dst pointer to v0 for the return value.
  */
 	slti	t2,a2,(2 * NSIZE)
-	bne	t2,zero,L(lastb)
+	bne	t2,zero,L(lasts)
 #if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH)
 	move	v0,zero
 #else
@@ -562,7 +562,7 @@ L(chkw):
  */
 L(chk1w):
 	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
-	beq	a2,t8,L(lastb)
+	beq	a2,t8,L(lastw)
 	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
 	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
 
@@ -574,6 +574,20 @@ L(wordCopy_loop):
 	bne	a0,a3,L(wordCopy_loop)
 	C_ST	REG3,UNIT(-1)(a0)
 
+/* If we have been copying double words, see if we can copy a single word
+   before doing byte copies.  We can have, at most, one word to copy.  */
+
+L(lastw):
+#ifdef USE_DOUBLE
+	andi    t8,a2,3		/* a2 is the remainder past 4 byte chunks.  */
+	beq	t8,a2,L(lastb)
+	lw	REG3,0(a1)
+	sw	REG3,0(a0)
+	PTR_ADDIU a0,a0,4
+	PTR_ADDIU a1,a1,4
+	move	a2,t8
+#endif
+
 /* Copy the last 8 (or 16) bytes */
 L(lastb):
 	blez	a2,L(leave)
@@ -588,6 +602,33 @@ L(leave):
 	j	ra
 	nop
 
+/* We jump here with a memcpy of less than 8 or 16 bytes, depending on
+   whether or not USE_DOUBLE is defined.  Instead of just doing byte
+   copies, check the alignment and size and use lw/sw if possible.
+   Otherwise, do byte copies.  */
+
+L(lasts):
+	andi	t8,a2,3
+	beq	t8,a2,L(lastb)
+
+	andi	t9,a0,3
+	bne	t9,zero,L(lastb)
+	andi	t9,a1,3
+	bne	t9,zero,L(lastb)
+
+	PTR_SUBU a3,a2,t8
+	PTR_ADDU a3,a0,a3
+
+L(wcopy_loop):
+	lw	REG3,0(a1)
+	PTR_ADDIU a0,a0,4
+	PTR_ADDIU a1,a1,4
+	bne	a0,a3,L(wcopy_loop)
+	sw	REG3,-4(a0)
+
+	b	L(lastb)
+	move	a2,t8
+
 #ifndef R6_CODE
 /*
  * UNALIGNED case, got here with a3 = "negu a0"