This is the mail archive of the
glibc-cvs@sourceware.org
mailing list for the glibc project.
GNU C Library master sources branch, master, updated. glibc-2.11-154-gd6ac932
- From: drepper at sourceware dot org
- To: glibc-cvs at sourceware dot org
- Date: 18 Jan 2010 20:44:03 -0000
- Subject: GNU C Library master sources branch, master, updated. glibc-2.11-154-gd6ac932
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "GNU C Library master sources".
The branch, master has been updated
via d6ac9329b3baf72e1f7a6dfd10ff5236668c2d10 (commit)
via 057edf90e015117bcb7c7cf2e895359e7244dbf8 (commit)
from f87d0dac8b79920b34f0a7878d2be711a7cdf537 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=d6ac9329b3baf72e1f7a6dfd10ff5236668c2d10
commit d6ac9329b3baf72e1f7a6dfd10ff5236668c2d10
Author: Ulrich Drepper <drepper@redhat.com>
Date: Mon Jan 18 12:43:47 2010 -0800
Fix whitespace issues.
diff --git a/sysdeps/powerpc/powerpc32/cell/memcpy.S b/sysdeps/powerpc/powerpc32/cell/memcpy.S
index e6c076c..cc1da99 100644
--- a/sysdeps/powerpc/powerpc32/cell/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/cell/memcpy.S
@@ -43,16 +43,16 @@
.align 7
EALIGN (BP_SYM (memcpy), 5, 0)
- CALL_MCOUNT
+ CALL_MCOUNT
dcbt 0,r4 /* Prefetch ONE SRC cacheline */
cmplwi cr1,r5,16 /* is size < 16 ? */
- mr r6,r3
+ mr r6,r3
blt+ cr1,.Lshortcopy
.Lbigcopy:
neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
- clrlwi r8,r8,32-4 /* aling to 16byte boundary */
+ clrlwi r8,r8,32-4 /* aling to 16byte boundary */
sub r7,r4,r3
cmplwi cr0,r8,0
beq+ .Ldst_aligned
@@ -112,8 +112,8 @@ EALIGN (BP_SYM (memcpy), 5, 0)
.LprefetchSRC:
dcbt r12,r4
- addi r12,r12,128
- bdnz .LprefetchSRC
+ addi r12,r12,128
+ bdnz .LprefetchSRC
.Lnocacheprefetch:
mtctr r7
@@ -122,7 +122,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
beq cr6,.Lcachelinealigned
.Laligntocacheline:
- lfd fp9,0x08(r4)
+ lfd fp9,0x08(r4)
lfdu fp10,0x10(r4)
stfd fp9,0x08(r6)
stfdu fp10,0x10(r6)
@@ -131,10 +131,10 @@ EALIGN (BP_SYM (memcpy), 5, 0)
.Lcachelinealigned: /* copy while cache lines */
- blt- cr1,.Llessthancacheline /* size <128 */
+ blt- cr1,.Llessthancacheline /* size <128 */
.Louterloop:
- cmpwi r11,0
+ cmpwi r11,0
mtctr r11
beq- .Lendloop
@@ -142,7 +142,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
.align 4
/* Copy whole cachelines, optimized by prefetching SRC cacheline */
-.Lloop: /* Copy aligned body */
+.Lloop: /* Copy aligned body */
dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
lfd fp9, 0x08(r4)
dcbz r11,r6
@@ -186,7 +186,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
beq- .Lendloop2
mtctr r10
-.Lloop2: /* Copy aligned body */
+.Lloop2: /* Copy aligned body */
lfd fp9, 0x08(r4)
lfd fp10, 0x10(r4)
lfd fp11, 0x18(r4)
@@ -206,7 +206,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
mtctr r7
.Lcopy_remaining:
- lfd fp9,0x08(r4)
+ lfd fp9,0x08(r4)
lfdu fp10,0x10(r4)
stfd fp9,0x08(r6)
stfdu fp10,0x10(r6)
@@ -214,7 +214,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
.Ldo_lt16: /* less than 16 ? */
cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */
- beqlr+ /* no rest to copy */
+ beqlr+ /* no rest to copy */
addi r4,r4,8
addi r6,r6,8
diff --git a/sysdeps/powerpc/powerpc64/cell/memcpy.S b/sysdeps/powerpc/powerpc64/cell/memcpy.S
index 2a00a6e..c6ee730 100644
--- a/sysdeps/powerpc/powerpc64/cell/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/cell/memcpy.S
@@ -43,16 +43,16 @@
.align 7
EALIGN (BP_SYM (memcpy), 5, 0)
- CALL_MCOUNT 3
+ CALL_MCOUNT 3
dcbt 0,r4 /* Prefetch ONE SRC cacheline */
cmpldi cr1,r5,16 /* is size < 16 ? */
- mr r6,r3
+ mr r6,r3
blt+ cr1,.Lshortcopy
.Lbigcopy:
neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
- clrldi r8,r8,64-4 /* aling to 16byte boundary */
+ clrldi r8,r8,64-4 /* aling to 16byte boundary */
sub r7,r4,r3
cmpldi cr0,r8,0
beq+ .Ldst_aligned
@@ -112,8 +112,8 @@ EALIGN (BP_SYM (memcpy), 5, 0)
.LprefetchSRC:
dcbt r12,r4
- addi r12,r12,128
- bdnz .LprefetchSRC
+ addi r12,r12,128
+ bdnz .LprefetchSRC
.Lnocacheprefetch:
mtctr r7
@@ -122,7 +122,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
beq cr6,.Lcachelinealigned
.Laligntocacheline:
- ld r9,0x08(r4)
+ ld r9,0x08(r4)
ldu r7,0x10(r4)
std r9,0x08(r6)
stdu r7,0x10(r6)
@@ -131,10 +131,10 @@ EALIGN (BP_SYM (memcpy), 5, 0)
.Lcachelinealigned: /* copy while cache lines */
- blt- cr1,.Llessthancacheline /* size <128 */
+ blt- cr1,.Llessthancacheline /* size <128 */
.Louterloop:
- cmpdi r11,0
+ cmpdi r11,0
mtctr r11
beq- .Lendloop
@@ -142,7 +142,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
.align 4
/* Copy whole cachelines, optimized by prefetching SRC cacheline */
-.Lloop: /* Copy aligned body */
+.Lloop: /* Copy aligned body */
dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
ld r9, 0x08(r4)
dcbz r11,r6
@@ -186,7 +186,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
beq- .Lendloop2
mtctr r10
-.Lloop2: /* Copy aligned body */
+.Lloop2: /* Copy aligned body */
ld r9, 0x08(r4)
ld r7, 0x10(r4)
ld r8, 0x18(r4)
@@ -206,7 +206,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
mtctr r7
.Lcopy_remaining:
- ld r8,0x08(r4)
+ ld r8,0x08(r4)
ldu r7,0x10(r4)
std r8,0x08(r6)
stdu r7,0x10(r6)
@@ -214,7 +214,7 @@ EALIGN (BP_SYM (memcpy), 5, 0)
.Ldo_lt16: /* less than 16 ? */
cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
- beqlr+ /* no rest to copy */
+ beqlr+ /* no rest to copy */
addi r4,r4,8
addi r6,r6,8
http://sources.redhat.com/git/gitweb.cgi?p=glibc.git;a=commitdiff;h=057edf90e015117bcb7c7cf2e895359e7244dbf8
commit 057edf90e015117bcb7c7cf2e895359e7244dbf8
Author: Ulrich Drepper <drepper@redhat.com>
Date: Mon Jan 18 12:40:29 2010 -0800
memcpy for ppc/cell.
diff --git a/ChangeLog b/ChangeLog
index 92ed81c..8f6695b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2010-01-14 Ryan S. Arnold <rsa@us.ibm.com>
+
+ * sysdeps/powerpc/powerpc32/cell/memcpy.S: New file.
+ * sysdeps/powerpc/powerpc64/cell/memcpy.S: New file.
+ * sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies: New file.
+ * sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies: New file.
+
2010-01-18 Andreas Schwab <schwab@redhat.com>
* sysdeps/unix/sysv/linux/sparc/bits/fcntl.h: Remove duplicate
diff --git a/sysdeps/powerpc/powerpc32/cell/memcpy.S b/sysdeps/powerpc/powerpc32/cell/memcpy.S
new file mode 100644
index 0000000..e6c076c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc32/cell/memcpy.S
@@ -0,0 +1,245 @@
+/* Optimized memcpy implementation for CELL BE PowerPC.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
+#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
+
+/* memcpy routine optimized for CELL-BE-PPC v2.0
+ *
+ * The CELL PPC core has 1 integer unit and 1 load/store unit
+ * CELL:
+ * 1st level data cache = 32K
+ * 2nd level data cache = 512K
+ * 3rd level data cache = 0K
+ * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
+ * latency to memory is >400 clocks
+ * To improve copy performance we need to prefetch source data
+ * far ahead to hide this latency
+ * For best performance instructionforms ending in "." like "andi."
+ * should be avoided as the are implemented in microcode on CELL.
+ * The below code is loop unrolled for the CELL cache line of 128 bytes
+ */
+
+.align 7
+
+EALIGN (BP_SYM (memcpy), 5, 0)
+ CALL_MCOUNT
+
+ dcbt 0,r4 /* Prefetch ONE SRC cacheline */
+ cmplwi cr1,r5,16 /* is size < 16 ? */
+ mr r6,r3
+ blt+ cr1,.Lshortcopy
+
+.Lbigcopy:
+ neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
+ clrlwi r8,r8,32-4 /* aling to 16byte boundary */
+ sub r7,r4,r3
+ cmplwi cr0,r8,0
+ beq+ .Ldst_aligned
+
+.Ldst_unaligned:
+ mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
+ subf r5,r8,r5
+
+ bf cr7*4+3,1f
+ lbzx r0,r7,r6 /* copy 1 byte */
+ stb r0,0(r6)
+ addi r6,r6,1
+1: bf cr7*4+2,2f
+ lhzx r0,r7,r6 /* copy 2 byte */
+ sth r0,0(r6)
+ addi r6,r6,2
+2: bf cr7*4+1,4f
+ lwzx r0,r7,r6 /* copy 4 byte */
+ stw r0,0(r6)
+ addi r6,r6,4
+4: bf cr7*4+0,8f
+ lfdx fp9,r7,r6 /* copy 8 byte */
+ stfd fp9,0(r6)
+ addi r6,r6,8
+8:
+ add r4,r7,r6
+
+.Ldst_aligned:
+
+ cmpwi cr5,r5,128-1
+
+ neg r7,r6
+ addi r6,r6,-8 /* prepare for stfdu */
+ addi r4,r4,-8 /* prepare for lfdu */
+
+ clrlwi r7,r7,32-7 /* align to cacheline boundary */
+ ble+ cr5,.Llessthancacheline
+
+ cmplwi cr6,r7,0
+ subf r5,r7,r5
+ srwi r7,r7,4 /* divide size by 16 */
+ srwi r10,r5,7 /* number of cache lines to copy */
+
+ cmplwi r10,0
+ li r11,0 /* number cachelines to copy with prefetch */
+ beq .Lnocacheprefetch
+
+ cmplwi r10,PREFETCH_AHEAD
+ li r12,128+8 /* prefetch distance */
+ ble .Llessthanmaxprefetch
+
+ subi r11,r10,PREFETCH_AHEAD
+ li r10,PREFETCH_AHEAD
+
+.Llessthanmaxprefetch:
+ mtctr r10
+
+.LprefetchSRC:
+ dcbt r12,r4
+ addi r12,r12,128
+ bdnz .LprefetchSRC
+
+.Lnocacheprefetch:
+ mtctr r7
+ cmplwi cr1,r5,128
+ clrlwi r5,r5,32-7
+ beq cr6,.Lcachelinealigned
+
+.Laligntocacheline:
+ lfd fp9,0x08(r4)
+ lfdu fp10,0x10(r4)
+ stfd fp9,0x08(r6)
+ stfdu fp10,0x10(r6)
+ bdnz .Laligntocacheline
+
+
+.Lcachelinealigned: /* copy while cache lines */
+
+ blt- cr1,.Llessthancacheline /* size <128 */
+
+.Louterloop:
+ cmpwi r11,0
+ mtctr r11
+ beq- .Lendloop
+
+ li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
+
+.align 4
+ /* Copy whole cachelines, optimized by prefetching SRC cacheline */
+.Lloop: /* Copy aligned body */
+ dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
+ lfd fp9, 0x08(r4)
+ dcbz r11,r6
+ lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */
+ lfd fp11, 0x18(r4) /* to hide 1st level cache lantency. */
+ lfd fp12, 0x20(r4)
+ stfd fp9, 0x08(r6)
+ stfd fp10, 0x10(r6)
+ stfd fp11, 0x18(r6)
+ stfd fp12, 0x20(r6)
+ lfd fp9, 0x28(r4)
+ lfd fp10, 0x30(r4)
+ lfd fp11, 0x38(r4)
+ lfd fp12, 0x40(r4)
+ stfd fp9, 0x28(r6)
+ stfd fp10, 0x30(r6)
+ stfd fp11, 0x38(r6)
+ stfd fp12, 0x40(r6)
+ lfd fp9, 0x48(r4)
+ lfd fp10, 0x50(r4)
+ lfd fp11, 0x58(r4)
+ lfd fp12, 0x60(r4)
+ stfd fp9, 0x48(r6)
+ stfd fp10, 0x50(r6)
+ stfd fp11, 0x58(r6)
+ stfd fp12, 0x60(r6)
+ lfd fp9, 0x68(r4)
+ lfd fp10, 0x70(r4)
+ lfd fp11, 0x78(r4)
+ lfdu fp12, 0x80(r4)
+ stfd fp9, 0x68(r6)
+ stfd fp10, 0x70(r6)
+ stfd fp11, 0x78(r6)
+ stfdu fp12, 0x80(r6)
+
+ bdnz .Lloop
+
+.Lendloop:
+ cmpwi r10,0
+ slwi r10,r10,2 /* adjust from 128 to 32 byte stride */
+ beq- .Lendloop2
+ mtctr r10
+
+.Lloop2: /* Copy aligned body */
+ lfd fp9, 0x08(r4)
+ lfd fp10, 0x10(r4)
+ lfd fp11, 0x18(r4)
+ lfdu fp12, 0x20(r4)
+ stfd fp9, 0x08(r6)
+ stfd fp10, 0x10(r6)
+ stfd fp11, 0x18(r6)
+ stfdu fp12, 0x20(r6)
+
+ bdnz .Lloop2
+.Lendloop2:
+
+.Llessthancacheline: /* less than cache to do ? */
+ cmplwi cr0,r5,16
+ srwi r7,r5,4 /* divide size by 16 */
+ blt- .Ldo_lt16
+ mtctr r7
+
+.Lcopy_remaining:
+ lfd fp9,0x08(r4)
+ lfdu fp10,0x10(r4)
+ stfd fp9,0x08(r6)
+ stfdu fp10,0x10(r6)
+ bdnz .Lcopy_remaining
+
+.Ldo_lt16: /* less than 16 ? */
+ cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */
+ beqlr+ /* no rest to copy */
+ addi r4,r4,8
+ addi r6,r6,8
+
+.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
+ mtcrf 0x01,r5
+ sub r7,r4,r6
+ bf- cr7*4+0,8f
+ lfdx fp9,r7,r6 /* copy 8 byte */
+ stfd fp9,0(r6)
+ addi r6,r6,8
+8:
+ bf cr7*4+1,4f
+ lwzx r0,r7,r6 /* copy 4 byte */
+ stw r0,0(r6)
+ addi r6,r6,4
+4:
+ bf cr7*4+2,2f
+ lhzx r0,r7,r6 /* copy 2 byte */
+ sth r0,0(r6)
+ addi r6,r6,2
+2:
+ bf cr7*4+3,1f
+ lbzx r0,r7,r6 /* copy 1 byte */
+ stb r0,0(r6)
+1: blr
+
+END (BP_SYM (memcpy))
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/powerpc/powerpc64/cell/memcpy.S b/sysdeps/powerpc/powerpc64/cell/memcpy.S
new file mode 100644
index 0000000..2a00a6e
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/cell/memcpy.S
@@ -0,0 +1,245 @@
+/* Optimized memcpy implementation for CELL BE PowerPC.
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+#include <bp-sym.h>
+#include <bp-asm.h>
+
+#define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */
+#define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */
+
+/* memcpy routine optimized for CELL-BE-PPC v2.0
+ *
+ * The CELL PPC core has 1 integer unit and 1 load/store unit
+ * CELL:
+ * 1st level data cache = 32K
+ * 2nd level data cache = 512K
+ * 3rd level data cache = 0K
+ * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks,
+ * latency to memory is >400 clocks
+ * To improve copy performance we need to prefetch source data
+ * far ahead to hide this latency
+ * For best performance instructionforms ending in "." like "andi."
+ * should be avoided as the are implemented in microcode on CELL.
+ * The below code is loop unrolled for the CELL cache line of 128 bytes
+ */
+
+.align 7
+
+EALIGN (BP_SYM (memcpy), 5, 0)
+ CALL_MCOUNT 3
+
+ dcbt 0,r4 /* Prefetch ONE SRC cacheline */
+ cmpldi cr1,r5,16 /* is size < 16 ? */
+ mr r6,r3
+ blt+ cr1,.Lshortcopy
+
+.Lbigcopy:
+ neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */
+ clrldi r8,r8,64-4 /* aling to 16byte boundary */
+ sub r7,r4,r3
+ cmpldi cr0,r8,0
+ beq+ .Ldst_aligned
+
+.Ldst_unaligned:
+ mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */
+ subf r5,r8,r5
+
+ bf cr7*4+3,1f
+ lbzx r0,r7,r6 /* copy 1 byte */
+ stb r0,0(r6)
+ addi r6,r6,1
+1: bf cr7*4+2,2f
+ lhzx r0,r7,r6 /* copy 2 byte */
+ sth r0,0(r6)
+ addi r6,r6,2
+2: bf cr7*4+1,4f
+ lwzx r0,r7,r6 /* copy 4 byte */
+ stw r0,0(r6)
+ addi r6,r6,4
+4: bf cr7*4+0,8f
+ ldx r0,r7,r6 /* copy 8 byte */
+ std r0,0(r6)
+ addi r6,r6,8
+8:
+ add r4,r7,r6
+
+.Ldst_aligned:
+
+ cmpdi cr5,r5,128-1
+
+ neg r7,r6
+ addi r6,r6,-8 /* prepare for stdu */
+ addi r4,r4,-8 /* prepare for ldu */
+
+ clrldi r7,r7,64-7 /* align to cacheline boundary */
+ ble+ cr5,.Llessthancacheline
+
+ cmpldi cr6,r7,0
+ subf r5,r7,r5
+ srdi r7,r7,4 /* divide size by 16 */
+ srdi r10,r5,7 /* number of cache lines to copy */
+
+ cmpldi r10,0
+ li r11,0 /* number cachelines to copy with prefetch */
+ beq .Lnocacheprefetch
+
+ cmpldi r10,PREFETCH_AHEAD
+ li r12,128+8 /* prefetch distance */
+ ble .Llessthanmaxprefetch
+
+ subi r11,r10,PREFETCH_AHEAD
+ li r10,PREFETCH_AHEAD
+
+.Llessthanmaxprefetch:
+ mtctr r10
+
+.LprefetchSRC:
+ dcbt r12,r4
+ addi r12,r12,128
+ bdnz .LprefetchSRC
+
+.Lnocacheprefetch:
+ mtctr r7
+ cmpldi cr1,r5,128
+ clrldi r5,r5,64-7
+ beq cr6,.Lcachelinealigned
+
+.Laligntocacheline:
+ ld r9,0x08(r4)
+ ldu r7,0x10(r4)
+ std r9,0x08(r6)
+ stdu r7,0x10(r6)
+ bdnz .Laligntocacheline
+
+
+.Lcachelinealigned: /* copy while cache lines */
+
+ blt- cr1,.Llessthancacheline /* size <128 */
+
+.Louterloop:
+ cmpdi r11,0
+ mtctr r11
+ beq- .Lendloop
+
+ li r11,128*ZERO_AHEAD +8 /* DCBZ dist */
+
+.align 4
+ /* Copy whole cachelines, optimized by prefetching SRC cacheline */
+.Lloop: /* Copy aligned body */
+ dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */
+ ld r9, 0x08(r4)
+ dcbz r11,r6
+ ld r7, 0x10(r4) /* 4 register stride copy is optimal */
+ ld r8, 0x18(r4) /* to hide 1st level cache lantency. */
+ ld r0, 0x20(r4)
+ std r9, 0x08(r6)
+ std r7, 0x10(r6)
+ std r8, 0x18(r6)
+ std r0, 0x20(r6)
+ ld r9, 0x28(r4)
+ ld r7, 0x30(r4)
+ ld r8, 0x38(r4)
+ ld r0, 0x40(r4)
+ std r9, 0x28(r6)
+ std r7, 0x30(r6)
+ std r8, 0x38(r6)
+ std r0, 0x40(r6)
+ ld r9, 0x48(r4)
+ ld r7, 0x50(r4)
+ ld r8, 0x58(r4)
+ ld r0, 0x60(r4)
+ std r9, 0x48(r6)
+ std r7, 0x50(r6)
+ std r8, 0x58(r6)
+ std r0, 0x60(r6)
+ ld r9, 0x68(r4)
+ ld r7, 0x70(r4)
+ ld r8, 0x78(r4)
+ ldu r0, 0x80(r4)
+ std r9, 0x68(r6)
+ std r7, 0x70(r6)
+ std r8, 0x78(r6)
+ stdu r0, 0x80(r6)
+
+ bdnz .Lloop
+
+.Lendloop:
+ cmpdi r10,0
+ sldi r10,r10,2 /* adjust from 128 to 32 byte stride */
+ beq- .Lendloop2
+ mtctr r10
+
+.Lloop2: /* Copy aligned body */
+ ld r9, 0x08(r4)
+ ld r7, 0x10(r4)
+ ld r8, 0x18(r4)
+ ldu r0, 0x20(r4)
+ std r9, 0x08(r6)
+ std r7, 0x10(r6)
+ std r8, 0x18(r6)
+ stdu r0, 0x20(r6)
+
+ bdnz .Lloop2
+.Lendloop2:
+
+.Llessthancacheline: /* less than cache to do ? */
+ cmpldi cr0,r5,16
+ srdi r7,r5,4 /* divide size by 16 */
+ blt- .Ldo_lt16
+ mtctr r7
+
+.Lcopy_remaining:
+ ld r8,0x08(r4)
+ ldu r7,0x10(r4)
+ std r8,0x08(r6)
+ stdu r7,0x10(r6)
+ bdnz .Lcopy_remaining
+
+.Ldo_lt16: /* less than 16 ? */
+ cmpldi cr0,r5,0 /* copy remaining bytes (0-15) */
+ beqlr+ /* no rest to copy */
+ addi r4,r4,8
+ addi r6,r6,8
+
+.Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */
+ mtcrf 0x01,r5
+ sub r7,r4,r6
+ bf- cr7*4+0,8f
+ ldx r0,r7,r6 /* copy 8 byte */
+ std r0,0(r6)
+ addi r6,r6,8
+8:
+ bf cr7*4+1,4f
+ lwzx r0,r7,r6 /* copy 4 byte */
+ stw r0,0(r6)
+ addi r6,r6,4
+4:
+ bf cr7*4+2,2f
+ lhzx r0,r7,r6 /* copy 2 byte */
+ sth r0,0(r6)
+ addi r6,r6,2
+2:
+ bf cr7*4+3,1f
+ lbzx r0,r7,r6 /* copy 1 byte */
+ stb r0,0(r6)
+1: blr
+
+END_GEN_TB (BP_SYM (memcpy),TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies
new file mode 100644
index 0000000..7c381f0
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies
@@ -0,0 +1,3 @@
+# Make sure this comes before the powerpc/powerpc32/fpu that's
+# listed in unix/sysv/linux/powerpc/powerpc32/fpu/Implies.
+powerpc/powerpc32/cell/fpu
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies b/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies
new file mode 100644
index 0000000..b6720ec
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies
@@ -0,0 +1 @@
+powerpc/powerpc64/cell/fpu
-----------------------------------------------------------------------
Summary of changes:
ChangeLog | 7 +
sysdeps/powerpc/powerpc32/cell/memcpy.S | 245 ++++++++++++++++++++
sysdeps/powerpc/powerpc64/cell/memcpy.S | 245 ++++++++++++++++++++
.../sysv/linux/powerpc/powerpc32/cell/fpu/Implies | 3 +
.../sysv/linux/powerpc/powerpc64/cell/fpu/Implies | 1 +
5 files changed, 501 insertions(+), 0 deletions(-)
create mode 100644 sysdeps/powerpc/powerpc32/cell/memcpy.S
create mode 100644 sysdeps/powerpc/powerpc64/cell/memcpy.S
create mode 100644 sysdeps/unix/sysv/linux/powerpc/powerpc32/cell/fpu/Implies
create mode 100644 sysdeps/unix/sysv/linux/powerpc/powerpc64/cell/fpu/Implies
hooks/post-receive
--
GNU C Library master sources