This is the mail archive of the glibc-cvs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Community source repository for glibc add-on ports branch, rth/testing, updated. glibc-2.12-2-g10ae74b


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "Community source repository for glibc add-on ports".

The branch, rth/testing has been updated
       via  10ae74bdc2c8e9f37e14e5f9de1c47cfdc7def00 (commit)
      from  7ffd2bd725c3e4d77e6bfe36b76500d20427929d (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
http://sources.redhat.com/git/gitweb.cgi?p=glibc-ports.git;a=commitdiff;h=10ae74bdc2c8e9f37e14e5f9de1c47cfdc7def00

commit 10ae74bdc2c8e9f37e14e5f9de1c47cfdc7def00
Author: Richard Henderson <rth@twiddle.net>
Date:   Wed Sep 15 10:41:43 2010 -0700

    BZ 12019: rewrite alpha memchr.
    
    The new implementation should not read too much data.

diff --git a/sysdeps/alpha/alphaev6/memchr.S b/sysdeps/alpha/alphaev6/memchr.S
deleted file mode 100644
index fe77cd8..0000000
--- a/sysdeps/alpha/alphaev6/memchr.S
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright (C) 2000, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by David Mosberger (davidm@cs.arizona.edu).
-   EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-#include <sysdep.h>
-
-	.arch ev6
-        .set noreorder
-        .set noat
-
-ENTRY(__memchr)
-#ifdef PROF
-	ldgp	gp, 0(pv)
-	lda	AT, _mcount
-	jsr	AT, (AT), _mcount
-	.prologue 1
-#else
-	.prologue 0
-#endif
-
-	# Hack -- if someone passes in (size_t)-1, hoping to just
-	# search til the end of the address space, we will overflow
-	# below when we find the address of the last byte.  Given
-	# that we will never have a 56-bit address space, cropping
-	# the length is the easiest way to avoid trouble.
-	zap	$18, 0x80, $5	# U : Bound length
-	beq	$18, $not_found	# U :
-        ldq_u   $1, 0($16)	# L : load first quadword Latency=3
-	and	$17, 0xff, $17	# E : L L U U : 00000000000000ch
-
-	insbl	$17, 1, $2	# U : 000000000000ch00
-	cmpult	$18, 9, $4	# E : small (< 1 quad) string?
-	or	$2, $17, $17	# E : 000000000000chch
-        lda     $3, -1($31)	# E : U L L U
-
-	sll	$17, 16, $2	# U : 00000000chch0000
-	addq	$16, $5, $5	# E : Max search address
-	or	$2, $17, $17	# E : 00000000chchchch
-	sll	$17, 32, $2	# U : U L L U : chchchch00000000
-
-	or	$2, $17, $17	# E : chchchchchchchch
-	extql	$1, $16, $7	# U : $7 is upper bits
-	beq	$4, $first_quad	# U :
-	ldq_u	$6, -1($5)	# L : L U U L : eight or less bytes to search Latency=3
-
-	extqh	$6, $16, $6	# U : 2 cycle stall for $6
-	mov	$16, $0		# E :
-	nop			# E :
-	or	$7, $6, $1	# E : L U L U $1 = quadword starting at $16
-
-	# Deal with the case where at most 8 bytes remain to be searched
-	# in $1.  E.g.:
-	#	$18 = 6
-	#	$1 = ????c6c5c4c3c2c1
-$last_quad:
-	negq	$18, $6		# E :
-        xor	$17, $1, $1	# E :
-	srl	$3, $6, $6	# U : $6 = mask of $18 bits set
-        cmpbge  $31, $1, $2	# E : L U L U
-
-	nop
-	nop
-	and	$2, $6, $2	# E :
-        beq     $2, $not_found	# U : U L U L
-
-$found_it:
-#if defined(__alpha_fix__) && defined(__alpha_cix__)
-	/*
-	 * Since we are guaranteed to have set one of the bits, we don't
-	 * have to worry about coming back with a 0x40 out of cttz...
-	 */
-	cttz	$2, $3		# U0 :
-	addq	$0, $3, $0	# E : All done
-	nop			# E :
-	ret			# L0 : L U L U
-#else
-	/*
-	 * Slow and clunky.  It can probably be improved.
-	 * An exercise left for others.
-	 */
-        negq    $2, $3		# E :
-        and     $2, $3, $2	# E :
-        and     $2, 0x0f, $1	# E :
-        addq    $0, 4, $3	# E :
-
-        cmoveq  $1, $3, $0	# E : Latency 2, extra map cycle
-	nop			# E : keep with cmov
-        and     $2, 0x33, $1	# E :
-        addq    $0, 2, $3	# E : U L U L : 2 cycle stall on $0
-
-        cmoveq  $1, $3, $0	# E : Latency 2, extra map cycle
-	nop			# E : keep with cmov
-        and     $2, 0x55, $1	# E :
-        addq    $0, 1, $3	# E : U L U L : 2 cycle stall on $0
-
-        cmoveq  $1, $3, $0	# E : Latency 2, extra map cycle
-	nop
-	nop
-	ret			# L0 : L U L U
-#endif
-
-	# Deal with the case where $18 > 8 bytes remain to be
-	# searched.  $16 may not be aligned.
-	.align 4
-$first_quad:
-	andnot	$16, 0x7, $0	# E :
-        insqh   $3, $16, $2	# U : $2 = 0000ffffffffffff ($16<0:2> ff)
-        xor	$1, $17, $1	# E :
-	or	$1, $2, $1	# E : U L U L $1 = ====ffffffffffff
-
-        cmpbge  $31, $1, $2	# E :
-        bne     $2, $found_it	# U :
-	# At least one byte left to process.
-	ldq	$31, 8($0)	# L :
-	subq	$5, 1, $18	# E : U L U L
-
-	addq	$0, 8, $0	# E :
-	# Make $18 point to last quad to be accessed (the
-	# last quad may or may not be partial).
-	andnot	$18, 0x7, $18	# E :
-	cmpult	$0, $18, $2	# E :
-	beq	$2, $final	# U : U L U L
-
-	# At least two quads remain to be accessed.
-
-	subq	$18, $0, $4	# E : $4 <- nr quads to be processed
-	and	$4, 8, $4	# E : odd number of quads?
-	bne	$4, $odd_quad_count # U :
-	# At least three quads remain to be accessed
-	nop			# E : L U L U : move prefetched value to correct reg
-
-	.align	4
-$unrolled_loop:
-	ldq	$1, 0($0)	# L : load quad
-	xor	$17, $1, $2	# E :
-	ldq	$31, 8($0)	# L : prefetch next quad
-	cmpbge	$31, $2, $2	# E : U L U L
-
-	bne	$2, $found_it	# U :
-	addq	$0, 8, $0	# E :
-	nop			# E :
-	nop			# E :
-
-$odd_quad_count:
-	ldq	$1, 0($0)	# L : load quad
-	xor	$17, $1, $2	# E :
-	ldq	$31, 8($0)	# L : prefetch $4
-	cmpbge	$31, $2, $2	# E :
-
-	addq	$0, 8, $6	# E :
-	bne	$2, $found_it	# U :
-	cmpult	$6, $18, $6	# E :
-	addq	$0, 8, $0	# E :
-
-	bne	$6, $unrolled_loop # U :
-	nop			# E :
-	nop			# E :
-	nop			# E :
-
-$final:	ldq	$1, 0($0)	# L : load last quad
-	subq	$5, $0, $18	# E : $18 <- number of bytes left to do
-	nop			# E :
-	bne	$18, $last_quad	# U :
-
-$not_found:
-	mov	$31, $0		# E :
-	nop			# E :
-	nop			# E :
-	ret			# L0 :
-
-	END(__memchr)
-
-weak_alias (__memchr, memchr)
-#if !__BOUNDED_POINTERS__
-weak_alias (__memchr, __ubp_memchr)
-#endif
-libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/alpha/memchr.S b/sysdeps/alpha/memchr.S
deleted file mode 100644
index 87c7fb1..0000000
--- a/sysdeps/alpha/memchr.S
+++ /dev/null
@@ -1,176 +0,0 @@
-/* Copyright (C) 1996, 2000, 2003 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by David Mosberger (davidm@cs.arizona.edu).
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, write to the Free
-   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
-   02111-1307 USA.  */
-
-/* Finds characters in a memory area.  Optimized for the Alpha:
-
-      - memory accessed as aligned quadwords only
-      - uses cmpbge to compare 8 bytes in parallel
-      - does binary search to find 0 byte in last
-        quadword (HAKMEM needed 12 instructions to
-        do this instead of the 9 instructions that
-        binary search needs).
-
-For correctness consider that:
-
-      - only minimum number of quadwords may be accessed
-      - the third argument is an unsigned long
-*/
-
-#include <sysdep.h>
-
-        .set noreorder
-        .set noat
-
-ENTRY(__memchr)
-#ifdef PROF
-	ldgp	gp, 0(pv)
-	lda	AT, _mcount
-	jsr	AT, (AT), _mcount
-	.prologue 1
-#else
-	.prologue 0
-#endif
-
-	# Hack -- if someone passes in (size_t)-1, hoping to just
-	# search til the end of the address space, we will overflow
-	# below when we find the address of the last byte.  Given
-	# that we will never have a 56-bit address space, cropping
-	# the length is the easiest way to avoid trouble.
-	zap	a2, 0x80, t4	#-e0	:
-
-	beq	a2, $not_found	# .. e1 :
-        ldq_u   t0, 0(a0)       # e1	: load first quadword
-	insbl	a1, 1, t1	# .. e0 : t1 = 000000000000ch00
-	and	a1, 0xff, a1	#-e0    : a1 = 00000000000000ch
-	cmpult	a2, 9, t3	# .. e1 :
-	or	t1, a1, a1	# e0    : a1 = 000000000000chch
-        lda     t2, -1(zero)	# .. e1 :
-	sll	a1, 16, t1	#-e0    : t1 = 00000000chch0000
-	addq	a0, t4, t4	# .. e1 :
-	or	t1, a1, a1	# e1    : a1 = 00000000chchchch
-	unop			#	:
-	sll	a1, 32, t1	#-e0    : t1 = chchchch00000000
-	or	t1, a1, a1	# e1	: a1 = chchchchchchchch
-	extql	t0, a0, t6	# e0    :
-	beq	t3, $first_quad	# .. e1 :
-
-	ldq_u	t5, -1(t4)	#-e1	: eight or less bytes to search
-	extqh	t5, a0, t5	# .. e0 :
-	mov	a0, v0		# e0	:
-	or	t6, t5, t0	# .. e1 : t0 = quadword starting at a0
-
-	# Deal with the case where at most 8 bytes remain to be searched
-	# in t0.  E.g.:
-	#	a2 = 6
-	#	t0 = ????c6c5c4c3c2c1
-$last_quad:
-	negq	a2, t5		#-e0	:
-        xor	a1, t0, t0	# .. e1 :
-	srl	t2, t5, t5	# e0    : t5 = mask of a2 bits set
-        cmpbge  zero, t0, t1	# .. e1 :
-	and	t1, t5, t1	#-e0	:
-        beq     t1, $not_found	# .. e1 :
-
-$found_it:
-	# Now, determine which byte matched:
-        negq    t1, t2		# e0	:
-        and     t1, t2, t1	# e1	:
-
-        and     t1, 0x0f, t0	#-e0	:
-        addq    v0, 4, t2	# .. e1 :
-        cmoveq  t0, t2, v0	# e0	:
-
-        addq    v0, 2, t2	# .. e1 :
-        and     t1, 0x33, t0	#-e0	:
-        cmoveq  t0, t2, v0	# .. e1 :
-
-        and     t1, 0x55, t0	# e0	:
-        addq    v0, 1, t2	# .. e1 :
-        cmoveq  t0, t2, v0	#-e0	:
-
-$done:	ret			# .. e1 :
-
-	# Deal with the case where a2 > 8 bytes remain to be
-	# searched.  a0 may not be aligned.
-	.align 4
-$first_quad:
-	andnot	a0, 0x7, v0	#-e1	:
-        insqh   t2, a0, t1	# .. e0	: t1 = 0000ffffffffffff (a0<0:2> ff)
-        xor	t0, a1, t0	# e0	:
-	or	t0, t1, t0	# e1	: t0 = ====ffffffffffff
-        cmpbge  zero, t0, t1	#-e0	:
-        bne     t1, $found_it	# .. e1 :
-
-	# At least one byte left to process.
-
-	ldq	zero, 8(v0)	# e0	: prefetch next quad
-	subq	t4, 1, a2	# .. e1 :
-	addq	v0, 8, v0	#-e0	:
-
-	# Make a2 point to last quad to be accessed (the
-	# last quad may or may not be partial).
-
-	andnot	a2, 0x7, a2	# .. e1 :
-	cmpult	v0, a2, t1	# e0	:
-	beq	t1, $final	# .. e1 :
-
-	# At least two quads remain to be accessed.
-
-	subq	a2, v0, t3	#-e0	: t3 <- nr quads to be processed
-	and	t3, 8, t3	# e1	: odd number of quads?
-	bne	t3, $odd_quad_count # e1 :
-
-	# At least three quads remain to be accessed
-
-	.align	4
-$unrolled_loop:
-	ldq	t0, 0(v0)	# e0	: load quad
-	xor	a1, t0, t1	# .. e1 :
-	ldq	zero, 8(v0)	# e0	: prefetch next quad
-	cmpbge	zero, t1, t1	# .. e1:
-	bne	t1, $found_it	# e0    :
-
-	addq	v0, 8, v0	#    e1	:
-$odd_quad_count:
-	ldq	t0, 0(v0)	# e0	: load quad
-	xor	a1, t0, t1	# .. e1 :
-	ldq	zero, 8(v0)	# e0	: prefetch next quad
-	cmpbge	zero, t1, t1	# .. e1 :
-	addq	v0, 8, t5	#-e0	:
-	bne	t1, $found_it	# .. e1	:
-
-	cmpult	t5, a2, t5	# e0	:
-	addq	v0, 8, v0	# .. e1 :
-	bne	t5, $unrolled_loop #-e1 :
-
-$final:	ldq	t0, 0(v0)	# e0	: load last quad
-	subq	t4, v0, a2	# .. e1	: a2 <- number of bytes left to do
-	bne	a2, $last_quad	# e1	:
-
-$not_found:
-	mov	zero, v0	#-e0	:
-	ret			# .. e1 :
-
-        END(__memchr)
-
-weak_alias (__memchr, memchr)
-#if !__BOUNDED_POINTERS__
-weak_alias (__memchr, __ubp_memchr)
-#endif
-libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/alpha/memchr.c b/sysdeps/alpha/memchr.c
new file mode 100644
index 0000000..c52841b
--- /dev/null
+++ b/sysdeps/alpha/memchr.c
@@ -0,0 +1,175 @@
+/* Copyright (C) 2010 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <string.h>
+
+typedef unsigned long word;
+
+static inline word
+ldq_u(const void *s)
+{
+  return *(const word *)((word)s & -8);
+}
+
+#define unlikely(X)	__builtin_expect ((X), 0)
+#define prefetch(X)	__builtin_prefetch ((void *)(X), 0)
+
+#define cmpbeq0(X)	__builtin_alpha_cmpbge(0, (X))
+#define find(X, Y)	cmpbeq0 ((X) ^ (Y))
+
+/* Search no more than N bytes of S for C.  */
+
+void *
+__memchr (const void *s, int xc, size_t n)
+{
+  const word *s_align;
+  word t, current, found, mask, offset;
+
+  if (unlikely (n == 0))
+    return 0;
+
+  current = ldq_u (s);
+
+  /* Replicate low byte of XC into all bytes of C.  */
+  t = xc & 0xff;			/* 0000000c */
+  t = (t << 8) | t;			/* 000000cc */
+  t = (t << 16) | t;			/* 0000cccc */
+  const word c = (t << 32) | t;		/* cccccccc */
+
+  /* Align the source, and decrement the count by the number
+     of bytes searched in the first word.  */
+  s_align = (const word *)(s & -8);
+  n += (s & 7);
+
+  /* Deal with misalignment in the first word for the comparison.  */
+  mask = (1ul << (s & 7)) - 1;
+
+  /* If the entire string fits within one word, we may need masking
+     at both the front and the back of the string.  */
+  if (unlikely (n <= 8))
+    {
+      mask |= -1ul << n;
+      goto last_quad;
+    }
+
+  found = find (current, c) & ~mask;
+  if (unlikely (found))
+    goto found_it;
+
+  s_align++;
+  n -= 8;
+
+  /* If the block is sufficiently large, align to cacheline and prefetch.  */
+  if (unlikely (n >= 256))
+    {
+      /* Prefetch 3 cache lines beyond the one we're working on.  */
+      prefetch (s_align + 8);
+      prefetch (s_align + 16);
+      prefetch (s_align + 24);
+
+      while ((word)s_align & 63)
+	{
+	  current = *s_align;
+	  found = find (current, c);
+	  if (found)
+	    goto found_it;
+	  s_align++;
+	  n -= 8;
+	}
+
+	/* Within each cacheline, advance the load for the next word
+	   before the test for the previous word is complete.  This
+	   allows us to hide the 3 cycle L1 cache load latency.  We
+	   only perform this advance load within a cacheline to prevent
+	   reading across page boundary.  */
+#define CACHELINE_LOOP				\
+	do {					\
+	  word i, next = s_align[0];		\
+	  for (i = 0; i < 7; ++i)		\
+	    {					\
+	      current = next;			\
+	      next = s_align[1];		\
+	      found = find (current, c);	\
+	      if (unlikely (found))		\
+		goto found_it;			\
+	      s_align++;			\
+	    }					\
+	  current = next;			\
+	  found = find (current, c);		\
+	  if (unlikely (found))			\
+	    goto found_it;			\
+	  s_align++;				\
+	  n -= 64;				\
+	} while (0)
+      
+      /* While there's still lots more data to potentially be read,
+	 continue issuing prefetches for the 4th cacheline out.  */
+      while (n >= 256)
+	{
+	  prefetch (s_align + 24);
+	  CACHELINE_LOOP;
+	}
+
+      /* Up to 3 cache lines remaining.  Continue issuing advanced
+	 loads, but stop prefetching.  */
+      while (n >= 64)
+	CACHELINE_LOOP;
+
+      /* We may have exhausted the buffer.  */
+      if (n == 0)
+	return NULL;
+    }
+
+  /* Quadword aligned loop.  */
+  current = *s_align;
+  while (n > 8)
+    {
+      found = find (current, c);
+      if (unlikely (found))
+	goto found_it;
+      current = *++s_align;
+      n -= 8;
+    }
+
+  /* The last word may need masking at the tail of the compare.  */
+  mask = -1ul << n;
+ last_quad:
+  found = find (current, c) & ~mask;
+  if (found == 0)
+    return NULL;
+
+ found_it:
+#ifdef __alpha_cix__
+  offset = __builtin_alpha_cttz (found);
+#else
+  /* Extract LSB.  */
+  found &= -found;
+
+  /* Binary search for the LSB.  */
+  offset  = (found & 0x0f ? 0 : 4);
+  offset += (found & 0x33 ? 0 : 2);
+  offset += (found & 0x55 ? 0 : 1);
+#endif
+
+  return (void *)((word)s_align + offset);
+}
+
+#ifdef weak_alias
+weak_alias (__memchr, BP_SYM (memchr))
+#endif
+libc_hidden_builtin_def (memchr)

-----------------------------------------------------------------------

Summary of changes:
 sysdeps/alpha/alphaev6/memchr.S |  193 ---------------------------------------
 sysdeps/alpha/memchr.S          |  176 -----------------------------------
 sysdeps/alpha/memchr.c          |  175 +++++++++++++++++++++++++++++++++++
 3 files changed, 175 insertions(+), 369 deletions(-)
 delete mode 100644 sysdeps/alpha/alphaev6/memchr.S
 delete mode 100644 sysdeps/alpha/memchr.S
 create mode 100644 sysdeps/alpha/memchr.c


hooks/post-receive
-- 
Community source repository for glibc add-on ports


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]