This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

An improved merge-sort algorithm


Hi there.  When I added a merge-sort algorithm to rsync, I improved upon
the glibc version to reduce the amount of copying it does and to half
the needed memory.  I thought you might be interested in updating the
version in glibc, so I'm attaching a patch that applies to the CVS
version.

The algorithm is simple:

When dividing a buffer, b, into b1 and b2 halves and getting each half
sorted (recursing into this algorithm), we join the lists by first
comparing the starting item(s) in the b1 section to find one that is
larger than the first item in b2.  If none are found, no copying is
needed.  If one or more are found, the remaining b1 items are copied
into temporary-memory (making the maximum needed tmp memory the size of
b1), and then the remaining items are merged from tmp and b2 back into
the b buffer starting at the spot where the b1 items were copied away.
Any b2 items that are already in place at the end of the buffer are not
copied at all (which is the same as the current algorithm).

Compared to the old merge-sort algorithm, here is the amount of copying
needed for various situations:

 - If sections b1 and b2 are in sequence as a group, the new algorithm
   does no copying.  The old algorithm copied b1 to tmp and tmp back to
   b1, so this saves 2 useless b1-sized copies with no increase in
   comparisons.

 - If sections b1 and b2 are in the opposite order, the new algorithm
   copies b1 to tmp, and then copies b2 followed by tmp back to the b
   buffer.  The old copied b2 + b1 to tmp, and then copied them both
   back again, so the new code does 25% less copying in a worst-case-
   order situation.

 - In a case where 1/2 the leading and 1/2 the trailing data is already
   in the right spot, the new algorithm results in 50% less copying.

 - Etc.

Since the algorithm is a little more complex than the old one, I decided
to use a macro as a function template.  I also created a separate static
function for each of the separate copy-size cases instead of using a
switch inside a single function.  A small number of inline copy-one-item
functions makes the template easy for all the types.

I also moved the (n <= 1) exclusion at the start of the msort function
into the calling functions (one spot in qsort() and when recursing).
This saves 1-2 useless recursion calls when we reach 1-item lists.

This version has received light testing to ensure all the msort_TYPE()
functions work properly for a simple test case.

..wayne..
--- msort.c-1.23	2007-12-09 15:38:04.000000000 -0800
+++ msort.c	2007-12-09 16:56:33.000000000 -0800
@@ -29,148 +29,140 @@
 struct msort_param
 {
   size_t s;
-  size_t var;
   __compar_d_fn_t cmp;
   void *arg;
   char *t;
 };
-static void msort_with_tmp (const struct msort_param *p, void *b, size_t n);
+
+/* This macro is a template function for the msort_TYPE() functions. */
+
+#define MSORT(param, buf, cnt, this_func, copy_func, item_len, cmp_cast)\
+  size_t n1 = cnt / 2;							\
+  size_t n2 = cnt - n1;							\
+  char *b1 = buf;							\
+  char *b2 = (char *) buf + (n1 * param->s);				\
+									\
+  if (n1 > 1)								\
+    this_func (param, b1, n1);						\
+  if (n2 > 1)								\
+    this_func (param, b2, n2);						\
+									\
+  char *tmp = param->t;							\
+  const size_t s = param->s;						\
+  __compar_d_fn_t cmp = param->cmp;					\
+  void *arg = p->arg;							\
+									\
+  /* Check for starting items that are already in the right spot. */	\
+  while ((*cmp) (cmp_cast b1, cmp_cast b2, arg) <= 0)			\
+    {									\
+      if (!--n1)							\
+	return; /* The two sections need no changes. */			\
+      b1 += item_len;							\
+    }									\
+  /* Copy what's left of b1 into the tmp area. */			\
+  memcpy (tmp, b1, n1 * item_len);					\
+  /* Move the b2 item that was compared above into its place. */	\
+  b1 = copy_func (b1, b2, item_len), b2 += item_len, --n2;		\
+  /* Merge while we have both tmp (b1) and b2 items. */			\
+  while (n1 > 0 && n2 > 0)						\
+    {									\
+      if ((*cmp) (cmp_cast tmp, cmp_cast b2, arg) <= 0)			\
+	b1 = copy_func (b1, tmp, item_len), tmp += item_len, --n1;	\
+      else								\
+	b1 = copy_func (b1, b2, item_len), b2 += item_len, --n2;	\
+    }									\
+  /* Any remaining b2 items are already in the right spot at the end */	\
+  /* OR any remaining tmp (b1) items get copied to the end. */		\
+  if (n1 > 0)								\
+    memcpy (b1, tmp, n1 * s);						\
+									\
+ return
+
+/* These inline copy routines work line __mempcpy() for other types. */
+
+static inline char *
+copy_uint32_t (char *dst, char *src, size_t item_len)
+{
+  (void)item_len;
+  *(uint32_t *) dst = *(uint32_t *) src;
+  return dst + sizeof (uint32_t);
+}
+
+static inline char *
+copy_uint64_t (char *dst, char *src, size_t item_len)
+{
+  (void)item_len;
+  *(uint64_t *) dst = *(uint64_t *) src;
+  return dst + sizeof (uint64_t);
+}
+
+static inline char *
+copy_ulongs (char *dst, char *src, size_t item_len)
+{
+  unsigned long *dl = (unsigned long *) dst;
+  unsigned long *dl_end = (unsigned long *) (dst + item_len);
+  unsigned long *sl = (unsigned long *) src;
+  while (dl < dl_end)
+    *dl++ = *sl++;
+  return (char *)dl;
+}
+
+static inline char *
+copy_void_ptr (char *dst, char *src, size_t item_len)
+{
+  (void)item_len;
+  *(void **) dst = *(void **) src;
+  return dst + sizeof (void *);
+}
 
 static void
-msort_with_tmp (const struct msort_param *p, void *b, size_t n)
+msort_uint32_t (const struct msort_param *p, void *b, size_t n)
 {
-  char *b1, *b2;
-  size_t n1, n2;
+  MSORT(p, b, n, msort_uint32_t, copy_uint32_t, sizeof (uint32_t), (void *));
+}
 
-  if (n <= 1)
-    return;
+static void
+msort_uint64_t (const struct msort_param *p, void *b, size_t n)
+{
+  MSORT(p, b, n, msort_uint64_t, copy_uint64_t, sizeof (uint64_t), (void *));
+}
 
-  n1 = n / 2;
-  n2 = n - n1;
-  b1 = b;
-  b2 = (char *) b + (n1 * p->s);
-
-  msort_with_tmp (p, b1, n1);
-  msort_with_tmp (p, b2, n2);
-
-  char *tmp = p->t;
-  const size_t s = p->s;
-  __compar_d_fn_t cmp = p->cmp;
-  void *arg = p->arg;
-  switch (p->var)
-    {
-    case 0:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      *(uint32_t *) tmp = *(uint32_t *) b1;
-	      b1 += sizeof (uint32_t);
-	      --n1;
-	    }
-	  else
-	    {
-	      *(uint32_t *) tmp = *(uint32_t *) b2;
-	      b2 += sizeof (uint32_t);
-	      --n2;
-	    }
-	  tmp += sizeof (uint32_t);
-	}
-      break;
-    case 1:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      *(uint64_t *) tmp = *(uint64_t *) b1;
-	      b1 += sizeof (uint64_t);
-	      --n1;
-	    }
-	  else
-	    {
-	      *(uint64_t *) tmp = *(uint64_t *) b2;
-	      b2 += sizeof (uint64_t);
-	      --n2;
-	    }
-	  tmp += sizeof (uint64_t);
-	}
-      break;
-    case 2:
-      while (n1 > 0 && n2 > 0)
-	{
-	  unsigned long *tmpl = (unsigned long *) tmp;
-	  unsigned long *bl;
+static void
+msort_ulongs (const struct msort_param *p, void *b, size_t n)
+{
+  /* The s variable is the p->s value in the template. */
+  MSORT(p, b, n, msort_ulongs, copy_ulongs, s, (void *));
+}
 
-	  tmp += s;
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      bl = (unsigned long *) b1;
-	      b1 += s;
-	      --n1;
-	    }
-	  else
-	    {
-	      bl = (unsigned long *) b2;
-	      b2 += s;
-	      --n2;
-	    }
-	  while (tmpl < (unsigned long *) tmp)
-	    *tmpl++ = *bl++;
-	}
-      break;
-    case 3:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (*(const void **) b1, *(const void **) b2, arg) <= 0)
-	    {
-	      *(void **) tmp = *(void **) b1;
-	      b1 += sizeof (void *);
-	      --n1;
-	    }
-	  else
-	    {
-	      *(void **) tmp = *(void **) b2;
-	      b2 += sizeof (void *);
-	      --n2;
-	    }
-	  tmp += sizeof (void *);
-	}
-      break;
-    default:
-      while (n1 > 0 && n2 > 0)
-	{
-	  if ((*cmp) (b1, b2, arg) <= 0)
-	    {
-	      tmp = (char *) __mempcpy (tmp, b1, s);
-	      b1 += s;
-	      --n1;
-	    }
-	  else
-	    {
-	      tmp = (char *) __mempcpy (tmp, b2, s);
-	      b2 += s;
-	      --n2;
-	    }
-	}
-      break;
-    }
+static void
+msort_void_ptrs (const struct msort_param *p, void *b, size_t n)
+{
+  MSORT(p, b, n, msort_void_ptrs, copy_void_ptr, sizeof (void *), *(void **));
+}
 
-  if (n1 > 0)
-    memcpy (tmp, b1, n1 * s);
-  memcpy (b, p->t, (n - n2) * s);
+static void
+msort_bytes (const struct msort_param *p, void *b, size_t n)
+{
+  /* The s variable is the p->s value in the template. */
+  MSORT(p, b, n, msort_bytes, (char *) __mempcpy, s, (void *));
 }
 
 
 void
 qsort_r (void *b, size_t n, size_t s, __compar_d_fn_t cmp, void *arg)
 {
-  size_t size = n * s;
+  size_t size;
   char *tmp = NULL;
   struct msort_param p;
 
+  if (n <= 1)
+    return;
+
   /* For large object sizes use indirect sorting.  */
   if (s > 32)
-    size = 2 * n * sizeof (void *) + s;
+    size = (n + n/2) * sizeof (void *) + s;
+  else
+    size = n/2 * s;
 
   if (size < 1024)
     /* The temporary array is small, so put it on the stack.  */
@@ -229,7 +221,6 @@
     }
 
   p.s = s;
-  p.var = 4;
   p.cmp = cmp;
   p.arg = arg;
 
@@ -237,7 +228,7 @@
     {
       /* Indirect sorting.  */
       char *ip = (char *) b;
-      void **tp = (void **) (p.t + n * sizeof (void *));
+      void **tp = (void **) (p.t + n/2 * sizeof (void *));
       void **t = tp;
       void *tmp_storage = (void *) (tp + n);
 
@@ -247,8 +238,7 @@
 	  ip += s;
 	}
       p.s = sizeof (void *);
-      p.var = 3;
-      msort_with_tmp (&p, p.t + n * sizeof (void *), n);
+      msort_void_ptrs (&p, tp, n);
 
       /* tp[0] .. tp[n - 1] is now sorted, copy around entries of
 	 the original array.  Knuth vol. 3 (2nd ed.) exercise 5.2-10.  */
@@ -282,16 +272,19 @@
 	  && ((char *) b - (char *) 0) % __alignof__ (uint32_t) == 0)
 	{
 	  if (s == sizeof (uint32_t))
-	    p.var = 0;
+	    msort_uint32_t (&p, b, n);
 	  else if (s == sizeof (uint64_t)
 		   && ((char *) b - (char *) 0) % __alignof__ (uint64_t) == 0)
-	    p.var = 1;
+	    msort_uint64_t (&p, b, n);
 	  else if ((s & (sizeof (unsigned long) - 1)) == 0
 		   && ((char *) b - (char *) 0)
 		      % __alignof__ (unsigned long) == 0)
-	    p.var = 2;
+	    msort_ulongs (&p, b, n);
+	  else
+	    msort_bytes (&p, b, n);
 	}
-      msort_with_tmp (&p, b, n);
+      else
+	msort_bytes (&p, b, n);
     }
   free (tmp);
 }

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]