This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH][malloc] Speedup fastbin paths


This patch speeds up the fastbin paths.  When an application is
single-threaded, we can execute a simpler, faster fastbin path.
Doing this at a high level means we only need a single check
which can bypass multiple locks, atomic instructions and related
extra complexity.

The speedup for fastbin allocations on AArch64 is about 2.4 times.
The number of load/store exclusive instructions is now zero in
single-threaded scenarios.

Also inline tcache_get and tcache_put for a 16% performance gain
of the tcache fast paths. 

Bench-malloc-thread speeds up by an extra ~5% on top of the previous
patch for the single-threaded case and ~2.5% for the multi-threaded
case (in total ~9% for 1 thread and ~6% for 32 threads with the
have_fastchunk improvement).

Passes GLIBC tests. OK to commit?

ChangeLog:
2017-09-28  Wilco Dijkstra  <wdijkstr@arm.com>

	* malloc/malloc.c (sysdep-cancel.h): Add include.
	(tcache_put): Inline.
	(tcache_get): Inline.
	(__libc_malloc): Add SINGLE_THREAD_P path.
	(_int_malloc): Likewise.
	(_int_free): Likewise.

--

diff --git a/malloc/malloc.c b/malloc/malloc.c
index 082c2b927727bff441cf48744265628d0bc40add..88cfd25766eba6787faeb7195d95b73d7a4637ab 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -243,6 +243,10 @@
 
 #include <malloc/malloc-internal.h>
 
+/* For SINGLE_THREAD_P.  */
+#include <sysdep-cancel.h>
+
+
 /*
   Debugging:
 
@@ -2909,7 +2913,7 @@ static __thread tcache_perthread_struct *tcache = NULL;
 
 /* Caller must ensure that we know tc_idx is valid and there's room
    for more chunks.  */
-static void
+static inline void
 tcache_put (mchunkptr chunk, size_t tc_idx)
 {
   tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
@@ -2921,7 +2925,7 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
 
 /* Caller must ensure that we know tc_idx is valid and there's
    available chunks to remove.  */
-static void *
+static inline void *
 tcache_get (size_t tc_idx)
 {
   tcache_entry *e = tcache->entries[tc_idx];
@@ -3030,24 +3034,34 @@ __libc_malloc (size_t bytes)
   DIAG_POP_NEEDS_COMMENT;
 #endif
 
-  arena_get (ar_ptr, bytes);
-
-  victim = _int_malloc (ar_ptr, bytes);
-  /* Retry with another arena only if we were able to find a usable arena
-     before.  */
-  if (!victim && ar_ptr != NULL)
+  if (SINGLE_THREAD_P)
     {
-      LIBC_PROBE (memory_malloc_retry, 1, bytes);
-      ar_ptr = arena_get_retry (ar_ptr, bytes);
-      victim = _int_malloc (ar_ptr, bytes);
+      victim = _int_malloc (&main_arena, bytes);
+      assert (!victim || chunk_is_mmapped (mem2chunk (victim)) ||
+	      &main_arena == arena_for_chunk (mem2chunk (victim)));
+      return victim;
     }
+  else
+    {
+      arena_get (ar_ptr, bytes);
 
-  if (ar_ptr != NULL)
-    __libc_lock_unlock (ar_ptr->mutex);
+      victim = _int_malloc (ar_ptr, bytes);
+      /* Retry with another arena only if we were able to find a usable arena
+	 before.  */
+      if (!victim && ar_ptr != NULL)
+	{
+	  LIBC_PROBE (memory_malloc_retry, 1, bytes);
+	  ar_ptr = arena_get_retry (ar_ptr, bytes);
+	  victim = _int_malloc (ar_ptr, bytes);
+	}
+
+      if (ar_ptr != NULL)
+	__libc_lock_unlock (ar_ptr->mutex);
 
-  assert (!victim || chunk_is_mmapped (mem2chunk (victim)) ||
-          ar_ptr == arena_for_chunk (mem2chunk (victim)));
-  return victim;
+      assert (!victim || chunk_is_mmapped (mem2chunk (victim)) ||
+	      ar_ptr == arena_for_chunk (mem2chunk (victim)));
+      return victim;
+    }
 }
 libc_hidden_def (__libc_malloc)
 
@@ -3526,39 +3540,81 @@ _int_malloc (mstate av, size_t bytes)
 
   if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
     {
-      idx = fastbin_index (nb);
-      mfastbinptr *fb = &fastbin (av, idx);
-      mchunkptr pp = *fb;
-      REMOVE_FB (fb, victim, pp);
-      if (victim != 0)
-        {
-          if (__builtin_expect (fastbin_index (chunksize (victim)) != idx, 0))
-	    malloc_printerr ("malloc(): memory corruption (fast)");
-          check_remalloced_chunk (av, victim, nb);
-#if USE_TCACHE
-	  /* While we're here, if we see other chunks of the same size,
-	     stash them in the tcache.  */
-	  size_t tc_idx = csize2tidx (nb);
-	  if (tcache && tc_idx < mp_.tcache_bins)
+      if (SINGLE_THREAD_P)
+	{
+	  idx = fastbin_index (nb);
+	  mfastbinptr *fb = &fastbin (av, idx);
+	  mchunkptr next;
+	  victim = *fb;
+	  if (victim != NULL)
 	    {
-	      mchunkptr tc_victim;
-
-	      /* While bin not empty and tcache not full, copy chunks over.  */
-	      while (tcache->counts[tc_idx] < mp_.tcache_count
-		     && (pp = *fb) != NULL)
+	      size_t victim_idx = fastbin_index (chunksize (victim));
+	      next = victim->fd;
+	      if (__builtin_expect (victim_idx != idx, 0))
+		malloc_printerr ("malloc(): memory corruption (fast)");
+	      check_remalloced_chunk (av, victim, nb);
+#if USE_TCACHE
+	      /* While we're here, if we see other chunks of the same size,
+		 stash them in the tcache.  */
+	      size_t tc_idx = csize2tidx (nb);
+	      if (next != NULL && tcache && tc_idx < mp_.tcache_bins)
 		{
-		  REMOVE_FB (fb, tc_victim, pp);
-		  if (tc_victim != 0)
+		  mchunkptr tc_victim;
+
+		  /* While bin not empty and tcache not full, copy chunks.  */
+		  while (tcache->counts[tc_idx] < mp_.tcache_count)
 		    {
+		      tc_victim = next;
+		      next = tc_victim->fd;
 		      tcache_put (tc_victim, tc_idx);
-	            }
+		      if (next == NULL)
+			break;
+		    }
 		}
-	    }
 #endif
-          void *p = chunk2mem (victim);
-          alloc_perturb (p, bytes);
-          return p;
+	      *fb = next;
+	      void *p = chunk2mem (victim);
+	      alloc_perturb (p, bytes);
+	      return p;
+	    }
         }
+      else
+	{
+	  idx = fastbin_index (nb);
+	  mfastbinptr *fb = &fastbin (av, idx);
+	  mchunkptr pp = *fb;
+	  REMOVE_FB (fb, victim, pp);
+	  if (victim != 0)
+	    {
+	      size_t victim_idx = fastbin_index (chunksize (victim));
+	      if (__builtin_expect (victim_idx != idx, 0))
+		malloc_printerr ("malloc(): memory corruption (fast)");
+	      check_remalloced_chunk (av, victim, nb);
+#if USE_TCACHE
+	      /* While we're here, if we see other chunks of the same size,
+		 stash them in the tcache.  */
+	      size_t tc_idx = csize2tidx (nb);
+	      if (tcache && tc_idx < mp_.tcache_bins)
+		{
+		  mchunkptr tc_victim;
+
+		  /* While bin not empty and tcache not full, copy chunks.  */
+		  while (tcache->counts[tc_idx] < mp_.tcache_count
+			 && (pp = *fb) != NULL)
+		    {
+		      REMOVE_FB (fb, tc_victim, pp);
+		      if (tc_victim != 0)
+			{
+			  tcache_put (tc_victim, tc_idx);
+			}
+		    }
+		}
+#endif
+	      void *p = chunk2mem (victim);
+	      alloc_perturb (p, bytes);
+	      return p;
+	    }
+	}
     }
 
   /*
@@ -4158,24 +4214,36 @@ _int_free (mstate av, mchunkptr p, int have_lock)
     /* Atomically link P to its fastbin: P->FD = *FB; *FB = P;  */
     mchunkptr old = *fb, old2;
     unsigned int old_idx = ~0u;
-    do
+
+    if (SINGLE_THREAD_P && !have_lock)
       {
-	/* Check that the top of the bin is not the record we are going to add
-	   (i.e., double free).  */
 	if (__builtin_expect (old == p, 0))
 	  malloc_printerr ("double free or corruption (fasttop)");
-	/* Check that size of fastbin chunk at the top is the same as
-	   size of the chunk that we are adding.  We can dereference OLD
-	   only if we have the lock, otherwise it might have already been
-	   deallocated.  See use of OLD_IDX below for the actual check.  */
-	if (have_lock && old != NULL)
-	  old_idx = fastbin_index(chunksize(old));
-	p->fd = old2 = old;
+	p->fd = old;
+	*fb = p;
       }
-    while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2)) != old2);
+    else
+      {
+	do
+	  {
+	    /* Check that the top of the bin is not the record we are going to
+	       add (i.e., double free).  */
+	    if (__builtin_expect (old == p, 0))
+	      malloc_printerr ("double free or corruption (fasttop)");
+	    /* Check that size of fastbin chunk at the top is the same as
+	       size of the chunk that we are adding.  We can dereference OLD
+	       only if we have the lock, otherwise it might have already been
+	       deallocated.  See use of OLD_IDX below for the actual check.  */
+	    if (have_lock && old != NULL)
+	      old_idx = fastbin_index (chunksize (old));
+	    p->fd = old2 = old;
+	  }
+	while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))
+		!= old2);
 
-    if (have_lock && old != NULL && __builtin_expect (old_idx != idx, 0))
-      malloc_printerr ("invalid fastbin entry (free)");
+	if (have_lock && old != NULL && __builtin_expect (old_idx != idx, 0))
+	  malloc_printerr ("invalid fastbin entry (free)");
+      }
   }
 
   /*


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]