This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH][malloc] Speedup fastbin paths
- From: Wilco Dijkstra <Wilco dot Dijkstra at arm dot com>
- To: "libc-alpha at sourceware dot org" <libc-alpha at sourceware dot org>, "dj at redhat dot com" <dj at redhat dot com>
- Cc: nd <nd at arm dot com>
- Date: Thu, 28 Sep 2017 15:28:14 +0000
- Subject: [PATCH][malloc] Speedup fastbin paths
- Authentication-results: sourceware.org; auth=none
- Authentication-results: spf=none (sender IP is ) smtp.mailfrom=Wilco dot Dijkstra at arm dot com;
- Nodisclaimer: True
- Spamdiagnosticmetadata: NSPM
- Spamdiagnosticoutput: 1:99
This patch speeds up the fastbin paths. When an application is
single-threaded, we can execute a simpler, faster fastbin path.
Doing this at a high level means we only need a single check
which can bypass multiple locks, atomic instructions and related
extra complexity.
The speedup for fastbin allocations on AArch64 is about 2.4 times.
The number of load/store exclusive instructions is now zero in
single-threaded scenarios.
Also inline tcache_get and tcache_put for a 16% performance gain
of the tcache fast paths.
Bench-malloc-thread speeds up by an extra ~5% on top of the previous
patch for the single-threaded case and ~2.5% for the multi-threaded
case (in total ~9% for 1 thread and ~6% for 32 threads with the
have_fastchunk improvement).
Passes GLIBC tests. OK to commit?
ChangeLog:
2017-09-28 Wilco Dijkstra <wdijkstr@arm.com>
* malloc/malloc.c (sysdep-cancel.h): Add include.
(tcache_put): Inline.
(tcache_get): Inline.
(__libc_malloc): Add SINGLE_THREAD_P path.
(_int_malloc): Likewise.
(_int_free): Likewise.
--
diff --git a/malloc/malloc.c b/malloc/malloc.c
index 082c2b927727bff441cf48744265628d0bc40add..88cfd25766eba6787faeb7195d95b73d7a4637ab 100644
--- a/malloc/malloc.c
+++ b/malloc/malloc.c
@@ -243,6 +243,10 @@
#include <malloc/malloc-internal.h>
+/* For SINGLE_THREAD_P. */
+#include <sysdep-cancel.h>
+
+
/*
Debugging:
@@ -2909,7 +2913,7 @@ static __thread tcache_perthread_struct *tcache = NULL;
/* Caller must ensure that we know tc_idx is valid and there's room
for more chunks. */
-static void
+static inline void
tcache_put (mchunkptr chunk, size_t tc_idx)
{
tcache_entry *e = (tcache_entry *) chunk2mem (chunk);
@@ -2921,7 +2925,7 @@ tcache_put (mchunkptr chunk, size_t tc_idx)
/* Caller must ensure that we know tc_idx is valid and there's
available chunks to remove. */
-static void *
+static inline void *
tcache_get (size_t tc_idx)
{
tcache_entry *e = tcache->entries[tc_idx];
@@ -3030,24 +3034,34 @@ __libc_malloc (size_t bytes)
DIAG_POP_NEEDS_COMMENT;
#endif
- arena_get (ar_ptr, bytes);
-
- victim = _int_malloc (ar_ptr, bytes);
- /* Retry with another arena only if we were able to find a usable arena
- before. */
- if (!victim && ar_ptr != NULL)
+ if (SINGLE_THREAD_P)
{
- LIBC_PROBE (memory_malloc_retry, 1, bytes);
- ar_ptr = arena_get_retry (ar_ptr, bytes);
- victim = _int_malloc (ar_ptr, bytes);
+ victim = _int_malloc (&main_arena, bytes);
+ assert (!victim || chunk_is_mmapped (mem2chunk (victim)) ||
+ &main_arena == arena_for_chunk (mem2chunk (victim)));
+ return victim;
}
+ else
+ {
+ arena_get (ar_ptr, bytes);
- if (ar_ptr != NULL)
- __libc_lock_unlock (ar_ptr->mutex);
+ victim = _int_malloc (ar_ptr, bytes);
+ /* Retry with another arena only if we were able to find a usable arena
+ before. */
+ if (!victim && ar_ptr != NULL)
+ {
+ LIBC_PROBE (memory_malloc_retry, 1, bytes);
+ ar_ptr = arena_get_retry (ar_ptr, bytes);
+ victim = _int_malloc (ar_ptr, bytes);
+ }
+
+ if (ar_ptr != NULL)
+ __libc_lock_unlock (ar_ptr->mutex);
- assert (!victim || chunk_is_mmapped (mem2chunk (victim)) ||
- ar_ptr == arena_for_chunk (mem2chunk (victim)));
- return victim;
+ assert (!victim || chunk_is_mmapped (mem2chunk (victim)) ||
+ ar_ptr == arena_for_chunk (mem2chunk (victim)));
+ return victim;
+ }
}
libc_hidden_def (__libc_malloc)
@@ -3526,39 +3540,81 @@ _int_malloc (mstate av, size_t bytes)
if ((unsigned long) (nb) <= (unsigned long) (get_max_fast ()))
{
- idx = fastbin_index (nb);
- mfastbinptr *fb = &fastbin (av, idx);
- mchunkptr pp = *fb;
- REMOVE_FB (fb, victim, pp);
- if (victim != 0)
- {
- if (__builtin_expect (fastbin_index (chunksize (victim)) != idx, 0))
- malloc_printerr ("malloc(): memory corruption (fast)");
- check_remalloced_chunk (av, victim, nb);
-#if USE_TCACHE
- /* While we're here, if we see other chunks of the same size,
- stash them in the tcache. */
- size_t tc_idx = csize2tidx (nb);
- if (tcache && tc_idx < mp_.tcache_bins)
+ if (SINGLE_THREAD_P)
+ {
+ idx = fastbin_index (nb);
+ mfastbinptr *fb = &fastbin (av, idx);
+ mchunkptr next;
+ victim = *fb;
+ if (victim != NULL)
{
- mchunkptr tc_victim;
-
- /* While bin not empty and tcache not full, copy chunks over. */
- while (tcache->counts[tc_idx] < mp_.tcache_count
- && (pp = *fb) != NULL)
+ size_t victim_idx = fastbin_index (chunksize (victim));
+ next = victim->fd;
+ if (__builtin_expect (victim_idx != idx, 0))
+ malloc_printerr ("malloc(): memory corruption (fast)");
+ check_remalloced_chunk (av, victim, nb);
+#if USE_TCACHE
+ /* While we're here, if we see other chunks of the same size,
+ stash them in the tcache. */
+ size_t tc_idx = csize2tidx (nb);
+ if (next != NULL && tcache && tc_idx < mp_.tcache_bins)
{
- REMOVE_FB (fb, tc_victim, pp);
- if (tc_victim != 0)
+ mchunkptr tc_victim;
+
+ /* While bin not empty and tcache not full, copy chunks. */
+ while (tcache->counts[tc_idx] < mp_.tcache_count)
{
+ tc_victim = next;
+ next = tc_victim->fd;
tcache_put (tc_victim, tc_idx);
- }
+ if (next == NULL)
+ break;
+ }
}
- }
#endif
- void *p = chunk2mem (victim);
- alloc_perturb (p, bytes);
- return p;
+ *fb = next;
+ void *p = chunk2mem (victim);
+ alloc_perturb (p, bytes);
+ return p;
+ }
}
+ else
+ {
+ idx = fastbin_index (nb);
+ mfastbinptr *fb = &fastbin (av, idx);
+ mchunkptr pp = *fb;
+ REMOVE_FB (fb, victim, pp);
+ if (victim != 0)
+ {
+ size_t victim_idx = fastbin_index (chunksize (victim));
+ if (__builtin_expect (victim_idx != idx, 0))
+ malloc_printerr ("malloc(): memory corruption (fast)");
+ check_remalloced_chunk (av, victim, nb);
+#if USE_TCACHE
+ /* While we're here, if we see other chunks of the same size,
+ stash them in the tcache. */
+ size_t tc_idx = csize2tidx (nb);
+ if (tcache && tc_idx < mp_.tcache_bins)
+ {
+ mchunkptr tc_victim;
+
+ /* While bin not empty and tcache not full, copy chunks. */
+ while (tcache->counts[tc_idx] < mp_.tcache_count
+ && (pp = *fb) != NULL)
+ {
+ REMOVE_FB (fb, tc_victim, pp);
+ if (tc_victim != 0)
+ {
+ tcache_put (tc_victim, tc_idx);
+ }
+ }
+ }
+#endif
+ void *p = chunk2mem (victim);
+ alloc_perturb (p, bytes);
+ return p;
+ }
+ }
}
/*
@@ -4158,24 +4214,36 @@ _int_free (mstate av, mchunkptr p, int have_lock)
/* Atomically link P to its fastbin: P->FD = *FB; *FB = P; */
mchunkptr old = *fb, old2;
unsigned int old_idx = ~0u;
- do
+
+ if (SINGLE_THREAD_P && !have_lock)
{
- /* Check that the top of the bin is not the record we are going to add
- (i.e., double free). */
if (__builtin_expect (old == p, 0))
malloc_printerr ("double free or corruption (fasttop)");
- /* Check that size of fastbin chunk at the top is the same as
- size of the chunk that we are adding. We can dereference OLD
- only if we have the lock, otherwise it might have already been
- deallocated. See use of OLD_IDX below for the actual check. */
- if (have_lock && old != NULL)
- old_idx = fastbin_index(chunksize(old));
- p->fd = old2 = old;
+ p->fd = old;
+ *fb = p;
}
- while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2)) != old2);
+ else
+ {
+ do
+ {
+ /* Check that the top of the bin is not the record we are going to
+ add (i.e., double free). */
+ if (__builtin_expect (old == p, 0))
+ malloc_printerr ("double free or corruption (fasttop)");
+ /* Check that size of fastbin chunk at the top is the same as
+ size of the chunk that we are adding. We can dereference OLD
+ only if we have the lock, otherwise it might have already been
+ deallocated. See use of OLD_IDX below for the actual check. */
+ if (have_lock && old != NULL)
+ old_idx = fastbin_index (chunksize (old));
+ p->fd = old2 = old;
+ }
+ while ((old = catomic_compare_and_exchange_val_rel (fb, p, old2))
+ != old2);
- if (have_lock && old != NULL && __builtin_expect (old_idx != idx, 0))
- malloc_printerr ("invalid fastbin entry (free)");
+ if (have_lock && old != NULL && __builtin_expect (old_idx != idx, 0))
+ malloc_printerr ("invalid fastbin entry (free)");
+ }
}
/*