This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: [updated patch] malloc per-thread cache ready for review
- From: Will Newton <will dot newton at gmail dot com>
- To: DJ Delorie <dj at redhat dot com>
- Cc: GNU C Library <libc-alpha at sourceware dot org>
- Date: Fri, 17 Mar 2017 10:49:02 +0000
- Subject: Re: [updated patch] malloc per-thread cache ready for review
- Authentication-results: sourceware.org; auth=none
- References: <CAFbHwiRF6J3SWZkYf+-j658BewpbHWNazz=1UjMZ=B+pC837cw@mail.gmail.com> <xna88tzpuu.fsf@greed.delorie.com>
On Fri, Mar 10, 2017 at 5:02 AM, DJ Delorie <dj@redhat.com> wrote:
>
> Branch updated...
>
> Fix whitespace around operators.
>
> Define MAYBE_INIT_TCACHE to empty when disabled; remove wrapper
>
> Add helper functions for tcache_get() and tcache_put() to collect
> common code in one spot.
>
> Updated patch attached. (git diff origin/master...origin/dj/malloc-tcache)
This looks ok to me from a basic review - the code looks reasonable,
no testsuite regressions or warnings added.
It could do with a ChangeLog and NEWS entry though.
It would be good to get wider testing of the performance on
performance critical workloads that other people may have.
> I also put a copy of the oocalc workload online, as a sample of the
> workloads I've been using to benchmark. It's 16M compressed, and you
> have to decompress it (to 65M) to use it with the simulator.
>
> http://www.delorie.com/tmp/oocalc.wl.bz2
>
> (if you use wget, use "-U malloc" ;)
>
> diff --git a/config.make.in b/config.make.in
> index 5836b32..0290d83 100644
> --- a/config.make.in
> +++ b/config.make.in
> @@ -77,6 +77,8 @@ multi-arch = @multi_arch@
>
> mach-interface-list = @mach_interface_list@
>
> +experimental-malloc = @experimental_malloc@
> +
> nss-crypt = @libc_cv_nss_crypt@
> static-nss-crypt = @libc_cv_static_nss_crypt@
>
> diff --git a/configure.ac b/configure.ac
> index 4a77411..b929012 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -313,6 +313,13 @@ AC_ARG_ENABLE([multi-arch],
> [multi_arch=$enableval],
> [multi_arch=default])
>
> +AC_ARG_ENABLE([experimental-malloc],
> + AC_HELP_STRING([--disable-experimental-malloc],
> + [disable experimental malloc features]),
> + [experimental_malloc=$enableval],
> + [experimental_malloc=yes])
> +AC_SUBST(experimental_malloc)
> +
> AC_ARG_ENABLE([nss-crypt],
> AC_HELP_STRING([--enable-nss-crypt],
> [enable libcrypt to use nss]),
> diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
> index cb9e8f1..37620c8 100644
> --- a/elf/dl-tunables.list
> +++ b/elf/dl-tunables.list
> @@ -76,5 +76,17 @@ glibc {
> minval: 1
> security_level: SXID_IGNORE
> }
> + tcache_max {
> + type: SIZE_T
> + env_alias: MALLOC_TCACHE_MAX
> + }
> + tcache_count {
> + type: SIZE_T
> + env_alias: MALLOC_TCACHE_COUNT
> + }
> + tcache_unsorted_limit {
> + type: SIZE_T
> + env_alias: MALLOC_TCACHE_UNSORTED_LIMIT
> + }
> }
> }
> diff --git a/malloc/Makefile b/malloc/Makefile
> index e93b83b..dd8a43a 100644
> --- a/malloc/Makefile
> +++ b/malloc/Makefile
> @@ -168,6 +168,9 @@ tst-malloc-usable-static-ENV = $(tst-malloc-usable-ENV)
> tst-malloc-usable-tunables-ENV = GLIBC_TUNABLES=glibc.malloc.check=3
> tst-malloc-usable-static-tunables-ENV = $(tst-malloc-usable-tunables-ENV)
>
> +ifeq ($(experimental-malloc),yes)
> +CPPFLAGS-malloc.c += -DUSE_TCACHE
> +endif
> # Uncomment this for test releases. For public releases it is too expensive.
> #CPPFLAGS-malloc.o += -DMALLOC_DEBUG=1
>
> diff --git a/malloc/arena.c b/malloc/arena.c
> index d49e4a2..79e918f 100644
> --- a/malloc/arena.c
> +++ b/malloc/arena.c
> @@ -236,6 +236,11 @@ DL_TUNABLE_CALLBACK_FNDECL (set_perturb_byte, int32_t)
> DL_TUNABLE_CALLBACK_FNDECL (set_trim_threshold, size_t)
> DL_TUNABLE_CALLBACK_FNDECL (set_arena_max, size_t)
> DL_TUNABLE_CALLBACK_FNDECL (set_arena_test, size_t)
> +#if USE_TCACHE
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_max, size_t)
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_count, size_t)
> +DL_TUNABLE_CALLBACK_FNDECL (set_tcache_unsorted_limit, size_t)
> +#endif
> #else
> /* Initialization routine. */
> #include <string.h>
> @@ -322,6 +327,11 @@ ptmalloc_init (void)
> TUNABLE_SET_VAL_WITH_CALLBACK (mmap_max, NULL, set_mmaps_max);
> TUNABLE_SET_VAL_WITH_CALLBACK (arena_max, NULL, set_arena_max);
> TUNABLE_SET_VAL_WITH_CALLBACK (arena_test, NULL, set_arena_test);
> +#if USE_TCACHE
> + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_max, NULL, set_tcache_max);
> + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_count, NULL, set_tcache_count);
> + TUNABLE_SET_VAL_WITH_CALLBACK (tcache_unsorted_limit, NULL, set_tcache_unsorted_limit);
> +#endif
> __libc_lock_unlock (main_arena.mutex);
> #else
> const char *s = NULL;
> @@ -371,7 +381,23 @@ ptmalloc_init (void)
> if (memcmp (envline, "ARENA_TEST", 10) == 0)
> __libc_mallopt (M_ARENA_TEST, atoi (&envline[11]));
> }
> +#if USE_TCACHE
> + if (!__builtin_expect (__libc_enable_secure, 0))
> + {
> + if (memcmp (envline, "TCACHE_MAX", 10) == 0)
> + __libc_mallopt (M_TCACHE_MAX, atoi (&envline[11]));
> + }
> +#endif
> break;
> +#if USE_TCACHE
> + case 12:
> + if (!__builtin_expect (__libc_enable_secure, 0))
> + {
> + if (memcmp (envline, "TCACHE_COUNT", 12) == 0)
> + __libc_mallopt (M_TCACHE_COUNT, atoi (&envline[13]));
> + }
> + break;
> +#endif
> case 15:
> if (!__builtin_expect (__libc_enable_secure, 0))
> {
> @@ -381,6 +407,15 @@ ptmalloc_init (void)
> __libc_mallopt (M_MMAP_THRESHOLD, atoi (&envline[16]));
> }
> break;
> +#if USE_TCACHE
> + case 21:
> + if (!__builtin_expect (__libc_enable_secure, 0))
> + {
> + if (memcmp (envline, "TCACHE_UNSORTED_LIMIT", 21) == 0)
> + __libc_mallopt (M_TCACHE_UNSORTED_LIMIT, atoi (&envline[22]));
> + }
> + break;
> +#endif
> default:
> break;
> }
> diff --git a/malloc/malloc.c b/malloc/malloc.c
> index e29105c..8cd03d8 100644
> --- a/malloc/malloc.c
> +++ b/malloc/malloc.c
> @@ -297,6 +297,33 @@ __malloc_assert (const char *assertion, const char *file, unsigned int line,
> }
> #endif
>
> +#ifndef USE_TCACHE
> +# define USE_TCACHE 0
> +#endif
> +#if USE_TCACHE
> +/* We want 64 entries. This is an arbitrary limit, which tunables can reduce. */
> +# define MAX_TCACHE_SIZE (MALLOC_ALIGNMENT * 63)
> +# define TCACHE_IDX ((MAX_TCACHE_SIZE / MALLOC_ALIGNMENT) + 1)
> +# define size2tidx_(bytes) (((bytes) + MALLOC_ALIGNMENT - 1) / MALLOC_ALIGNMENT)
> +
> +# define tidx2csize(idx) ((idx) * MALLOC_ALIGNMENT + SIZE_SZ)
> +# define tidx2usize(idx) ((idx) * MALLOC_ALIGNMENT)
> +
> +/* When "x" is a user-provided size. */
> +# define usize2tidx(x) size2tidx_ (x)
> +/* When "x" is from chunksize(). */
> +# define csize2tidx(x) size2tidx_ ((x) - SIZE_SZ)
> +
> +/* Rounds up, so...
> + idx 0 bytes 0
> + idx 1 bytes 1..8
> + idx 2 bytes 9..16
> + etc. */
> +
> +/* This is another arbitrary limit, which tunables can change. */
> +# define TCACHE_FILL_COUNT 7
> +#endif
> +
>
> /*
> REALLOC_ZERO_BYTES_FREES should be set if a call to
> @@ -1711,6 +1738,17 @@ struct malloc_par
>
> /* First address handed out by MORECORE/sbrk. */
> char *sbrk_base;
> +
> +#if USE_TCACHE
> + /* Maximum number of buckets to use. */
> + size_t tcache_max;
> + size_t tcache_max_bytes;
> + /* Maximum number of chunks in each bucket. */
> + size_t tcache_count;
> + /* Maximum number of chunks to remove from the unsorted list, which
> + don't match. */
> + size_t tcache_unsorted_limit;
> +#endif
> };
>
> /* There are several instances of this struct ("arenas") in this
> @@ -1749,8 +1787,22 @@ static struct malloc_par mp_ =
> .trim_threshold = DEFAULT_TRIM_THRESHOLD,
> #define NARENAS_FROM_NCORES(n) ((n) * (sizeof (long) == 4 ? 2 : 8))
> .arena_test = NARENAS_FROM_NCORES (1)
> +#if USE_TCACHE
> + ,
> + .tcache_count = TCACHE_FILL_COUNT,
> + .tcache_max = TCACHE_IDX,
> + .tcache_max_bytes = tidx2usize (TCACHE_IDX-1),
> + .tcache_unsorted_limit = 0 /* No limit */
> +#endif
> };
>
> +/* Non public mallopt parameters. */
> +#if USE_TCACHE
> +# define M_TCACHE_COUNT -9
> +# define M_TCACHE_MAX -10
> +# define M_TCACHE_UNSORTED_LIMIT -11
> +#endif
> +
> /* Maximum size of memory handled in fastbins. */
> static INTERNAL_SIZE_T global_max_fast;
>
> @@ -2874,6 +2926,106 @@ mremap_chunk (mchunkptr p, size_t new_size)
>
> /*------------------------ Public wrappers. --------------------------------*/
>
> +#if USE_TCACHE
> +
> +typedef struct TCacheEntry {
> + struct TCacheEntry *next;
> +} TCacheEntry;
> +
> +/* There is one of these for each thread, which contains the
> + per-thread cache (hence "TCache"). Keeping overall size low is
> + mildly important. Note that COUNTS and ENTRIES are redundant, this
> + is for performance reasons. */
> +typedef struct TCache {
> + char counts[TCACHE_IDX];
> + TCacheEntry *entries[TCACHE_IDX];
> +} TCache;
> +
> +static __thread char tcache_shutting_down = 0;
> +static __thread TCache *tcache = NULL;
> +
> +static void
> +tcache_put (mchunkptr chunk, size_t tc_idx)
> +{
> + TCacheEntry *e = (TCacheEntry *) chunk2mem (chunk);
> + e->next = tcache->entries[tc_idx];
> + tcache->entries[tc_idx] = e;
> + ++(tcache->counts[tc_idx]);
> +}
> +
> +static void *
> +tcache_get (size_t tc_idx)
> +{
> + TCacheEntry *e = tcache->entries[tc_idx];
> + tcache->entries[tc_idx] = e->next;
> + --(tcache->counts[tc_idx]);
> + return (void *) e;
> +}
> +
> +static void __attribute__ ((section ("__libc_thread_freeres_fn")))
> +tcache_thread_freeres (void)
> +{
> + int i;
> + TCache *tcache_tmp = tcache;
> +
> + if (!tcache)
> + return;
> +
> + tcache = NULL;
> +
> + for (i = 0; i < TCACHE_IDX; ++i) {
> + while (tcache_tmp->entries[i])
> + {
> + TCacheEntry *e = tcache_tmp->entries[i];
> + tcache_tmp->entries[i] = e->next;
> + __libc_free (e);
> + }
> + }
> +
> + __libc_free (tcache_tmp);
> +
> + tcache_shutting_down = 1;
> +}
> +text_set_element (__libc_thread_subfreeres, tcache_thread_freeres);
> +
> +static void
> +tcache_init(void)
> +{
> + mstate ar_ptr;
> + void *victim = 0;
> + const size_t bytes = sizeof (TCache);
> +
> + if (tcache_shutting_down)
> + return;
> +
> + arena_get (ar_ptr, bytes);
> + victim = _int_malloc (ar_ptr, bytes);
> + if (!victim && ar_ptr != NULL)
> + {
> + ar_ptr = arena_get_retry (ar_ptr, bytes);
> + victim = _int_malloc (ar_ptr, bytes);
> + }
> +
> +
> + if (ar_ptr != NULL)
> + __libc_lock_unlock (ar_ptr->mutex);
> +
> + if (victim)
> + {
> + tcache = (TCache *) victim;
> + memset (tcache, 0, sizeof (TCache));
> + }
> +
> +}
> +
> +#define MAYBE_INIT_TCACHE() \
> + if (__glibc_unlikely (tcache == NULL)) \
> + tcache_init();
> +
> +#else
> +#define MAYBE_INIT_TCACHE()
> +#endif
> +
> void *
> __libc_malloc (size_t bytes)
> {
> @@ -2884,6 +3036,21 @@ __libc_malloc (size_t bytes)
> = atomic_forced_read (__malloc_hook);
> if (__builtin_expect (hook != NULL, 0))
> return (*hook)(bytes, RETURN_ADDRESS (0));
> +#if USE_TCACHE
> + /* int_free also calls request2size, be careful to not pad twice. */
> + size_t tbytes = request2size (bytes);
> + size_t tc_idx = csize2tidx (tbytes);
> +
> + MAYBE_INIT_TCACHE ();
> +
> + if (tc_idx < mp_.tcache_max
> + && tc_idx < TCACHE_IDX /* to appease gcc */
> + && tcache
> + && tcache->entries[tc_idx] != NULL)
> + {
> + return tcache_get (tc_idx);
> + }
> +#endif
>
> arena_get (ar_ptr, bytes);
>
> @@ -2943,6 +3110,8 @@ __libc_free (void *mem)
> return;
> }
>
> + MAYBE_INIT_TCACHE ();
> +
> ar_ptr = arena_for_chunk (p);
> _int_free (ar_ptr, p, 0);
> }
> @@ -2980,7 +3149,10 @@ __libc_realloc (void *oldmem, size_t bytes)
> if (chunk_is_mmapped (oldp))
> ar_ptr = NULL;
> else
> - ar_ptr = arena_for_chunk (oldp);
> + {
> + MAYBE_INIT_TCACHE ();
> + ar_ptr = arena_for_chunk (oldp);
> + }
>
> /* Little security check which won't hurt performance: the allocator
> never wrapps around at the end of the address space. Therefore
> @@ -3205,6 +3377,8 @@ __libc_calloc (size_t n, size_t elem_size)
>
> sz = bytes;
>
> + MAYBE_INIT_TCACHE ();
> +
> arena_get (av, sz);
> if (av)
> {
> @@ -3335,6 +3509,10 @@ _int_malloc (mstate av, size_t bytes)
> mchunkptr fwd; /* misc temp for linking */
> mchunkptr bck; /* misc temp for linking */
>
> +#if USE_TCACHE
> + size_t tcache_unsorted_count; /* count of unsorted chunks processed */
> +#endif
> +
> const char *errstr = NULL;
>
> /*
> @@ -3387,6 +3565,35 @@ _int_malloc (mstate av, size_t bytes)
> return NULL;
> }
> check_remalloced_chunk (av, victim, nb);
> +#if USE_TCACHE
> + /* While we're here, if we see other chunks of the same size,
> + stash them in the tcache. */
> + size_t tc_idx = csize2tidx (nb);
> + if (tcache && tc_idx < mp_.tcache_max)
> + {
> + mchunkptr tc_victim;
> + int found = 0;
> +
> + /* While bin not empty and tcache not full, copy chunks over. */
> + while (tcache->counts[tc_idx] < mp_.tcache_count
> + && (pp = *fb) != NULL)
> + {
> + do
> + {
> + tc_victim = pp;
> + if (tc_victim == NULL)
> + break;
> + }
> + while ((pp = catomic_compare_and_exchange_val_acq (fb, tc_victim->fd, tc_victim))
> + != tc_victim);
> + if (tc_victim != 0)
> + {
> + tcache_put (tc_victim, tc_idx);
> + ++found;
> + }
> + }
> + }
> +#endif
> void *p = chunk2mem (victim);
> alloc_perturb (p, bytes);
> return p;
> @@ -3425,6 +3632,34 @@ _int_malloc (mstate av, size_t bytes)
> if (av != &main_arena)
> set_non_main_arena (victim);
> check_malloced_chunk (av, victim, nb);
> +#if USE_TCACHE
> + /* While we're here, if we see other chunks of the same size,
> + stash them in the tcache. */
> + size_t tc_idx = csize2tidx (nb);
> + if (tcache && tc_idx < mp_.tcache_max)
> + {
> + mchunkptr tc_victim;
> + int found = 0;
> +
> + /* While bin not empty and tcache not full, copy chunks over. */
> + while (tcache->counts[tc_idx] < mp_.tcache_count
> + && (tc_victim = last (bin)) != bin)
> + {
> + if (tc_victim != 0)
> + {
> + bck = tc_victim->bk;
> + set_inuse_bit_at_offset (tc_victim, nb);
> + if (av != &main_arena)
> + set_non_main_arena (tc_victim);
> + bin->bk = bck;
> + bck->fd = bin;
> +
> + tcache_put (tc_victim, tc_idx);
> + ++found;
> + }
> + }
> + }
> +#endif
> void *p = chunk2mem (victim);
> alloc_perturb (p, bytes);
> return p;
> @@ -3463,6 +3698,16 @@ _int_malloc (mstate av, size_t bytes)
> otherwise need to expand memory to service a "small" request.
> */
>
> +#if USE_TCACHE
> + INTERNAL_SIZE_T tcache_nb = 0;
> + size_t tc_idx = csize2tidx (nb);
> + if (tcache && tc_idx < mp_.tcache_max)
> + tcache_nb = nb;
> + int return_cached = 0;
> +
> + tcache_unsorted_count = 0;
> +#endif
> +
> for (;; )
> {
> int iters = 0;
> @@ -3523,10 +3768,26 @@ _int_malloc (mstate av, size_t bytes)
> set_inuse_bit_at_offset (victim, size);
> if (av != &main_arena)
> set_non_main_arena (victim);
> +#if USE_TCACHE
> + /* Fill cache first, return to user only if cache fills.
> + We may return one of these chunks later. */
> + if (tcache_nb
> + && tcache->counts[tc_idx] < mp_.tcache_count)
> + {
> + tcache_put (victim, tc_idx);
> + return_cached = 1;
> + continue;
> + }
> + else
> + {
> +#endif
> check_malloced_chunk (av, victim, nb);
> void *p = chunk2mem (victim);
> alloc_perturb (p, bytes);
> return p;
> +#if USE_TCACHE
> + }
> +#endif
> }
>
> /* place chunk in bin */
> @@ -3593,11 +3854,31 @@ _int_malloc (mstate av, size_t bytes)
> fwd->bk = victim;
> bck->fd = victim;
>
> +#if USE_TCACHE
> + /* If we've processed as many chunks as we're allowed while
> + filling the cache, return one of the cached ones. */
> + ++tcache_unsorted_count;
> + if (return_cached
> + && mp_.tcache_unsorted_limit > 0
> + && tcache_unsorted_count > mp_.tcache_unsorted_limit)
> + {
> + return tcache_get (tc_idx);
> + }
> +#endif
> +
> #define MAX_ITERS 10000
> if (++iters >= MAX_ITERS)
> break;
> }
>
> +#if USE_TCACHE
> + /* If all the small chunks we found ended up cached, return one now. */
> + if (return_cached)
> + {
> + return tcache_get (tc_idx);
> + }
> +#endif
> +
> /*
> If a large request, scan through the chunks of current bin in
> sorted order to find smallest that fits. Use the skip list for this.
> @@ -3883,6 +4164,20 @@ _int_free (mstate av, mchunkptr p, int have_lock)
>
> check_inuse_chunk(av, p);
>
> +#if USE_TCACHE
> + {
> + size_t tc_idx = csize2tidx (size);
> +
> + if (tcache
> + && tc_idx < mp_.tcache_max
> + && tcache->counts[tc_idx] < mp_.tcache_count)
> + {
> + tcache_put (p, tc_idx);
> + return;
> + }
> + }
> +#endif
> +
> /*
> If eligible, place chunk on a fastbin so it can be found
> and used quickly in malloc.
> @@ -4844,6 +5139,38 @@ do_set_arena_max (size_t value)
> return 1;
> }
>
> +#if USE_TCACHE
> +static inline int
> +__always_inline
> +do_set_tcache_max (size_t value)
> +{
> + LIBC_PROBE (memory_mallopt_tcache_max_bytes, 2, value, mp_.tcache_max_bytes);
> + if (value >= 0 && value <= MAX_TCACHE_SIZE)
> + {
> + mp_.tcache_max_bytes = value;
> + mp_.tcache_max = usize2tidx (value) + 1;
> + }
> + return 1;
> +}
> +
> +static inline int
> +__always_inline
> +do_set_tcache_count (size_t value)
> +{
> + LIBC_PROBE (memory_mallopt_tcache_count, 2, value, mp_.tcache_count);
> + mp_.tcache_count = value;
> + return 1;
> +}
> +
> +static inline int
> +__always_inline
> +do_set_tcache_unsorted_limit (size_t value)
> +{
> + LIBC_PROBE (memory_mallopt_tcache_unsorted_limit, 2, value, mp_.tcache_unsorted_limit);
> + mp_.tcache_unsorted_limit = value;
> + return 1;
> +}
> +#endif
>
> int
> __libc_mallopt (int param_number, int value)
> @@ -4904,6 +5231,20 @@ __libc_mallopt (int param_number, int value)
> if (value > 0)
> do_set_arena_test (value);
> break;
> +#if USE_TCACHE
> + case M_TCACHE_COUNT:
> + if (value >= 0)
> + do_set_tcache_count (value);
> + break;
> + case M_TCACHE_MAX:
> + if (value >= 0)
> + do_set_tcache_max (value);
> + break;
> + case M_TCACHE_UNSORTED_LIMIT:
> + if (value >= 0)
> + do_set_tcache_unsorted_limit (value);
> + break;
> +#endif
> }
> __libc_lock_unlock (av->mutex);
> return res;