This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
[PATCH] malloc: use bitmap to conserve hot bins
- From: Joern Engel <joern at purestorage dot com>
- To: "GNU C. Library" <libc-alpha at sourceware dot org>
- Cc: Siddhesh Poyarekar <siddhesh dot poyarekar at gmail dot com>, Joern Engel <joern at purestorage dot org>
- Date: Mon, 25 Jan 2016 16:25:09 -0800
- Subject: [PATCH] malloc: use bitmap to conserve hot bins
- Authentication-results: sourceware.org; auth=none
- References: <1453767942-19369-1-git-send-email-joern at purestorage dot com>
From: Joern Engel <joern@purestorage.org>
Tcache_gc needs some heuristic to decide what memory will remain useful
in the future and what memory can be returned to the allocator. This
uses the same heuristic as lru - bins that have been used in the recent
past are preserved entirely, bins that haven't been used are emptied.
JIRA: PURE-27597
---
tpc/malloc2.13/tcache.h | 88 +++++++++++++++++++++++++++++++++++++++----------
1 file changed, 71 insertions(+), 17 deletions(-)
diff --git a/tpc/malloc2.13/tcache.h b/tpc/malloc2.13/tcache.h
index 7cf6b316456f..1829cd4ebb9a 100644
--- a/tpc/malloc2.13/tcache.h
+++ b/tpc/malloc2.13/tcache.h
@@ -24,9 +24,12 @@ static inline int fls(int x)
* arena. On free we keep the freed object in hope of reusing it in
* future allocations.
*/
-#define CACHE_SIZE (1 << 16)
-#define MAX_CACHED_SIZE (CACHE_SIZE >> 3)
-#define MAX_PREFETCH_SIZE (CACHE_SIZE >> 6)
+#define CACHE_SIZE_BITS (16)
+#define CACHE_SIZE (1 << CACHE_SIZE_BITS)
+#define MAX_CACHED_SIZE_BITS (CACHE_SIZE_BITS - 3)
+#define MAX_CACHED_SIZE (1 << MAX_CACHED_SIZE_BITS)
+#define MAX_PREFETCH_SIZE_BITS (CACHE_SIZE_BITS - 6)
+#define MAX_PREFETCH_SIZE (1 << MAX_PREFETCH_SIZE_BITS)
#define NO_PREFETCH (1 << 3)
/*
@@ -40,8 +43,8 @@ static inline int fls(int x)
*/
#define ALIGN_BITS (4)
#define ALIGNMENT (1 << ALIGN_BITS)
-#define ALIGN_DOWN(x) ((x) & ~(ALIGNMENT - 1))
-#define ALIGN(x) (((x) + (ALIGNMENT - 1)) & ~(ALIGNMENT - 1))
+#define ALIGN_DOWN(x, a) ((x) & ~((a) - 1))
+#define ALIGN(x, a) (((x) + ((a) - 1)) & ~((a) - 1))
#define SUBBIN_BITS (4)
#define SUBBINS (1 << SUBBIN_BITS)
@@ -57,7 +60,27 @@ static int cache_bin(int size)
return SUBBINS * (power - SUBBIN_BITS - 1) + subbin;
}
-#define CACHE_NO_BINS 97
+#define CACHE_NO_BINS (SUBBINS * (MAX_CACHED_SIZE_BITS - ALIGN_BITS - SUBBIN_BITS + 1) + 1)
+
+#define CACHE_BITMAP_SIZE (ALIGN(CACHE_NO_BINS, __WORDSIZE) / __WORDSIZE)
+
+#define BITMAP_WORD(i) ((i) / __WORDSIZE)
+#define BITMAP_BIT(i) (1UL << ((i) & (__WORDSIZE - 1)))
+
+static inline void clear_bit(unsigned long *bitmap, int i)
+{
+ bitmap[BITMAP_WORD(i)] &= ~BITMAP_BIT(i);
+}
+
+static inline void set_bit(unsigned long *bitmap, int i)
+{
+ bitmap[BITMAP_WORD(i)] |= BITMAP_BIT(i);
+}
+
+static inline int get_bit(unsigned long *bitmap, int i)
+{
+ return !!(bitmap[BITMAP_WORD(i)] & BITMAP_BIT(i));
+}
struct thread_cache {
/* Bytes in cache */
@@ -66,21 +89,32 @@ struct thread_cache {
/* Objects in cache */
uint32_t tc_count;
+ unsigned long accessed_map[CACHE_BITMAP_SIZE];
+
struct malloc_chunk *tc_bin[CACHE_NO_BINS];
};
-/*
- * XXX Completely unoptimized - lots of low-hanging fruit
- */
-static void cache_gc(struct thread_cache *cache)
+static inline void set_accessed(struct thread_cache *cache, int bin)
+{
+ set_bit(cache->accessed_map, bin);
+}
+
+static inline int is_accessed(struct thread_cache *cache, int bin)
+{
+ return get_bit(cache->accessed_map, bin);
+}
+
+static void tcache_gc(struct thread_cache *cache)
{
struct malloc_chunk *victim, *next;
struct malloc_state *arena;
- int i;
+ int i, did_repeat = 0;
+repeat:
for (i = 0; i < CACHE_NO_BINS; i++) {
victim = cache->tc_bin[i];
- if (!victim)
+ /* accessed bins get skipped - they are useful */
+ if (is_accessed(cache, i) || !victim)
continue;
cache->tc_bin[i] = NULL;
while (victim) {
@@ -88,14 +122,33 @@ static void cache_gc(struct thread_cache *cache)
cache->tc_size -= chunksize(victim);
cache->tc_count--;
arena = arena_for_chunk(victim);
+ /* TODO: use atomic bins instead */
mutex_lock(&arena->mutex);
_int_free(arena, victim);
mutex_unlock(&arena->mutex);
victim = next;
}
}
- assert(cache->tc_size == 0);
- assert(cache->tc_count == 0);
+ memset(cache->accessed_map, 0, sizeof(cache->accessed_map));
+
+ if (cache->tc_size > CACHE_SIZE) {
+ /*
+ * Since we skip accessed bins we can run into
+ * pathological cases where all bins are empty or
+ * accessed and we made no progress. In those cases
+ * we retry after clearing the accessed bits, freeing
+ * the entire cache. Should be rare.
+ */
+ did_repeat = 1;
+ goto repeat;
+ } else if (did_repeat) {
+ /*
+ * Since we freed the entire cache, we can verify the
+ * counters are consistent.
+ */
+ assert(cache->tc_size == 0);
+ assert(cache->tc_count == 0);
+ }
}
static void *tcache_malloc(size_t size)
@@ -127,6 +180,7 @@ static void *tcache_malloc(size_t size)
bin_no = cache_bin(nb);
assert(bin_no < CACHE_NO_BINS);
+ set_accessed(cache, bin_no);
bin = &cache->tc_bin[bin_no];
victim = *bin;
@@ -150,11 +204,11 @@ static void *tcache_malloc(size_t size)
/*
* GC the cache before prefetching, not after. The last thing
* we want is to spend effort prefetching, then release all
- * those objects via cache_gc. Also do it before taking the
+ * those objects via tcache_gc. Also do it before taking the
* lock, to minimize hold times.
*/
if (nb <= MAX_PREFETCH_SIZE && (cache->tc_size + nb * 8) > CACHE_SIZE )
- cache_gc(cache);
+ tcache_gc(cache);
arena = arena_get(size);
if (!arena)
@@ -226,6 +280,6 @@ static int tcache_free(mchunkptr p)
p->fd = *bin;
*bin = p;
if (cache->tc_size > CACHE_SIZE)
- cache_gc(cache);
+ tcache_gc(cache);
return 1;
}
--
2.7.0.rc3