This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] malloc: brain-dead thread cache


From: Joern Engel <joern@purestorage.org>

No prefetching yet, cache_gc takes a lock for every single object,
entire cache gets flushed on gc,...

JIRA: PURE-27597
---
 tpc/malloc2.13/arena.h  |   2 +
 tpc/malloc2.13/malloc.c |  11 +++
 tpc/malloc2.13/tcache.h | 183 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 196 insertions(+)
 create mode 100644 tpc/malloc2.13/tcache.h

diff --git a/tpc/malloc2.13/arena.h b/tpc/malloc2.13/arena.h
index 0804ecfe3a26..d66b4e7029a2 100644
--- a/tpc/malloc2.13/arena.h
+++ b/tpc/malloc2.13/arena.h
@@ -72,6 +72,7 @@ extern int sanity_check_heap_info_alignment[(sizeof (heap_info)
 
 /* Thread specific data */
 
+static tsd_key_t cache_key;
 static tsd_key_t arena_key;
 static mutex_t list_lock;
 
@@ -362,6 +363,7 @@ static void ptmalloc_init(void)
 	}
 
 	mutex_init(&list_lock);
+	tsd_key_create(&cache_key, NULL);
 	tsd_key_create(&arena_key, NULL);
 	tsd_setspecific(arena_key, (Void_t *) & main_arena);
 	thread_atfork(ptmalloc_lock_all, ptmalloc_unlock_all, ptmalloc_unlock_all2);
diff --git a/tpc/malloc2.13/malloc.c b/tpc/malloc2.13/malloc.c
index dca97ef553c0..40f6aa578c6f 100644
--- a/tpc/malloc2.13/malloc.c
+++ b/tpc/malloc2.13/malloc.c
@@ -3254,6 +3254,8 @@ static struct malloc_state *get_backup_arena(struct malloc_state *arena, size_t
 
 /*------------------------ Public wrappers. --------------------------------*/
 
+#include "tcache.h"
+
 Void_t *public_mALLOc(size_t bytes)
 {
 	struct malloc_state *ar_ptr;
@@ -3263,6 +3265,10 @@ Void_t *public_mALLOc(size_t bytes)
 	if (hook != NULL)
 		return (*hook) (bytes, RETURN_ADDRESS(0));
 
+	victim = tcache_malloc(bytes);
+	if (victim)
+		return victim;
+
 	ar_ptr = arena_get(bytes);
 	if (!ar_ptr)
 		return 0;
@@ -3297,6 +3303,11 @@ void public_fREe(Void_t * mem)
 		return;
 	}
 
+	if (tcache_free(p)) {
+		/* Object could be freed on fast path */
+		return;
+	}
+
 	ar_ptr = arena_for_chunk(p);
 	arena_lock(ar_ptr);
 	_int_free(ar_ptr, p);
diff --git a/tpc/malloc2.13/tcache.h b/tpc/malloc2.13/tcache.h
new file mode 100644
index 000000000000..62d58cc77475
--- /dev/null
+++ b/tpc/malloc2.13/tcache.h
@@ -0,0 +1,183 @@
+/*
+ * Thread-cache for malloc.
+ */
+
+#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
+static inline int fls(int x)
+{
+	int r;
+
+	asm("bsrl %1,%0\n\t"
+	"jnz 1f\n\t"
+	"movl $-1,%0\n"
+	"1:" : "=r" (r) : "rm" (x));
+	return r + 1;
+}
+#else
+#error Must define fls()
+#endif
+
+/*
+ * Per-thread cache is supposed to reduce lock contention on arenas.
+ * When doing allocations we prefetch several identical objects and
+ * can return the surplus on future allocations without going to an
+ * arena.  On free we keep the freed object in hope of reusing it in
+ * future allocations.
+ */
+#define CACHE_SIZE		(1 << 16)
+#define MAX_CACHED_SIZE		(CACHE_SIZE >> 3)
+#define MAX_PREFETCH_SIZE	(CACHE_SIZE >> 6)
+#define NO_PREFECT		(1 << 3)
+
+/*
+ * Binning is done as a subdivided buddy allocator.  A buddy allocator
+ * has one bin per power of two.  We use 16 bins per power of two,
+ * yielding a worst-case fragmentation of 6%.
+ *
+ * Worst-case fragmentation is nearly impossible to reach.  In bad
+ * real-world workloads there is a single dominant allocation size,
+ * causing most objects to be created with a perfect size.
+ */
+#define ALIGN_BITS	(4)
+#define ALIGNMENT	(1 << ALIGN_BITS)
+#define ALIGN_DOWN(x)   ((x) & ~(ALIGNMENT - 1))
+#define ALIGN(x)	(((x) + (ALIGNMENT - 1)) & ~(ALIGNMENT - 1))
+#define SUBBIN_BITS	(4)
+#define SUBBINS		(1 << SUBBIN_BITS)
+
+static int cache_bin(int size)
+{
+	int power, subbin;
+
+	size >>= ALIGN_BITS;
+	if (size < SUBBINS)
+		return size;
+	power = fls(size);
+	subbin = size >> (power - SUBBIN_BITS - 1);
+	return SUBBINS * (power - SUBBIN_BITS - 1) + subbin;
+}
+
+#define CACHE_NO_BINS 97
+
+struct thread_cache {
+	/* Bytes in cache */
+	uint32_t tc_size;
+
+	/* Objects in cache */
+	uint32_t tc_count;
+
+	struct malloc_chunk *tc_bin[CACHE_NO_BINS];
+};
+
+/*
+ * XXX Completely unoptimized - lots of low-hanging fruit
+ */
+static void cache_gc(struct thread_cache *cache)
+{
+	struct malloc_chunk *victim, *next;
+	struct malloc_state *arena;
+	int i;
+
+	for (i = 0; i < CACHE_NO_BINS; i++) {
+		victim = cache->tc_bin[i];
+		if (!victim)
+			continue;
+		cache->tc_bin[i] = NULL;
+		while (victim) {
+			next = victim->fd;
+			cache->tc_size -= chunksize(victim);
+			cache->tc_count--;
+			arena = arena_for_chunk(victim);
+			mutex_lock(&arena->mutex);
+			_int_free(arena, victim);
+			mutex_unlock(&arena->mutex);
+			victim = next;
+		}
+	}
+	assert(cache->tc_size == 0);
+	assert(cache->tc_count == 0);
+}
+
+static void *tcache_malloc(size_t size)
+{
+	struct thread_cache *cache;
+	struct malloc_state *arena;
+	struct malloc_chunk **bin, *victim;
+	size_t nb;
+	int bin_no;
+
+	checked_request2size(size, nb);
+	if (nb > MAX_CACHED_SIZE)
+		return NULL;
+
+	tsd_getspecific(cache_key, cache);
+	if (!cache) {
+		arena = arena_get(sizeof(*cache));
+		cache = _int_malloc(arena, sizeof(*cache));
+		arena_unlock(arena);
+		if (!cache)
+			return NULL;
+		memset(cache, 0, sizeof(*cache));
+		tsd_setspecific(cache_key, cache);
+	}
+
+	bin_no = cache_bin(nb);
+	assert(bin_no < CACHE_NO_BINS);
+
+	bin = &cache->tc_bin[bin_no];
+	victim = *bin;
+	if (victim) {
+		if (chunksize(victim) < nb)
+			return NULL;
+		if (cache_bin(chunksize(*bin)) != bin_no) {
+			malloc_printerr(check_action, "invalid tcache entry", victim);
+			return NULL;
+		}
+		*bin = victim->fd;
+		void *p = chunk2mem(victim);
+		if (perturb_byte)
+			alloc_perturb(p, size);
+		cache->tc_size -= chunksize(victim);
+		cache->tc_count--;
+		return p;
+	}
+	/* TODO: prefetch objects */
+	return NULL;
+}
+
+/*
+ * returns 1 if object was freed
+ */
+static int tcache_free(mchunkptr p)
+{
+	struct thread_cache *cache;
+	struct malloc_chunk **bin;
+	size_t size;
+	int bin_no;
+
+	tsd_getspecific(cache_key, cache);
+	if (!cache)
+		return 0;
+	size = chunksize(p);
+	if (size > MAX_CACHED_SIZE)
+		return 0;
+	bin_no = cache_bin(size);
+	assert(bin_no < CACHE_NO_BINS);
+
+	cache->tc_size += size;
+	cache->tc_count++;
+	bin = &cache->tc_bin[bin_no];
+	if (*bin == p) {
+		malloc_printerr(check_action, "double free or corruption (tcache)", chunk2mem(p));
+		return 0;
+	}
+	if (*bin && cache_bin(chunksize(*bin)) != bin_no) {
+		malloc_printerr(check_action, "invalid tcache entry", chunk2mem(p));
+		return 0;
+	}
+	p->fd = *bin;
+	*bin = p;
+	if (cache->tc_size > CACHE_SIZE)
+		cache_gc(cache);
+	return 1;
+}
-- 
2.7.0.rc3


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]