This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |
Other format: | [Raw text] |
On Thu, Jun 19, 2014 at 05:46:08PM +0100, Will Newton wrote: > Add a microbenchmark for measuring malloc and free performance with > varying numbers of threads. The benchmark allocates and frees buffers > of random sizes in a random order and measures the overall execution > time and RSS. Variants of the benchmark are run with 1, 4, 8 and > 16 threads. > > The random block sizes used follow an inverse square distribution > which is intended to mimic the behaviour of real applications which > tend to allocate many more small blocks than large ones. > > ChangeLog: > > 2014-06-19 Will Newton <will.newton@linaro.org> > > * benchtests/Makefile: (bench-malloc): Add malloc thread > scalability benchmark. > * benchtests/bench-malloc-threads.c: New file. > --- > benchtests/Makefile | 20 ++- > benchtests/bench-malloc-thread.c | 299 +++++++++++++++++++++++++++++++++++++++ > 2 files changed, 316 insertions(+), 3 deletions(-) > create mode 100644 benchtests/bench-malloc-thread.c > > Changes in v3: > - Single executable that takes a parameter for thread count > - Run for a fixed duration rather than a fixed number of loops > - Other fixes in response to review suggestions > > Example of a plot of the results versus tcmalloc and jemalloc on > a 4 core i5: > > http://people.linaro.org/~will.newton/bench-malloc-threads.png > > diff --git a/benchtests/Makefile b/benchtests/Makefile > index dc9ee04..a3a4564 100644 > --- a/benchtests/Makefile > +++ b/benchtests/Makefile > @@ -44,8 +44,11 @@ benchset := $(string-bench-all) $(stdlib-bench) > CFLAGS-bench-ffs.c += -fno-builtin > CFLAGS-bench-ffsll.c += -fno-builtin > > +bench-malloc := malloc-thread > + > $(addprefix $(objpfx)bench-,$(bench-math)): $(libm) > $(addprefix $(objpfx)bench-,$(bench-pthread)): $(shared-thread-library) > +$(objpfx)bench-malloc-thread: $(shared-thread-library) > > > > @@ -60,6 +63,7 @@ include ../Rules > > binaries-bench := $(addprefix $(objpfx)bench-,$(bench)) > binaries-benchset := $(addprefix $(objpfx)bench-,$(benchset)) > +binaries-bench-malloc := $(addprefix $(objpfx)bench-,$(bench-malloc)) > > # The default duration: 10 seconds. > ifndef BENCH_DURATION > @@ -82,7 +86,8 @@ endif > > # This makes sure CPPFLAGS-nonlib and CFLAGS-nonlib are passed > # for all these modules. > -cpp-srcs-left := $(binaries-benchset:=.c) $(binaries-bench:=.c) > +cpp-srcs-left := $(binaries-benchset:=.c) $(binaries-bench:=.c) \ > + $(binaries-bench-malloc:=.c) > lib := nonlib > include $(patsubst %,$(..)cppflags-iterator.mk,$(cpp-srcs-left)) > > @@ -99,9 +104,10 @@ timing-type := $(objpfx)bench-timing-type > bench-clean: > rm -f $(binaries-bench) $(addsuffix .o,$(binaries-bench)) > rm -f $(binaries-benchset) $(addsuffix .o,$(binaries-benchset)) > + rm -f $(binaries-bench-malloc) $(addsuffix .o,$(binaries-bench-malloc)) > rm -f $(timing-type) $(addsuffix .o,$(timing-type)) > > -bench: $(timing-type) bench-set bench-func > +bench: $(timing-type) bench-set bench-func bench-malloc > > bench-set: $(binaries-benchset) > for run in $^; do \ > @@ -109,6 +115,13 @@ bench-set: $(binaries-benchset) > $(run-bench) > $${run}.out; \ > done > > +bench-malloc: $(binaries-bench-malloc) > + run=$(objpfx)bench-malloc-thread; \ > + for thr in 1 8 16 32; do \ > + echo "Running $${run} $${thr}"; \ > + $(run-bench) $${thr} > $${run}-$${thr}.out; \ Wouldn't printing out all the stats into a single json be nicer? Also, the right thing to do here would be to write the output to a tmp file and then move the tmp file to the final .out file. That would prevent a partial run from masquerading as a complete run. > + done > + > # Build and execute the benchmark functions. This target generates JSON > # formatted bench.out. Each of the programs produce independent JSON output, > # so one could even execute them individually and process it using any JSON > @@ -134,7 +147,8 @@ bench-func: $(binaries-bench) > scripts/validate_benchout.py $(objpfx)bench.out \ > scripts/benchout.schema.json > > -$(timing-type) $(binaries-bench) $(binaries-benchset): %: %.o $(objpfx)json-lib.o \ > +$(timing-type) $(binaries-bench) $(binaries-benchset) \ > + $(binaries-bench-malloc): %: %.o $(objpfx)json-lib.o \ > $(sort $(filter $(common-objpfx)lib%,$(link-libc))) \ > $(addprefix $(csu-objpfx),start.o) $(+preinit) $(+postinit) > $(+link) > diff --git a/benchtests/bench-malloc-thread.c b/benchtests/bench-malloc-thread.c > new file mode 100644 > index 0000000..b637c94 > --- /dev/null > +++ b/benchtests/bench-malloc-thread.c > @@ -0,0 +1,299 @@ > +/* Benchmark malloc and free functions. > + Copyright (C) 2013-2014 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <http://www.gnu.org/licenses/>. */ > + Add a brief description of the benchmark here. > +#include <errno.h> > +#include <math.h> > +#include <pthread.h> > +#include <signal.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <sys/time.h> > +#include <sys/resource.h> > +#include <unistd.h> > + > +#include "bench-timing.h" > +#include "json-lib.h" > + > +/* Benchmark duration in seconds. */ > +#define BENCHMARK_DURATION 60 > +#define RAND_SEED 88 > + > +#ifndef NUM_THREADS > +# define NUM_THREADS 1 > +#endif Not needed anymore. > + > +/* Maximum memory that can be allocated at any one time is: > + > + NUM_THREADS * WORKING_SET_SIZE * MAX_ALLOCATION_SIZE > + > + However due to the distribution of the random block sizes > + the typical amount allocated will be much smaller. */ > +#define WORKING_SET_SIZE 1024 > + > +#define MIN_ALLOCATION_SIZE 4 > +#define MAX_ALLOCATION_SIZE 32768 > + > +/* Get a random block size with an inverse square distribution. */ > +static unsigned int > +get_block_size (unsigned int rand_data) > +{ > + /* Inverse square. */ > + const float exponent = -2; > + /* Minimum value of distribution. */ > + const float dist_min = MIN_ALLOCATION_SIZE; > + /* Maximum value of distribution. */ > + const float dist_max = MAX_ALLOCATION_SIZE; > + > + float min_pow = powf (dist_min, exponent + 1); > + float max_pow = powf (dist_max, exponent + 1); > + > + float r = (float) rand_data / RAND_MAX; > + > + return (unsigned int) powf ((max_pow - min_pow) * r + min_pow, 1 / (exponent + 1)); > +} > + > +#define NUM_BLOCK_SIZES 8000 > +#define NUM_OFFSETS ((WORKING_SET_SIZE) * 4) > + > +static unsigned int random_block_sizes[NUM_BLOCK_SIZES]; > +static unsigned int random_offsets[NUM_OFFSETS]; > + > +static void > +init_random_values (void) > +{ > + for (size_t i = 0; i < NUM_BLOCK_SIZES; i++) > + random_block_sizes[i] = get_block_size (rand ()); > + > + for (size_t i = 0; i < NUM_OFFSETS; i++) > + random_offsets[i] = rand () % WORKING_SET_SIZE; > +} > + > +static unsigned int > +get_random_block_size (unsigned int *state) > +{ > + unsigned int idx = *state; > + > + if (idx >= NUM_BLOCK_SIZES - 1) > + idx = 0; > + else > + idx++; > + > + *state = idx; > + > + return random_block_sizes[idx]; > +} > + > +static unsigned int > +get_random_offset (unsigned int *state) > +{ > + unsigned int idx = *state; > + > + if (idx >= NUM_OFFSETS - 1) > + idx = 0; > + else > + idx++; > + > + *state = idx; > + > + return random_offsets[idx]; > +} > + > +static volatile bool timeout; > + > +static void alarm_handler (int signum) Newline after the void. > +{ > + timeout = true; > +} > + > +/* Allocate and free blocks in a random order. */ > +static size_t > +malloc_benchmark_loop (void **ptr_arr) > +{ > + unsigned int offset_state = 0, block_state = 0; > + size_t iters = 0; > + > + while (!timeout) > + { > + unsigned int next_idx = get_random_offset (&offset_state); > + unsigned int next_block = get_random_block_size (&block_state); > + > + free (ptr_arr[next_idx]); > + > + ptr_arr[next_idx] = malloc (next_block); > + > + iters++; > + } > + > + return iters; > +} > + > +struct thread_args > +{ > + size_t iters; > + void **working_set; > + timing_t elapsed; > +}; > + > +static void * > +benchmark_thread (void *arg) > +{ > + struct thread_args *args = (struct thread_args *) arg; > + size_t iters; > + void *thread_set = args->working_set; > + timing_t start, stop; > + > + TIMING_NOW (start); > + iters = malloc_benchmark_loop (thread_set); > + TIMING_NOW (stop); > + > + TIMING_DIFF (args->elapsed, start, stop); > + args->iters = iters; > + > + return NULL; > +} > + > +static timing_t > +do_benchmark (size_t num_threads, size_t *iters) > +{ > + timing_t elapsed = 0; > + > + if (num_threads == 1) > + { > + timing_t start, stop; > + void *working_set[WORKING_SET_SIZE]; > + > + memset (working_set, 0, sizeof (working_set)); > + > + TIMING_NOW (start); > + *iters = malloc_benchmark_loop (working_set); > + TIMING_NOW (stop); > + > + TIMING_DIFF (elapsed, start, stop); > + } > + else > + { > + struct thread_args args[num_threads]; > + void *working_set[num_threads][WORKING_SET_SIZE]; > + pthread_t threads[num_threads]; > + > + memset (working_set, 0, sizeof (working_set)); > + > + *iters = 0; > + > + for (size_t i = 0; i < num_threads; i++) > + { > + args[i].working_set = working_set[i]; > + pthread_create(&threads[i], NULL, benchmark_thread, &args[i]); > + } > + > + for (size_t i = 0; i < num_threads; i++) > + { > + pthread_join(threads[i], NULL); > + TIMING_ACCUM (elapsed, args[i].elapsed); > + *iters += args[i].iters; > + } > + } > + return elapsed; > +} > + > +static void usage(const char *name) Newline after the void. > +{ > + fprintf (stderr, "%s: <num_threads>\n", name); > + exit (1); > +} > + > +int > +main (int argc, char **argv) > +{ > + timing_t cur; > + size_t iters = 0, num_threads = 1; > + unsigned long res; > + json_ctx_t json_ctx; > + double d_total_s, d_total_i; > + struct sigaction act; > + > + if (argc == 1) > + num_threads = 1; > + else if (argc == 2) > + { > + long ret; > + > + errno = 0; > + ret = strtol(argv[1], NULL, 10); > + > + if (errno || ret == 0) > + usage(argv[0]); > + > + num_threads = ret; > + } > + else > + usage(argv[0]); > + > + init_random_values (); > + > + json_init (&json_ctx, 0, stdout); > + > + json_document_begin (&json_ctx); > + > + json_attr_string (&json_ctx, "timing_type", TIMING_TYPE); > + > + json_attr_object_begin (&json_ctx, "functions"); > + > + json_attr_object_begin (&json_ctx, "malloc"); > + > + json_attr_object_begin (&json_ctx, ""); > + > + TIMING_INIT (res); > + > + (void) res; > + > + memset (&act, 0, sizeof (act)); > + act.sa_handler = &alarm_handler; > + > + sigaction (SIGALRM, &act, NULL); > + > + alarm (BENCHMARK_DURATION); > + > + cur = do_benchmark (num_threads, &iters); > + > + struct rusage usage; > + getrusage(RUSAGE_SELF, &usage); > + > + d_total_s = cur; > + d_total_i = iters; > + > + json_attr_double (&json_ctx, "duration", d_total_s); > + json_attr_double (&json_ctx, "iterations", d_total_i); > + json_attr_double (&json_ctx, "time_per_iteration", d_total_s / d_total_i); > + json_attr_double (&json_ctx, "max_rss", usage.ru_maxrss); > + > + json_attr_double (&json_ctx, "threads", num_threads); > + json_attr_double (&json_ctx, "min_size", MIN_ALLOCATION_SIZE); > + json_attr_double (&json_ctx, "max_size", MAX_ALLOCATION_SIZE); > + json_attr_double (&json_ctx, "random_seed", RAND_SEED); > + > + json_attr_object_end (&json_ctx); > + > + json_attr_object_end (&json_ctx); > + > + json_attr_object_end (&json_ctx); > + > + json_document_end (&json_ctx); > + > + return 0; > +} > -- > 1.9.3 >
Attachment:
pgp8DSV8iwERf.pgp
Description: PGP signature
Index Nav: | [Date Index] [Subject Index] [Author Index] [Thread Index] | |
---|---|---|
Message Nav: | [Date Prev] [Date Next] | [Thread Prev] [Thread Next] |