This is the mail archive of the
libc-alpha@sourceware.org
mailing list for the glibc project.
Re: calloc() implementation question
- From: Olivier Langlois <olivier at trillion01 dot com>
- To: OndÅej BÃlka <neleai at seznam dot cz>
- Cc: Carlos O'Donell <carlos at redhat dot com>, libc-alpha at sourceware dot org
- Date: Thu, 12 Dec 2013 22:44:48 -0500
- Subject: Re: calloc() implementation question
- Authentication-results: sourceware.org; auth=none
- References: <1386830712 dot 753 dot 25 dot camel at Wailaba2> <52A9F44F dot 8090708 at redhat dot com> <20131212184153 dot GA7378 at domone dot podge>
OndÅej
On Thu, 2013-12-12 at 19:41 +0100, OndÅej BÃlka wrote:
> To see how big is speedup/loss write a benchmark that compares variant
> with memset and one with inline expansion.
>
> void *
> calloc2 (size_t n)
> {
> return memset (malloc (n), 0, n);
> }
>
> void *
> calloc3 (size_t n)
> {
> void *x = malloc (n);
> if (n < 9 * 16)
> ...
> }
that was an amazing suggestion as it did allow me to have some new
insight in glibc malloc/calloc performance.
first, the manual unrolled memset did turned out to be faster than the
unconditionnal memset() call.
The other unexpected result was that malloc + memset() (calloc2()) is
always faster than the real calloc() for allocation size <= ~ 64KB.
that was unexpected as I tought that because calloc may skip the memset
step, it would be faster and at worse be on par with malloc + memset().
I did my tests on Linux kernel 3.12.4 on a
Intel(R) Atom(TM) CPU N455 @ 1.66GHz
#include <stdlib.h>
#include <string.h>
void *calloc1(size_t nmemb, size_t size)
{
return calloc(nmemb,size);
}
void *calloc2(size_t nmemb, size_t size)
{
return memset(malloc(nmemb*size),0,nmemb*size);
}
void *calloc3(size_t nmemb, size_t size)
{
size_t *d = (size_t *)malloc(nmemb*size);
size_t clearsize = nmemb*size;
size_t nclears = clearsize/sizeof(size_t);
if (nclears > 9)
memset(d,0,clearsize);
else {
*(d+0) = 0;
*(d+1) = 0;
*(d+2) = 0;
if (nclears > 4) {
*(d+3) = 0;
*(d+4) = 0;
if (nclears > 6) {
*(d+5) = 0;
*(d+6) = 0;
if (nclears > 8) {
*(d+7) = 0;
*(d+8) = 0;
}
}
}
}
return d;
}
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
#define CALLOC_FUNC calloc1
#define PAT_FUNC pat1_getsize
#define NUMITER 2000000
#define PAT1_SZ 11
void *calloc2(size_t nmemb, size_t size);
void *calloc3(size_t nmemb, size_t size);
static size_t pat1_getsize(size_t iter) { return PAT1_SZ; }
static size_t pat2_getsize(size_t iter) { return 9; }
static size_t pat3_getsize(size_t iter)
{
static size_t arr[] = { 3, 5, 7, 9, 11, 9, 7, 5 };
return arr[(iter%(sizeof(arr)/sizeof(size_t)))];
}
/*
* a - b
*/
static long diff_timespec_ns( struct timespec *a, struct timespec *b )
{
if ( a->tv_nsec < b->tv_nsec ) {
a->tv_nsec += 1000000000;
a->tv_sec--;
}
return a->tv_nsec - b->tv_nsec + (a->tv_sec - b->tv_sec)*1000000000;
}
int main(int argc, char *argv[])
{
struct timespec start,end;
size_t i;
clock_gettime(CLOCK_REALTIME,&start);
for( i = 0; i < NUMITER; ++i )
CALLOC_FUNC(PAT_FUNC(i),sizeof(size_t));
clock_gettime(CLOCK_REALTIME,&end);
printf("executed in %ld ns\n",diff_timespec_ns(&end,&start));
return 0;
}