This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH v2] add benchmark for strcoll


On Tue, Mar 10, 2015 at 07:20:15PM +0100, Leonhard Holz wrote:
> This is essentially the benchmark from last year
> (https://sourceware.org/ml/libc-alpha/2014-10/msg00345.html) plus an
> enchanced benchtests/Makefile to generate the needed locales. The attached
> input file have to be deployed to the benchtest directory.
> 
> 	* benchtests/bench-strcoll.c: New benchmark.
> 	* benchtests/Makefile: Generate locales and run benchmark.
> 	* localedata/gen-locale.sh (generate_locale): Make path to charmaps
> 	independent of current working directory.
> 	* benchtests/lorem_ipsum_ar_SA: New benchmark input file.
> 	* benchtests/lorem_ipsum_cs_CZ: Likewise.
> 	* benchtests/lorem_ipsum_da_DK: Likewise.
> 	* benchtests/lorem_ipsum_el_GR: Likewise.
> 	* benchtests/lorem_ipsum_en_GB: Likewise.
> 	* benchtests/lorem_ipsum_en_US: Likewise.
> 	* benchtests/lorem_ipsum_es_ES: Likewise.
> 	* benchtests/lorem_ipsum_fr_FR: Likewise.
> 	* benchtests/lorem_ipsum_hi_IN: Likewise.
> 	* benchtests/lorem_ipsum_hu_HU: Likewise.
> 	* benchtests/lorem_ipsum_is_IS: Likewise.
> 	* benchtests/lorem_ipsum_it_IT: Likewise.
> 	* benchtests/lorem_ipsum_iw_IL: Likewise.
> 	* benchtests/lorem_ipsum_ja_JP: Likewise.
> 	* benchtests/lorem_ipsum_pl_PL: Likewise.
> 	* benchtests/lorem_ipsum_pt_PT: Likewise.
> 	* benchtests/lorem_ipsum_ru_RU: Likewise.
> 	* benchtests/lorem_ipsum_sr_RS: Likewise.
> 	* benchtests/lorem_ipsum_sv_SE: Likewise.
> 	* benchtests/lorem_ipsum_tr_TR: Likewise.
> 	* benchtests/lorem_ipsum_vi_VN: Likewise.
> 	* benchtests/lorem_ipsum_zh_CN: Likewise.

Put the input files in a separate directory, say,
benchtests/strcoll-inputs/.

> diff --git a/benchtests/bench-strcoll.c b/benchtests/bench-strcoll.c
> index e69de29..6636e38 100644
> --- a/benchtests/bench-strcoll.c
> +++ b/benchtests/bench-strcoll.c
> @@ -0,0 +1,382 @@
> +/* Measure strcoll execution time in different locales.
> +   Copyright (C) 2015 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define TEST_MAIN
> +#define TEST_NAME "strcoll"
> +
> +#include <stdio.h>
> +#include <fcntl.h>
> +#include <assert.h>
> +#include <stdlib.h>
> +#include <locale.h>
> +#include <unistd.h>
> +#include <dirent.h>
> +#include "json-lib.h"
> +#include "bench-timing.h"
> +
> +/* many thanks to http://generator.lorem-ipsum.info/ */

Capitalize first letter and two spaces before comment end.

> +#define CHARSET "UTF-8"
> +#define INPUT_PREFIX "lorem_ipsum_"
> +
> +static const char *const locales[] = {
> +  "vi_VN",
> +  "en_US",
> +  "ar_SA",
> +  "en_US",
> +  "zh_CN",
> +  "cs_CZ",
> +  "en_GB",
> +  "da_DK",
> +  "pl_PL",
> +  "fr_FR",
> +  "pt_PT",
> +  "el_GR",
> +  "ru_RU",
> +  "iw_IL",
> +  "es_ES",
> +  "hi_IN",
> +  "sv_SE",
> +  "hu_HU",
> +  "tr_TR",
> +  "is_IS",
> +  "it_IT",
> +  "sr_RS",
> +  "ja_JP"
> +};
> +
> +#define TEXTFILE_DELIMITER " \n\r\t.,?!"
> +#define FILENAMES_DELIMITER "\n\r"
> +
> +static int
> +is_dir (const char *filename)
> +{
> +  int is_dir = 0;
> +  struct stat stats;
> +
> +  if (stat (filename, &stats) == 0)
> +    is_dir = stats.st_mode & S_IFDIR;
> +
> +  return is_dir;
> +}
> +
> +static char *
> +concat_path (const char *dirname, const char *filename)
> +{
> +  char *path;
> +  int ret = asprintf (&path, "%s/%s", dirname, filename);
> +  assert (ret > 0);
> +  return path;
> +}
> +
> +static char *
> +read_file (const char *filename)
> +{
> +  struct stat stats;
> +  char *buffer = NULL;
> +  int fd = open (filename, O_CLOEXEC);
> +
> +  if (fd >= 0)
> +    {
> +      if (fstat (fd, &stats) == 0)
> +	{
> +	  buffer = malloc (stats.st_size + 1);
> +	  if (buffer)
> +	    {
> +	      if (read (fd, buffer, stats.st_size) == stats.st_size)
> +		buffer[stats.st_size] = '\0';
> +	      else
> +		{
> +		  free (buffer);
> +		  buffer = NULL;
> +		}
> +	    }
> +	}
> +      close (fd);
> +    }
> +
> +  return buffer;
> +}
> +
> +static size_t
> +count_words (const char *text, const char *delim)
> +{
> +  size_t wordcount = 0;
> +  char *tmp = strdup (text);
> +
> +  char *token = strtok (tmp, delim);
> +  while (token != NULL)
> +    {
> +      if (*token != '\0')
> +	wordcount++;
> +      token = strtok (NULL, delim);
> +    }
> +
> +  free (tmp);
> +  return wordcount;
> +}
> +
> +typedef struct
> +{
> +  size_t size;
> +  char **words;
> +} word_list;
> +
> +static word_list *
> +new_word_list (size_t size)
> +{
> +  word_list *list = malloc (sizeof (word_list));
> +  assert (list != NULL);
> +  list->size = size;
> +  list->words = malloc (size * sizeof (char *));
> +  assert (list->words != NULL);
> +  return list;
> +}
> +
> +static word_list *
> +merge_word_list (word_list * dst, word_list * src)
> +{
> +  size_t i, n = dst->size;
> +  dst->size += src->size;
> +
> +  char **new_words = malloc (dst->size * sizeof (char *));
> +  assert (new_words != NULL);
> +  memcpy (new_words, dst->words, n * sizeof (char *));
> +  free (dst->words);
> +  dst->words = new_words;
> +
> +  for (i = 0; i < src->size; i++)
> +    dst->words[i + n] = src->words[i];
> +
> +  free (src->words);
> +  free (src);
> +
> +  return dst;
> +}
> +
> +static word_list *
> +file_word_list (const char *dirname)
> +{
> +  DIR *dir;
> +  struct dirent *ent;
> +  word_list *list = NULL;
> +
> +  if ((dir = opendir (dirname)) != NULL)
> +    {
> +      size_t ent_cnt = 0, i = 0;
> +      word_list *sublist = new_word_list (0);
> +
> +      while ((ent = readdir (dir)) != NULL)
> +	if (ent->d_name[0] != '.')
> +	  {
> +	    char *subdirname = concat_path (dirname, ent->d_name);
> +	    if (is_dir (subdirname))
> +		merge_word_list (sublist, file_word_list (subdirname));
> +	    free (subdirname);
> +
> +	    ent_cnt++;
> +	  }
> +
> +      rewinddir (dir);
> +      list = new_word_list (ent_cnt);
> +      while ((ent = readdir (dir)) != NULL)
> +	if (ent->d_name[0] != '.')
> +	  {
> +	    list->words[i] = strdup (ent->d_name);
> +	    i++;
> +	  }
> +
> +      merge_word_list (list, sublist);
> +      closedir (dir);
> +    }
> +
> +  return list;
> +}
> +
> +static word_list *
> +str_word_list (const char *str, const char *delim)
> +{
> +  size_t n = 0;
> +  word_list *list = new_word_list (count_words (str, delim));
> +
> +  char *toks = strdup (str);
> +  char *word = strtok (toks, delim);
> +  while (word != NULL && n < list->size)
> +    {
> +      if (*word != '\0')
> +	list->words[n++] = strdup (word);
> +      word = strtok (NULL, delim);
> +    }
> +
> +  free (toks);
> +  return list;
> +}
> +
> +static word_list *
> +copy_word_list (const word_list *list)
> +{
> +  size_t i;
> +  word_list *copy = new_word_list (list->size);
> +
> +  for (i = 0; i < list->size; i++)
> +    copy->words[i] = strdup (list->words[i]);
> +
> +  return copy;
> +}
> +
> +static void
> +free_word_list (word_list *list)
> +{
> +  size_t i;
> +  for (i = 0; i < list->size; i++)
> +    free (list->words[i]);
> +
> +  free (list->words);
> +  free (list);
> +}
> +
> +static int
> +compare_words (const void *a, const void *b)
> +{
> +  const char *s1 = *(char **) a;
> +  const char *s2 = *(char **) b;
> +  return strcoll (s1, s2);
> +}
> +
> +#undef INNER_LOOP_ITERS
> +#define INNER_LOOP_ITERS 16
> +
> +static void
> +bench_list (json_ctx_t *json_ctx, word_list *list)
> +{
> +  size_t i;
> +  timing_t start, stop, cur;
> +
> +  word_list **tests = malloc (INNER_LOOP_ITERS * sizeof (word_list *));
> +  assert (tests != NULL);
> +  for (i = 0; i < INNER_LOOP_ITERS; i++)
> +    tests[i] = copy_word_list (list);
> +
> +  TIMING_NOW (start);
> +  for (i = 0; i < INNER_LOOP_ITERS; i++)
> +    qsort (tests[i]->words, tests[i]->size, sizeof (char *), compare_words);
> +  TIMING_NOW (stop);
> +
> +  TIMING_DIFF (cur, start, stop);
> +  setlocale (LC_ALL, "en_US.UTF-8");
> +  json_attr_double (json_ctx, "duration", cur);
> +  json_attr_double (json_ctx, "iterations", i);
> +  json_attr_double (json_ctx, "mean", cur / i);
> +
> +  for (i = 0; i < INNER_LOOP_ITERS; i++)
> +    free_word_list (tests[i]);
> +  free (tests);
> +}
> +
> +#define OK 0
> +#define ERROR_LOCALE 1
> +#define ERROR_IO 2

Use an enum instead.

> +
> +static int
> +bench_file (json_ctx_t *json_ctx, const char *filename, const char *delim,
> +  const char *locale)

Adjust this as recommended below.

> +{
> +  if (setlocale (LC_ALL, locale) == NULL)
> +    return ERROR_LOCALE;
> +
> +  char *text = read_file (filename);
> +  if (text == NULL)
> +    return ERROR_IO;
> +
> +  word_list *list = str_word_list (text, delim);
> +
> +  json_attr_object_begin (json_ctx, locale);
> +  bench_list (json_ctx, list);
> +  json_attr_object_end (json_ctx);
> +
> +  free_word_list (list);
> +  free (text);
> +  return OK;
> +}
> +
> +static int
> +bench_file_list (json_ctx_t *json_ctx, const char *dirname, const char *locale)

If you just create a file with this information once, you can collapse
this special case into the other cases and get rid of this function.

> +{
> +  if (setlocale (LC_ALL, locale) == NULL)
> +    return ERROR_LOCALE;
> +
> +  word_list *list = file_word_list (dirname);
> +  if (list == NULL)
> +    return ERROR_IO;
> +
> +  json_attr_object_begin (json_ctx, "");
> +  json_attr_string (json_ctx, "locale", locale);
> +  bench_list (json_ctx, list);
> +  json_attr_object_end (json_ctx);
> +
> +  free_word_list (list);
> +  return OK;
> +}
> +
> +static int
> +do_bench (void)
> +{
> +  timing_t res;
> +  TIMING_INIT (res);
> +
> +  json_ctx_t *json_ctx = malloc (sizeof (json_ctx_t));
> +  assert (json_ctx != NULL);
> +  json_init (json_ctx, 2, stdout);
> +  json_attr_object_begin (json_ctx, "strcoll");
> +
> +  int result = bench_file_list (json_ctx, "..", "C");
> +  if (result != OK) {
> +    error:
> +    if (result == ERROR_LOCALE)
> +      printf ("Failed to set locale en_US.UTF-8, aborting!\n");

s/en_US.UTF-8/C/

In fact, set a variable and use it to pass to bench_file_list and the
printf so as to avoid such an error.

> +    else if (result == ERROR_IO)
> +      printf ("Failed to read libc directory, aborting!\n");
> +    else
> +      printf ("Error, aborting!\n");

Print the value of the result.

> +    return result;
> +  }
> +  result = bench_file_list (json_ctx, "..", "en_US.UTF-8");
> +  if (result != OK)
> +    goto error;

Instead, split the error handling code into a function, set RESULT and
goto a label 'out' near the end of the function that just returns
RESULT.

> +
> +  size_t i;
> +  char filename[40], locale[15];
> +  for (i = 0; i < (sizeof (locales) / sizeof (locales[0])); i++)
> +    {
> +      sprintf (locale, "%s." CHARSET, locales[i]);
> +      sprintf (filename, INPUT_PREFIX "%s", locales[i]);
> +      result = bench_file (json_ctx, filename, TEXTFILE_DELIMITER, locale);

Just make bench_file accept the locale name and deduce the locale and
filename in the function.  You don't really need to pass the delimiter
either since you don't call bench_file with anything other than
TEXTFILE_DELIMITER.  That could simplify your code to:

	result = bench_file (json_ctx, locales[i]);
	if (result != OK)
	  goto out;

> +      if (result != OK)
> +	goto error;
> +    }
> +
> +  json_attr_object_end (json_ctx);
> +  free (json_ctx);

This is where the out: label goes and you return RESULT instead of OK.

> +  return OK;
> +}
> +
> +#define TEST_FUNCTION do_bench ()
> +
> +/* On slower platforms this test needs more than the default 2 seconds.  */
> +#define TIMEOUT 10
> +
> +#include "../test-skeleton.c"

Don't use the test-skeleton.  Make do_bench the main function.  The
other string benchmarks do this because they were directly ported from
the old test cases.  In fact, they need to be evaluated and improved.

> diff --git a/benchtests/Makefile b/benchtests/Makefile
> index 08603a2..4aaff73 100644
> --- a/benchtests/Makefile
> +++ b/benchtests/Makefile
> @@ -34,9 +34,22 @@ string-bench := bcopy bzero memccpy memchr memcmp memcpy
> memmem memmove \
>  		mempcpy memset rawmemchr stpcpy stpncpy strcasecmp strcasestr \
>  		strcat strchr strchrnul strcmp strcpy strcspn strlen \
>  		strncasecmp strncat strncmp strncpy strnlen strpbrk strrchr \
> -		strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok
> +		strspn strstr strcpy_chk stpcpy_chk memrchr strsep strtok \
> +		strcoll
>  string-bench-all := $(string-bench)
> 
> +# We have to generate locales
> +LOCALES := en_US.UTF-8 tr_TR.UTF-8 cs_CZ.UTF-8 fa_IR.UTF-8 fr_FR.UTF-8 \
> +	   ja_JP.UTF-8 si_LK.UTF-8 en_GB.UTF-8 vi_VN.UTF-8 ar_SA.UTF-8 \
> +	   da_DK.UTF-8 pl_PL.UTF-8 pt_PT.UTF-8 el_GR.UTF-8 ru_RU.UTF-8 \
> +	   iw_IL.UTF-8 is_IS.UTF-8 es_ES.UTF-8 hi_IN.UTF-8 sv_SE.UTF-8 \
> +	   hu_HU.UTF-8 it_IT.UTF-8 sr_RS.UTF-8 zh_CN.UTF-8
> +LOCALE_SRCS := $(shell echo "$(LOCALES)"|sed 's/\([^ .]*\)[^ ]*/\1/g')
> +CHARMAPS := $(shell echo "$(LOCALES)" | \
> +		    sed -e 's/[^ .]*[.]\([^ ]*\)/\1/g' -e s/SJIS/SHIFT_JIS/g)
> +CTYPE_FILES = $(addsuffix /LC_CTYPE,$(LOCALES))
> +gen-locales := $(addprefix $(common-objpfx)localedata/,$(CTYPE_FILES))
> +
>  stdlib-bench := strtod
> 
>  benchset := $(string-bench-all) $(stdlib-bench)
> @@ -65,6 +78,19 @@ binaries-bench := $(addprefix $(objpfx)bench-,$(bench))
>  binaries-benchset := $(addprefix $(objpfx)bench-,$(benchset))
>  binaries-bench-malloc := $(addprefix $(objpfx)bench-,$(bench-malloc))
> 
> +# Dependency for the locale files.  We actually make it depend only on
> +# one of the files.
> +$(addprefix $(common-objpfx)localedata/,$(CTYPE_FILES)): %: \
> +  ../localedata/gen-locale.sh \
> +  $(common-objpfx)locale/localedef \
> +  ../localedata/Makefile \
> +  $(addprefix ../localedata/charmaps/,$(CHARMAPS)) \
> +  $(addprefix ../localedata/locales/,$(LOCALE_SRCS))
> +	@$(SHELL) ../localedata/gen-locale.sh $(common-objpfx) \
> +		  '$(built-program-cmd-before-env)' '$(run-program-env)' \
> +		  '$(built-program-cmd-after-env)' $@; \
> +	$(evaluate-test)
> +

AFAICT all of this is copied from localedata/Makefile.  Instead, make
a common snippet that you can include in both these places.  This will
reduce the pain of remembering to update two locations when any of
this code needs to be fixed.

>  # The default duration: 10 seconds.
>  ifndef BENCH_DURATION
>  BENCH_DURATION := 10
> @@ -107,7 +133,7 @@ bench-clean:
>  	rm -f $(binaries-bench-malloc) $(addsuffix .o,$(binaries-bench-malloc))
>  	rm -f $(timing-type) $(addsuffix .o,$(timing-type))
> 
> -bench: $(timing-type) bench-set bench-func bench-malloc
> +bench: $(timing-type) $(gen-locales) bench-set bench-func bench-malloc
> 
>  bench-set: $(binaries-benchset)
>  	for run in $^; do \
> 
> diff --git a/localedata/gen-locale.sh b/localedata/gen-locale.sh
> index 4156f9e..2a48a34 100644
> --- a/localedata/gen-locale.sh
> +++ b/localedata/gen-locale.sh
> @@ -30,7 +30,8 @@ generate_locale ()
>     charmap=$1
>     input=$2
>     out=$3
> -  if ${localedef_before_env} ${run_program_env} I18NPATH=. \
> +  if ${localedef_before_env} ${run_program_env} \
> +     I18NPATH=${common_objpfx}../localedata \

Shouldn't this be I18NPATH=${common_objpfx}localedata?

>        ${localedef_after_env} --quiet -c -f $charmap -i $input \
>   			    ${common_objpfx}localedata/$out
>     then
> 


Attachment: pgpVSVKadt5T3.pgp
Description: PGP signature


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]