This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Speed up fnmatch.


On Wed, Oct 09, 2013 at 07:43:59PM +0200, OndÅej BÃlka wrote:
> Hi, fnmatch on multibyte string is slow for several reasons.
> Main reason is that we spend ages converting strings to wide
> strings.
> 
> Most of time this overhead is not needed as user does a simple search
> with no special characters. 
> 
> For utf8 these could be checked by strstr as there is only one way how
> encode ascii-only sequence. I could generalize this to any encoding for
> which this is true.
> 
> A strstr will quickly determine that there is no match because there is
> no match with start.
> 
> I excluded all special characters from documentation and it passes test.
>
Which were not complete as I missed FNM_CASEFOLD. 

As a benchmark I measured find on my home directory. This optimization
decreased user runtime by 30% and still has bit ineffective checks for
encoding.

$ gcc -O3 -ldl tst.c -fPIC -shared -o tst.so

# find -name "aeou*a" # cache entries

$ LD_PRELOAD=tst.so /usr/bin/time find -name "aeou*a"
Command exited with non-zero status 1
0.68user 1.11system 0:01.83elapsed 98%CPU (0avgtext+0avgdata 37900maxresident)k
0inputs+0outputs (0major+15574minor)pagefaults 0swaps

$  /usr/bin/time find -name "aeou*a"
Command exited with non-zero status 1
1.02user 1.06system 0:02.12elapsed 98%CPU (0avgtext+0avgdata 37868maxresident)k
0inputs+0outputs (0major+15561minor)pagefaults 0swaps 



#define _GNU_SOURCE
#include <dlfcn.h>
#include <fnmatch.h>
#include <locale.h>
#include <langinfo.h>
#include <string.h>
#include <stdlib.h>

int
fnmatch (pattern, string, flags)
     const char *pattern;
     const char *string;
     int flags;
{
  int multibyte_needed = 0;
  if (MB_CUR_MAX != 1)
    {
      char *encoding = nl_langinfo (CODESET);
      if (strcmp (encoding, "UTF-8"))
	multibyte_needed = 1;
    }

  /* ASCII with \+/.*?[{(@! excluded.  */
  static unsigned char normal[256] = {
 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };

  if (!multibyte_needed && !(flags & FNM_CASEFOLD))
    {
      char start[16];
      size_t i;
      for (i = 0; i < 8 && normal[(unsigned char) pattern[i]]; i++)
        start[i] = pattern[i];
      start[i] = 0;
      char *string2 = strstr (string, start);
      if (!string2)
        return FNM_NOMATCH;
    }
  int (*fnm) (const char *,const char *,int) = dlsym (RTLD_NEXT, "fnmatch");
  return fnm (pattern, string, flags);
}


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]