This is the mail archive of the libc-hacker@sources.redhat.com mailing list for the glibc project.

Note that libc-hacker is a closed list. You may look at the archives of this list, but subscription and posting are not open.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Speed up REG_NOSUB or nested subexps


Hi!

The following patch speeds up REG_NOSUB searching, adds RE_NO_SUB
for the re_compile_pattern interface to get similar behaviour and
also speeds up cases when subexps are immediately nested.

It removes all OP_OPEN_SUBEXP/OP_CLOSE_SUBEXP
nodes that are not needed from the str_tree after parsing, but
before analyzing.
With REG_NOSUB, all these nodes are unneeded with the exception
of those used for backreferences.
Without REG_NOSUB, unneeded are just those that are immediately surrounded
by another () pair (and aren't used by backreferences).

Measurements, tst-regex2 with -DWHOLE_FILE_TIMING, is below.
With REG_NOSUB or RE_NO_SUB (test1 and test3), all 3 patterns
are searched with the same speed.  Without that, 3rd pattern
searching is as fast as 2nd pattern search.

Vanilla CVS:
test 0 pattern 0: 0.840423424s
test 0 pattern 1: 1.212836508s
test 0 pattern 2: 4.888927188s
test 1 pattern 0: 0.812340634s
test 1 pattern 1: 1.242682267s
test 1 pattern 2: 4.872612232s
test 2 pattern 0: 0.763208542s
test 2 pattern 1: 1.142578726s
test 2 pattern 2: 4.639178880s
test 3 pattern 0: 0.764673397s
test 3 pattern 1: 1.139890453s
test 3 pattern 2: 4.635312775s

With patch:
test 0 pattern 0: 0.820845810s
test 0 pattern 1: 1.261617889s
test 0 pattern 2: 1.263218611s
test 1 pattern 0: 0.833536480s
test 1 pattern 1: 0.821412026s
test 1 pattern 2: 0.822183978s
test 2 pattern 0: 0.778098488s
test 2 pattern 1: 1.191543473s
test 2 pattern 2: 1.194518191s
test 3 pattern 0: 0.777669430s
test 3 pattern 1: 0.790473658s
test 3 pattern 2: 0.780549884s

2004-11-18  Jakub Jelinek  <jakub@redhat.com>

	[BZ #544]
	* posix/regex.h (RE_NO_SUB): New define.
	* posix/regex_internal.h (OP_DELETED_SUBEXP): New.
	(re_dfa_t): Add subexp_map.
	* posix/regcomp.c (struct subexp_optimize): New type.
	(optimize_subexps): New routine.
	(re_compile_internal): Call it.
	(re_compile_pattern): Set preg->no_sub to 1 if RE_NO_SUB.
	(free_dfa_content): Free subexp_map.
	(calc_inveclosure, calc_eclosure): Skip OP_DELETED_SUBEXP
	nodes.
	* posix/regexec.c (re_search_internal): If subexp_map
	is not NULL, duplicate registers as needed.
	* posix/Makefile: Add rules to build and run tst-regex2.
	* posix/tst-regex2.c: New test.
	* posix/rxspencer/tests: Fix last two tests (\0 -> \1).
	Add some new tests for nested subexpressions.

--- libc/posix/Makefile.jj	2004-11-15 13:32:43.000000000 +0100
+++ libc/posix/Makefile	2004-11-18 18:01:02.077432232 +0100
@@ -80,7 +80,7 @@ tests		:= tstgetopt testfnm runtests run
 		   bug-regex13 bug-regex14 bug-regex15 bug-regex16 \
 		   bug-regex17 bug-regex18 bug-regex19 bug-regex20 \
 		   bug-regex21 bug-regex22 bug-regex23 bug-regex24 \
-		   tst-nice tst-nanosleep \
+		   tst-nice tst-nanosleep tst-regex2 \
 		   transbug tst-rxspencer tst-pcre tst-boost \
 		   bug-ga1 tst-vfork1 tst-vfork2 tst-waitid \
 		   tst-getaddrinfo2 bug-glob1 bug-glob2
@@ -160,6 +160,7 @@ tst-fnmatch-ENV = LOCPATH=$(common-objpf
 tst-regexloc-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex1-ENV = LOCPATH=$(common-objpfx)localedata
 tst-regex-ENV = LOCPATH=$(common-objpfx)localedata
+tst-regex2-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex5-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex6-ENV = LOCPATH=$(common-objpfx)localedata
 bug-regex17-ENV = LOCPATH=$(common-objpfx)localedata
@@ -244,8 +245,10 @@ $(objpfx)tst-getconf.out: tst-getconf.sh
 
 ifeq (yes,$(build-shared))
 $(objpfx)tst-regex: $(common-objpfx)rt/librt.so
+$(objpfx)tst-regex2: $(common-objpfx)rt/librt.so
 else
 $(objpfx)tst-regex: $(common-objpfx)rt/librt.a
+$(objpfx)tst-regex2: $(common-objpfx)rt/librt.a
 endif
 
 $(objpfx)bug-ga2-mem: $(objpfx)bug-ga2.out
--- libc/posix/regex_internal.h.jj	2004-11-12 13:57:49.000000000 +0100
+++ libc/posix/regex_internal.h	2004-11-18 13:18:13.000000000 +0100
@@ -189,6 +189,7 @@ typedef enum
   OP_DUP_PLUS = EPSILON_BIT | 4,
   OP_DUP_QUESTION = EPSILON_BIT | 5,
   ANCHOR = EPSILON_BIT | 6,
+  OP_DELETED_SUBEXP = EPSILON_BIT | 7,
 
   /* Tree type, these are used only by tree. */
   CONCAT = 16,
@@ -644,6 +645,7 @@ struct re_dfa_t
   int mb_cur_max;
   bitset word_char;
   reg_syntax_t syntax;
+  int *subexp_map;
 #ifdef DEBUG
   char* re_str;
 #endif
--- libc/posix/rxspencer/tests.jj	2004-11-10 10:36:48.000000000 +0100
+++ libc/posix/rxspencer/tests	2004-11-18 16:45:27.902282924 +0100
@@ -508,5 +508,21 @@ a*a*a*a*a*a*a*	&	aaaaaa	aaaaaa
 (\b){0}	-	x	@x	-
 \(\b\)\{0,0\}	b	abc	@abc	-
 a(\b){0}c	-	ac	ac	-
-a(.*)b(\0){0}c	-	abc	abc	@bc,-
-a(.*)b(\0){0}c	-	axbc	axbc	x,-
+a(.*)b(\1){0}c	-	abc	abc	@bc,-
+a(.*)b(\1){0}c	-	axbc	axbc	x,-
+
+a\(\(b*\)\)c\1d	b	abbcbbd	abbcbbd	bb,bb
+a\(\([bc]\)\)\2d	b	abcdabbd	abbd	b,b
+a\(\(\(\([bc]\)\)\3\)\)*d	b	abbccd	abbccd	cc,cc,c,c
+a(b)(c)d	-	abcd	abcd	b,c
+a(((b)))c	-	abc	abc	b,b,b
+a(((b|(((c))))))d	-	abd	abd	b,b,b,-,-,-
+a(((b*|c|e)))d	-	abbd	abbd	bb,bb,bb
+a((b|c)){0,0}d	-	ad	ad	-,-
+a((b|c)){0,1}d	-	abd	abd	b,b
+a((b|c)){0,2}d	-	abcd	abcd	c,c
+a((b+|((c)*)))+d	-	abd	abd	b,b,-,-
+a((b+|((c)*)))+d	-	abcd	abcd	c,c,c,c
+(((\b))){0}	-	x	@x	-,-,-
+a(((.*)))b((\2)){0}c	-	abc	abc	@bc,@bc,@bc,-,-
+a(((.*)))b((\1)){0}c	-	axbc	axbc	x,x,x,-,-
--- libc/posix/regcomp.c.jj	2004-11-15 13:39:35.000000000 +0100
+++ libc/posix/regcomp.c	2004-11-18 16:20:19.500142900 +0100
@@ -33,6 +33,14 @@ static reg_errcode_t create_initial_stat
 #ifdef RE_ENABLE_I18N
 static void optimize_utf8 (re_dfa_t *dfa);
 #endif
+struct subexp_optimize
+{
+  re_dfa_t *dfa;
+  re_token_t *nodes;
+  int no_sub, re_nsub;
+};
+static bin_tree_t *optimize_subexps (struct subexp_optimize *so,
+                                     bin_tree_t *node, int sidx, int depth);
 static reg_errcode_t analyze (re_dfa_t *dfa);
 static reg_errcode_t analyze_tree (re_dfa_t *dfa, bin_tree_t *node);
 static void calc_first (re_dfa_t *dfa, bin_tree_t *node);
@@ -238,8 +246,8 @@ re_compile_pattern (pattern, length, buf
 
   /* And GNU code determines whether or not to get register information
      by passing null for the REGS argument to re_match, etc., not by
-     setting no_sub.  */
-  bufp->no_sub = 0;
+     setting no_sub, unless RE_NO_SUB is set.  */
+  bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
 
   /* Match anchors at newline.  */
   bufp->newline_anchor = 1;
@@ -633,6 +641,7 @@ free_dfa_content (re_dfa_t *dfa)
   if (dfa->sb_char != utf8_sb_map)
     re_free (dfa->sb_char);
 #endif
+  re_free (dfa->subexp_map);
 #ifdef DEBUG
   re_free (dfa->re_str);
 #endif
@@ -810,6 +819,17 @@ re_compile_internal (preg, pattern, leng
     optimize_utf8 (dfa);
 #endif
 
+  if (preg->re_nsub > 0)
+    {
+      struct subexp_optimize so;
+
+      so.dfa = dfa;
+      so.nodes = dfa->nodes;
+      so.no_sub = preg->no_sub;
+      so.re_nsub = preg->re_nsub;
+      dfa->str_tree = optimize_subexps (&so, dfa->str_tree, -1, 0);
+    }
+
   /* Analyze the tree and collect information which is necessary to
      create the dfa.  */
   err = analyze (dfa);
@@ -1121,6 +1141,82 @@ optimize_utf8 (dfa)
 }
 #endif
 
+static bin_tree_t *
+optimize_subexps (so, node, sidx, depth)
+     struct subexp_optimize *so;
+     bin_tree_t *node;
+     int sidx, depth;
+{
+  int idx, new_depth, new_sidx;
+  bin_tree_t *ret;
+  if (node == NULL)
+    return NULL;
+
+  new_depth = 0;
+  new_sidx = sidx;
+  if ((depth & 1) && node->type == CONCAT
+      && node->right && node->right->type == 0
+      && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
+    {
+      new_depth = depth + 1;
+      if (new_depth == 2
+          || (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
+              && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx)))
+        new_sidx = so->nodes[idx].opr.idx;
+    }
+  node->left = optimize_subexps (so, node->left, new_sidx, new_depth);
+  new_depth = (depth & 1) == 0 && node->type == CONCAT
+              && node->left && node->left->type == 0
+              && so->nodes[node->left->node_idx].type == OP_OPEN_SUBEXP
+              ? depth + 1 : 0;
+  node->right = optimize_subexps (so, node->right, sidx, new_depth);
+                                     
+  if (node->type != CONCAT)
+    return node;
+  if ((depth & 1) == 0
+      && node->left
+      && node->left->type == 0
+      && so->nodes[idx = node->left->node_idx].type == OP_OPEN_SUBEXP)
+    ret = node->right;
+  else if ((depth & 1)
+           && node->right
+           && node->right->type == 0
+           && so->nodes[idx = node->right->node_idx].type == OP_CLOSE_SUBEXP)
+    ret = node->left;
+  else
+    return node;
+
+  if (so->nodes[idx].opr.idx < 8 * sizeof (so->dfa->used_bkref_map)
+      && so->dfa->used_bkref_map & (1 << so->nodes[idx].opr.idx))
+    return node;
+
+  if (!so->no_sub)
+    {
+      int i;
+
+      if (depth < 2)
+        return node;
+
+      if (so->dfa->subexp_map == NULL)
+        {
+          so->dfa->subexp_map = re_malloc (int, so->re_nsub);
+          if (so->dfa->subexp_map == NULL)
+            return node;
+
+          for (i = 0; i < so->re_nsub; i++)
+            so->dfa->subexp_map[i] = i;
+        }
+
+      i = so->nodes[idx].opr.idx;
+      assert (sidx < i);
+      so->dfa->subexp_map[i] = sidx;
+    }
+
+  so->nodes[idx].type = OP_DELETED_SUBEXP;
+  ret->parent = node->parent;
+  return ret;
+}
+
 /* Analyze the structure tree, and calculate "first", "next", "edest",
    "eclosure", and "inveclosure".  */
 
@@ -1525,6 +1621,8 @@ calc_inveclosure (dfa)
   int src, idx, dest;
   for (src = 0; src < dfa->nodes_len; ++src)
     {
+      if (dfa->nodes[src].type == OP_DELETED_SUBEXP)
+        continue;
       for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
 	{
 	  dest = dfa->eclosures[src].elems[idx];
@@ -1560,6 +1658,9 @@ calc_eclosure (dfa)
 #ifdef DEBUG
       assert (dfa->eclosures[node_idx].nelem != -1);
 #endif
+      if (dfa->nodes[node_idx].type == OP_DELETED_SUBEXP)
+        continue;
+
       /* If we have already calculated, skip it.  */
       if (dfa->eclosures[node_idx].nelem != 0)
 	continue;
--- libc/posix/regexec.c.jj	2004-11-12 13:57:49.000000000 +0100
+++ libc/posix/regexec.c	2004-11-18 15:48:40.891375714 +0100
@@ -882,6 +882,18 @@ re_search_internal (preg, string, length
 	    pmatch[reg_idx].rm_so += match_first;
 	    pmatch[reg_idx].rm_eo += match_first;
 	  }
+
+      if (dfa->subexp_map)
+        for (reg_idx = 0;
+             reg_idx + 1 < nmatch && reg_idx < preg->re_nsub;
+             reg_idx++)
+          if (dfa->subexp_map[reg_idx] != reg_idx)
+            {
+              pmatch[reg_idx + 1].rm_so
+                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_so;
+              pmatch[reg_idx + 1].rm_eo
+                = pmatch[dfa->subexp_map[reg_idx] + 1].rm_eo;
+            }
     }
 
  free_return:
--- libc/posix/tst-regex2.c.jj	2004-11-18 17:57:44.697602014 +0100
+++ libc/posix/tst-regex2.c	2004-11-18 18:00:13.697052790 +0100
@@ -0,0 +1,244 @@
+#include <fcntl.h>
+#include <locale.h>
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+#ifdef _POSIX_CPUTIME
+static clockid_t cl;
+static int use_clock;
+#endif
+
+static int
+do_test (void)
+{
+#ifdef _POSIX_CPUTIME
+  /* See whether we can use the CPU clock.  */
+  use_clock = clock_getcpuclockid (0, &cl) == 0;
+#endif
+
+  static const char *pat[] = {
+    ".?.?.?.?.?.?.?Log\\.13",
+    "(.?)(.?)(.?)(.?)(.?)(.?)(.?)Log\\.13",
+    "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
+    "((((((((((.?))))))))))((((((((((.?))))))))))((((((((((.?))))))))))"
+    "((((((((((.?))))))))))Log\\.13" };
+
+  int fd = open ("../ChangeLog.14", O_RDONLY);
+  if (fd < 0)
+    {
+      printf ("Couldn't open ChangeLog.14: %m\n");
+      return 1;
+    }
+
+  struct stat64 st;
+  if (fstat64 (fd, &st) < 0)
+    {
+      printf ("Couldn't fstat ChangeLog.14: %m\n");
+      return 1;
+    }
+
+  char *buf = malloc (st.st_size + 1);
+  if (buf == NULL)
+    {
+      printf ("Couldn't allocate buffer: %m\n");
+      return 1;
+    }
+
+  if (read (fd, buf, st.st_size) != (ssize_t) st.st_size)
+    {
+      puts ("Couldn't read ChangeLog.14");
+      return 1;
+    }
+
+  close (fd);
+  buf[st.st_size] = '\0';
+
+  setlocale (LC_ALL, "de_DE.UTF-8");
+
+  char *string = buf;
+  size_t len = st.st_size;
+
+#ifndef WHOLE_FILE_TIMING
+  /* Don't search the whole file normally, it takes too long.  */
+  if (len > 500000 + 64)
+    {
+      string += 500000;
+      len -= 500000;
+    }
+#endif
+
+  for (int testno = 0; testno < 4; ++testno)
+    for (int i = 0; i < sizeof (pat) / sizeof (pat[0]); ++i)
+      {
+	printf ("test %d pattern %d", testno, i);
+
+	regex_t rbuf;
+	struct re_pattern_buffer rpbuf;
+	int err;
+	if (testno < 2)
+	  {
+	    err = regcomp (&rbuf, pat[i],
+			   REG_EXTENDED | (testno ? REG_NOSUB : 0));
+	    if (err != 0)
+	      {
+		putchar ('\n');
+		char errstr[300];
+		regerror (err, &rbuf, errstr, sizeof (errstr));
+		puts (errstr);
+		return err;
+	      }
+	  }
+	else
+	  {
+	    re_set_syntax (RE_SYNTAX_POSIX_EGREP
+			   | (testno == 3 ? RE_NO_SUB : 0));
+
+	    memset (&rpbuf, 0, sizeof (rpbuf));
+	    const char *s = re_compile_pattern (pat[i], strlen (pat[i]),
+						&rpbuf);
+	    if (s != NULL)
+	      {
+		printf ("\n%s\n", s);
+		return 1;
+	      }
+
+	    /* Just so that this can be tested with earlier glibc as well.  */
+	    if (testno == 3)
+	      rpbuf.no_sub = 1;
+	  }
+
+#ifdef _POSIX_CPUTIME
+      struct timespec start, stop;
+      if (use_clock)
+	use_clock = clock_gettime (cl, &start) == 0;
+#endif
+
+      if (testno < 2)
+	{
+	  regmatch_t pmatch[71];
+	  err = regexec (&rbuf, string, 71, pmatch, 0);
+	  if (err == REG_NOMATCH)
+	    {
+	      puts ("\nregexec failed");
+	      return 1;
+	    }
+
+	  if (testno == 0)
+	    {
+	      if (pmatch[0].rm_eo != pmatch[0].rm_so + 13
+		  || pmatch[0].rm_eo > len
+		  || pmatch[0].rm_so < len - 100
+		  || strncmp (string + pmatch[0].rm_so,
+			      " ChangeLog.13 for earlier changes",
+			      sizeof " ChangeLog.13 for earlier changes" - 1)
+		     != 0)
+		{
+		  puts ("\nregexec without REG_NOSUB did not find the correct match");
+		  return 1;
+		}
+
+	      if (i > 0)
+		for (int j = 0, l = 1; j < 7; ++j)
+		  for (int k = 0; k < (i == 1 ? 1 : 10); ++k, ++l)
+		    if (pmatch[l].rm_so != pmatch[0].rm_so + j
+			|| pmatch[l].rm_eo != pmatch[l].rm_so + 1)
+		      {
+			printf ("\npmatch[%d] incorrect\n", l);
+			return 1;
+		      }
+	    }
+	}
+      else
+	{
+	  struct re_registers regs;
+
+	  memset (&regs, 0, sizeof (regs));
+	  int match = re_search (&rpbuf, string, len, 0, len,
+				 &regs);
+	  if (match < 0)
+	    {
+	      puts ("\nre_search failed");
+	      return 1;
+	    }
+
+	  if (match + 13 > len
+	      || match < len - 100
+	      || strncmp (string + match,
+			  " ChangeLog.13 for earlier changes",
+			  sizeof " ChangeLog.13 for earlier changes" - 1)
+		  != 0)
+	    {
+	      puts ("\nre_search did not find the correct match");
+	      return 1;
+	    }
+
+	  if (testno == 2)
+	    {
+	      if (regs.num_regs != 2 + (i == 0 ? 0 : i == 1 ? 7 : 70))
+		{
+		  printf ("\nincorrect num_regs %d\n", regs.num_regs);
+		  return 1;
+		}
+
+	      if (regs.start[0] != match || regs.end[0] != match + 13)
+		{
+		  printf ("\nincorrect regs.{start,end}[0] = { %d, %d}\n",
+			  regs.start[0], regs.end[0]);
+		  return 1;
+		}
+
+	      if (regs.start[regs.num_regs - 1] != -1
+		  || regs.end[regs.num_regs - 1] != -1)
+		{
+		  puts ("\nincorrect regs.{start,end}[num_regs - 1]");
+		  return 1;
+		}
+
+	      if (i > 0)
+		for (int j = 0, l = 1; j < 7; ++j)
+		  for (int k = 0; k < (i == 1 ? 1 : 10); ++k, ++l)
+		    if (regs.start[l] != match + j
+			|| regs.end[l] != regs.start[l] + 1)
+		      {
+			printf ("\nregs.{start,end}[%d] incorrect\n", l);
+			return 1;
+		      }
+	    }
+	}
+
+#ifdef _POSIX_CPUTIME
+      if (use_clock)
+	use_clock = clock_gettime (cl, &stop) == 0;
+      if (use_clock)
+	{
+	  stop.tv_sec -= start.tv_sec;
+	  if (stop.tv_nsec < start.tv_nsec)
+	    {
+	      stop.tv_sec--;
+	      stop.tv_nsec += 1000000000 - start.tv_nsec;
+	    }
+	  else
+	    stop.tv_nsec -= start.tv_nsec;
+	  printf (": %ld.%09lds\n", (long) stop.tv_sec, (long) stop.tv_nsec);
+	}
+      else
+#endif
+	putchar ('\n');
+
+      if (testno < 2)
+	regfree (&rbuf);
+      else
+	regfree (&rpbuf);
+    }
+
+  return 0;
+}
+
+#define TIMEOUT 20
+#define TEST_FUNCTION do_test ()
+#include "../test-skeleton.c"
--- libc/posix/regex.h.jj	2004-03-05 09:37:25.000000000 +0100
+++ libc/posix/regex.h	2004-11-18 16:14:57.477498572 +0100
@@ -179,6 +179,10 @@ typedef unsigned long int reg_syntax_t;
    immediately after an alternation or begin-group operator.  */
 #define RE_CONTEXT_INVALID_DUP (RE_CARET_ANCHORS_HERE << 1)
 
+/* If this bit is set, then no_sub will be set to 1 during
+   re_compile_pattern.  */
+#define RE_NO_SUB (RE_CONTEXT_INVALID_DUP << 1)
+
 /* This global variable defines the particular regexp syntax to use (for
    some interfaces).  When a regexp is compiled, the syntax used is
    stored in the pattern buffer, so changing this does not affect


	Jakub


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]