This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] More regex fixes and testcases


The first of these patches fixes the infinite loop on a regex
like ()\1*\1*.  While doing this, I decided to rewrite the
check_dst_limits_calc_pos routine which had some IMHO uselessly
complicated expressions.  The patch has no new testsuite
failures, also in the sed testsuite.

I still have some XFAILs in the sed testsuite, that look easy
but are not completely so.  The problem is in matching empty
backreferences, which are not put in the epsilon closures and
hence are sifted away by sift_states_backward.  However the
easy way of adding OP_BACK_REF to the epsilon closures causes
some problems in the handling of the fail stack.

       ()(b)\1c\2     fails to match the whole of `bcb'
       (b())\2\1      fails to match a<bb>bbc
       (bb())\2\1     fails to match a<bbbb>c with a sigsegv

The first patch adds test cases for the fail stack handling bug
that was caused by my first attempt at solving them.  The
second patch adds the failing test cases; I made it separately
because you may not like XFAILs.

Time permitting, I have a set of patches in mind to:
1) add debugging code to print out the DFA
2) mark some functions as pure
3) add more comments to the functions (most of the times,
  there is something at the call-site but not at the heading).
4) simplify/hoist some expression like I did in
  check_dst_limits_calc_pos

How much would these be appreciated, and in what order?

Paolo

2003-10-05  Paolo Bonzini  <bonzini@gnu.org>

        * posix/bug-regex11.c: Add more backreference-related
        test cases.
        (main): Show the failing regex in the error messages.
        * posix/regexec.c (check_dst_limits_calc_pos):
        Simplify some nested conditionals.  Replace if's with
        a switch statement.
        (check_dst_limits_calc_pos <TYPE_BKREF>): Rename
        parameter NODE to FROM_NODE, it shadows a local
        variable; don't recurse if FROM_NODE does not change
        in the recursive invocation, fixing an infinite loop
        in the ()\1*\1* regex.
	(sift_states_backward): Fix function comment.
	* posix/regcomp.c (calc_epsdest): Add an assertion.


diff -u -p -b -r1.6 bug-regex11.c
--- libc/posix/bug-regex11.c	2 Oct 2003 22:40:16 -0000	1.6
+++ libc/posix/bug-regex11.c	6 Oct 2003 09:53:38 -0000
@@ -35,6 +35,7 @@ struct
   /* Test for newline handling in regex.  */
   { "[^~]*~", "\nx~y", 0, 2, { { 0, 3 }, { -1, -1 } } },
   /* Other tests.  */
+  { "a(.*)b", "a b", REG_EXTENDED, 2, { { 0, 3 }, { 1, 2 } } },
   { ".*|\\([KIO]\\)\\([^|]*\\).*|?[KIO]", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
     { { 0, 21 }, { 15, 16 }, { 16, 18 } } },
   { ".*|\\([KIO]\\)\\([^|]*\\).*|?\\1", "10~.~|P|K0|I10|O16|?KSb", 0, 3,
@@ -52,6 +53,10 @@ struct
   /* Here ^ cannot be treated as an anchor according to POSIX.  */
   { "(^|foo)bar", "(^|foo)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
   { "(foo|^)bar", "(foo|^)bar", 0, 2, { { 0, 10 }, { -1, -1 } } },
+  /* More tests on backreferences.  */
+  { "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
+  { "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
+  { "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
 };
 
 int
@@ -71,14 +76,14 @@ main (void)
 	{
 	  char buf[500];
 	  regerror (n, &re, buf, sizeof (buf));
-	  printf ("regcomp %zd failed: %s\n", i, buf);
+	  printf ("%s: regcomp %zd failed: %s\n", tests[i].pattern, buf, i);
 	  ret = 1;
 	  continue;
 	}
 
       if (regexec (&re, tests[i].string, tests[i].nmatch, rm, 0))
 	{
-	  printf ("regexec %zd failed\n", i);
+	  printf ("%s: regexec %zd failed\n", tests[i].pattern, i);
 	  ret = 1;
 	  regfree (&re);
 	  continue;
@@ -90,8 +95,8 @@ main (void)
 	  {
 	    if (tests[i].rm[n].rm_so == -1 && tests[i].rm[n].rm_eo == -1)
 	      break;
-	    printf ("regexec match failure rm[%d] %d..%d\n",
-		    n, rm[n].rm_so, rm[n].rm_eo);
+	    printf ("%s: regexec %zd match failure rm[%d] %d..%d\n",
+		    tests[i].pattern, i, n, rm[n].rm_so, rm[n].rm_eo);
 	    ret = 1;
 	    break;
 	  }
diff -u -p -b -r1.42 regcomp.c
--- libc/posix/regcomp.c	2 Oct 2003 22:39:46 -0000	1.42
+++ libc/posix/regcomp.c	6 Oct 2003 09:53:39 -0000
@@ -1149,6 +1149,8 @@ calc_epsdest (dfa, node)
 	       || dfa->nodes[idx].type == OP_CLOSE_SUBEXP
 	       || dfa->nodes[idx].type == OP_BACK_REF)
 	re_node_set_init_1 (dfa->edests + idx, node->next);
+      else
+        assert (!IS_EPSILON_NODE (dfa->nodes[idx].type));
     }
 }
 
diff -u -p -b -r1.27 regexec.c
--- libc/posix/regexec.c	23 Sep 2003 05:30:23 -0000	1.27
+++ libc/posix/regexec.c	6 Oct 2003 09:53:39 -0000
@@ -1387,7 +1387,7 @@ update_regs (dfa, pmatch, cur_node, cur_
 	   away the node `a'.
 	ii. If 'b' is in the STATE_LOG[STR_IDX+strlen('s')] but 'b' is
 	    throwed away, we throw away the node `a'.
-     3. When 0 <= STR_IDX < n and 'a' epsilon transit to 'b':
+     3. When 0 <= STR_IDX < MATCH_LAST and 'a' epsilon transit to 'b':
 	i. If 'b' isn't in the STATE_LOG[STR_IDX], we throw away the
 	   node `a'.
 	ii. If 'b' is in the STATE_LOG[STR_IDX] but 'b' is throwed away,
@@ -1724,60 +1724,98 @@ check_dst_limits (dfa, limits, mctx, dst
 }
 
 static int
-check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, node,
+check_dst_limits_calc_pos (dfa, mctx, limit, eclosures, subexp_idx, from_node,
 			   str_idx)
      re_dfa_t *dfa;
      re_match_context_t *mctx;
      re_node_set *eclosures;
-     int limit, subexp_idx, node, str_idx;
+     int limit, subexp_idx, from_node, str_idx;
 {
   struct re_backref_cache_entry *lim = mctx->bkref_ents + limit;
-  int pos = (str_idx < lim->subexp_from ? -1
-	     : (lim->subexp_to < str_idx ? 1 : 0));
-  if (pos == 0
-      && (str_idx == lim->subexp_from || str_idx == lim->subexp_to))
-    {
       int node_idx;
+
+  /* If we are outside the range of the subexpression, return -1 or 1.  */
+  if (str_idx < lim->subexp_from)
+    return -1;
+
+  if (lim->subexp_to < str_idx)
+    return 1;
+
+  /* If we are within the subexpression, return 0.  */
+  if (str_idx != lim->subexp_from && str_idx != lim->subexp_to)
+    return 0;
+
+  /* Else, we are on the boundary: examine the nodes on the epsilon
+     closure.  */
       for (node_idx = 0; node_idx < eclosures->nelem; ++node_idx)
 	{
 	  int node = eclosures->elems[node_idx];
-	  re_token_type_t type= dfa->nodes[node].type;
-	  if (type == OP_BACK_REF)
+      switch (dfa->nodes[node].type)
+	{
+	case OP_BACK_REF:
 	    {
 	      int bi = search_cur_bkref_entry (mctx, str_idx);
 	      for (; bi < mctx->nbkref_ents; ++bi)
 		{
 		  struct re_backref_cache_entry *ent = mctx->bkref_ents + bi;
+		int dst, cpos;
+		
+		/* If this backreference goes beyond the point we're
+		   examining, don't go any further.  */
 		  if (ent->str_idx > str_idx)
 		    break;
-		  if (ent->node == node && ent->subexp_from == ent->subexp_to)
-		    {
-		      int cpos, dst;
+
+		if (ent->node != node || ent->subexp_from != ent->subexp_to)
+		  continue;
+
+		/* Recurse trying to reach the OP_OPEN_SUBEXP and
+		   OP_CLOSE_SUBEXP cases below.  But, if the
+		   destination node is the same node as the source
+		   node, don't recurse because it would cause an
+		   infinite loop: a regex that exhibits this behavior
+		   is ()\1*\1*  */
 		      dst = dfa->edests[node].elems[0];
+		if (dst == from_node)
+		  {
+		    if (str_idx == lim->subexp_from)
+		      return -1;
+		    else /* if (str_idx == lim->subexp_to) */
+		      return 0;
+		  }
+
 		      cpos = check_dst_limits_calc_pos (dfa, mctx, limit,
 							dfa->eclosures + dst,
 							subexp_idx, dst,
 							str_idx);
-		      if ((str_idx == lim->subexp_from && cpos == -1)
-			  || (str_idx == lim->subexp_to && cpos == 0))
-			return cpos;
-		    }
-		}
+
+		if (cpos == -1 && str_idx == lim->subexp_from)
+		  return -1;
+
+		if (cpos == 0 /* && str_idx == lim->lim->subexp_to */)
+		  return 0;
 	    }
-	  if (type == OP_OPEN_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
-	      && str_idx == lim->subexp_from)
-	    {
-	      pos = -1;
 	      break;
 	    }
-	  if (type == OP_CLOSE_SUBEXP && subexp_idx == dfa->nodes[node].opr.idx
-	      && str_idx == lim->subexp_to)
+	  
+	case OP_OPEN_SUBEXP:
+	  if (str_idx == lim->subexp_from && subexp_idx == dfa->nodes[node].opr.idx)
+	    return -1;
+	  break;
+	  
+	case OP_CLOSE_SUBEXP:
+	  if (str_idx == lim->subexp_to && subexp_idx == dfa->nodes[node].opr.idx)
+	    return 0;
+	  break;
+
+	default:
 	    break;
 	}
-      if (node_idx == eclosures->nelem && str_idx == lim->subexp_to)
-	pos = 1;
     }
-  return pos;
+
+  if (str_idx == lim->subexp_to)
+    return 1;
+  else
+    return 0;
 }
 
 /* Check the limitations of sub expressions LIMITS, and remove the nodes
2003-10-05  Paolo Bonzini  <bonzini@gnu.org>

	* libc/posix/bug-regex11.c: add three
	failing testcases.

--- libc/posix/bug-regex11.c	2003-10-06 11:52:23.000000000 +0200
+++ bug-regex11.c	2003-10-06 12:06:23.000000000 +0200
@@ -57,8 +57,11 @@
   { "()\\1*\\1*", "", REG_EXTENDED, 2, { { 0, 0 }, { 0, 0 } } },
   { "([0-9]).*\\1(a*)", "7;7a6", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
   { "([0-9]).*\\1(a*)", "7;7a", REG_EXTENDED, 3, { { 0, 4 }, { 0, 1 }, { 3, 4 } } },
+  { "()(b)\\1c\\2", "bcb", REG_EXTENDED, 3, { { 0, 3 }, { 0, 0 }, { 1, 2 } } },
+  { "(b())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 2 }, { 0, 1 }, { 1, 1 } } },
+  { "(bb())\\2\\1", "bbbb", REG_EXTENDED, 3, { { 0, 4 }, { 0, 2 }, { 2, 2 } } },
 };
 
 int
 main (void)
 {

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]