This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH] Speed-up character range regexes by up to 2x


On single-byte character sets that have no collation elements (or on all SBCS outside libc), it is useless to create the COMPLEX_BRACKET node that tries to accept multi-byte chars for a character range. This way in those locales (including the C locale) transit_state_mb is never called and some more book-keeping disappears: I got a speed improvement over 40% matching [A-Z][0-9] against ABCDEFGHIJKLMNOPQRSTUVWXYZ.

This patch does this together with a few other cleanups that I found while reading the code.

Please apply also the patch http://sources.redhat.com/ml/libc-alpha/2004-01/msg00099.html which is needed to compile regex on many non-gcc hosts.

What follows the review of the "gawk guy"'s regex patch:

> +#ifdef RE_ENABLE_I18N
>    int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
> +#else
> +  int icase = (bufp->syntax & RE_ICASE);
> +#endif

This is unneeded.

> @@ -2558,8 +2564,8 @@
>                 ? __btowc (start_ch) : start_elem->opr.wch);
>      end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
>               ? __btowc (end_ch) : end_elem->opr.wch);
> -    cmp_buf[0] = start_wc;
> -    cmp_buf[4] = end_wc;
> +    cmp_buf[0] = start_wc != WEOF ? start_wc : start_ch;
> +    cmp_buf[4] = end_wc != WEOF ? end_wc : end_ch;
>      if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
>        return REG_ERANGE;

I am not sure this is the fix; maybe it is better not to include the character set if start_wc == WEOF || end_wc == WEOF, or to return REG_ERANGE?

> +#ifdef HAVE_CONFIG_H
> +#include "config.h"
> +#endif

The alloca patch does this at the very beginning of the file.

> +
> +#if defined (_MSC_VER)
> +#include <stdio.h> /* for size_t */
> +#endif
> +
> +#include <limits.h>

This is needed.

> +# elif defined __APPLE_CC__
> +#  define __restrict

This too.

> +#if 0
> +/* Don't include this here. On some systems it sets RE_DUP_MAX to a
> + * lower value than GNU regex allows.  Instead, include it in
> + * regex.c, before include of <regex.h>, which correctly
> + * #undefs RE_DUP_MAX and sets it to the right value.
> + */
>  #include <limits.h>
> +#endif

Can be completely removed?


> /* This is for other GNU distributions with internationalized messages. */
> -#if HAVE_LIBINTL_H || defined _LIBC
> +#if (HAVE_LIBINTL_H && ENABLE_NLS) || defined _LIBC


Also needed.

> +#if _LIBC || __GNUC__ >= 3
> +# define BE(expr, val) __builtin_expect (expr, val)
> +#else
> +# define BE(expr, val) (expr)
> +# define inline
> +#endif
> +

Isn't this already there? Also, shouldn't "#define inline" be taken care of in the configure script?

Paolo

2004-01-12  Paolo Bonzini  <bonzini@gnu.org>

	* posix/regcomp.c [_LIBC && !RE_ENABLE_I18N]:
	Drop code to support this, it is never true.
	(build_range_exp) [!_LIBC]: Do not create a range
	in MBCSET for a single-byte character set.
	(build_range_exp) [_LIBC]: Do not create a range
	in MBCSET for a single-byte character set without
	collation elements.
	(init_dfa): Do not conditionalize on _LIBC, it
	just makes the code less clear.
	(parse_bracket_exp): Use NON_MATCH variable in
	addition to "mbcset->non_match", not as an
	alternative.
	(build_charclass_op): rename NOT parameter to
	NON_MATCH, use it instead of declaring a variable. 
	(parse_bracket_exp) [!_LIBC]: Pass NULL for MBCSET
	if the character set is single-byte.

Index: regcomp.c
===================================================================
RCS file: /cvs/glibc/libc/posix/regcomp.c,v
retrieving revision 1.74
diff -u -p -r1.74 regcomp.c
--- regcomp.c	6 Jan 2004 21:59:24 -0000	1.74
+++ regcomp.c	12 Jan 2004 08:53:52 -0000
@@ -126,8 +118,8 @@ static reg_errcode_t build_charclass (un
 static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
 				       unsigned RE_TRANSLATE_TYPE trans,
 				       const unsigned char *class_name,
-				       const unsigned char *extra, int not,
-				       reg_errcode_t *err);
+				       const unsigned char *extra,
+				       int non_match, reg_errcode_t *err);
 static bin_tree_t *create_tree (re_dfa_t *dfa,
 				bin_tree_t *left, bin_tree_t *right,
 				re_token_type_t type, int index);
@@ -862,11 +854,9 @@ init_dfa (dfa, pat_len)
       dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset), 1);
       if (BE (dfa->sb_char == NULL, 0))
 	return REG_ESPACE;
-#ifdef _LIBC
       if (dfa->is_utf8)
 	memset (dfa->sb_char, 255, sizeof (unsigned int) * BITSET_UINTS / 2);
       else
-#endif
 	for (i = 0, ch = 0; i < BITSET_UINTS; ++i)
 	  for (j = 0; j < UINT_BITS; ++j, ++ch)
 	    if (btowc (ch) != WEOF)
@@ -2563,32 +2553,40 @@ build_range_exp (sbcset, start_elem, end
     if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
       return REG_ERANGE;
 
-    /* Check the space of the arrays.  */
-    if (BE (*range_alloc == mbcset->nranges, 0))
+    /* Got valid collation sequence values, add them as a new entry.
+       However, for !_LIBC we have no collation elements: if the
+       character set is single byte, the single byte character set
+       that we build below suffices.  parse_bracket_exp passes
+       no MBCSET if dfa->mb_cur_max == 1.  */
+    if (mbcset)
       {
-	/* There are not enough space, need realloc.  */
-	wchar_t *new_array_start, *new_array_end;
-	int new_nranges;
-
-	/* +1 in case of mbcset->nranges is 0.  */
-	new_nranges = 2 * mbcset->nranges + 1;
-	/* Use realloc since mbcset->range_starts and mbcset->range_ends
-	   are NULL if *range_alloc == 0.  */
-	new_array_start = re_realloc (mbcset->range_starts, wchar_t,
-				      new_nranges);
-	new_array_end = re_realloc (mbcset->range_ends, wchar_t,
-				    new_nranges);
-
-	if (BE (new_array_start == NULL || new_array_end == NULL, 0))
-	  return REG_ESPACE;
-
-	mbcset->range_starts = new_array_start;
-	mbcset->range_ends = new_array_end;
-	*range_alloc = new_nranges;
-      }
+        /* Check the space of the arrays.  */
+        if (BE (*range_alloc == mbcset->nranges, 0))
+          {
+	    /* There is not enough space, need realloc.  */
+	    wchar_t *new_array_start, *new_array_end;
+	    int new_nranges;
+
+	    /* +1 in case of mbcset->nranges is 0.  */
+	    new_nranges = 2 * mbcset->nranges + 1;
+	    /* Use realloc since mbcset->range_starts and mbcset->range_ends
+	       are NULL if *range_alloc == 0.  */
+	    new_array_start = re_realloc (mbcset->range_starts, wchar_t,
+				          new_nranges);
+	    new_array_end = re_realloc (mbcset->range_ends, wchar_t,
+				        new_nranges);
+
+	    if (BE (new_array_start == NULL || new_array_end == NULL, 0))
+	      return REG_ESPACE;
+
+	    mbcset->range_starts = new_array_start;
+	    mbcset->range_ends = new_array_end;
+	    *range_alloc = new_nranges;
+          }
 
-    mbcset->range_starts[mbcset->nranges] = start_wc;
-    mbcset->range_ends[mbcset->nranges++] = end_wc;
+        mbcset->range_starts[mbcset->nranges] = start_wc;
+        mbcset->range_ends[mbcset->nranges++] = end_wc;
+      }
 
     /* Build the table for single byte characters.  */
     for (wc = 0; wc <= SBC_MAX; ++wc)
@@ -2775,13 +2773,9 @@ parse_bracket_exp (regexp, dfa, token, s
 
   static inline reg_errcode_t
   __attribute ((always_inline))
-# ifdef RE_ENABLE_I18N
   build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
 	 re_charset_t *mbcset;
 	 int *range_alloc;
-# else /* not RE_ENABLE_I18N */
-  build_range_exp (sbcset, start_elem, end_elem)
-# endif /* not RE_ENABLE_I18N */
 	 re_bitset_ptr_t sbcset;
 	 bracket_elem_t *start_elem, *end_elem;
     {
@@ -2789,33 +2783,6 @@ parse_bracket_exp (regexp, dfa, token, s
       uint32_t start_collseq;
       uint32_t end_collseq;
 
-# ifdef RE_ENABLE_I18N
-      /* Check the space of the arrays.  */
-      if (BE (*range_alloc == mbcset->nranges, 0))
-	{
-	  /* There are not enough space, need realloc.  */
-	  uint32_t *new_array_start;
-	  uint32_t *new_array_end;
-	  int new_nranges;
-
-	  /* +1 in case of mbcset->nranges is 0.  */
-	  new_nranges = 2 * mbcset->nranges + 1;
-	  /* Use realloc since mbcset->range_starts and mbcset->range_ends
-	     are NULL if *range_alloc == 0.  */
-	  new_array_start = re_realloc (mbcset->range_starts, uint32_t,
-					new_nranges);
-	  new_array_end = re_realloc (mbcset->range_ends, uint32_t,
-				      new_nranges);
-
-	  if (BE (new_array_start == NULL || new_array_end == NULL, 0))
-	    return REG_ESPACE;
-
-	  mbcset->range_starts = new_array_start;
-	  mbcset->range_ends = new_array_end;
-	  *range_alloc = new_nranges;
-	}
-# endif /* RE_ENABLE_I18N */
-
       /* Equivalence Classes and Character Classes can't be a range
 	 start/end.  */
       if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
@@ -2831,11 +2798,38 @@ parse_bracket_exp (regexp, dfa, token, s
       if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
 	return REG_ERANGE;
 
-# ifdef RE_ENABLE_I18N
-      /* Got valid collation sequence values, add them as a new entry.  */
-      mbcset->range_starts[mbcset->nranges] = start_collseq;
-      mbcset->range_ends[mbcset->nranges++] = end_collseq;
-# endif /* RE_ENABLE_I18N */
+      /* Got valid collation sequence values, add them as a new entry.
+	 However, if we have no collation elements, and the character set
+	 is single byte, the single byte character set that we
+	 build below suffices. */
+      if (nrules > 0 || dfa->mb_cur_max > 1)
+	{
+          /* Check the space of the arrays.  */
+          if (BE (*range_alloc == mbcset->nranges, 0))
+	    {
+	      /* There is not enough space, need realloc.  */
+	      uint32_t *new_array_start;
+	      uint32_t *new_array_end;
+	      int new_nranges;
+
+	      /* +1 in case of mbcset->nranges is 0.  */
+	      new_nranges = 2 * mbcset->nranges + 1;
+	      new_array_start = re_realloc (mbcset->range_starts, uint32_t,
+					    new_nranges);
+	      new_array_end = re_realloc (mbcset->range_ends, uint32_t,
+				          new_nranges);
+
+	      if (BE (new_array_start == NULL || new_array_end == NULL, 0))
+	        return REG_ESPACE;
+
+	      mbcset->range_starts = new_array_start;
+	      mbcset->range_ends = new_array_end;
+	      *range_alloc = new_nranges;
+	    }
+
+          mbcset->range_starts[mbcset->nranges] = start_collseq;
+          mbcset->range_ends[mbcset->nranges++] = end_collseq;
+	}
 
       /* Build the table for single byte characters.  */
       for (ch = 0; ch <= SBC_MAX; ch++)
@@ -2862,13 +2856,9 @@ parse_bracket_exp (regexp, dfa, token, s
 
   static inline reg_errcode_t
   __attribute ((always_inline))
-# ifdef RE_ENABLE_I18N
   build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
 	 re_charset_t *mbcset;
 	 int *coll_sym_alloc;
-# else /* not RE_ENABLE_I18N */
-  build_collating_symbol (sbcset, name)
-# endif /* not RE_ENABLE_I18N */
 	 re_bitset_ptr_t sbcset;
 	 const unsigned char *name;
     {
@@ -2894,7 +2884,6 @@ parse_bracket_exp (regexp, dfa, token, s
 	  else
 	    return REG_ECOLLATE;
 
-# ifdef RE_ENABLE_I18N
 	  /* Got valid collation sequence, add it as a new entry.  */
 	  /* Check the space of the arrays.  */
 	  if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
@@ -2912,7 +2901,6 @@ parse_bracket_exp (regexp, dfa, token, s
 	      *coll_sym_alloc = new_coll_sym_alloc;
 	    }
 	  mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
-# endif /* RE_ENABLE_I18N */
 	  return REG_NOERROR;
 	}
       else
@@ -2934,9 +2922,8 @@ parse_bracket_exp (regexp, dfa, token, s
   re_charset_t *mbcset;
   int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
   int equiv_class_alloc = 0, char_class_alloc = 0;
-#else /* not RE_ENABLE_I18N */
-  int non_match = 0;
 #endif /* not RE_ENABLE_I18N */
+  int non_match = 0;
   bin_tree_t *work_tree;
   int token_len;
   int first_round = 1;
@@ -2981,9 +2968,8 @@ parse_bracket_exp (regexp, dfa, token, s
     {
 #ifdef RE_ENABLE_I18N
       mbcset->non_match = 1;
-#else /* not RE_ENABLE_I18N */
-      non_match = 1;
 #endif /* not RE_ENABLE_I18N */
+      non_match = 1;
       if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
 	bitset_set (sbcset, '\0');
       re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
@@ -3062,11 +3048,18 @@ parse_bracket_exp (regexp, dfa, token, s
 
 	  token_len = peek_token_bracket (token, regexp, syntax);
 
+#ifdef _LIBC
+	  *err = build_range_exp (sbcset, mbcset, &range_alloc,
+				  &start_elem, &end_elem);
+#else
+# ifdef RE_ENABLE_I18N
 	  *err = build_range_exp (sbcset,
-#ifdef RE_ENABLE_I18N
-				  mbcset, &range_alloc,
+				  dfa->mb_cur_max > 1 ? mbcset : NULL,
+				  &range_alloc, &start_elem, &end_elem);
+# else
+	  *err = build_range_exp (sbcset, &start_elem, &end_elem);
+# endif
 #endif /* RE_ENABLE_I18N */
-				  &start_elem, &end_elem);
 	  if (BE (*err != REG_NOERROR, 0))
 	    goto parse_bracket_exp_free_return;
 	}
@@ -3140,12 +3133,9 @@ parse_bracket_exp (regexp, dfa, token, s
   re_string_skip_bytes (regexp, token_len); /* Skip a token.  */
 
   /* If it is non-matching list.  */
-#ifdef RE_ENABLE_I18N
-  if (mbcset->non_match)
-#else /* not RE_ENABLE_I18N */
   if (non_match)
-#endif /* not RE_ENABLE_I18N */
     bitset_not (sbcset);
+
 #ifdef RE_ENABLE_I18N
   /* Ensure only single byte characters are set.  */
   if (dfa->mb_cur_max > 1)
@@ -3316,7 +3306,7 @@ build_equiv_class (sbcset, name)
      re_bitset_ptr_t sbcset;
      const unsigned char *name;
 {
-#if defined _LIBC && defined RE_ENABLE_I18N
+#if defined _LIBC
   uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
   if (nrules != 0)
     {
@@ -3385,7 +3375,7 @@ build_equiv_class (sbcset, name)
       mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
     }
   else
-#endif /* _LIBC && RE_ENABLE_I18N */
+#endif /* _LIBC */
     {
       if (BE (strlen ((const char *) name) != 1, 0))
 	return REG_ECOLLATE;
@@ -3481,20 +3471,18 @@ build_charclass (trans, sbcset, class_na
 }
 
 static bin_tree_t *
-build_charclass_op (dfa, trans, class_name, extra, not, err)
+build_charclass_op (dfa, trans, class_name, extra, non_match, err)
      re_dfa_t *dfa;
      unsigned RE_TRANSLATE_TYPE trans;
      const unsigned char *class_name;
      const unsigned char *extra;
-     int not;
+     int non_match;
      reg_errcode_t *err;
 {
   re_bitset_ptr_t sbcset;
 #ifdef RE_ENABLE_I18N
   re_charset_t *mbcset;
   int alloc = 0;
-#else /* not RE_ENABLE_I18N */
-  int non_match = 0;
 #endif /* not RE_ENABLE_I18N */
   reg_errcode_t ret;
   re_token_t br_token;
@@ -3515,7 +3503,7 @@ build_charclass_op (dfa, trans, class_na
       return NULL;
     }
 
-  if (not)
+  if (non_match)
     {
 #ifdef RE_ENABLE_I18N
       /*
@@ -3523,8 +3511,6 @@ build_charclass_op (dfa, trans, class_na
 	bitset_set(cset->sbcset, '\0');
       */
       mbcset->non_match = 1;
-#else /* not RE_ENABLE_I18N */
-      non_match = 1;
 #endif /* not RE_ENABLE_I18N */
     }
 
@@ -3549,11 +3535,7 @@ build_charclass_op (dfa, trans, class_na
     bitset_set (sbcset, *extra);
 
   /* If it is non-matching list.  */
-#ifdef RE_ENABLE_I18N
-  if (mbcset->non_match)
-#else /* not RE_ENABLE_I18N */
   if (non_match)
-#endif /* not RE_ENABLE_I18N */
     bitset_not (sbcset);
 
 #ifdef RE_ENABLE_I18N

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]