This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[regex] Speed up single-byte .


C locale searches spend a lot of time (up to 10-15% for bug-regex11) calling functions for multibyte matches, because ACCEPT_MB_NODE passes OP_PERIOD nodes even if dfa->mb_cur_max == 1.

The attached patch does not use ACCEPT_MB_NODE to discriminate such nodes, and instead relies on a bitfield within the token type.

This does not apply to UTF-8 searches yet, but I have another patch to speed up UTF-8 searches, lowering OP_PERIOD to character ranges instead of using OP_UTF8_PERIOD. After that one, many UTF-8 searches won't go through the slow multi-byte path anymore.

I wanted to ask if it is ok to use [\x00-\x7F]|[\xC2-\xFF][\x80-\xBF]* instead of the full UTF-8 format. This sequence is more efficient memory-wise, but it accepts invalid UTF-8 sequences, for example \xE0\xA0\x80.

Paolo
2004-01-12  Paolo Bonzini  <bonzini@gnu.org>

	* posix/regcomp.c (peek_token, parse_bracket_exp, build_charclass_op)
	[RE_ENABLE_I18N]: Initialize the tokens' accept_mb field.
	* posix/regex_internal.c (create_ci_newstate, create_cd_newstate)
	[RE_ENABLE_I18N]: Initialize the DFA state's accept_mb field
	from the nodes'.
	* posix/regex_internal.h (re_token_t) [RE_ENABLE_I18N]: Add the
	accept_mb bitfield.
	(ACCEPT_MB_NODE): Remove.
	* posix/regexec.c (transit_state_mb, check_arrival_add_next_nodes,
	proceed_next_node, build_sifted_states): Look at the nodes' accept_mb
	field.

--- orig/lib/regcomp.c
+++ mod/lib/regcomp.c
@@ -1788,6 +1788,7 @@ peek_token (token, input, syntax)
 
   token->word_char = 0;
 #ifdef RE_ENABLE_I18N
+  token->accept_mb = 0;
   token->mb_partial = 0;
   if (input->mb_cur_max > 1 &&
       !re_string_first_byte (input, re_string_cur_idx (input)))
@@ -2383,6 +2384,9 @@ parse_expression (regexp, preg, token, s
       fetch_token (token, regexp, syntax);
       return tree;
     case OP_PERIOD:
+#ifdef RE_ENABLE_I18N
+      token->accept_mb = dfa->mb_cur_max > 1;
+#endif
       tree = re_dfa_add_tree_node (dfa, NULL, NULL, token);
       if (BE (tree == NULL, 0))
 	{
@@ -3314,6 +3318,7 @@ parse_bracket_exp (regexp, dfa, token, s
 	  return work_tree;
 	}
       br_token.type = COMPLEX_BRACKET;
+      br_token.accept_mb = 1;
       br_token.opr.mbcset = mbcset;
       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);
       if (BE (mbc_tree == NULL, 0))
@@ -3704,6 +3709,7 @@ build_charclass_op (dfa, trans, class_na
       bin_tree_t *mbc_tree;
       /* Build a tree for complex bracket.  */
       br_token.type = COMPLEX_BRACKET;
+      br_token.accept_mb = 1;
       br_token.opr.mbcset = mbcset;
       dfa->has_mb_node = 1;
       mbc_tree = re_dfa_add_tree_node (dfa, NULL, NULL, &br_token);


--- orig/lib/regex_internal.c
+++ mod/lib/regex_internal.c
@@ -1560,16 +1560,13 @@ create_ci_newstate (dfa, nodes, hash)
       re_token_type_t type = node->type;
       if (type == CHARACTER && !node->constraint)
 	continue;
+#ifdef RE_ENABLE_I18N
+      newstate->accept_mb |= node->accept_mb;
+#endif /* RE_ENABLE_I18N */
 
       /* If the state has the halt node, the state is a halt state.  */
-      else if (type == END_OF_RE)
+      if (type == END_OF_RE)
 	newstate->halt = 1;
-#ifdef RE_ENABLE_I18N
-      else if (type == COMPLEX_BRACKET
-	       || type == OP_UTF8_PERIOD
-	       || (type == OP_PERIOD && dfa->mb_cur_max > 1))
-	newstate->accept_mb = 1;
-#endif /* RE_ENABLE_I18N */
       else if (type == OP_BACK_REF)
 	newstate->has_backref = 1;
       else if (type == ANCHOR || node->constraint)
@@ -1613,15 +1610,13 @@ create_cd_newstate (dfa, nodes, context,
 
       if (type == CHARACTER && !constraint)
 	continue;
-      /* If the state has the halt node, the state is a halt state.  */
-      else if (type == END_OF_RE)
-	newstate->halt = 1;
 #ifdef RE_ENABLE_I18N
-      else if (type == COMPLEX_BRACKET
-	       || type == OP_UTF8_PERIOD
-	       || (type == OP_PERIOD && dfa->mb_cur_max > 1))
-	newstate->accept_mb = 1;
+      newstate->accept_mb |= node->accept_mb;
 #endif /* RE_ENABLE_I18N */
+
+      /* If the state has the halt node, the state is a halt state.  */
+      if (type == END_OF_RE)
+	newstate->halt = 1;
       else if (type == OP_BACK_REF)
 	newstate->has_backref = 1;
       else if (type == ANCHOR)


--- orig/lib/regex_internal.h
+++ mod/lib/regex_internal.h
@@ -284,6 +284,7 @@ typedef struct
   unsigned int duplicated : 1;
   unsigned int opt_subexp : 1;
 #ifdef RE_ENABLE_I18N
+  unsigned int accept_mb : 1;
   /* These 2 bits can be moved into the union if needed (e.g. if running out
      of bits; move opr.c to opr.c.c and move the flags to opr.c.flags).  */
   unsigned int mb_partial : 1;
@@ -292,8 +293,6 @@ typedef struct
 } re_token_t;
 
 #define IS_EPSILON_NODE(type) ((type) & EPSILON_BIT)
-#define ACCEPT_MB_NODE(type) \
-  ((type) >= OP_PERIOD && (type) <= OP_UTF8_PERIOD)
 
 struct re_string_t
 {


--- orig/lib/regexec.c
+++ mod/lib/regexec.c
@@ -1254,7 +1254,7 @@ proceed_next_node (mctx, nregs, regs, pi
       re_token_type_t type = dfa->nodes[node].type;
 
 #ifdef RE_ENABLE_I18N
-      if (ACCEPT_MB_NODE (type))
+      if (dfa->nodes[node].accept_mb)
 	naccepted = check_node_accept_bytes (dfa, node, &mctx->input, *pidx);
       else
 #endif /* RE_ENABLE_I18N */
@@ -1620,7 +1620,7 @@ build_sifted_states (mctx, sctx, str_idx
 	continue;
 #ifdef RE_ENABLE_I18N
       /* If the node may accept `multi byte'.  */
-      if (ACCEPT_MB_NODE (type))
+      if (dfa->nodes[prev_node].accept_mb)
 	naccepted = sift_states_iter_mb (mctx, sctx, prev_node,
 					 str_idx, sctx->last_str_idx);
 #endif /* RE_ENABLE_I18N */
@@ -2476,7 +2476,7 @@ transit_state_mb (mctx, pstate)
 	}
 
       /* How many bytes the node can accept?  */
-      if (ACCEPT_MB_NODE (dfa->nodes[cur_node_idx].type))
+      if (dfa->nodes[cur_node_idx].accept_mb)
 	naccepted = check_node_accept_bytes (dfa, cur_node_idx, &mctx->input,
 					     re_string_cur_idx (&mctx->input));
       if (naccepted == 0)
@@ -3015,7 +3015,7 @@ check_arrival_add_next_nodes (mctx, str_
 	continue;
 #ifdef RE_ENABLE_I18N
       /* If the node may accept `multi byte'.  */
-      if (ACCEPT_MB_NODE (type))
+      if (dfa->nodes[cur_node].accept_mb)
 	{
 	  naccepted = check_node_accept_bytes (dfa, cur_node, &mctx->input,
 					       str_idx);




Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]