This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[regex] Speedup fastmap


This was the one with the bug.  It should be ok now.

It hoists several conditionals outside the "fastmap" loop, speeding up for example regexes that start with a period (or depending on the input, also things such as [A-Za-z]*).

Paolo

2004-10-27  Paolo Bonzini  <bonzini@gnu.org>

	* posix/regexec.c (acquire_init_state_context): Do not
	always inline.
	(transit_state): Remove the check for out-of-bounds buffers.
	(check_matching): Check here for out-of-bounds buffers.
	(re_search_internal): Store into match_kind a set of bits
	indicating which incantation of fastmap scanning must be
	used.  Use a switch statement instead of multiple ifs.
	Exit the final "for (;;)" with goto free_return unless
	the match succeeded, thus simplifying some conditionals.

diff -ru lib_mio/regexec.c lib/regexec.c
*** lib_mio/regexec.c	2004-10-27 13:03:14.000000000 +0200
--- lib/regexec.c	2004-10-27 12:56:46.000000000 +0200
***************
*** 57,63 ****
  			      int nregs, int regs_allocated) internal_function;
  static inline re_dfastate_t *acquire_init_state_context
       (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
!      __attribute ((always_inline)) internal_function;
  static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
       internal_function;
  static int check_matching (re_match_context_t *mctx, int fl_longest_match,
--- 55,61 ----
  			      int nregs, int regs_allocated) internal_function;
  static inline re_dfastate_t *acquire_init_state_context
       (reg_errcode_t *err, const re_match_context_t *mctx, int idx)
!      internal_function;
  static reg_errcode_t prune_impossible_nodes (re_match_context_t *mctx)
       internal_function;
  static int check_matching (re_match_context_t *mctx, int fl_longest_match,
***************
*** 603,617 ****
    reg_errcode_t err;
    re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
    int left_lim, right_lim, incr;
!   int fl_longest_match, match_first, match_last = -1;
!   int fast_translate, sb;
  #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
    re_match_context_t mctx = { .dfa = dfa };
  #else
    re_match_context_t mctx;
  #endif
!   char *fastmap = ((preg->fastmap != NULL && preg->fastmap_accurate
! 		    && range && !preg->can_be_null) ? preg->fastmap : NULL);
  
  #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
    memset (&mctx, '\0', sizeof (re_match_context_t));
--- 601,616 ----
    reg_errcode_t err;
    re_dfa_t *dfa = (re_dfa_t *)preg->buffer;
    int left_lim, right_lim, incr;
!   int fl_longest_match, match_first, match_kind, match_last = -1;
!   int fast_translate, sb, ch;
  #if defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L)
    re_match_context_t mctx = { .dfa = dfa };
  #else
    re_match_context_t mctx;
  #endif
!   char *fastmap = (preg->fastmap != NULL && preg->fastmap_accurate
! 		   && range && !preg->can_be_null) ? preg->fastmap : NULL;
!   unsigned RE_TRANSLATE_TYPE t = (unsigned RE_TRANSLATE_TYPE) preg->translate;
  
  #if !(defined _LIBC || (defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L))
    memset (&mctx, '\0', sizeof (re_match_context_t));
***************
*** 684,771 ****
    left_lim = (range < 0) ? start + range : start;
    right_lim = (range < 0) ? start : start + range;
    sb = dfa->mb_cur_max == 1;
!   fast_translate = sb || !(preg->syntax & RE_ICASE || preg->translate);
  
!   for (;;)
      {
!       /* At first get the current byte from input string.  */
!       if (fastmap)
! 	{
! 	  if (BE (fast_translate, 1))
! 	    {
! 	      unsigned RE_TRANSLATE_TYPE t
! 		= (unsigned RE_TRANSLATE_TYPE) preg->translate;
! 	      if (BE (range >= 0, 1))
! 		{
! 		  if (BE (t != NULL, 0))
! 		    {
! 		      while (BE (match_first < right_lim, 1)
! 			     && !fastmap[t[(unsigned char) string[match_first]]])
! 			++match_first;
! 		    }
! 		  else
! 		    {
! 		      while (BE (match_first < right_lim, 1)
! 			     && !fastmap[(unsigned char) string[match_first]])
! 			++match_first;
! 		    }
! 		  if (BE (match_first == right_lim, 0))
! 		    {
! 		      int ch = match_first >= length
! 			       ? 0 : (unsigned char) string[match_first];
! 		      if (!fastmap[t ? t[ch] : ch])
! 			break;
! 		    }
! 		}
! 	      else
! 		{
! 		  while (match_first >= left_lim)
! 		    {
! 		      int ch = match_first >= length
! 			       ? 0 : (unsigned char) string[match_first];
! 		      if (fastmap[t ? t[ch] : ch])
! 			break;
! 		      --match_first;
! 		    }
! 		  if (match_first < left_lim)
! 		    break;
! 		}
  	    }
! 	  else
! 	    {
! 	      int ch;
  
! 	      do
  		{
! 		  /* In this case, we can't determine easily the current byte,
! 		     since it might be a component byte of a multibyte
! 		     character.  Then we use the constructed buffer
! 		     instead.  */
! 		  /* If MATCH_FIRST is out of the valid range, reconstruct the
! 		     buffers.  */
! 		  if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len
! 		      <= match_first
! 		      || match_first < mctx.input.raw_mbs_idx)
! 		    {
! 		      err = re_string_reconstruct (&mctx.input, match_first,
! 						   eflags);
! 		      if (BE (err != REG_NOERROR, 0))
! 			goto free_return;
! 		    }
! 		  /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
! 		     Note that MATCH_FIRST must not be smaller than 0.  */
! 		  ch = ((match_first >= length) ? 0
! 		       : re_string_byte_at (&mctx.input,
! 					    match_first
! 					    - mctx.input.raw_mbs_idx));
! 		  if (fastmap[ch])
! 		    break;
! 		  match_first += incr;
  		}
! 	      while (match_first >= left_lim && match_first <= right_lim);
! 	      if (! fastmap[ch])
  		break;
  	    }
  	}
  
        /* Reconstruct the buffers so that the matcher can assume that
--- 683,781 ----
    left_lim = (range < 0) ? start + range : start;
    right_lim = (range < 0) ? start : start + range;
    sb = dfa->mb_cur_max == 1;
!   match_kind = 
!     (fastmap ? 8 : 0)
!     | (sb || !(preg->syntax & RE_ICASE || t) ? 4 : 0)
!     | (range >= 0 ? 2 : 0)
!     | (t != NULL ? 1 : 0);
  
!   for (;; match_first += incr)
      {
!       err = REG_NOMATCH;
!       if (match_first < left_lim || right_lim < match_first)
! 	goto free_return;
! 
!       /* Advance as rapidly as possible through the string, until we
! 	 find a plausible place to start matching.  This may be done
! 	 with varying efficiency, so there are various possibilities:
! 	 only the most common of them are specialized, in order to
! 	 save on code size.  We use a switch statement for speed.  */
!       switch (match_kind)
! 	{
! 	case 0: case 1: case 2: case 3:
! 	case 4: case 5: case 6: case 7:
! 	  /* No fastmap.  */
! 	  break;
! 
! 	case 15:
! 	  /* Fastmap with single-byte translation, match forward.  */
! 	  while (BE (match_first < right_lim, 1)
! 		 && !fastmap[t[(unsigned char) string[match_first]]])
! 	    ++match_first;
! 	  goto forward_match_found_start_or_reached_end;
! 
! 	case 14:
! 	  /* Fastmap without translation, match forward.  */
! 	  while (BE (match_first < right_lim, 1)
! 		 && !fastmap[(unsigned char) string[match_first]])
! 	    ++match_first;
! 
! 	forward_match_found_start_or_reached_end:
! 	  if (BE (match_first == right_lim, 0))
! 	    {
! 	      ch = match_first >= length
! 		       ? 0 : (unsigned char) string[match_first];
! 	      if (!fastmap[t ? t[ch] : ch])
! 		goto free_return;
  	    }
! 	  break;
  
! 	case 12:
! 	case 13:
! 	  /* Fastmap without multi-byte translation, match backwards.  */
! 	  while (match_first >= left_lim)
! 	    {
! 	      ch = match_first >= length
! 		       ? 0 : (unsigned char) string[match_first];
! 	      if (fastmap[t ? t[ch] : ch])
! 		break;
! 	      --match_first;
! 	    }
! 	  if (match_first < left_lim)
! 	    goto free_return;
! 	  break;
! 	  
! 	default:
! 	  /* In this case, we can't determine easily the current byte,
! 	     since it might be a component byte of a multibyte
! 	     character.  Then we use the constructed buffer instead.  */
! 	  for (;;)
! 	    {
! 	      /* If MATCH_FIRST is out of the valid range, reconstruct the
! 		 buffers.  */
! 	      if (mctx.input.raw_mbs_idx + mctx.input.valid_raw_len <= match_first
! 		  || match_first < mctx.input.raw_mbs_idx)
  		{
! 		  err = re_string_reconstruct (&mctx.input, match_first,
! 					       eflags);
! 		  if (BE (err != REG_NOERROR, 0))
! 		    goto free_return;
  		}
! 	      /* If MATCH_FIRST is out of the buffer, leave it as '\0'.
! 		 Note that MATCH_FIRST must not be smaller than 0.  */
! 	      ch = ((match_first >= length) ? 0
! 		    : re_string_byte_at (&mctx.input,
! 					 match_first - mctx.input.raw_mbs_idx));
! 	      if (fastmap[ch])
  		break;
+ 	      match_first += incr;
+ 	      if (match_first < left_lim || match_first > right_lim)
+ 	        {
+ 	          err = REG_NOMATCH;
+ 	          goto free_return;
+ 	        }
  	    }
+ 	  break;
  	}
  
        /* Reconstruct the buffers so that the matcher can assume that
***************
*** 773,829 ****
        err = re_string_reconstruct (&mctx.input, match_first, eflags);
        if (BE (err != REG_NOERROR, 0))
  	goto free_return;
  #ifdef RE_ENABLE_I18N
!      /* Eliminate it when it is a component of a multibyte character
! 	 and isn't the head of a multibyte character.  */
!       if (sb || re_string_first_byte (&mctx.input, 0))
  #endif
  	{
! 	  /* It seems to be appropriate one, then use the matcher.  */
! 	  /* We assume that the matching starts from 0.  */
! 	  mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
! 	  match_last = check_matching (&mctx, fl_longest_match,
! 				       range >= 0 ? &match_first : NULL);
! 	  if (match_last != -1)
  	    {
! 	      if (BE (match_last == -2, 0))
  		{
! 		  err = REG_ESPACE;
! 		  goto free_return;
  		}
! 	      else
  		{
! 		  mctx.match_last = match_last;
! 		  if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
! 		    {
! 		      re_dfastate_t *pstate = mctx.state_log[match_last];
! 		      mctx.last_node = check_halt_state_context (&mctx, pstate,
! 								 match_last);
! 		    }
! 		  if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
! 		      || dfa->nbackref)
! 		    {
! 		      err = prune_impossible_nodes (&mctx);
! 		      if (err == REG_NOERROR)
! 			break;
! 		      if (BE (err != REG_NOMATCH, 0))
! 			goto free_return;
! 		      match_last = -1;
! 		    }
! 		  else
! 		    break; /* We found a match.  */
  		}
  	    }
- 	  match_ctx_clean (&mctx);
  	}
!       /* Update counter.  */
!       match_first += incr;
!       if (match_first < left_lim || right_lim < match_first)
! 	break;
      }
  
    /* Set pmatch[] if we need.  */
!   if (match_last != -1 && nmatch > 0)
      {
        int reg_idx;
  
--- 783,842 ----
        err = re_string_reconstruct (&mctx.input, match_first, eflags);
        if (BE (err != REG_NOERROR, 0))
  	goto free_return;
+ 
  #ifdef RE_ENABLE_I18N
!      /* Don't consider this char as a possible match start if it part,
! 	yet isn't the head, of a multibyte character.  */
!       if (!sb && !re_string_first_byte (&mctx.input, 0))
! 	continue;
  #endif
+ 
+       /* It seems to be appropriate one, then use the matcher.  */
+       /* We assume that the matching starts from 0.  */
+       mctx.state_log_top = mctx.nbkref_ents = mctx.max_mb_elem_len = 0;
+       match_last = check_matching (&mctx, fl_longest_match,
+ 				   range >= 0 ? &match_first : NULL);
+       if (match_last != -1)
  	{
! 	  if (BE (match_last == -2, 0))
  	    {
! 	      err = REG_ESPACE;
! 	      goto free_return;
! 	    }
! 	  else
! 	    {
! 	      mctx.match_last = match_last;
! 	      if ((!preg->no_sub && nmatch > 1) || dfa->nbackref)
  		{
! 		  re_dfastate_t *pstate = mctx.state_log[match_last];
! 		  mctx.last_node = check_halt_state_context (&mctx, pstate,
! 							     match_last);
  		}
! 	      if ((!preg->no_sub && nmatch > 1 && dfa->has_plural_match)
! 		  || dfa->nbackref)
  		{
! 		  err = prune_impossible_nodes (&mctx);
! 		  if (err == REG_NOERROR)
! 		    break;
! 		  if (BE (err != REG_NOMATCH, 0))
! 		    goto free_return;
! 		  match_last = -1;
  		}
+ 	      else
+ 		break; /* We found a match.  */
  	    }
  	}
! 
!       match_ctx_clean (&mctx);
      }
  
+ #ifdef DEBUG
+   assert (match_last != -1);
+   assert (err == REG_NOERROR);
+ #endif
+ 
    /* Set pmatch[] if we need.  */
!   if (nmatch > 0)
      {
        int reg_idx;
  
***************
*** 868,874 ****
  	    pmatch[reg_idx].rm_eo += match_first;
  	  }
      }
!   err = (match_last == -1) ? REG_NOMATCH : REG_NOERROR;
   free_return:
    re_free (mctx.state_log);
    if (dfa->nbackref)
--- 881,887 ----
  	    pmatch[reg_idx].rm_eo += match_first;
  	  }
      }
! 
   free_return:
    re_free (mctx.state_log);
    if (dfa->nbackref)
***************
*** 1073,1078 ****
--- 1086,1105 ----
    while (!re_string_eoi (&mctx->input))
      {
        re_dfastate_t *old_state = cur_state;
+       int next_char_idx = re_string_cur_idx (&mctx->input) + 1;
+ 
+       if (BE (next_char_idx >= mctx->input.bufs_len, 0)
+           || (BE (next_char_idx >= mctx->input.valid_len, 0)
+               && mctx->input.valid_len < mctx->input.len))
+         {
+           err = extend_buffers (mctx);
+           if (BE (err != REG_NOERROR, 0))
+ 	    {
+ 	      assert (err == REG_ESPACE);
+ 	      return -2;
+ 	    }
+         }
+ 
        cur_state = transit_state (&err, mctx, cur_state);
        if (mctx->state_log != NULL)
  	cur_state = merge_state_with_log (&err, mctx, cur_state);
***************
*** 1091,1100 ****
  	    break;
  	}
  
!       if (at_init_state)
  	{
  	  if (old_state == cur_state)
! 	    next_start_idx = re_string_cur_idx (&mctx->input);
  	  else
  	    at_init_state = 0;
  	}
--- 1118,1127 ----
  	    break;
  	}
  
!       if (BE (at_init_state, 0))
  	{
  	  if (old_state == cur_state)
! 	    next_start_idx = next_char_idx;
  	  else
  	    at_init_state = 0;
  	}
***************
*** 1110,1122 ****
  	      /* We found an appropriate halt state.  */
  	      match_last = re_string_cur_idx (&mctx->input);
  	      match = 1;
  	      if (!fl_longest_match)
  		break;
  	    }
  	}
!    }
  
!   if (match_last == -1 && p_match_first)
      *p_match_first += next_start_idx;
  
    return match_last;
--- 1137,1152 ----
  	      /* We found an appropriate halt state.  */
  	      match_last = re_string_cur_idx (&mctx->input);
  	      match = 1;
+ 
+ 	      /* We found a match, do not modify match_first below.  */
+ 	      p_match_first = NULL;
  	      if (!fl_longest_match)
  		break;
  	    }
  	}
!     }
  
!   if (p_match_first)
      *p_match_first += next_start_idx;
  
    return match_last;
***************
*** 2171,2188 ****
    re_dfastate_t **trtable;
    unsigned char ch;
  
-   if (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.bufs_len
-       || (re_string_cur_idx (&mctx->input) + 1 >= mctx->input.valid_len
- 	  && mctx->input.valid_len < mctx->input.len))
-     {
-       *err = extend_buffers (mctx);
-       if (BE (*err != REG_NOERROR, 0))
- 	return NULL;
-     }
- 
  #ifdef RE_ENABLE_I18N
        /* If the current state can accept multibyte.  */
!       if (state->accept_mb)
  	{
  	  *err = transit_state_mb (mctx, state);
  	  if (BE (*err != REG_NOERROR, 0))
--- 2201,2209 ----
    re_dfastate_t **trtable;
    unsigned char ch;
  
  #ifdef RE_ENABLE_I18N
        /* If the current state can accept multibyte.  */
!       if (BE (state->accept_mb, 0))
  	{
  	  *err = transit_state_mb (mctx, state);
  	  if (BE (*err != REG_NOERROR, 0))

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]