This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Fix buffer overrun in regexp matcher


On 01/29/2013 11:13 AM, Andreas Schwab wrote:
> When extending regex buffers, make sure we allocate enough room for the
> state log.  Merely doubling the space may not be enough if the current
> node has accepted a long run of characters.  This part of the code only
> triggers with multibyte characters.
> 
> Andreas.
> 
> 	[BZ #15078]
> 	* posix/regexec.c (extend_buffers): Add parameter min_len.
> 	(check_matching): Pass minimum needed length.
> 	(clean_state_log_if_needed): Likewise.
> 	(get_subexp): Likewise.
> 	* posix/Makefile (tests): Add bug-regex34.
> 	(bug-regex34-ENV): Define.
> 	* posix/bug-regex34.c: New file.

This is looking good, but I have one question for you below.
 
> diff --git a/NEWS b/NEWS
> index 9a039d8..ec4175c 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -10,7 +10,7 @@ Version 2.18
>  * The following bugs are resolved with this release:
>  
>    13951, 14200, 14317, 14327, 14496, 14964, 14981, 14982, 14985, 14994,
> -  14996, 15003, 15020, 15023, 15036.
> +  14996, 15003, 15020, 15023, 15036, 15078.
>  
>  
>  Version 2.17
> diff --git a/posix/Makefile b/posix/Makefile
> index 57672d8..6ceb440 100644
> --- a/posix/Makefile
> +++ b/posix/Makefile
> @@ -86,7 +86,7 @@ tests		:= tstgetopt testfnm runtests runptests	     \
>  		   tst-rfc3484-3 \
>  		   tst-getaddrinfo3 tst-fnmatch2 tst-cpucount tst-cpuset \
>  		   bug-getopt1 bug-getopt2 bug-getopt3 bug-getopt4 \
> -		   bug-getopt5 tst-getopt_long1
> +		   bug-getopt5 tst-getopt_long1 bug-regex34
>  xtests		:= bug-ga2
>  ifeq (yes,$(build-shared))
>  test-srcs	:= globtest
> @@ -199,6 +199,7 @@ bug-regex26-ENV = LOCPATH=$(common-objpfx)localedata
>  bug-regex30-ENV = LOCPATH=$(common-objpfx)localedata
>  bug-regex32-ENV = LOCPATH=$(common-objpfx)localedata
>  bug-regex33-ENV = LOCPATH=$(common-objpfx)localedata
> +bug-regex34-ENV = LOCPATH=$(common-objpfx)localedata
>  tst-rxspencer-ARGS = --utf8 rxspencer/tests
>  tst-rxspencer-ENV = LOCPATH=$(common-objpfx)localedata
>  tst-pcre-ARGS = PCRE.tests
> diff --git a/posix/bug-regex34.c b/posix/bug-regex34.c
> new file mode 100644
> index 0000000..bb3b613
> --- /dev/null
> +++ b/posix/bug-regex34.c
> @@ -0,0 +1,46 @@
> +/* Test re_search with multi-byte characters in UTF-8.
> +   Copyright (C) 2013 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#define _GNU_SOURCE 1
> +#include <stdio.h>
> +#include <string.h>
> +#include <locale.h>
> +#include <regex.h>
> +
> +static int
> +do_test (void)
> +{
> +  struct re_pattern_buffer r;
> +  /* ááááááááx */
> +  const char *s = "\xe1\x80\x80\xe1\x80\xbb\xe1\x80\xbd\xe1\x80\x94\xe1\x80\xba\xe1\x80\xaf\xe1\x80\x95\xe1\x80\xbax";

Looks good. Matches what I get.

> +
> +  if (setlocale (LC_ALL, "en_US.UTF-8") == NULL)
> +    {
> +      puts ("setlocale failed");
> +      return 1;
> +    }
> +  memset (&r, 0, sizeof (r));
> +
> +  re_compile_pattern ("[^x]x", 5, &r);
> +  /* This was triggering a buffer overflow.  */
> +  re_search (&r, s, strlen (s), 0, strlen (s), 0);
> +  return 0;
> +}
> +
> +#define TEST_FUNCTION do_test ()
> +#include "../test-skeleton.c"
> diff --git a/posix/regexec.c b/posix/regexec.c
> index 7f2de85..5ca2bf6 100644
> --- a/posix/regexec.c
> +++ b/posix/regexec.c
> @@ -197,7 +197,7 @@ static int group_nodes_into_DFAstates (const re_dfa_t *dfa,
>  static int check_node_accept (const re_match_context_t *mctx,
>  			      const re_token_t *node, int idx)
>       internal_function;
> -static reg_errcode_t extend_buffers (re_match_context_t *mctx)
> +static reg_errcode_t extend_buffers (re_match_context_t *mctx, int min_len)
>       internal_function;
>  
>  /* Entry point for POSIX code.  */
> @@ -1160,7 +1160,7 @@ check_matching (re_match_context_t *mctx, int fl_longest_match,
>  	  || (BE (next_char_idx >= mctx->input.valid_len, 0)
>  	      && mctx->input.valid_len < mctx->input.len))
>  	{
> -	  err = extend_buffers (mctx);
> +	  err = extend_buffers (mctx, next_char_idx + 1);

OK.

>  	  if (BE (err != REG_NOERROR, 0))
>  	    {
>  	      assert (err == REG_ESPACE);
> @@ -1738,7 +1738,7 @@ clean_state_log_if_needed (re_match_context_t *mctx, int next_state_log_idx)
>  	  && mctx->input.valid_len < mctx->input.len))
>      {
>        reg_errcode_t err;
> -      err = extend_buffers (mctx);
> +      err = extend_buffers (mctx, next_state_log_idx + 1);

OK.

>        if (BE (err != REG_NOERROR, 0))
>  	return err;
>      }
> @@ -2792,7 +2792,7 @@ get_subexp (re_match_context_t *mctx, int bkref_node, int bkref_str_idx)
>  		  if (bkref_str_off >= mctx->input.len)
>  		    break;
>  
> -		  err = extend_buffers (mctx);
> +		  err = extend_buffers (mctx, bkref_str_off + 1);

OK.

Those are all three cases of extend_buffers I also found.

>  		  if (BE (err != REG_NOERROR, 0))
>  		    return err;
>  
> @@ -4102,7 +4102,7 @@ check_node_accept (const re_match_context_t *mctx, const re_token_t *node,
>  
>  static reg_errcode_t
>  internal_function __attribute_warn_unused_result__
> -extend_buffers (re_match_context_t *mctx)
> +extend_buffers (re_match_context_t *mctx, int min_len)
>  {
>    reg_errcode_t ret;
>    re_string_t *pstr = &mctx->input;
> @@ -4111,8 +4111,10 @@ extend_buffers (re_match_context_t *mctx)
>    if (BE (INT_MAX / 2 / sizeof (re_dfastate_t *) <= pstr->bufs_len, 0))
>      return REG_ESPACE;
>  
> -  /* Double the lengthes of the buffers.  */
> -  ret = re_string_realloc_buffers (pstr, MIN (pstr->len, pstr->bufs_len * 2));
> +  /* Double the lengthes of the buffers, but allocate at least MIN_LEN.  */

This comment hasn't been true since MIN() was added by:
~~~
commit 8887a920a4b81a500f54893250085e0d1a52cf9a
Author: Ulrich Drepper <drepper@gmail.com>
Date:   Sat May 28 17:14:30 2011 -0400

    Fix unnecessary overallocation due to incomplete character
    
    When incomplete characters are found at the end of a string the
    code ran amok and allocated lots of memory.  Stricter limits
    are now in place.
~~~

> +  ret = re_string_realloc_buffers (pstr,
> +				   MAX (min_len,
> +					MIN (pstr->len, pstr->bufs_len * 2)));

Certainly `pstr->bufs_len * 2` is a doubling of the length of 
the mbs or wcs buffer sizes.

However, `pstr->len', is always give-or-take the length of the
raw input string e.g. mbs_raw.

Thus MIN() never yields anything near double the length.

Given that you extended the function to include a length parameter,
which was a good idea, why can't we just use that?

e.g.

ret = re_string_realloc_buffers (pstr, min_len);

Why wouldn't that just work?

Our goal is to minimize the overallocation of the buffers?

>    if (BE (ret != REG_NOERROR, 0))
>      return ret;
>  
> 

Cheers,
Carlos.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]