This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH v2 1/2] iconv_prog: Don't slurp whole input at once [BZ #6050]


On Wed, Aug 12, 2015 at 12:27:45AM +0200, Benjamin Herr wrote:
> As described in the bug report comments, contrary to the comment in
> 'process_fd' , iconv can deal with being passed invalid multibyte
> sequences, as it leaves the input pointer at the start of the sequence
> and we can just call it again once more bytes are availabe.
> 
> Therefore we do not need to read the whole input at once when reading
> from an fd, and can process it in chunks instead. This enables the iconv
> program to be used in pipelines sensibly, and to not choke on very large
> input files.
> 
> To still support reporting the position of invalid sequences in the
> input, we count consumed input bytes explicitly. As overflow of that
> counter is now possible, we use saturating addition and just mention in
> the error message when (size_t) -1 has been reached.
> ---
> 2015-08-08  Benjamin Herr  <ben@0x539.de>
> 
> 	[BZ #6050]
> 	* iconv/iconv_prog.c: Don't slurp the whole input at once, instead
> 	pass chunks to iconv and carry over incomplete multibyte sequences.
>
Could you next time split patches like that to first do refactoring to
shuffle lines around and then actual patch? It simplifies review. 

I read this patch and it look reasonable but is quite big so I would
prefer second set of eyes to also check that.
> 
>  iconv/iconv_prog.c | 247 +++++++++++++++++++++++++++++------------------------
>  1 file changed, 135 insertions(+), 112 deletions(-)
> 
> diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c
> index e249bce..63b8e33 100644
> --- a/iconv/iconv_prog.c
> +++ b/iconv/iconv_prog.c
> @@ -465,29 +465,93 @@ conversion stopped due to problem in writing the output"));
>    return 0;
>  }
>  
> +static void
> +report_iconv_error (size_t position)
> +{
> +  switch (errno)
> +    {
> +    case EILSEQ:
> +      if (position < (size_t) -1)
> +	error (0, 0,
> +	       _("illegal input sequence at position %zu"), position);
> +      else
> +	error (0, 0,
> +	       _("illegal input sequence past position %zu"),
> +	       position - 1);
> +      break;
> +    case EINVAL:
> +      error (0, 0, _("\
> +incomplete character or shift sequence at end of buffer"));
> +      break;
> +    case EBADF:
> +      error (0, 0, _("internal error (illegal descriptor)"));
> +      break;
> +    default:
> +      error (0, 0, _("unknown iconv() error %d"), errno);
> +      break;
> +    }
> +}
>  
ok, as its moved from below.

> +#define BUF_SIZE	32768
>  static int
> -process_block (iconv_t cd, char *addr, size_t len, FILE **output,
> -	       const char *output_file)
> +flush_state (iconv_t cd, FILE **output, const char *output_file, size_t offset)
>  {
> -#define OUTBUF_SIZE	32768
> -  const char *start = addr;
> -  char outbuf[OUTBUF_SIZE];
> +  char outbuf[BUF_SIZE];
> +  char *outptr;
> +  size_t outlen;
> +  size_t n;
> +
> +  /* All the input test is processed.  For state-dependent
> +     character sets we have to flush the state now.  */
> +  outptr = outbuf;
> +  outlen = BUF_SIZE;
> +  n = iconv (cd, NULL, NULL, &outptr, &outlen);
> +
> +  if (outptr != outbuf)
> +    {
> +      if (write_output (outbuf, outptr, output, output_file) == -1)
> +	return -1;
> +    }
> +
> +  if (n == (size_t) -1)
> +    {
> +      report_iconv_error (offset);
> +      return 1;
> +    }
> +
> +  return 0;
> +}
> +
ok
> +static size_t
> +saturating_add (size_t a, size_t b)
> +{
> +  if ((size_t) -1 - a > b)
> +    return a + b;
> +  else
> +    return -1;
> +}
> +
ok
> +static int
> +process_part (iconv_t cd, char **addr, size_t *len, size_t offset,
> +	      FILE **output, const char *output_file)
> +{
> +  const char *start = *addr;
> +  char outbuf[BUF_SIZE];
>    char *outptr;
>    size_t outlen;
>    size_t n;
>    int ret = 0;
>  
> -  while (len > 0)
> +  while (*len > 0)
>      {
>        outptr = outbuf;
> -      outlen = OUTBUF_SIZE;
> -      n = iconv (cd, &addr, &len, &outptr, &outlen);
> +      outlen = BUF_SIZE;
> +      n = iconv (cd, addr, len, &outptr, &outlen);
>  
>        if (n == (size_t) -1 && omit_invalid && errno == EILSEQ)
>  	{
>  	  ret = 1;
> -	  if (len == 0)
> +	  if (*len == 0)
>  	    n = 0;
>  	  else
>  	    errno = E2BIG;
> @@ -500,52 +564,18 @@ process_block (iconv_t cd, char *addr, size_t len, FILE **output,
>  	    break;
>  	}
>  
> -      if (n != (size_t) -1)
> +      /* Incomplete multibyte characters might be completed by the next
> +         chunk, so do not treat them as an error here. */
> +      if (n != (size_t) -1 || errno == EINVAL)
>  	{
> -	  /* All the input test is processed.  For state-dependent
> -	     character sets we have to flush the state now.  */
> -	  outptr = outbuf;
> -	  outlen = OUTBUF_SIZE;
> -	  n = iconv (cd, NULL, NULL, &outptr, &outlen);
> -
> -	  if (outptr != outbuf)
> -	    {
> -	      ret = write_output (outbuf, outptr, output, output_file);
> -	      if (ret != 0)
> -		break;
> -	    }
> -
> -	  if (n != (size_t) -1)
> -	    break;
> -
> -	  if (omit_invalid && errno == EILSEQ)
> -	    {
> -	      ret = 1;
> -	      break;
> -	    }
> +	  ret = 0;
> +	  break;
>  	}
>  
>        if (errno != E2BIG)
>  	{
>  	  /* iconv() ran into a problem.  */
> -	  switch (errno)
> -	    {
> -	    case EILSEQ:
> -	      if (! omit_invalid)
> -		error (0, 0, _("illegal input sequence at position %ld"),
> -		       (long int) (addr - start));
> -	      break;
> -	    case EINVAL:
> -	      error (0, 0, _("\
> -incomplete character or shift sequence at end of buffer"));
> -	      break;
> -	    case EBADF:
> -	      error (0, 0, _("internal error (illegal descriptor)"));
> -	      break;
> -	    default:
> -	      error (0, 0, _("unknown iconv() error %d"), errno);
> -	      break;
> -	    }
> +          report_iconv_error (saturating_add (offset, (*addr - start)));
>  
>  	  return -1;
>  	}
> @@ -554,83 +584,76 @@ incomplete character or shift sequence at end of buffer"));
>    return ret;
>  }
>  
ok
> -
>  static int
> -process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
> +process_block (iconv_t cd, char *addr, size_t len, FILE **output,
> +	       const char *output_file)
>  {
> -  /* we have a problem with reading from a desriptor since we must not
> -     provide the iconv() function an incomplete character or shift
> -     sequence at the end of the buffer.  Since we have to deal with
> -     arbitrary encodings we must read the whole text in a buffer and
> -     process it in one step.  */
> -  static char *inbuf = NULL;
> -  static size_t maxlen = 0;
> -  char *inptr = NULL;
> -  size_t actlen = 0;
> -
> -  while (actlen < maxlen)
> -    {
> -      ssize_t n = read (fd, inptr, maxlen - actlen);
> +  char *start = addr;
>  
> -      if (n == 0)
> -	/* No more text to read.  */
> -	break;
> +  /* Process everything in one go.  */
> +  int ret = process_part (cd, &addr, &len, 0, output, output_file);
>  
> -      if (n == -1)
> -	{
> -	  /* Error while reading.  */
> -	  error (0, errno, _("error while reading the input"));
> -	  return -1;
> -	}
> +  if (ret != 0)
> +    return ret;
>  
> -      inptr += n;
> -      actlen += n;
> +  /* If there is input left over, there is an incomplete multibyte
> +     sequence at the end.  */
> +  if (len > 0)
> +    {
> +      errno = EINVAL;
> +      report_iconv_error (addr - start);
> +      return -1;
>      }
>  
> -  if (actlen == maxlen)
> -    while (1)
> -      {
> -	ssize_t n;
> -	char *new_inbuf;
> +  return flush_state (cd, output, output_file, addr - start);
> +}
>  
> -	/* Increase the buffer.  */
> -	new_inbuf = (char *) realloc (inbuf, maxlen + 32768);
> -	if (new_inbuf == NULL)
> -	  {
> -	    error (0, errno, _("unable to allocate buffer for input"));
> -	    return -1;
> -	  }
> -	inbuf = new_inbuf;
> -	maxlen += 32768;
> -	inptr = inbuf + actlen;
>  
> -	do
> -	  {
> -	    n = read (fd, inptr, maxlen - actlen);
> +static int
> +process_fd (iconv_t cd, int fd, FILE **output, const char *output_file)
> +{
> +  char inbuf[BUF_SIZE];
> +  size_t len = 0;
> +  size_t offset = 0;
> +  ssize_t n;
>  
> -	    if (n == 0)
> -	      /* No more text to read.  */
> -	      break;
> +  /* Read into the buffer past unconsumed bytes from the last iteration.  */
> +  while ((n = read (fd, inbuf + len, BUF_SIZE - len)) > 0)
> +    {
> +      int ret;
> +      char *inptr;
>  
> -	    if (n == -1)
> -	      {
> -		/* Error while reading.  */
> -		error (0, errno, _("error while reading the input"));
> -		return -1;
> -	      }
> +      len += n;
>  
> -	    inptr += n;
> -	    actlen += n;
> -	  }
> -	while (actlen < maxlen);
> +      inptr = inbuf;
> +      /* Process what we have read.  */
> +      ret = process_part (cd, &inptr, &len, offset, output, output_file);
> +      if (ret != 0)
> +	return ret;
> +      /* Keep track of overall position in the input for error reporting.  */
> +      offset = saturating_add (offset, inptr - inbuf);
>  
> -	if (n == 0)
> -	  /* Break again so we leave both loops.  */
> -	  break;
> -      }
> +      /* Keep incomplete multibyte characters, if any.  */
> +      memmove (inbuf, inptr, len);
> +    }
> +
> +  if (n == -1)
> +    {
> +      /* Error while reading.  */
> +      error (0, errno, _("error while reading the input"));
> +      return -1;
> +    }
> +
> +  /* No more input available, so incomplete multibyte sequences are
> +     not going to get completed at this point. */
> +  if (len > 0)
> +    {
> +      errno = EINVAL;
> +      report_iconv_error (offset);
> +      return -1;
> +    }
>  
> -  /* Now we have all the input in the buffer.  Process it in one run.  */
> -  return process_block (cd, inbuf, actlen, output, output_file);
> +  return flush_state (cd, output, output_file, offset);
>  }
>  
>  


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]