This is the mail archive of the libc-alpha@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Re: [PATCH] Complete GB18030 charmap


Andreas Schwab wrote:
> > How did you develop the patch?
>
> From ICU
> (http://source.icu-project.org/repos/icu/data/trunk/charset/source/gb18030
> and
> http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/gb18030.ucm)
>
> > What testing did you do with this patch?
>
> tst-tables.sh tests for consistency.

I have also tested this patch, checking the conversion table in both
directions (extracted through the attached programs) against the one that
will be used in the next release of libiconv.

The patch is perfect. It contains irreversible mappings
in the multibyte -> Unicode direction, for backward compatibility:

0x95329031	U+20087
0x95329033	U+20089
0x95329730	U+200CC
0x9536B937	U+215D7
0x9630BA35	U+2298F
0x9635B630	U+241FE

GNU libiconv also contains the following irreversible mappings
in the multibyte -> Unicode direction, also for backward compatibility:

0x82359037	U+9FB4
0x82359038	U+9FB5
0x82359039	U+9FB6
0x82359130	U+9FB7
0x82359131	U+9FB8
0x82359132	U+9FB9
0x82359133	U+9FBA
0x82359134	U+9FBB
0x84318236	U+FE10
0x84318237	U+FE11
0x84318238	U+FE12
0x84318239	U+FE13
0x84318330	U+FE14
0x84318331	U+FE15
0x84318332	U+FE16
0x84318333	U+FE17
0x84318334	U+FE18
0x84318335	U+FE19

These byte sequences can be contained in text files that were created
with previous versions of libiconv. But glibc did not support these byte
sequences (I tested glibc 2.9 and 2.11), therefore it is not really needed
that the glibc converter contains them.

In other words, thanks Andreas for having cleaned up the long standing
issues with this converter!

Bruno

/* Copyright (C) 2000-2002, 2004-2005 Free Software Foundation, Inc.
   This file is part of the GNU LIBICONV Library.

   The GNU LIBICONV Library is free software; you can redistribute it
   and/or modify it under the terms of the GNU Library General Public
   License as published by the Free Software Foundation; either version 2
   of the License, or (at your option) any later version.

   The GNU LIBICONV Library is distributed in the hope that it will be
   useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the GNU LIBICONV Library; see the file COPYING.LIB.
   If not, see <http://www.gnu.org/licenses/>.  */

/* Create a table from CHARSET to Unicode. */

#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iconv.h>
#include <errno.h>

/* If nonzero, ignore conversions outside Unicode plane 0. */
static int bmp_only;

static const char* hexbuf (unsigned char buf[], unsigned int buflen)
{
  static char msg[50];
  switch (buflen) {
    case 1: sprintf(msg,"0x%02X",buf[0]); break;
    case 2: sprintf(msg,"0x%02X%02X",buf[0],buf[1]); break;
    case 3: sprintf(msg,"0x%02X%02X%02X",buf[0],buf[1],buf[2]); break;
    case 4: sprintf(msg,"0x%02X%02X%02X%02X",buf[0],buf[1],buf[2],buf[3]); break;
    default: abort();
  }
  return msg;
}

static int try (iconv_t cd, unsigned char buf[], unsigned int buflen, unsigned int* out)
{
  const char* inbuf = (const char*) buf;
  size_t inbytesleft = buflen;
  char* outbuf = (char*) out;
  size_t outbytesleft = 3*sizeof(unsigned int);
  size_t result;
  iconv(cd,NULL,NULL,NULL,NULL);
  result = iconv(cd,(char**)&inbuf,&inbytesleft,&outbuf,&outbytesleft);
  if (result != (size_t)(-1))
    result = iconv(cd,NULL,NULL,&outbuf,&outbytesleft);
  if (result == (size_t)(-1)) {
    if (errno == EILSEQ) {
      return -1;
    } else if (errno == EINVAL) {
      return 0;
    } else {
      int saved_errno = errno;
      fprintf(stderr,"%s: iconv error: ",hexbuf(buf,buflen));
      errno = saved_errno;
      perror("");
      exit(1);
    }
  } else if (result > 0) /* ignore conversions with transliteration */ {
    return -1;
  } else {
    if (inbytesleft != 0) {
      fprintf(stderr,"%s: inbytes = %ld, outbytes = %ld\n",hexbuf(buf,buflen),(long)(buflen-inbytesleft),(long)(3*sizeof(unsigned int)-outbytesleft));
      exit(1);
    }
    return (3*sizeof(unsigned int)-outbytesleft)/sizeof(unsigned int);
  }
}

/* Returns the out[] buffer as a Unicode value, formatted as 0x%04X. */
static const char* ucs4_decode (const unsigned int* out, unsigned int outlen)
{
  static char hexbuf[21];
  char* p = hexbuf;
  while (outlen > 0) {
    if (p > hexbuf)
      *p++ = ' ';
    sprintf (p, "0x%04X", out[0]);
    out += 1; outlen -= 1;
    if (bmp_only && strlen(p) > 6)
      return NULL;
    p += strlen(p);
  }
  return hexbuf;
}

int main (int argc, char* argv[])
{
  const char* charset;
  iconv_t cd;
  int search_depth;

  if (argc != 2) {
    fprintf(stderr,"Usage: table-from charset\n");
    exit(1);
  }
  charset = argv[1];

  cd = iconv_open("UCS-4LE",charset);
  if (cd == (iconv_t)(-1)) {
    perror("iconv_open");
    exit(1);
  }

  /* When testing UTF-8, stop at 0x10000, otherwise the output file gets too
     big. */
  bmp_only = (strcmp(charset,"UTF-8") == 0);
  search_depth = (strcmp(charset,"UTF-8") == 0 ? 3 : 4);

  {
    unsigned int out[3];
    unsigned char buf[4];
    unsigned int i0, i1, i2, i3;
    int result;
    for (i0 = 0; i0 < 0x100; i0++) {
      buf[0] = i0;
      result = try(cd,buf,1,out);
      if (result < 0) {
      } else if (result > 0) {
        const char* unicode = ucs4_decode(out,result);
        if (unicode != NULL)
          printf("0x%02X\t%s\n",i0,unicode);
      } else {
        for (i1 = 0; i1 < 0x100; i1++) {
          buf[1] = i1;
          result = try(cd,buf,2,out);
          if (result < 0) {
          } else if (result > 0) {
            const char* unicode = ucs4_decode(out,result);
            if (unicode != NULL)
              printf("0x%02X%02X\t%s\n",i0,i1,unicode);
          } else {
            for (i2 = 0; i2 < 0x100; i2++) {
              buf[2] = i2;
              result = try(cd,buf,3,out);
              if (result < 0) {
              } else if (result > 0) {
                const char* unicode = ucs4_decode(out,result);
                if (unicode != NULL)
                  printf("0x%02X%02X%02X\t%s\n",i0,i1,i2,unicode);
              } else if (search_depth > 3) {
                for (i3 = 0; i3 < 0x100; i3++) {
                  buf[3] = i3;
                  result = try(cd,buf,4,out);
                  if (result < 0) {
                  } else if (result > 0) {
                    const char* unicode = ucs4_decode(out,result);
                    if (unicode != NULL)
                      printf("0x%02X%02X%02X%02X\t%s\n",i0,i1,i2,i3,unicode);
                  } else {
                    fprintf(stderr,"%s: incomplete byte sequence\n",hexbuf(buf,4));
                    exit(1);
                  }
                }
              }
            }
          }
        }
      }
    }
  }

  if (iconv_close(cd) < 0) {
    perror("iconv_close");
    exit(1);
  }

  if (ferror(stdin) || ferror(stdout) || fclose(stdout)) {
    fprintf(stderr,"I/O error\n");
    exit(1);
  }

  exit(0);
}
/* Copyright (C) 2000-2002, 2004-2005 Free Software Foundation, Inc.
   This file is part of the GNU LIBICONV Library.

   The GNU LIBICONV Library is free software; you can redistribute it
   and/or modify it under the terms of the GNU Library General Public
   License as published by the Free Software Foundation; either version 2
   of the License, or (at your option) any later version.

   The GNU LIBICONV Library is distributed in the hope that it will be
   useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Library General Public License for more details.

   You should have received a copy of the GNU Library General Public
   License along with the GNU LIBICONV Library; see the file COPYING.LIB.
   If not, see <http://www.gnu.org/licenses/>.  */

/* Create a table from Unicode to CHARSET. */

#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <iconv.h>
#include <errno.h>

int main (int argc, char* argv[])
{
  const char* charset;
  iconv_t cd;
  int bmp_only;

  if (argc != 2) {
    fprintf(stderr,"Usage: table-to charset\n");
    exit(1);
  }
  charset = argv[1];

  cd = iconv_open(charset,"UCS-4LE");
  if (cd == (iconv_t)(-1)) {
    perror("iconv_open");
    exit(1);
  }

  /* When testing UTF-8, stop at 0x10000, otherwise the output file gets too
     big. */
  bmp_only = (strcmp(charset,"UTF-8") == 0);

  {
    unsigned int i;
    unsigned char buf[10];
    for (i = 0; i < (bmp_only ? 0x10000 : 0x110000); i++) {
      unsigned int in = i;
      const char* inbuf = (const char*) &in;
      size_t inbytesleft = sizeof(unsigned int);
      char* outbuf = (char*)buf;
      size_t outbytesleft = sizeof(buf);
      size_t result;
      size_t result2 = 0;
      iconv(cd,NULL,NULL,NULL,NULL);
      result = iconv(cd,(char**)&inbuf,&inbytesleft,&outbuf,&outbytesleft);
      if (result != (size_t)(-1))
        result2 = iconv(cd,NULL,NULL,&outbuf,&outbytesleft);
      if (result == (size_t)(-1) || result2 == (size_t)(-1)) {
        if (errno != EILSEQ) {
          int saved_errno = errno;
          fprintf(stderr,"0x%02X: iconv error: ",i);
          errno = saved_errno;
          perror("");
          exit(1);
        }
      } else if (result == 0) /* ignore conversions with transliteration */ {
        if (inbytesleft == 0 && outbytesleft < sizeof(buf)) {
          unsigned int jmax = sizeof(buf) - outbytesleft;
          unsigned int j;
          printf("0x");
          for (j = 0; j < jmax; j++)
            printf("%02X",buf[j]);
          printf("\t0x%04X\n",i);
        } else if (inbytesleft == 0 && i >= 0xe0000 && i < 0xe0080) {
          /* Language tags may silently be dropped. */
        } else {
          fprintf(stderr,"0x%02X: inbytes = %ld, outbytes = %ld\n",i,(long)(sizeof(unsigned int)-inbytesleft),(long)(sizeof(buf)-outbytesleft));
          exit(1);
        }
      }
    }
  }

  if (iconv_close(cd) < 0) {
    perror("iconv_close");
    exit(1);
  }

  if (ferror(stdin) || ferror(stdout) || fclose(stdout)) {
    fprintf(stderr,"I/O error\n");
    exit(1);
  }

  exit(0);
}

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]