This is the mail archive of the libc-alpha@sources.redhat.com mailing list for the glibc project.
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]
Re: making the wide character properties Unicode compatible

To: libc-alpha at sources dot redhat dot com
Subject: Re: making the wide character properties Unicode compatible
From: Bruno Haible <haible at ilog dot fr>
Date: Sun, 1 Oct 2000 14:45:09 +0200 (CEST)
References: <14799.18936.302663.551703@honolulu.ilog.fr><m3vgvjixo5.fsf@otr.mynet.cygnus.com>
Ulrich Drepper writes:

> >   - xdigit: is lacking all the foreign digit characters, but SUSV2 says
> >       "The definition of character class xdigit requires that the characters
> >        included in character class digit be included here also."
> 
> I challenge this one.  It does not make any sense since then you would
> also have to add all forms of the letters.  This is simply unreasonable.

Actually, ISO C 99 *mandates* that the 'xdigit' class contains only ASCII
characters. And also the 'digit' class! Therefore here is a small adjustment
for the i18n file, as well as keeping the gen-unicode-ctype.c consistent
with it (and follow the GNU coding style).


2000-09-30  Bruno Haible  <haible@clisp.cons.org>

	* gen-unicode-ctype.c (is_digit, is_xdigit): Remove all non-ASCII
	digits.
	(is_alpha): Add them here.
	* locales/i18n (digit): Remove all non-ASCII digits.
	(alpha): Add them here.

*** glibc-20000928/localedata/gen-unicode-ctype.c.bak	Tue Sep 26 01:50:03 2000
--- glibc-20000928/localedata/gen-unicode-ctype.c	Fri Sep 29 01:23:19 2000
***************
*** 166,186 ****
        int n;
  
        lineno++;
!       n = getfield(stream, field0, ';');
!       n += getfield(stream, field1, ';');
!       n += getfield(stream, field2, ';');
!       n += getfield(stream, field3, ';');
!       n += getfield(stream, field4, ';');
!       n += getfield(stream, field5, ';');
!       n += getfield(stream, field6, ';');
!       n += getfield(stream, field7, ';');
!       n += getfield(stream, field8, ';');
!       n += getfield(stream, field9, ';');
!       n += getfield(stream, field10, ';');
!       n += getfield(stream, field11, ';');
!       n += getfield(stream, field12, ';');
!       n += getfield(stream, field13, ';');
!       n += getfield(stream, field14, '\n');
        if (n == 0)
  	break;
        if (n != 15)
--- 166,186 ----
        int n;
  
        lineno++;
!       n = getfield (stream, field0, ';');
!       n += getfield (stream, field1, ';');
!       n += getfield (stream, field2, ';');
!       n += getfield (stream, field3, ';');
!       n += getfield (stream, field4, ';');
!       n += getfield (stream, field5, ';');
!       n += getfield (stream, field6, ';');
!       n += getfield (stream, field7, ';');
!       n += getfield (stream, field8, ';');
!       n += getfield (stream, field9, ';');
!       n += getfield (stream, field10, ';');
!       n += getfield (stream, field11, ';');
!       n += getfield (stream, field12, ';');
!       n += getfield (stream, field13, ';');
!       n += getfield (stream, field14, '\n');
        if (n == 0)
  	break;
        if (n != 15)
***************
*** 196,216 ****
  	{
  	  /* Deal with a range. */
  	  lineno++;
! 	  n = getfield(stream, field0, ';');
! 	  n += getfield(stream, field1, ';');
! 	  n += getfield(stream, field2, ';');
! 	  n += getfield(stream, field3, ';');
! 	  n += getfield(stream, field4, ';');
! 	  n += getfield(stream, field5, ';');
! 	  n += getfield(stream, field6, ';');
! 	  n += getfield(stream, field7, ';');
! 	  n += getfield(stream, field8, ';');
! 	  n += getfield(stream, field9, ';');
! 	  n += getfield(stream, field10, ';');
! 	  n += getfield(stream, field11, ';');
! 	  n += getfield(stream, field12, ';');
! 	  n += getfield(stream, field13, ';');
! 	  n += getfield(stream, field14, '\n');
  	  if (n != 15)
  	    {
  	      fprintf (stderr, "missing end range in '%s':%d\n",
--- 196,216 ----
  	{
  	  /* Deal with a range. */
  	  lineno++;
! 	  n = getfield (stream, field0, ';');
! 	  n += getfield (stream, field1, ';');
! 	  n += getfield (stream, field2, ';');
! 	  n += getfield (stream, field3, ';');
! 	  n += getfield (stream, field4, ';');
! 	  n += getfield (stream, field5, ';');
! 	  n += getfield (stream, field6, ';');
! 	  n += getfield (stream, field7, ';');
! 	  n += getfield (stream, field8, ';');
! 	  n += getfield (stream, field9, ';');
! 	  n += getfield (stream, field10, ';');
! 	  n += getfield (stream, field11, ';');
! 	  n += getfield (stream, field12, ';');
! 	  n += getfield (stream, field13, ';');
! 	  n += getfield (stream, field14, '\n');
  	  if (n != 15)
  	    {
  	      fprintf (stderr, "missing end range in '%s':%d\n",
***************
*** 390,406 ****
  	      || (unicode_attributes[ch].category[0] == 'S'
  		  && unicode_attributes[ch].category[1] == 'o'
  		  && strstr (unicode_attributes[ch].name, " LETTER ")
! 		     != NULL)));
  }
  
  static bool
  is_digit (unsigned int ch)
  {
    return (unicode_attributes[ch].name != NULL
  	  && unicode_attributes[ch].category[0] == 'N'
  	  && unicode_attributes[ch].category[1] == 'd');
    /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
       a zero.  Must add <0> in front of them by hand.  */
  }
  
  static bool
--- 390,424 ----
  	      || (unicode_attributes[ch].category[0] == 'S'
  		  && unicode_attributes[ch].category[1] == 'o'
  		  && strstr (unicode_attributes[ch].name, " LETTER ")
! 		     != NULL)
! 	      /* Consider all the non-ASCII digits as alphabetic.
! 		 ISO C 99 forbids us to have them in category "digit",
! 		 but we want iswalnum to return true on them.  */
! 	      || (unicode_attributes[ch].category[0] == 'N'
! 		  && unicode_attributes[ch].category[1] == 'd'
! 		  && !(ch >= 0x0030 && ch <= 0x0039))));
  }
  
  static bool
  is_digit (unsigned int ch)
  {
+ #if 0
    return (unicode_attributes[ch].name != NULL
  	  && unicode_attributes[ch].category[0] == 'N'
  	  && unicode_attributes[ch].category[1] == 'd');
    /* Note: U+0BE7..U+0BEF and U+1369..U+1371 are digit systems without
       a zero.  Must add <0> in front of them by hand.  */
+ #else
+   /* SUSV2 gives us some freedom for the "digit" category, but ISO C 99
+      takes it away:
+      7.25.2.1.5:
+         The iswdigit function tests for any wide character that corresponds
+         to a decimal-digit character (as defined in 5.2.1).
+      5.2.1:
+         the 10 decimal digits 0 1 2 3 4 5 6 7 8 9
+    */
+   return (ch >= 0x0030 && ch <= 0x0039);
+ #endif
  }
  
  static bool
***************
*** 455,463 ****
--- 473,495 ----
  static bool
  is_xdigit (unsigned int ch)
  {
+ #if 0
    return is_digit (ch)
  	 || (ch >= 0x0041 && ch <= 0x0046)
  	 || (ch >= 0x0061 && ch <= 0x0066);
+ #else
+   /* SUSV2 gives us some freedom for the "xdigit" category, but ISO C 99
+      takes it away:
+      7.25.2.1.12:
+         The iswxdigit function tests for any wide character that corresponds
+         to a hexadecimal-digit character (as defined in 6.4.4.1).
+      6.4.4.1:
+         hexadecimal-digit: one of 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
+    */
+   return (ch >= 0x0030 && ch <= 0x0039)
+ 	 || (ch >= 0x0041 && ch <= 0x0046)
+ 	 || (ch >= 0x0061 && ch <= 0x0066);
+ #endif
  }
  
  static bool
*** glibc-20000928/localedata/dump-ctype.c.bak	Tue Sep 26 01:50:48 2000
--- glibc-20000928/localedata/dump-ctype.c	Fri Sep 29 01:22:32 2000
***************
*** 113,119 ****
      }
  }
  
! int main (int argc, char *argv[])
  {
    size_t i;
  
--- 113,120 ----
      }
  }
  
! int
! main (int argc, char *argv[])
  {
    size_t i;
  
*** glibc-20000928/localedata/locales/i18n.bak	Tue Sep 26 14:41:43 2000
--- glibc-20000928/localedata/locales/i18n	Sat Sep 30 02:36:34 2000
***************
*** 305,317 ****
     <UFE70>..<UFE72>;<UFE74>;<UFE76>..<UFEFC>;/
  % HALFWIDTH AND FULLWIDTH FORMS/
     <UFF21>..<UFF3A>;<UFF41>..<UFF5A>;<UFF66>..<UFFBE>;<UFFC2>..<UFFC7>;/
!    <UFFCA>..<UFFCF>;<UFFD2>..<UFFD7>;<UFFDA>..<UFFDC>
! 
! % The "digit" class of the "i18n" FDCC-set is reflecting
! % the recommendations in TR 10176 annex A
! digit /
! % TABLE 1 BASIC LATIN/
!   <U0030>..<U0039>;/
  % TABLE 15 and 16 ARABIC/
    <U0660>..<U0669>;<U06F0>..<U06F9>;/
  % TABLE 17 DEVANAGARI/
--- 305,314 ----
     <UFE70>..<UFE72>;<UFE74>;<UFE76>..<UFEFC>;/
  % HALFWIDTH AND FULLWIDTH FORMS/
     <UFF21>..<UFF3A>;<UFF41>..<UFF5A>;<UFF66>..<UFFBE>;<UFFC2>..<UFFC7>;/
!    <UFFCA>..<UFFCF>;<UFFD2>..<UFFD7>;<UFFDA>..<UFFDC>;/
! % The non-ASCII number characters are included here because ISO C 99    /
! % forbids us to classify them as digits; however, they behave more like /
! % alphanumeric than like punctuation.                                   /
  % TABLE 15 and 16 ARABIC/
    <U0660>..<U0669>;<U06F0>..<U06F9>;/
  % TABLE 17 DEVANAGARI/
***************
*** 349,354 ****
--- 346,356 ----
  % HALFWIDTH AND FULLWIDTH FORMS/
     <UFF10>..<UFF19>
  
+ % The "digit" class must only contain the BASIC LATIN digits, says ISO C 99
+ % (sections 7.25.2.1.5 and 5.2.1).
+ digit /
+    <U0030>..<U0039>
+ 
  outdigit <U0030>..<U0039>
  
  space /
***************
*** 602,607 ****
--- 604,611 ----
     <UFFD2>..<UFFD7>;<UFFDA>..<UFFDC>;<UFFE0>..<UFFE6>;<UFFE8>..<UFFEE>;/
     <UFFF9>..<UFFFD>
  
+ % The "xdigit" class must only contain the BASIC LATIN digits and A-F, a-f,
+ % says ISO C 99 (sections 7.25.2.1.12 and 6.4.4.1).
  xdigit /
     <U0030>..<U0039>;<U0041>..<U0046>;<U0061>..<U0066>
Follow-Ups:
- Re: making the wide character properties Unicode compatible
  - From: Ulrich Drepper
Index Nav:	[Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav:	[Date Prev] [Date Next]	[Thread Prev] [Thread Next]