This is the mail archive of the newlib@sourceware.org mailing list for the newlib project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[PATCH/RFA] Add CJK ambiguous character handling dependent on language


Hi,

as discussed three weeks ago, I'll now propose the following patch.  It
changes __wcwidth along the lines of Markus Kuhn's code and Iwamuro
Motonori's proposal to use the language set via setlocale(1) to return
different character widths for the CJK Ambiguous Width" category of
characters.  Tested on Cygwin.

Ok to apply?


Thanks,
Corinna


	* libc/locale/locale.c (lc_ctype_cjk_lang): New static int.
	(loadlocale): In case of setting LC_CTYPE, set lc_ctype_cjk_lang
	to 1 if the language is "jp", "ko", or "zh", to 0 otherwise.
	(__locale_cjk_lang): New function.
	* libc/string/local.h (__locale_cjk_lang): Declare.
	* libc/string/wcwidth.c: Fix comments.
	(__wcwidth): Handle CJK Ambiguous Width characters according
	to setting returned by __locale_cjk_lang.


Index: libc/locale/locale.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/locale/locale.c,v
retrieving revision 1.19
diff -u -p -r1.19 locale.c
--- libc/locale/locale.c	13 May 2009 14:54:11 -0000	1.19
+++ libc/locale/locale.c	3 Jun 2009 10:54:53 -0000
@@ -204,6 +204,7 @@ static const char *__get_locale_env(stru
 
 static char lc_ctype_charset[ENCODING_LEN + 1] = "ASCII";
 static char lc_message_charset[ENCODING_LEN + 1] = "ASCII";
+static int lc_ctype_cjk_lang = 0;
 
 char *
 _DEFUN(_setlocale_r, (p, category, locale),
@@ -602,6 +603,14 @@ loadlocale(struct _reent *p, int categor
       __wctomb = l_wctomb;
       __mbtowc = l_mbtowc;
       __set_ctype (charset);
+      /* Check for the language part of the locale specifier.  In case
+         of "ja", "ko", or "zh", assume the use of CJK fonts.  This is
+	 stored in lc_ctype_cjk_lang and tested in wcwidth() to figure
+	 out the width to return (1 or 2) for the "CJK Ambiguous Width"
+	 category of characters. */
+      lc_ctype_cjk_lang = (strncmp (locale, "ja", 2) == 0
+			   || strncmp (locale, "ko", 2) == 0
+			   || strncmp (locale, "zh", 2) == 0);
 #endif
     }
   else if (category == LC_MESSAGES)
@@ -645,6 +654,12 @@ _DEFUN_VOID(__locale_msgcharset)
   return lc_message_charset;
 }
 
+int
+_DEFUN_VOID(__locale_cjk_lang)
+{
+  return lc_ctype_cjk_lang;
+}
+
 struct lconv *
 _DEFUN(_localeconv_r, (data), 
       struct _reent *data)
Index: libc/string/local.h
===================================================================
RCS file: /cvs/src/src/newlib/libc/string/local.h,v
retrieving revision 1.2
diff -u -p -r1.2 local.h
--- libc/string/local.h	2 Jun 2009 09:41:06 -0000	1.2
+++ libc/string/local.h	3 Jun 2009 10:54:53 -0000
@@ -3,3 +3,7 @@
 
 /* internal function to compute width of wide char. */
 int _EXFUN (__wcwidth, (wint_t));
+
+/* Defined in locale/locale.c.  Returns a value != 0 if the current
+   language is assumed to use CJK fonts. */
+int __locale_cjk_lang ();
Index: libc/string/wcwidth.c
===================================================================
RCS file: /cvs/src/src/newlib/libc/string/wcwidth.c,v
retrieving revision 1.3
diff -u -p -r1.3 wcwidth.c
--- libc/string/wcwidth.c	15 May 2009 11:40:28 -0000	1.3
+++ libc/string/wcwidth.c	3 Jun 2009 10:54:53 -0000
@@ -127,7 +127,7 @@ bisearch(wint_t ucs, const struct interv
 }
 #endif /* _MB_CAPABLE */
 
-/* The following two functions define the column width of an ISO 10646
+/* The following function defines the column width of an ISO 10646
  * character as follows:
  *
  *    - The null character (U+0000) has a column width of 0.
@@ -135,6 +135,11 @@ bisearch(wint_t ucs, const struct interv
  *    - Other C0/C1 control characters and DEL will lead to a return
  *      value of -1.
  *
+ *    - If the current language is recognized as a language usually using
+ *      CJK fonts, spacing characters in the East Asian Ambiguous (A)
+ *      category as defined in Unicode Technical Report #11 have a column
+ *      width of 2.
+ *
  *    - Non-spacing and enclosing combining characters (general
  *      category code Mn or Me in the Unicode database) have a
  *      column width of 0.
@@ -155,7 +160,7 @@ bisearch(wint_t ucs, const struct interv
  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
  *      etc.) have a column width of 1.
  *
- * This implementation assumes that wchar_t characters are encoded
+ * This implementation assumes that wint_t characters are encoded
  * in ISO 10646.
  */
 
@@ -164,6 +169,62 @@ _DEFUN (__wcwidth, (ucs),
 	_CONST wint_t ucs)
 {
 #ifdef _MB_CAPABLE
+  /* sorted list of non-overlapping intervals of East Asian Ambiguous
+   * characters, generated by "uniset +WIDTH-A -cat=Me -cat=Mn -cat=Cf c" */
+  static const struct interval ambiguous[] = {
+    { 0x00A1, 0x00A1 }, { 0x00A4, 0x00A4 }, { 0x00A7, 0x00A8 },
+    { 0x00AA, 0x00AA }, { 0x00AE, 0x00AE }, { 0x00B0, 0x00B4 },
+    { 0x00B6, 0x00BA }, { 0x00BC, 0x00BF }, { 0x00C6, 0x00C6 },
+    { 0x00D0, 0x00D0 }, { 0x00D7, 0x00D8 }, { 0x00DE, 0x00E1 },
+    { 0x00E6, 0x00E6 }, { 0x00E8, 0x00EA }, { 0x00EC, 0x00ED },
+    { 0x00F0, 0x00F0 }, { 0x00F2, 0x00F3 }, { 0x00F7, 0x00FA },
+    { 0x00FC, 0x00FC }, { 0x00FE, 0x00FE }, { 0x0101, 0x0101 },
+    { 0x0111, 0x0111 }, { 0x0113, 0x0113 }, { 0x011B, 0x011B },
+    { 0x0126, 0x0127 }, { 0x012B, 0x012B }, { 0x0131, 0x0133 },
+    { 0x0138, 0x0138 }, { 0x013F, 0x0142 }, { 0x0144, 0x0144 },
+    { 0x0148, 0x014B }, { 0x014D, 0x014D }, { 0x0152, 0x0153 },
+    { 0x0166, 0x0167 }, { 0x016B, 0x016B }, { 0x01CE, 0x01CE },
+    { 0x01D0, 0x01D0 }, { 0x01D2, 0x01D2 }, { 0x01D4, 0x01D4 },
+    { 0x01D6, 0x01D6 }, { 0x01D8, 0x01D8 }, { 0x01DA, 0x01DA },
+    { 0x01DC, 0x01DC }, { 0x0251, 0x0251 }, { 0x0261, 0x0261 },
+    { 0x02C4, 0x02C4 }, { 0x02C7, 0x02C7 }, { 0x02C9, 0x02CB },
+    { 0x02CD, 0x02CD }, { 0x02D0, 0x02D0 }, { 0x02D8, 0x02DB },
+    { 0x02DD, 0x02DD }, { 0x02DF, 0x02DF }, { 0x0391, 0x03A1 },
+    { 0x03A3, 0x03A9 }, { 0x03B1, 0x03C1 }, { 0x03C3, 0x03C9 },
+    { 0x0401, 0x0401 }, { 0x0410, 0x044F }, { 0x0451, 0x0451 },
+    { 0x2010, 0x2010 }, { 0x2013, 0x2016 }, { 0x2018, 0x2019 },
+    { 0x201C, 0x201D }, { 0x2020, 0x2022 }, { 0x2024, 0x2027 },
+    { 0x2030, 0x2030 }, { 0x2032, 0x2033 }, { 0x2035, 0x2035 },
+    { 0x203B, 0x203B }, { 0x203E, 0x203E }, { 0x2074, 0x2074 },
+    { 0x207F, 0x207F }, { 0x2081, 0x2084 }, { 0x20AC, 0x20AC },
+    { 0x2103, 0x2103 }, { 0x2105, 0x2105 }, { 0x2109, 0x2109 },
+    { 0x2113, 0x2113 }, { 0x2116, 0x2116 }, { 0x2121, 0x2122 },
+    { 0x2126, 0x2126 }, { 0x212B, 0x212B }, { 0x2153, 0x2154 },
+    { 0x215B, 0x215E }, { 0x2160, 0x216B }, { 0x2170, 0x2179 },
+    { 0x2190, 0x2199 }, { 0x21B8, 0x21B9 }, { 0x21D2, 0x21D2 },
+    { 0x21D4, 0x21D4 }, { 0x21E7, 0x21E7 }, { 0x2200, 0x2200 },
+    { 0x2202, 0x2203 }, { 0x2207, 0x2208 }, { 0x220B, 0x220B },
+    { 0x220F, 0x220F }, { 0x2211, 0x2211 }, { 0x2215, 0x2215 },
+    { 0x221A, 0x221A }, { 0x221D, 0x2220 }, { 0x2223, 0x2223 },
+    { 0x2225, 0x2225 }, { 0x2227, 0x222C }, { 0x222E, 0x222E },
+    { 0x2234, 0x2237 }, { 0x223C, 0x223D }, { 0x2248, 0x2248 },
+    { 0x224C, 0x224C }, { 0x2252, 0x2252 }, { 0x2260, 0x2261 },
+    { 0x2264, 0x2267 }, { 0x226A, 0x226B }, { 0x226E, 0x226F },
+    { 0x2282, 0x2283 }, { 0x2286, 0x2287 }, { 0x2295, 0x2295 },
+    { 0x2299, 0x2299 }, { 0x22A5, 0x22A5 }, { 0x22BF, 0x22BF },
+    { 0x2312, 0x2312 }, { 0x2460, 0x24E9 }, { 0x24EB, 0x254B },
+    { 0x2550, 0x2573 }, { 0x2580, 0x258F }, { 0x2592, 0x2595 },
+    { 0x25A0, 0x25A1 }, { 0x25A3, 0x25A9 }, { 0x25B2, 0x25B3 },
+    { 0x25B6, 0x25B7 }, { 0x25BC, 0x25BD }, { 0x25C0, 0x25C1 },
+    { 0x25C6, 0x25C8 }, { 0x25CB, 0x25CB }, { 0x25CE, 0x25D1 },
+    { 0x25E2, 0x25E5 }, { 0x25EF, 0x25EF }, { 0x2605, 0x2606 },
+    { 0x2609, 0x2609 }, { 0x260E, 0x260F }, { 0x2614, 0x2615 },
+    { 0x261C, 0x261C }, { 0x261E, 0x261E }, { 0x2640, 0x2640 },
+    { 0x2642, 0x2642 }, { 0x2660, 0x2661 }, { 0x2663, 0x2665 },
+    { 0x2667, 0x266A }, { 0x266C, 0x266D }, { 0x266F, 0x266F },
+    { 0x273D, 0x273D }, { 0x2776, 0x277F }, { 0xE000, 0xF8FF },
+    { 0xFFFD, 0xFFFD }, { 0xF0000, 0xFFFFD }, { 0x100000, 0x10FFFD }
+  };
   /* sorted list of non-overlapping intervals of non-spacing characters */
   /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */
   static const struct interval combining[] = {
@@ -217,6 +278,12 @@ _DEFUN (__wcwidth, (ucs),
     { 0xE0100, 0xE01EF }
   };
 
+  /* binary search in table of ambiguous characters */
+  if (__locale_cjk_lang ()
+      && bisearch(ucs, ambiguous,
+		  sizeof(ambiguous) / sizeof(struct interval) - 1))
+    return 2;
+
   /* test for 8-bit control characters */
   if (ucs == 0)
     return 0;


-- 
Corinna Vinschen
Cygwin Project Co-Leader
Red Hat


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]