This is the mail archive of the kawa@sourceware.org mailing list for the Kawa project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

Patch to make XMLPrinter encoding-aware


Hello!

I've extended XMLPrinter with a field escapeCharsetEncoder ::java.nio.charset.CharsetEncoder and 2 methods:
* method (gnu.xml.XMLPrinter:setEscapeCharset printer charset), accepts string or java.nio.charset.Charset as the 2nd argument
* method (gnu.xml.XMLPrinter:setDefaultEscapeCharset printer), the same as above but uses the system charset

If either of these methods is called, XMLPrinter will &#...;-encode only those characters that can't be encoded with these charsets (for which canEncode(char) returns #f)


The patch also adds a call to xout.setDefaultEscapeCharset(); in gnu.kawa.functions.DisplayFormat, so that XML objects are displayed without unnecessary encoding, BUT frankly speaking, I'm NOT SURE this is the right way to do it: I guess the charset for display format should be set by port-char-encoding. But I don't know how to do it correctly...

So for now I've just attached the 2nd patch, "change-just-XMLPrinter.diff", that doesn't touch DisplayFormat. :) This one is guaranteed not to break anything.


Here's an example of what the patch does (if DisplayFormat is changed):
$ java -cp kawa-1.12.jar -Dfile.encoding=Shift-JIS kawa.repl -e '(display (html:p "\u0414 \u00E6 \u662F"))(newline)' |iconv -f sjis
<p xmlns="http://www.w3.org/1999/xhtml";>Ð &#xE6; æ</p>
$ java -cp kawa-1.12.jar -Dfile.encoding=cp1251 kawa.repl -e '(display (html:p "\u0414 \u00E6 \u662F"))(newline)' |iconv -f cp1251
<p xmlns="http://www.w3.org/1999/xhtml";>Ð &#xE6; &#x662F;</p>
$ java -cp kawa-1.12.jar -Dfile.encoding=utf-8 kawa.repl -e '(display (html:p "\u0414 \u00E6 \u662F"))(newline)'
<p xmlns="http://www.w3.org/1999/xhtml";>Ð Ã æ</p>
$ java -cp kawa-1.12.jar -Dfile.encoding=cp1252 kawa.repl -e '(display (html:p "\u0414 \u00E6 \u662F"))(newline)' |iconv -f cp1252
<p xmlns="http://www.w3.org/1999/xhtml";>&#x414; Ã &#x662F;</p>
diff --git a/gnu/kawa/functions/DisplayFormat.java b/gnu/kawa/functions/DisplayFormat.java
index e91cbcc..657d51e 100644
--- a/gnu/kawa/functions/DisplayFormat.java
+++ b/gnu/kawa/functions/DisplayFormat.java
@@ -342,6 +342,7 @@ public class DisplayFormat extends AbstractFormat
         Writer wout = out instanceof Writer ? (Writer) out
           : new ConsumerWriter(out);
         XMLPrinter xout = new XMLPrinter(wout);
+        xout.setDefaultEscapeCharset();
         xout.writeObject(obj);
         xout.closeThis();
       }
diff --git a/gnu/xml/XMLPrinter.java b/gnu/xml/XMLPrinter.java
index 8abe9e4..4e1122b 100644
--- a/gnu/xml/XMLPrinter.java
+++ b/gnu/xml/XMLPrinter.java
@@ -4,6 +4,7 @@
 package gnu.xml;
 import gnu.lists.*;
 import java.io.*;
+import java.nio.charset.*;
 import gnu.text.*;
 import gnu.math.RealNum;
 import gnu.text.PrettyWriter;
@@ -45,6 +46,11 @@ public class XMLPrinter extends OutPort
   public int useEmptyElementTag = 2;
   public boolean escapeText = true;
   public boolean escapeNonAscii = true;
+  /** Charset encoder used to check which non-ASCII charcters
+  * should be encoded. Can be set indirectly by setEscapeCharset
+  * or setDefaultEscapeCharset. Makes sense only if escapeNonAscii
+  * is set to false. */
+  protected CharsetEncoder escapeCharsetEncoder = null;
   boolean isHtml = false;
   boolean isHtmlOrXhtml = false;
   boolean undeclareNamespaces = false;
@@ -154,10 +160,54 @@ public class XMLPrinter extends OutPort
     if ("plain".equals(style))
       escapeText = false;
   }
+  
+  public void setEscapeCharset(Charset charset) {
+    try {
+      String name = charset.name();
+      if ("UTF-8".equals(name) || "UTF-16BE".equals(name)
+	  || "UTF-16LE".equals(name) || "UTF-16".equals(name)) {
+	escapeNonAscii = false;
+	escapeCharsetEncoder = null;
+      }
+      else if ("US-ASCII".equals(name)) {
+	escapeNonAscii = true;
+	escapeCharsetEncoder = null;
+      }
+      else {
+	escapeNonAscii = false;
+	escapeCharsetEncoder = charset.newEncoder();
+      }
+    }
+    catch (RuntimeException e) {
+      escapeNonAscii = true;
+      escapeCharsetEncoder = null;
+    }
+  }
+
+  public void setEscapeCharset(String charsetName) {
+    try {
+      Charset charset = Charset.forName(charsetName);
+    }
+    catch (RuntimeException e) {
+      escapeNonAscii = true;
+      escapeCharsetEncoder = null;
+    }
+  }
+  
+  public void setDefaultEscapeCharset() {
+    /* #ifdef JAVA5 */
+    Charset defaultCharset = Charset.defaultCharset();
+    /* #else */
+    // String defaultCharset = System.getProperty("file.encoding", "US-ASCII");
+    /* #endif */
+    setEscapeCharset(defaultCharset);
+  }
 
   boolean mustHexEscape (int v)
   {
-    return (v >= 127 && (v <= 159 || escapeNonAscii))
+    return (v >= 127 && (v <= 159 || escapeNonAscii
+        ||  (escapeCharsetEncoder != null 
+          && !escapeCharsetEncoder.canEncode(new String(Character.toChars(v))))))
       || v == 0x2028
       // We must escape control characters in attributes,
       // since otherwise they get normalized to ' '.
diff --git a/gnu/xml/XMLPrinter.java b/gnu/xml/XMLPrinter.java
index 8abe9e4..4e1122b 100644
--- a/gnu/xml/XMLPrinter.java
+++ b/gnu/xml/XMLPrinter.java
@@ -4,6 +4,7 @@
 package gnu.xml;
 import gnu.lists.*;
 import java.io.*;
+import java.nio.charset.*;
 import gnu.text.*;
 import gnu.math.RealNum;
 import gnu.text.PrettyWriter;
@@ -45,6 +46,11 @@ public class XMLPrinter extends OutPort
   public int useEmptyElementTag = 2;
   public boolean escapeText = true;
   public boolean escapeNonAscii = true;
+  /** Charset encoder used to check which non-ASCII charcters
+  * should be encoded. Can be set indirectly by setEscapeCharset
+  * or setDefaultEscapeCharset. Makes sense only if escapeNonAscii
+  * is set to false. */
+  protected CharsetEncoder escapeCharsetEncoder = null;
   boolean isHtml = false;
   boolean isHtmlOrXhtml = false;
   boolean undeclareNamespaces = false;
@@ -154,10 +160,54 @@ public class XMLPrinter extends OutPort
     if ("plain".equals(style))
       escapeText = false;
   }
+  
+  public void setEscapeCharset(Charset charset) {
+    try {
+      String name = charset.name();
+      if ("UTF-8".equals(name) || "UTF-16BE".equals(name)
+	  || "UTF-16LE".equals(name) || "UTF-16".equals(name)) {
+	escapeNonAscii = false;
+	escapeCharsetEncoder = null;
+      }
+      else if ("US-ASCII".equals(name)) {
+	escapeNonAscii = true;
+	escapeCharsetEncoder = null;
+      }
+      else {
+	escapeNonAscii = false;
+	escapeCharsetEncoder = charset.newEncoder();
+      }
+    }
+    catch (RuntimeException e) {
+      escapeNonAscii = true;
+      escapeCharsetEncoder = null;
+    }
+  }
+
+  public void setEscapeCharset(String charsetName) {
+    try {
+      Charset charset = Charset.forName(charsetName);
+    }
+    catch (RuntimeException e) {
+      escapeNonAscii = true;
+      escapeCharsetEncoder = null;
+    }
+  }
+  
+  public void setDefaultEscapeCharset() {
+    /* #ifdef JAVA5 */
+    Charset defaultCharset = Charset.defaultCharset();
+    /* #else */
+    // String defaultCharset = System.getProperty("file.encoding", "US-ASCII");
+    /* #endif */
+    setEscapeCharset(defaultCharset);
+  }
 
   boolean mustHexEscape (int v)
   {
-    return (v >= 127 && (v <= 159 || escapeNonAscii))
+    return (v >= 127 && (v <= 159 || escapeNonAscii
+        ||  (escapeCharsetEncoder != null 
+          && !escapeCharsetEncoder.canEncode(new String(Character.toChars(v))))))
       || v == 0x2028
       // We must escape control characters in attributes,
       // since otherwise they get normalized to ' '.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]