This is the mail archive of the
kawa@sourceware.org
mailing list for the Kawa project.
Patch to make XMLPrinter encoding-aware
- From: Дмитрий <dmymd at yandex dot ru>
- To: kawa at sources dot redhat dot com
- Date: Fri, 27 Jul 2012 23:03:43 +0400
- Subject: Patch to make XMLPrinter encoding-aware
Hello!
I've extended XMLPrinter with a field escapeCharsetEncoder ::java.nio.charset.CharsetEncoder and 2 methods:
* method (gnu.xml.XMLPrinter:setEscapeCharset printer charset), accepts string or java.nio.charset.Charset as the 2nd argument
* method (gnu.xml.XMLPrinter:setDefaultEscapeCharset printer), the same as above but uses the system charset
If either of these methods is called, XMLPrinter will &#...;-encode only those characters that can't be encoded with these charsets (for which canEncode(char) returns #f)
The patch also adds a call to xout.setDefaultEscapeCharset(); in gnu.kawa.functions.DisplayFormat, so that XML objects are displayed without unnecessary encoding, BUT frankly speaking, I'm NOT SURE this is the right way to do it: I guess the charset for display format should be set by port-char-encoding. But I don't know how to do it correctly...
So for now I've just attached the 2nd patch, "change-just-XMLPrinter.diff", that doesn't touch DisplayFormat. :) This one is guaranteed not to break anything.
Here's an example of what the patch does (if DisplayFormat is changed):
$ java -cp kawa-1.12.jar -Dfile.encoding=Shift-JIS kawa.repl -e '(display (html:p "\u0414 \u00E6 \u662F"))(newline)' |iconv -f sjis
<p xmlns="http://www.w3.org/1999/xhtml">Ð æ æ</p>
$ java -cp kawa-1.12.jar -Dfile.encoding=cp1251 kawa.repl -e '(display (html:p "\u0414 \u00E6 \u662F"))(newline)' |iconv -f cp1251
<p xmlns="http://www.w3.org/1999/xhtml">Ð æ 是</p>
$ java -cp kawa-1.12.jar -Dfile.encoding=utf-8 kawa.repl -e '(display (html:p "\u0414 \u00E6 \u662F"))(newline)'
<p xmlns="http://www.w3.org/1999/xhtml">Ð Ã æ</p>
$ java -cp kawa-1.12.jar -Dfile.encoding=cp1252 kawa.repl -e '(display (html:p "\u0414 \u00E6 \u662F"))(newline)' |iconv -f cp1252
<p xmlns="http://www.w3.org/1999/xhtml">Д Ã 是</p>
diff --git a/gnu/kawa/functions/DisplayFormat.java b/gnu/kawa/functions/DisplayFormat.java
index e91cbcc..657d51e 100644
--- a/gnu/kawa/functions/DisplayFormat.java
+++ b/gnu/kawa/functions/DisplayFormat.java
@@ -342,6 +342,7 @@ public class DisplayFormat extends AbstractFormat
Writer wout = out instanceof Writer ? (Writer) out
: new ConsumerWriter(out);
XMLPrinter xout = new XMLPrinter(wout);
+ xout.setDefaultEscapeCharset();
xout.writeObject(obj);
xout.closeThis();
}
diff --git a/gnu/xml/XMLPrinter.java b/gnu/xml/XMLPrinter.java
index 8abe9e4..4e1122b 100644
--- a/gnu/xml/XMLPrinter.java
+++ b/gnu/xml/XMLPrinter.java
@@ -4,6 +4,7 @@
package gnu.xml;
import gnu.lists.*;
import java.io.*;
+import java.nio.charset.*;
import gnu.text.*;
import gnu.math.RealNum;
import gnu.text.PrettyWriter;
@@ -45,6 +46,11 @@ public class XMLPrinter extends OutPort
public int useEmptyElementTag = 2;
public boolean escapeText = true;
public boolean escapeNonAscii = true;
+ /** Charset encoder used to check which non-ASCII charcters
+ * should be encoded. Can be set indirectly by setEscapeCharset
+ * or setDefaultEscapeCharset. Makes sense only if escapeNonAscii
+ * is set to false. */
+ protected CharsetEncoder escapeCharsetEncoder = null;
boolean isHtml = false;
boolean isHtmlOrXhtml = false;
boolean undeclareNamespaces = false;
@@ -154,10 +160,54 @@ public class XMLPrinter extends OutPort
if ("plain".equals(style))
escapeText = false;
}
+
+ public void setEscapeCharset(Charset charset) {
+ try {
+ String name = charset.name();
+ if ("UTF-8".equals(name) || "UTF-16BE".equals(name)
+ || "UTF-16LE".equals(name) || "UTF-16".equals(name)) {
+ escapeNonAscii = false;
+ escapeCharsetEncoder = null;
+ }
+ else if ("US-ASCII".equals(name)) {
+ escapeNonAscii = true;
+ escapeCharsetEncoder = null;
+ }
+ else {
+ escapeNonAscii = false;
+ escapeCharsetEncoder = charset.newEncoder();
+ }
+ }
+ catch (RuntimeException e) {
+ escapeNonAscii = true;
+ escapeCharsetEncoder = null;
+ }
+ }
+
+ public void setEscapeCharset(String charsetName) {
+ try {
+ Charset charset = Charset.forName(charsetName);
+ }
+ catch (RuntimeException e) {
+ escapeNonAscii = true;
+ escapeCharsetEncoder = null;
+ }
+ }
+
+ public void setDefaultEscapeCharset() {
+ /* #ifdef JAVA5 */
+ Charset defaultCharset = Charset.defaultCharset();
+ /* #else */
+ // String defaultCharset = System.getProperty("file.encoding", "US-ASCII");
+ /* #endif */
+ setEscapeCharset(defaultCharset);
+ }
boolean mustHexEscape (int v)
{
- return (v >= 127 && (v <= 159 || escapeNonAscii))
+ return (v >= 127 && (v <= 159 || escapeNonAscii
+ || (escapeCharsetEncoder != null
+ && !escapeCharsetEncoder.canEncode(new String(Character.toChars(v))))))
|| v == 0x2028
// We must escape control characters in attributes,
// since otherwise they get normalized to ' '.
diff --git a/gnu/xml/XMLPrinter.java b/gnu/xml/XMLPrinter.java
index 8abe9e4..4e1122b 100644
--- a/gnu/xml/XMLPrinter.java
+++ b/gnu/xml/XMLPrinter.java
@@ -4,6 +4,7 @@
package gnu.xml;
import gnu.lists.*;
import java.io.*;
+import java.nio.charset.*;
import gnu.text.*;
import gnu.math.RealNum;
import gnu.text.PrettyWriter;
@@ -45,6 +46,11 @@ public class XMLPrinter extends OutPort
public int useEmptyElementTag = 2;
public boolean escapeText = true;
public boolean escapeNonAscii = true;
+ /** Charset encoder used to check which non-ASCII charcters
+ * should be encoded. Can be set indirectly by setEscapeCharset
+ * or setDefaultEscapeCharset. Makes sense only if escapeNonAscii
+ * is set to false. */
+ protected CharsetEncoder escapeCharsetEncoder = null;
boolean isHtml = false;
boolean isHtmlOrXhtml = false;
boolean undeclareNamespaces = false;
@@ -154,10 +160,54 @@ public class XMLPrinter extends OutPort
if ("plain".equals(style))
escapeText = false;
}
+
+ public void setEscapeCharset(Charset charset) {
+ try {
+ String name = charset.name();
+ if ("UTF-8".equals(name) || "UTF-16BE".equals(name)
+ || "UTF-16LE".equals(name) || "UTF-16".equals(name)) {
+ escapeNonAscii = false;
+ escapeCharsetEncoder = null;
+ }
+ else if ("US-ASCII".equals(name)) {
+ escapeNonAscii = true;
+ escapeCharsetEncoder = null;
+ }
+ else {
+ escapeNonAscii = false;
+ escapeCharsetEncoder = charset.newEncoder();
+ }
+ }
+ catch (RuntimeException e) {
+ escapeNonAscii = true;
+ escapeCharsetEncoder = null;
+ }
+ }
+
+ public void setEscapeCharset(String charsetName) {
+ try {
+ Charset charset = Charset.forName(charsetName);
+ }
+ catch (RuntimeException e) {
+ escapeNonAscii = true;
+ escapeCharsetEncoder = null;
+ }
+ }
+
+ public void setDefaultEscapeCharset() {
+ /* #ifdef JAVA5 */
+ Charset defaultCharset = Charset.defaultCharset();
+ /* #else */
+ // String defaultCharset = System.getProperty("file.encoding", "US-ASCII");
+ /* #endif */
+ setEscapeCharset(defaultCharset);
+ }
boolean mustHexEscape (int v)
{
- return (v >= 127 && (v <= 159 || escapeNonAscii))
+ return (v >= 127 && (v <= 159 || escapeNonAscii
+ || (escapeCharsetEncoder != null
+ && !escapeCharsetEncoder.canEncode(new String(Character.toChars(v))))))
|| v == 0x2028
// We must escape control characters in attributes,
// since otherwise they get normalized to ' '.