- From: Martin Duerst <duerst@w3.org>
- Date: Tue, 13 Aug 2002 11:38:28 +0900
- To: www-validator@w3.org
Hello Terje, others,
Here is some code (in Java, from the RDF validator) that
does charset/encoding detection for XML. I guess it can be
adapted to include HTML, too, and changed to perl.
Regards, Martin.
/*
* Given a URI string, open it, read its contents into a String
* and return the String
*
*@param uri the URI to open
*@return the content at the URI or null if any error occurs
*/
private String getRDFfromURI (String uri) throws getRDFException
{
/* add something like this code here, to allow reading from a file:
(if we really want to allow this!)
File ff = new File(uri);
in = new FileInputStream(ff);
*/
URL url = null;
try {
url = new URL(uri);
} catch (MalformedURLException e) {
throw new getRDFException("Malformed URI.");
}
URLConnection con = null;
try {
con = url.openConnection();
con.setRequestProperty("Accept", "application/rdf+xml");
con.connect();
} catch (Exception e) {
throw new getRDFException("Unable to open connection.");
}
String contentT = con.getContentType();
String HTTPcharset = null;
if (contentT != null) {
ContentType contentType = null;
try {
} catch (javax.mail.internet.ParseException e) {
throw new getRDFException("Unparsable content type.");
}
HTTPcharset = contentType.getParameter("charset");
}
// need buffer for lookahead for encoding detection
BufferedInputStream bis = null;
try {
bis = new BufferedInputStream(con.getInputStream());
} catch (IOException e) {
throw new getRDFException("Cannot open stream.");
}
bis.mark(200); // mark start so that we can get back to it
String s = "";
try { // read start of file as bytes
int c;
int numRead = 0;
contentType = new ContentType(con.getContentType());
while ((c = bis.read()) != -1) {
s += (char)c;
if (numRead++ >= 195) break;
}
} catch (IOException e) {
throw new getRDFException("IOException while starting reading.");
}
if (s.equals(""))
// Nothing was returned
throw new getRDFException("Empty document, ignored.");
// A server could return content but not the RDF/XML that
// we need. Check the beginning of s and if it looks like
// a generic HTML message, return an error.
if (s.startsWith("<!DOCTYPE"))
throw new getRDFException("Document looks like HTML, ignored.");
String APPFcharset = null; // 'charset' according to XML APP. F
int ignoreBytes = 0;
if (s.startsWith("\u00FE\u00FF")) {
APPFcharset = "UTF-16BE";
ignoreBytes = 2;
}
else if (s.startsWith("\u00FF\u00FE")) {
APPFcharset = "UTF-16LE";
ignoreBytes = 2;
}
else if (s.startsWith("\u00EF\u00BB\u00BF")) {
APPFcharset = "UTF-8";
ignoreBytes = 3;
}
else if (s.startsWith("\u0000<\u0000?")) {
APPFcharset = "UTF-16BE";
}
else if (s.startsWith("<\u0000?\u0000")) {
APPFcharset = "UTF-16LE";
}
else if (s.startsWith("<?xml")) {
APPFcharset = "US-ASCII";
}
else if (s.startsWith("\u004C\u006F\u00A7\u0094")) {
APPFcharset = "CP037"; // EBCDIC
}
// convert start of xml input according to APPFcharset
String xmlstart = null;
try {
xmlstart = new
String(s.substring(ignoreBytes).getBytes("iso-8859-1\
"), APPFcharset);
} catch (UnsupportedEncodingException e) {
throw new getRDFException("Unsupported encoding
'"+APPFcharset+"'."\
);
}
RE r;
try {
r = new RE("<\\?xml[ \\t\\n\\r]+version[ \\t\\n\\r]?=[
\\t\\n\\r]?(\
['\"])([a-zA-Z0-9_:]|\\.|-)+\\1[ \\t\\n\\r]+encoding[ \\t\\n\\r]?=[
\\t\\n\\r]?\
(['\"])([A-Za-z]([A-Za-z0-9._]|-)*)\\3");
} catch (RESyntaxException res) {
throw new getRDFException("Wrong regular expression syntax.");
}
// r.setMatchFlags(MATCH_NORMAL | MATCH_SINGLELINE);
String XMLcharset = null;
if (r.match(xmlstart) && r.getParenStart(0)==0)
XMLcharset = r.getParen(4);
if (HTTPcharset != null)
HTTPcharset = HTTPcharset.toUpperCase();
if (XMLcharset != null)
XMLcharset = XMLcharset.toUpperCase();
String finalCharset = null;
if (HTTPcharset != null) {
if (XMLcharset != null && !HTTPcharset.equals(XMLcharset))
throw new getRDFException("Charset conflict: Content-Type: "
+ contentT+ ". XML encoding: " + XMLcharset + ".");
finalCharset = HTTPcharset;
}
else if (XMLcharset != null)
finalCharset = XMLcharset;
if ((finalCharset != null && finalCharset.equals("UTF-16")) ||
(finalCharset == null && APPFcharset.startsWith("UTF-16")))
if (ignoreBytes == 2)
finalCharset = APPFcharset; // use correct endianness
else
throw new getRDFException("Illegal XML: UTF-16 without BOM.");
if (finalCharset == null)
finalCharset = "UTF-8";
try {
bis.reset(); // move back to start of stream
bis.skip(ignoreBytes); // skip BOM
} catch (IOException e) {
throw new getRDFException("IOException while resetting stream.");
}
InputStreamReader isr = null;
try {
isr = new InputStreamReader(bis, finalCharset);
} catch (UnsupportedEncodingException e) {
throw new getRDFException("Unsupported encoding
'"+finalCharset+"'.\
");
}
StringBuffer sb = new StringBuffer("");
int charnum = 0;
try { // read whole file as characters
int c;
while ((c = isr.read()) != -1) {
sb.append((char)c);
charnum++;
}
} catch (IOException e) {
throw new getRDFException("IOException while reading URI at
charact\
er "
+ charnum + " using encoding " + XMLcharset + ".");
}
// todo: fix encoding parameter in xml pseudo-PI
return sb.toString();
}
Received on Monday, 12 August 2002 22:38:39 UTC