- From: Martin Duerst <duerst@w3.org>
- Date: Tue, 13 Aug 2002 11:38:28 +0900
- To: www-validator@w3.org
Hello Terje, others, Here is some code (in Java, from the RDF validator) that does charset/encoding detection for XML. I guess it can be adapted to include HTML, too, and changed to perl. Regards, Martin. /* * Given a URI string, open it, read its contents into a String * and return the String * *@param uri the URI to open *@return the content at the URI or null if any error occurs */ private String getRDFfromURI (String uri) throws getRDFException { /* add something like this code here, to allow reading from a file: (if we really want to allow this!) File ff = new File(uri); in = new FileInputStream(ff); */ URL url = null; try { url = new URL(uri); } catch (MalformedURLException e) { throw new getRDFException("Malformed URI."); } URLConnection con = null; try { con = url.openConnection(); con.setRequestProperty("Accept", "application/rdf+xml"); con.connect(); } catch (Exception e) { throw new getRDFException("Unable to open connection."); } String contentT = con.getContentType(); String HTTPcharset = null; if (contentT != null) { ContentType contentType = null; try { } catch (javax.mail.internet.ParseException e) { throw new getRDFException("Unparsable content type."); } HTTPcharset = contentType.getParameter("charset"); } // need buffer for lookahead for encoding detection BufferedInputStream bis = null; try { bis = new BufferedInputStream(con.getInputStream()); } catch (IOException e) { throw new getRDFException("Cannot open stream."); } bis.mark(200); // mark start so that we can get back to it String s = ""; try { // read start of file as bytes int c; int numRead = 0; contentType = new ContentType(con.getContentType()); while ((c = bis.read()) != -1) { s += (char)c; if (numRead++ >= 195) break; } } catch (IOException e) { throw new getRDFException("IOException while starting reading."); } if (s.equals("")) // Nothing was returned throw new getRDFException("Empty document, ignored."); // A server could return content but not the RDF/XML that // we need. Check the beginning of s and if it looks like // a generic HTML message, return an error. if (s.startsWith("<!DOCTYPE")) throw new getRDFException("Document looks like HTML, ignored."); String APPFcharset = null; // 'charset' according to XML APP. F int ignoreBytes = 0; if (s.startsWith("\u00FE\u00FF")) { APPFcharset = "UTF-16BE"; ignoreBytes = 2; } else if (s.startsWith("\u00FF\u00FE")) { APPFcharset = "UTF-16LE"; ignoreBytes = 2; } else if (s.startsWith("\u00EF\u00BB\u00BF")) { APPFcharset = "UTF-8"; ignoreBytes = 3; } else if (s.startsWith("\u0000<\u0000?")) { APPFcharset = "UTF-16BE"; } else if (s.startsWith("<\u0000?\u0000")) { APPFcharset = "UTF-16LE"; } else if (s.startsWith("<?xml")) { APPFcharset = "US-ASCII"; } else if (s.startsWith("\u004C\u006F\u00A7\u0094")) { APPFcharset = "CP037"; // EBCDIC } // convert start of xml input according to APPFcharset String xmlstart = null; try { xmlstart = new String(s.substring(ignoreBytes).getBytes("iso-8859-1\ "), APPFcharset); } catch (UnsupportedEncodingException e) { throw new getRDFException("Unsupported encoding '"+APPFcharset+"'."\ ); } RE r; try { r = new RE("<\\?xml[ \\t\\n\\r]+version[ \\t\\n\\r]?=[ \\t\\n\\r]?(\ ['\"])([a-zA-Z0-9_:]|\\.|-)+\\1[ \\t\\n\\r]+encoding[ \\t\\n\\r]?=[ \\t\\n\\r]?\ (['\"])([A-Za-z]([A-Za-z0-9._]|-)*)\\3"); } catch (RESyntaxException res) { throw new getRDFException("Wrong regular expression syntax."); } // r.setMatchFlags(MATCH_NORMAL | MATCH_SINGLELINE); String XMLcharset = null; if (r.match(xmlstart) && r.getParenStart(0)==0) XMLcharset = r.getParen(4); if (HTTPcharset != null) HTTPcharset = HTTPcharset.toUpperCase(); if (XMLcharset != null) XMLcharset = XMLcharset.toUpperCase(); String finalCharset = null; if (HTTPcharset != null) { if (XMLcharset != null && !HTTPcharset.equals(XMLcharset)) throw new getRDFException("Charset conflict: Content-Type: " + contentT+ ". XML encoding: " + XMLcharset + "."); finalCharset = HTTPcharset; } else if (XMLcharset != null) finalCharset = XMLcharset; if ((finalCharset != null && finalCharset.equals("UTF-16")) || (finalCharset == null && APPFcharset.startsWith("UTF-16"))) if (ignoreBytes == 2) finalCharset = APPFcharset; // use correct endianness else throw new getRDFException("Illegal XML: UTF-16 without BOM."); if (finalCharset == null) finalCharset = "UTF-8"; try { bis.reset(); // move back to start of stream bis.skip(ignoreBytes); // skip BOM } catch (IOException e) { throw new getRDFException("IOException while resetting stream."); } InputStreamReader isr = null; try { isr = new InputStreamReader(bis, finalCharset); } catch (UnsupportedEncodingException e) { throw new getRDFException("Unsupported encoding '"+finalCharset+"'.\ "); } StringBuffer sb = new StringBuffer(""); int charnum = 0; try { // read whole file as characters int c; while ((c = isr.read()) != -1) { sb.append((char)c); charnum++; } } catch (IOException e) { throw new getRDFException("IOException while reading URI at charact\ er " + charnum + " using encoding " + XMLcharset + "."); } // todo: fix encoding parameter in xml pseudo-PI return sb.toString(); }
Received on Monday, 12 August 2002 22:38:39 UTC