Some code to detect charset (for XML, in Java) from Martin Duerst on 2002-08-13 (www-validator@w3.org from August 2002)

From: Martin Duerst <duerst@w3.org>
Date: Tue, 13 Aug 2002 11:38:28 +0900
To: www-validator@w3.org
Message-Id: <4.2.0.58.J.20020813113123.0363ba10@localhost>
Hello Terje, others,

Here is some code (in Java, from the RDF validator) that
does charset/encoding detection for XML. I guess it can be
adapted to include HTML, too, and changed to perl.

Regards,    Martin.

     /*
      * Given a URI string, open it, read its contents into a String
      * and return the String
      *
      *@param uri the URI to open
      *@return the content at the URI or null if any error occurs
      */
     private String getRDFfromURI (String uri) throws getRDFException
     {
         /* add something like this code here, to allow reading from a file:
            (if we really want to allow this!)
            File ff = new File(uri);
            in = new FileInputStream(ff);
         */
         URL url = null;
         try {
             url = new URL(uri);
         } catch (MalformedURLException e) {
             throw new getRDFException("Malformed URI.");
         }

         URLConnection con = null;
         try {
             con = url.openConnection();
             con.setRequestProperty("Accept", "application/rdf+xml");
             con.connect();
         } catch (Exception e) {
             throw new getRDFException("Unable to open connection.");
         }
         String contentT = con.getContentType();
         String HTTPcharset = null;
         if (contentT != null) {
             ContentType contentType = null;
             try {
                  } catch (javax.mail.internet.ParseException e) {
                 throw new getRDFException("Unparsable content type.");
             }
             HTTPcharset = contentType.getParameter("charset");
         }

         // need buffer for lookahead for encoding detection
         BufferedInputStream bis = null;
         try {
             bis = new BufferedInputStream(con.getInputStream());
         } catch (IOException e) {
             throw new getRDFException("Cannot open stream.");
         }
         bis.mark(200); // mark start so that we can get back to it
         String s = "";

         try {  // read start of file as bytes
             int c;
             int numRead = 0;
            contentType = new ContentType(con.getContentType());
             while ((c = bis.read()) != -1) {
                 s += (char)c;
                 if (numRead++ >= 195) break;
             }
         } catch (IOException e) {
             throw new getRDFException("IOException while starting reading.");
         }

         if (s.equals(""))
             // Nothing was returned
             throw new getRDFException("Empty document, ignored.");

         // A server could return content but not the RDF/XML that
         // we need.  Check the beginning of s and if it looks like
         // a generic HTML message, return an error.
         if (s.startsWith("<!DOCTYPE"))
             throw new getRDFException("Document looks like HTML, ignored.");

         String APPFcharset = null;  // 'charset' according to XML APP. F
         int ignoreBytes = 0;
         if (s.startsWith("\u00FE\u00FF")) {
             APPFcharset = "UTF-16BE";
             ignoreBytes = 2;
         }
         else if (s.startsWith("\u00FF\u00FE")) {
             APPFcharset = "UTF-16LE";
             ignoreBytes = 2;
         }
         else if (s.startsWith("\u00EF\u00BB\u00BF")) {
             APPFcharset = "UTF-8";
             ignoreBytes = 3;
         }
         else if (s.startsWith("\u0000<\u0000?")) {
             APPFcharset = "UTF-16BE";
         }
         else if (s.startsWith("<\u0000?\u0000")) {
             APPFcharset = "UTF-16LE";
         }
         else if (s.startsWith("<?xml")) {
             APPFcharset = "US-ASCII";
         }
         else if (s.startsWith("\u004C\u006F\u00A7\u0094")) {
             APPFcharset = "CP037";  // EBCDIC
         }

         // convert start of xml input according to APPFcharset
         String xmlstart = null;
         try {
             xmlstart = new 
String(s.substring(ignoreBytes).getBytes("iso-8859-1\
"), APPFcharset);
         } catch (UnsupportedEncodingException e) {
             throw new getRDFException("Unsupported encoding 
'"+APPFcharset+"'."\
);
         }
         RE r;
         try {
             r = new RE("<\\?xml[ \\t\\n\\r]+version[ \\t\\n\\r]?=[ 
\\t\\n\\r]?(\
['\"])([a-zA-Z0-9_:]|\\.|-)+\\1[ \\t\\n\\r]+encoding[ \\t\\n\\r]?=[ 
\\t\\n\\r]?\
(['\"])([A-Za-z]([A-Za-z0-9._]|-)*)\\3");
         } catch (RESyntaxException res) {
             throw new getRDFException("Wrong regular expression syntax.");
         }
         // r.setMatchFlags(MATCH_NORMAL | MATCH_SINGLELINE);
         String XMLcharset = null;
         if (r.match(xmlstart) && r.getParenStart(0)==0)
             XMLcharset = r.getParen(4);
         if (HTTPcharset != null)
             HTTPcharset = HTTPcharset.toUpperCase();
         if (XMLcharset != null)
             XMLcharset = XMLcharset.toUpperCase();

         String finalCharset = null;
         if (HTTPcharset != null) {
             if (XMLcharset != null && !HTTPcharset.equals(XMLcharset))
                 throw new getRDFException("Charset conflict: Content-Type: "
                     + contentT+ ". XML encoding: " +  XMLcharset + ".");
             finalCharset = HTTPcharset;
         }
         else if (XMLcharset != null)
             finalCharset = XMLcharset;
         if ((finalCharset != null && finalCharset.equals("UTF-16")) ||
                 (finalCharset == null && APPFcharset.startsWith("UTF-16")))
             if (ignoreBytes == 2)
                 finalCharset = APPFcharset;  // use correct endianness
             else
                 throw new getRDFException("Illegal XML: UTF-16 without BOM.");
         if (finalCharset == null)
             finalCharset = "UTF-8";

         try {
             bis.reset();                 // move back to start of stream
             bis.skip(ignoreBytes);       // skip BOM
         } catch (IOException e) {
             throw new getRDFException("IOException while resetting stream.");
         }

         InputStreamReader isr = null;
         try {
             isr = new InputStreamReader(bis, finalCharset);
         } catch (UnsupportedEncodingException e) {
             throw new getRDFException("Unsupported encoding 
'"+finalCharset+"'.\
");
         }

         StringBuffer sb = new StringBuffer("");
         int charnum = 0;
         try {  // read whole file as characters
             int c;
             while ((c = isr.read()) != -1) {
                 sb.append((char)c);
                 charnum++;
             }
         } catch (IOException e) {
             throw new getRDFException("IOException while reading URI at 
charact\
er "
                 + charnum + " using encoding " + XMLcharset + ".");
         }

         // todo: fix encoding parameter in xml pseudo-PI

         return sb.toString();
     }
Received on Monday, 12 August 2002 22:38:39 UTC