2002/css-validator/org/w3c/css/css DocumentParser.java,NONE,1.1 TagSoupStyleSheetHandler.java,NONE,1.1 CssValidator.java,1.4,1.5 StyleSheetCom.java,1.16,1.17

Update of /sources/public/2002/css-validator/org/w3c/css/css
In directory hutz:/tmp/cvs-serv18715/org/w3c/css/css

Modified Files:
	CssValidator.java StyleSheetCom.java 
Added Files:
	DocumentParser.java TagSoupStyleSheetHandler.java 
Log Message:
Added TagSoup parser to cope with HTML, instead of the old html4 parser, also better code for handling multiple type of docs. Reread of the stream is now avoided

Index: StyleSheetCom.java
===================================================================
RCS file: /sources/public/2002/css-validator/org/w3c/css/css/StyleSheetCom.java,v
retrieving revision 1.16
retrieving revision 1.17
diff -u -d -r1.16 -r1.17
--- StyleSheetCom.java	9 Mar 2007 04:40:02 -0000	1.16
+++ StyleSheetCom.java	24 Apr 2007 11:14:22 -0000	1.17
@@ -7,11 +7,6 @@
 
 package org.w3c.css.css;
 
-import html.tags.HtmlParser;
-import html.tags.HtmlParserListener;
-import html.tags.HtmlTag;
-import html.tags.HtmlTree;
-
 import java.io.File;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
@@ -33,7 +28,7 @@
 /**
  * @version $Revision$import javax.servlet.http.HttpServletResponse;
  */
-public class StyleSheetCom implements HtmlParserListener {
+public class StyleSheetCom {
 
     /*    ApplContext ac = new ApplContext("ja, en, zh"); */
     ApplContext ac;
@@ -54,21 +49,34 @@
     private Exception exception;
 
     public void htmlRequest() throws Exception {
+	StyleSheet style = null;
 
- 	System.err.println( "html request " + htmlURL);
-	HtmlParser htmlParser = new HtmlParser(ac, "html4", htmlURL.toString());
-	try {
-	    Util.fromHTMLFile = true;
-	    htmlParser.addParserListener(this);
-	    htmlParser.run();
-	    if (exception != null) {
-		throw (Exception) exception.fillInStackTrace();
+	TagSoupStyleSheetHandler handler = new TagSoupStyleSheetHandler(htmlURL, ac);
+	handler.parse(htmlURL);
+	style = handler.getStyleSheet();
+	if (style != null) {
+	    style.setType("text/html");
+	}
+	if (style != null) {
+	    style.findConflicts(ac);
+	    if (documentBase.startsWith("html")) {
+		StyleSheetGeneratorHTML2 output =
+		    new StyleSheetGeneratorHTML2(ac, file,
+						 style,
+						 documentBase,
+						 warningLevel);
+		output.print(out);
+	    } else {
+		StyleSheetGenerator2 style2 = new StyleSheetGenerator2(file,
+								       style,
+								       documentBase,
+								       warningLevel);
+		style2.print(out);
 	    }
-	} catch (html.parser.XMLInputException e) {
-	    xmlRequest();
-	} finally {
-	    Util.fromHTMLFile = false;
+	} else {
+	    System.err.println("No style sheet found in your HTML document");
 	}
+	ac.setInput("text/xml");
     }
 
     public void xmlRequest() throws Exception {
@@ -287,72 +295,6 @@
 	}
     }
 
-    /**
-     * Notifies root creation.
-     *
-     * Sent when the parser builds the root of the HTML tree.
-     *
-     * @param url the URL being parsed.
-     * @param root the new root Tag for this parser.
-     */
-    public void notifyCreateRoot(URL url, HtmlTag root) {
-    }
-
-    public void notifyActivity(int lines, long bytes) {
-    }
-
-    public void notifyConnection(URLConnection cnx) {
-    }
-
-    /**
-     * Notifies successful termination.
-     *
-     * @param root the root of the current Tree.
-     */
-    public void notifyEnd(HtmlTag root, String contentType) {
-
-	StyleSheet style = null;
-
-	if (root != null) {
-	    style = ((HtmlTree) root).getStyleSheet();
-	}
-
-	if (style != null) {
-	    style.findConflicts(ac);
-	    if (documentBase.startsWith("html")) {
-		StyleSheetGeneratorHTML2 output =
-		    new StyleSheetGeneratorHTML2(ac, file,
-						 style,
-						 contenttype,
-						 warningLevel);
-		output.print(out);
-	    } else {
-		StyleSheetGenerator2 style2 = new StyleSheetGenerator2(file,
-								       style,
-								       contenttype,
-								       warningLevel);
-		style2.print(out);
-	    }
-	} else {
-	    System.err.println("No style sheet found in your HTML document");
-	}
-	ac.setInput(contentType);
-    }
-
-    /**
-     * Notifies a fatal error.
-     *
-     * This notification is sent when the parser need to stop during or before
-     * parsing, due to an unexpected exception.
-     *
-     * @param root the root of the current Tree, if any.
-     * @param x the exception that cause the Parser stop
-     * @param msg an error message information
-     */
-    public void notifyFatalError(HtmlTag root, Exception x, String s) {
-	exception = x;
-    }
-
     private static CssSelectors createSelectors(String s) {
 	try {
 	    CssFouffa fouffa =

--- NEW FILE: DocumentParser.java ---
//
// $Id: DocumentParser.java,v 1.1 2007/04/24 11:14:22 ylafon Exp $
// From Philippe Le Hegaret (Philippe.Le_Hegaret@sophia.inria.fr)
//
// (c) COPYRIGHT MIT and INRIA, 1997.
// Please first read the full copyright statement in file COPYRIGHT.html

package org.w3c.css.css;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;

import org.w3c.css.util.ApplContext;
import org.w3c.css.util.HTTPURL;
import org.w3c.css.util.Util;

import org.w3c.www.mime.MimeType;
import org.w3c.www.mime.MimeTypeFormatException;

/**
 * @version $Revision: 1.1 $
 */
public final class DocumentParser {

    private StyleSheet style;
    private URL htmlURL;
    private Exception exception;
    private ApplContext ac;

    /**
     * Create a new DocumentParser
     *
     * @exception Exception An error
     */
    public DocumentParser(ApplContext ac, String urlString)
	throws Exception {
	this.htmlURL = HTTPURL.getURL(urlString);
	this.ac = ac;
	urlString = htmlURL.toString();
	String urlLower = urlString.toLowerCase();
	String media = ac.getMedium();
	String urlProtocol = htmlURL.getProtocol();

	if (!"http".equals(urlProtocol) && !"https".equals(urlProtocol)) {	    	    
	    if (urlLower.endsWith(".css")) {
		StyleSheetParser parser = new StyleSheetParser();
		parser.parseURL(ac, htmlURL, null, null, media,
				StyleSheetOrigin.AUTHOR);
		style = parser.getStyleSheet();
	    } else if (urlLower.endsWith(".html")
		       || urlLower.endsWith(".shtml")
		       || urlLower.endsWith("/")) {
		TagSoupStyleSheetHandler handler;
		handler = new TagSoupStyleSheetHandler(htmlURL, ac);
		handler.parse(htmlURL);
		style = handler.getStyleSheet();
		if (style != null) {
		    style.setType("text/html");
		}
	    } else if (urlLower.endsWith(".xhtml") 
		       || urlLower.endsWith(".xml")) {
		XMLStyleSheetHandler handler;
		handler = new XMLStyleSheetHandler(htmlURL, ac);
		handler.parse(htmlURL);
		style = handler.getStyleSheet();
		if (style != null) {
		    style.setType("text/xml");
		}	
	    } else {
		throw new Exception("Unknown file");
	    }	    
	} else {
	    URLConnection connection = null;

	    try {
		boolean isXML = false;
		String cType;

		// @@ hum, maybe? (plh, yes probably :-) )
		String credential = ac.getCredential();

		connection = HTTPURL.getConnection(htmlURL, ac);
		htmlURL = connection.getURL();

		String httpCL = connection.getHeaderField("Content-Location");
		if (httpCL != null) {
		    htmlURL = HTTPURL.getURL(htmlURL, httpCL);
		}

		cType = connection.getContentType();
		if (cType == null) {
		    cType = "unknown/unknown";
		}
		MimeType contentType = null;
		try {
		    contentType = new MimeType(cType);
		} catch (MimeTypeFormatException ex) {
		}

		if (Util.onDebug) {
		    System.err.println( "[DEBUG] content type is [" +
					contentType + ']');
		}

		if (contentType.match(MimeType.TEXT_HTML) ==
		                           MimeType.MATCH_SPECIFIC_SUBTYPE) {
		    TagSoupStyleSheetHandler handler;
		    handler = new TagSoupStyleSheetHandler(htmlURL, ac);
		    handler.parse(urlString, connection);
		    style = handler.getStyleSheet();		    
		    
		    if (style != null) {			
			style.setType("text/html");
		    }
		} else if (contentType.match(MimeType.TEXT_CSS) ==
			                   MimeType.MATCH_SPECIFIC_SUBTYPE ) {
		    StyleSheetParser parser = new StyleSheetParser();
		    parser.parseURL(ac, htmlURL, null, null, media,
				    StyleSheetOrigin.AUTHOR);
		    style = parser.getStyleSheet();
		} else if ((contentType.match(MimeType.TEXT_XML) ==
		                            MimeType.MATCH_SPECIFIC_SUBTYPE) ||
		    (contentType.match(MimeType.APPLICATION_XHTML_XML) ==
		                            MimeType.MATCH_SPECIFIC_SUBTYPE)) {
		    XMLStyleSheetHandler handler;
		    handler = new XMLStyleSheetHandler(htmlURL, ac);
		    handler.parse(urlString, connection);
		    style = handler.getStyleSheet();		    
		    
		    if (style != null) {			
			style.setType("text/xml");
		    }
		} else {
		    throw new IOException("Unknown mime type : "+ contentType);
		}
	    } finally {
		try {
		    connection.getInputStream().close();
		} catch (Exception e) {}
	    }
	}
    }

    /**
     * Returns the recognized style sheet.
     * @return A style sheet.
     */
    public StyleSheet getStyleSheet() {
	return style;
    }

} // HTMLStyleSheetParser

Index: CssValidator.java
===================================================================
RCS file: /sources/public/2002/css-validator/org/w3c/css/css/CssValidator.java,v
retrieving revision 1.4
retrieving revision 1.5
diff -u -d -r1.4 -r1.5
--- CssValidator.java	9 Mar 2007 04:40:02 -0000	1.4
+++ CssValidator.java	24 Apr 2007 11:14:22 -0000	1.5
@@ -143,7 +143,7 @@
 		    uri = HTTPURL.getURL(uri).toString(); // needed to be sure
 		    // that it is a valid
 		    // url
-		    HTMLStyleSheetParser URLparser = new HTMLStyleSheetParser(style.ac,
+		    DocumentParser URLparser = new DocumentParser(style.ac,
 			    uri);
 		    
 		    style.handleRequest(style.ac, uri, URLparser.getStyleSheet(), 

--- NEW FILE: TagSoupStyleSheetHandler.java ---
/*
 * Copyright (c) 2001 World Wide Web Consortium,
 * (Massachusetts Institute of Technology, Institut National de
 * Recherche en Informatique et en Automatique, Keio University). All
 * Rights Reserved. This program is distributed under the W3C's Software
 * Intellectual Property License. This program is distributed in the
 * hope that it will be useful, but WITHOUT ANY WARRANTY; without even
 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
 * PURPOSE.
 * See W3C License http://www.w3.org/Consortium/Legal/ for more details.
 *
 * $Id: TagSoupStyleSheetHandler.java,v 1.1 2007/04/24 11:14:22 ylafon Exp $
 */
package org.w3c.css.css;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringBufferInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.Hashtable;

import org.w3c.css.parser.CssError;
import org.w3c.css.parser.Errors;
import org.w3c.css.util.Warning;
import org.w3c.css.util.Warnings;
import org.w3c.css.util.ApplContext;
import org.w3c.css.util.HTTPURL;
import org.w3c.css.util.InvalidParamException;
import org.w3c.css.util.Util;
import org.w3c.css.util.xml.XMLCatalog;
import org.w3c.www.mime.MimeType;
import org.w3c.www.mime.MimeTypeFormatException;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.ext.LexicalHandler;

/**
 * @version $Revision: 1.1 $
 * @author  Philippe Le Hegaret
 */
public class TagSoupStyleSheetHandler implements ContentHandler,
    LexicalHandler, ErrorHandler, EntityResolver {

    static String XHTML_NS = "http://www.w3.org/1999/xhtml";

    private static long autoIdCount;

    String namespaceURI;
    boolean isRoot = true;

    ApplContext    ac;
    URL documentURI = null;
    URL baseURI = null;

    //  StyleSheet styleSheet = new StyleSheet();
    StyleSheetParser styleSheetParser = new StyleSheetParser();

    boolean inStyle = false;
    String media  = null;
    String type  = null;
    String title = null;
    StringBuffer text = new StringBuffer(255);

    Locator locator;

    static XMLCatalog catalog = new XMLCatalog();

    /**
     * Creates a new TagSoupStyleSheetHandler
     */
    public TagSoupStyleSheetHandler(URL baseURI, ApplContext ac) {
	this.documentURI = baseURI;
	this.baseURI     = baseURI;
	this.ac = ac;
    }

    public void setDocumentLocator (Locator locator) {
	this.locator = locator;
    }

    public void startDocument ()
        throws SAXException {
    }

    public void endDocument()
        throws SAXException {
	ac.setInput("text/xml");
    }

    public void startPrefixMapping (String prefix, String uri)
        throws SAXException {
    }

     public void endPrefixMapping (String prefix)
	 throws SAXException {
     }

    public void characters (char ch[], int start, int length)
        throws SAXException {
	if (inStyle) {
	    text.append(ch, start, length);
	}
    }

    public void comment (char ch[], int start, int length)
        throws SAXException {
	if (inStyle) {
            int line = (locator != null ? locator.getLineNumber() : -1);
            Warning w = new Warning(baseURI.toString(), line,
                    "style-inside-comment", 0, ac);
            Warnings warnings = new Warnings(ac.getWarningLevel());
            warnings.addWarning(w);
            styleSheetParser.notifyWarnings(warnings);
	    //text.append(ch, start, length); // ignoring, per http://www.w3.org/Bugs/Public/show_bug.cgi?id=761
	}
    }

    public void ignorableWhitespace (char ch[], int start, int length)
        throws SAXException {
    }

    public void processingInstruction (String target, String data)
        throws SAXException {
	Hashtable atts = getValues(data);

	if ("xml-stylesheet".equals(target)) {
	    String rel  = (String) atts.get("alternate");
	    String type  = (String) atts.get("type");
	    String href = (String) atts.get("href");

	    if (Util.onDebug) {
		System.err.println("<?xml-stylesheet alternate=\"" + rel
				   + "\" type=\"" + type
				   + "\"" + "   href=\"" + href + "\"?>");
	    }

	    if ("yes".equalsIgnoreCase(rel)) {
		rel = "alternate stylesheet";
	    } else {
		rel = "stylesheet";
	    }
	    if (href == null) {
		int line = -1;

		if (locator != null) {
		    line = locator.getLineNumber();
		}
		CssError er =
		    new CssError(baseURI.toString(), line,
				 new InvalidParamException("unrecognized.link", ac));
		Errors ers = new Errors();
		ers.addError(er);
		styleSheetParser.notifyErrors(ers);
	    }

	    if (href.charAt(0) == '#') {
		// internal style sheet, will be processed by the parser
		return;
	    }

	    if (type != null) {
		MimeType mt = null;
		try {
		    new MimeType(type);
		} catch (Exception ex) { /* at worst, null */ };
		if (mt != null && (MimeType.TEXT_CSS.match(mt) ==
		                           MimeType.MATCH_SPECIFIC_SUBTYPE)) {
		// we're dealing with a stylesheet...
		    URL url;

		    try {
			if (baseURI != null) {
			    url = new URL(baseURI, href);
			} else {
			    url = new URL(href);
			}
		    } catch (MalformedURLException e) {
			return; // Ignore errors
		    }

		    if (Util.onDebug) {
			System.err.println("[TagSoupStyleSheetHandler::"+
					   "initialize(): "
					   + "should parse CSS url: "
					   + url.toString() + "]");
		    }
		    String media = (String) atts.get("media");
		    if (media == null) {
			media="all";
		    }
		    styleSheetParser.parseURL(ac,
					      url,
					      (String) atts.get("title"),
					      rel,
					      media,
					      StyleSheetOrigin.AUTHOR);
		    if (Util.onDebug) {
			System.err.println("[parsed!]");
		    }
		}
	    }
	}
    }

    public void skippedEntity (String name)
        throws SAXException {
    }

    public void startElement(String namespaceURI,
			     String localName,
			     String qName,
			     Attributes atts) throws SAXException {
	if (isRoot) {
	    this.namespaceURI = namespaceURI;
	    isRoot = false;
	}
	if (XHTML_NS.equals(namespaceURI)) {
	    if ("base".equals(localName)) {
		String href = atts.getValue("href");

		if (Util.onDebug) {
		    System.err.println("BASE href=\"" + href + "\"");
		}

		if (href != null) {
		    //URL url;

		    try {
			baseURI = new URL(documentURI, href);
			documentURI = baseURI;
		    } catch (MalformedURLException e) {
			return; // Ignore errors
		    }
		}

	    } else if ("link".equals(localName)) {

		String rel  = atts.getValue("rel");
		String type  = atts.getValue("type");
		String href = atts.getValue("href");

		if (Util.onDebug) {
		    System.err.println("link rel=\"" + rel
				       + "\" type=\"" + type
				       + "\"" + "   href=\"" + href + "\"");
		}
		if (type == null) {
		    return;
		}
		MimeType mt = null;
		try {
		    mt = new MimeType(type);
		} catch (MimeTypeFormatException mtfe) {
		    return;
		}
		if (MimeType.TEXT_CSS.match(mt) !=
		                            MimeType.MATCH_SPECIFIC_SUBTYPE) {
		    return;
		}
		if (href == null) {
		    int line = -1;

		    if (locator != null) {
			line = locator.getLineNumber();
		    }
		    CssError er =
			new CssError(baseURI.toString(), line,
				                    new InvalidParamException(
					            "unrecognized.link", ac));
		    Errors ers = new Errors();
		    ers.addError(er);
		    styleSheetParser.notifyErrors(ers);
		    return;
		}

		if ((rel != null) &&
		              rel.toLowerCase().indexOf("stylesheet") != -1) {
		    // we're dealing with a stylesheet...
		    // @@TODO alternate stylesheet
		    URL url;

		    try {
			if (baseURI != null) {
			    url = new URL(baseURI, href);
			} else {
			    url = new URL(href);
			}
		    } catch (MalformedURLException e) {
			return; // Ignore errors
		    }

		    if (Util.onDebug) {
			System.err.println("[TagSoupStyleSheetHandler::initialize(): "
					   + "should parse CSS url: "
					   + url.toString() + "]");
		    }
		    String media = atts.getValue("media");
		    if (media == null) {
			media="all";
		    }
		    styleSheetParser.parseURL(ac,
					      url,
					      atts.getValue("title"),
					      rel,
					      media,
					      StyleSheetOrigin.AUTHOR);
		    if (Util.onDebug) {
			System.err.println("[parsed!]");
		    }
		}
	    } else if ("style".equals(localName)) {
		media  = atts.getValue("media");
		type  = atts.getValue("type");
		title = atts.getValue("title");

		if (media == null) {
		    media = "all";
		}
		if (Util.onDebug) {
		    System.err.println("style media=\"" + media
				       + "\" type=\"" + type
				       + "\"" + "   title=\"" + title + "\"");
		}

		if (type == null) {
		    int line = -1;

		    if (locator != null) {
			line = locator.getLineNumber();
		    }
		    CssError er =
			new CssError(baseURI.toString(), line,
				     new InvalidParamException(
					             "unrecognized.link", ac));
		    Errors ers = new Errors();
		    ers.addError(er);
		    styleSheetParser.notifyErrors(ers);
		} else {
		    try {
			MimeType mt = new MimeType(type);
			if (MimeType.TEXT_CSS.match(mt) ==
			                     MimeType.MATCH_SPECIFIC_SUBTYPE) {
			    text.setLength(0);
			    inStyle = true;
			}
		    } catch (MimeTypeFormatException ex) {
			// do nothing
		    }
		}
	    } else if (atts.getValue("style") != null) {
		String value = atts.getValue("style");

		if (value != null) { // here we have a style attribute
		    String id = atts.getValue("id");
		    handleStyleAttribute(value, id);
		}
	    }
	} else {
	    // the style attribute, recommended by UI Tech TF
	    String value = atts.getValue(XHTML_NS, "style");

	    if (value != null) { // here we have a style attribute
		String id = atts.getValue(XHTML_NS, "id");
		handleStyleAttribute(value, id);
	    }
	}
    }

    public void endElement (String namespaceURI, String localName,
                            String qName)
        throws SAXException {
	int line = 0;

	if (locator != null) {
	    line = locator.getLineNumber();
	}
	if (XHTML_NS.equals(namespaceURI)) {
	    if ("style".equals(localName)) {
		if (inStyle) {
		    inStyle = false;
		    if (text.length() != 0) {
			if (Util.onDebug) {
			    System.err.println( "PARSE [" + text.toString() + "]" );
			}
			styleSheetParser
			    .parseStyleElement(ac,
					       new StringBufferInputStream(text.toString()),
					       title, media,
					       documentURI, line);
		    }
		}

	    }
	}
    }

    public void handleStyleAttribute(String value, String id) {
	if (id == null) { // but we have no id: create one.
	    // a normal id should NOT contain a "#" character.
	    id = "#autoXML" + autoIdCount;
	    // workaround a java hashcode bug.
	    id += "" + autoIdCount++;
	}
	int line = 0;
	if (locator != null) {
	    line = locator.getLineNumber();
	}
	// parse the style attribute.
	styleSheetParser
	    .parseStyleAttribute(ac,
				 new ByteArrayInputStream(value.getBytes()),
				 id, documentURI, line);
    }

    public StyleSheet getStyleSheet() {
	return styleSheetParser.getStyleSheet();
    }

    public void startDTD (String name, String publicId,
                                   String systemId)
        throws SAXException {
    }

    public void endDTD ()
        throws SAXException {
    }

    public void startEntity (String name)
        throws SAXException {
    }

    public void endEntity (String name)
        throws SAXException {
    }

    public void startCDATA ()
        throws SAXException {
    }

    public void endCDATA ()
        throws SAXException {
    }

    public void error(SAXParseException exception) throws SAXException {
    }

    public void fatalError(SAXParseException exception) throws SAXException {
	throw exception;
    }

    public void warning(SAXParseException exception) throws SAXException {
    }

    public InputSource resolveEntity(String publicId, String systemId)
	throws SAXException, IOException {
	String uri = null;

	if (publicId != null) {
	    if ("-//W3C//DTD XHTML 1.0 Transitional//EN".equals(publicId)) {
		if (!"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd".equals(systemId)) {
		    if (ac != null && ac.getFrame() != null) {
			ac.getFrame().addWarning("xhtml.system_identifier.invalid");
		    }
		}
	    } else if ("-//W3C//DTD XHTML 1.0 Strict//EN".equals(publicId)) {
		if (!"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd".equals(systemId)) {
		    if (ac != null && ac.getFrame() != null) {
			ac.getFrame().addWarning("xhtml.system_identifier.invalid");
		    }
		}
	    } else if ("-//W3C//DTD XHTML 1.0 Frameset//EN".equals(publicId)) {
		if (!"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd".equals(systemId)) {
		    if (ac != null && ac.getFrame() != null) {
			ac.getFrame().addWarning("xhtml.system_identifier.invalid");
		    }
		}
	    }
	    uri = catalog.getProperty(publicId);
	}
	if (uri == null && systemId != null) {
	    uri = catalog.getProperty(systemId);
	}
	if (uri != null) {
	    return new InputSource(uri);
	} else {
	    return new InputSource(new URL(baseURI, systemId).toString());
	}
    }

    void parse(URL url) throws Exception {
	InputSource source = new InputSource();
	URLConnection connection;
	InputStream in;
	org.xml.sax.XMLReader xmlParser = new org.ccil.cowan.tagsoup.Parser();
	try {
	    xmlParser.setProperty("http://xml.org/sax/properties/lexical-handler",
				  this);
	    xmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
	    xmlParser.setFeature("http://xml.org/sax/features/validation", false);
	    /*
	      xmlParser.setFeature("http://xml.org/sax/features/external-parameter-entities",
				  false);
	    xmlParser.setFeature("http://xml.org/sax/features/external-general-entities",
				  false);
	    */
	} catch (Exception ex) {
	    ex.printStackTrace();
	}
	xmlParser.setContentHandler(this);

	connection = HTTPURL.getConnection(url, ac);
	in = connection.getInputStream();
	String httpCL = connection.getHeaderField("Content-Location");
	if (httpCL != null) {
	    baseURI = HTTPURL.getURL(baseURI, httpCL);
	    documentURI = baseURI;
	}
	String ctype = connection.getContentType();
	if (ctype != null) {
	    try {
		MimeType repmime = new MimeType(ctype);
		if (repmime.hasParameter("charset"))
		    source.setEncoding(repmime.getParameterValue("charset"));
	    } catch (Exception ex) {}
	}
	source.setByteStream(in);
	try {
	    xmlParser.parse(url.toString());
	} finally {
	    in.close();
	}
    }

    void parse(String urlString, URLConnection connection) throws Exception {
	org.xml.sax.XMLReader xmlParser = new org.apache.xerces.parsers.SAXParser();
	try {
	    xmlParser.setProperty("http://xml.org/sax/properties/lexical-handler",
				  this);
	    xmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);

	    xmlParser.setFeature("http://xml.org/sax/features/validation", false);
	    xmlParser.setErrorHandler(this);
	    xmlParser.setEntityResolver(this);
	} catch (Exception ex) {
	    ex.printStackTrace();
	}
	xmlParser.setContentHandler(this);
	InputStream cis = connection.getInputStream();
	InputSource source = new InputSource(cis);
	String ctype = connection.getContentType();
	if (ctype != null) {
	    try {
		MimeType repmime = new MimeType(ctype);
		if (repmime.hasParameter("charset")) {
		    source.setEncoding(repmime.getParameterValue("charset"));
		} else {
		    // if text/html and no given charset, let's assume
		    // iso-8859-1. Ideally, the parser would change the
		    // encoding if it find a mismatch, not sure, but well...
		    if (repmime.match(MimeType.TEXT_HTML) ==
			                     MimeType.MATCH_SPECIFIC_SUBTYPE) {
			source.setEncoding("iso-8859-1");
		    }
		}
	    } catch (Exception ex) {}
	}
	source.setSystemId(urlString);
	try {
	    xmlParser.parse(source);
	} finally {
	    cis.close();
	}
    }

    Hashtable getValues(String data) {
	int length = data.length();
	int current = 0;
	char c;
	StringBuffer name = new StringBuffer(10);
	StringBuffer value = new StringBuffer(128);
	StringBuffer entity_name = new StringBuffer(16);
	int state = 0;
	Hashtable table = new Hashtable();

	while (current < length) {
	    c = data.charAt(current);

	    switch (state) {
	    case 0:
		switch (c) {
		case ' ': case '\t': case '\n': // \r are normalized per XML spec
		    // nothing
		    break;
		case '"': case '\'':
		    return table;
		case 'h': case 't': case 'm': case 'c': case 'a':
		case 'r':
		    name.setLength(0); // reset the name
		    value.setLength(0); // reset the value
		    name.append(c); // start to build the name
		    state = 1;
		    break;
		default:
		    // anything else is invalid
		    return table;
		}
		break;
	    case 1: // in the "attribute" name inside the PI
		if ((c >= 'a') && (c <= 'z')) {
		    name.append(c);
		} else if ((c == ' ') || (c == '\t') || (c == '\n')) {
		    state = 2;
		} else if (c == '=') {
		    state = 3;
		} else {
		    // anything else is invalid
		    state = 0;
		}
		break;
	    case 2: // waiting for =
		switch (c) {
		case ' ': case '\t': case '\n':
		    // nothing
		    break;
		case '=':
		    state = 3;
		default:
		    // anything else is invalid
		    return table;
		}
		break;
	    case 3: // waiting for ' or "
		switch (c) {
		case ' ': case '\t': case '\n':
		    // nothing
		    break;
		case '"':
		    state = 4;
		    break;
		case '\'':
		    state = 5;
		    break;
		default:
		    // anything else is invalid
		    return table;
		}
		break;
	    case 4: case 5: // in the "attribute" value inside the PI
		switch (c) {
		case '&':
		    // predefined entities amp, lt, gt, quot, apos
		    entity_name.setLength(0);
		    state += 10;
		    break;
		case '<':
		    return table;
		case '"':
		    if (state == 4) {
			state = 6;
		    } else {
			value.append(c);
		    }
		    break;
		case '\'':
		    if (state == 5) {
			state = 6;
		    } else {
			value.append(c);
		    }
		    break;
		default:
		    value.append(c);
		}
		break;
	    case 6: // waiting a white space
		table.put(name.toString(), value.toString());
		name.setLength(0); // reset the name
		value.setLength(0); // reset the value
		switch (c) {
		case ' ': case '\n': case '\t':
		    state = 0;
		    break;
		default:
		    return table;
		}
		break;
	    case 14: case 15: // in the entity
		switch (c) {
		case 'a': case 'm': case 'p':
		case 'l': case 't': case 'g':
		case 'q': case 'u': case 'o':
		case 's':
		    entity_name.append(c);
		    break;
		case ';':
		    String entity = entity_name.toString();
		    if ("amp".equals(entity)) {
			value.append('&');
		    } else if ("lt".equals(entity)) {
			value.append('<');
		    } else if ("gt".equals(entity)) {
			value.append('>');
		    } else if ("quote".equals(entity)) {
			value.append('"');
		    } else if ("apos".equals(entity)) {
			value.append('\'');
		    } else {
			return table;
		    }
		    state -= 10;
		    break;
		default:
		    return table;
		}
	    }
	    current ++;
	}
	if (name.length() != 0 && value.length() != 0) {
	    table.put(name.toString(), value.toString());
	}
	return table;
    }
}

Received on Tuesday, 24 April 2007 11:14:28 UTC