Updated 12 Feb 2000
This is a Java version of HTML Tidy Release 13 Jan
2000 Copyright © 1999 W3C, see
Tidy.java
for the copyright notice.
Thanks to the people at Docuverse for assistance with DOM support for Java HTML Tidy.
I have made available:
To use the Tidy Java Bean, just include JTidy\lib\Tidy.jar
in
your classpath.
To build Tidy from the source, you need a Java compiler/runtime
environment, supporting Java 1.1 or higher. First, download and
expand the archive. For Win 9x/NT, build it using the batch file
JTidy\make\build.bat
as follows:
cd JTidy\make build c: 13jan2000
Where c:
is the root where you expanded the JTidy archive,
and
13jan2000
is the directory under JTidy\src
where the source is located.
NOTE: build.bat
assumes that the environment variable
java_home
points to your JDK installation, and that the
JDK tools are in your path.
For Unix environments, either Cygwin or true, use the makefile
in JTidy\make
.
The main class is: org.w3c.tidy.Tidy
docTypeStr
to
docType
and handled the same as configuration file 'doctype'
string. Fixed potential IndexOutOfBoundsException's
in
Clean.createProps
.InputStreamName
property to Bean.
Tried speed optimization
in Lexer.wstrcasecmp
.org.w3c.tidy.Node
protected, as I would like to phase out
external use of this class, and phase in DOM-style access to the parse
tree. Below is a code example of DOM-style parse tree traversal
and printing. Note that Java Tidy cannot yet be called DOM-compliant,
but it's getting there.
TidyMessages.properties
to the
org.w3c.tidy
package. Propogated
MissingResourceException
in static initializer of
Report
as an Error
since it represents a
severe error.
Lexer.lexbuf
to type
byte[]
.
Since Tidy stores lexbuf
as UTF-8 encoded bytes,
conversions of sequences of bytes of lexbuf
to
String's
need to take into account UTF-8 encoding, also it was a
waste of memory to represent it as a char[]
.
Thanks to Mark Diekhans for contributing this change.
Lexer.getToken
.
CM_HEAD
fix from Dave Raggett (posted to HTML tidy
mailing list) for object
entry in TagTable
.
Fixed cause of NullPointerException's in
Node.insertNodeAfterElement
.
Lexer.addGenerator().
Got rid of references to Lexer.lexbuf
in class PPRint
and other classes - used node.textarray
instead.
This is in preparation for more DOM implementation.
import java.io.IOException; import java.net.URL; import java.io.BufferedInputStream; import java.io.FileOutputStream; import java.io.PrintWriter; import java.io.FileWriter; import org.w3c.tidy.Tidy; /** * This program shows how HTML could be tidied directly from * a URL stream, and running on separate threads. Note the use * of the 'parse' method to parse from an InputStream, and send * the pretty-printed result to an OutputStream. * In this example thread th1 outputs XML, and thread th2 outputs * HTML. This shows that properties are per instance of Tidy. */ public class Test16 implements Runnable { private String url; private String outFileName; private String errOutFileName; private boolean xmlOut; public Test16(String url, String outFileName, String errOutFileName, boolean xmlOut) { this.url = url; this.outFileName = outFileName; this.errOutFileName = errOutFileName; this.xmlOut = xmlOut; } public void run() { URL u; BufferedInputStream in; FileOutputStream out; Tidy tidy = new Tidy(); tidy.setXmlOut(xmlOut); try { tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true)); u = new URL(url); in = new BufferedInputStream(u.openStream()); out = new FileOutputStream(outFileName); tidy.parse(in, out); } catch ( IOException e ) { System.out.println( this.toString() + e.toString() ); } } public static void main( String[] args ) { Test16 t1 = new Test16(args[0], args[1], args[2], true); Test16 t2 = new Test16(args[3], args[4], args[5], false); Thread th1 = new Thread(t1); Thread th2 = new Thread(t2); th1.start(); th2.start(); } }
import java.io.PrintWriter; import java.io.FileInputStream; import java.io.IOException; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.tidy.Tidy; /** * A sample DOM writer. This sample program illustrates how to * traverse a DOM tree in order to print a document that is parsed. * */ public class TestDOM { protected PrintWriter out; public TestDOM() { out = new PrintWriter(System.out); } /** Prints the specified node, recursively. */ public void print(Node node) { if ( node == null ) { return; } int type = node.getNodeType(); switch ( type ) { case Node.DOCUMENT_NODE: out.println(""); print(((Document)node).getDocumentElement()); out.flush(); break; case Node.ELEMENT_NODE: out.print('<'); out.print(node.getNodeName()); NamedNodeMap attrs = node.getAttributes(); for ( int i = 0; i < attrs.getLength(); i++ ) { out.print(' '); out.print(attrs.item(i).getNodeName()); out.print("=\""); out.print(attrs.item(i).getNodeValue()); out.print('"'); } out.print('>'); out.println(); // HACK NodeList children = node.getChildNodes(); if ( children != null ) { int len = children.getLength(); for ( int i = 0; i < len; i++ ) { print(children.item(i)); } } break; case Node.TEXT_NODE: out.print(node.getNodeValue()); break; } if ( type == Node.ELEMENT_NODE ) { out.print(""); out.print(node.getNodeName()); out.print('>'); out.println(); // HACK } out.flush(); } public static void main(String args[]) { if ( args.length == 0 ) { System.exit(1); } System.err.println(args[0]); FileInputStream in; Tidy tidy = new Tidy(); TestDOM t = new TestDOM(); try { in = new FileInputStream(args[0]); tidy.setMakeClean(true); tidy.setXmlTags(true); t.print(tidy.parseDOM(in, null)); } catch ( IOException e ) { System.err.println( e.toString() ); } } }
Send questions, comments, or bug reports to Andy Quick.