Java HTML Tidy

Updated 12 Feb 2000


This is a Java version of HTML Tidy Release 13 Jan 2000 Copyright © 1999 W3C, see Tidy.java for the copyright notice.

Thanks to the people at Docuverse for assistance with DOM support for Java HTML Tidy.

I have made available:

To use the Tidy Java Bean, just include JTidy\lib\Tidy.jar in your classpath.

To build Tidy from the source, you need a Java compiler/runtime environment, supporting Java 1.1 or higher. First, download and expand the archive. For Win 9x/NT, build it using the batch file JTidy\make\build.bat as follows:


    cd JTidy\make

    build c: 13jan2000

Where c: is the root where you expanded the JTidy archive, and 13jan2000 is the directory under JTidy\src where the source is located. NOTE: build.bat assumes that the environment variable java_home points to your JDK installation, and that the JDK tools are in your path.

For Unix environments, either Cygwin or true, use the makefile in JTidy\make.

The main class is: org.w3c.tidy.Tidy


What's New


Code example of how to use the Tidy Java Bean


import java.io.IOException;

import java.net.URL;

import java.io.BufferedInputStream;

import java.io.FileOutputStream;

import java.io.PrintWriter;

import java.io.FileWriter;

import org.w3c.tidy.Tidy;





/**

 * This program shows how HTML could be tidied directly from

 * a URL stream, and running on separate threads.  Note the use

 * of the 'parse' method to parse from an InputStream, and send

 * the pretty-printed result to an OutputStream.

 * In this example thread th1 outputs XML, and thread th2 outputs

 * HTML.  This shows that properties are per instance of Tidy.

 */



public class Test16 implements Runnable {



    private String url;

    private String outFileName;

    private String errOutFileName;

    private boolean xmlOut;



    public Test16(String url, String outFileName,

                  String errOutFileName, boolean xmlOut)

    {

        this.url = url;

        this.outFileName = outFileName;

        this.errOutFileName = errOutFileName;

        this.xmlOut = xmlOut;

    }



    public void run()

    {

        URL u;

        BufferedInputStream in;

        FileOutputStream out;

        Tidy tidy = new Tidy();



        tidy.setXmlOut(xmlOut);

        try {

            tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));

            u = new URL(url);

            in = new BufferedInputStream(u.openStream());

            out = new FileOutputStream(outFileName);

            tidy.parse(in, out);

        }

        catch ( IOException e ) {

            System.out.println( this.toString() + e.toString() );

        }

    }



    public static void main( String[] args ) {

        Test16 t1 = new Test16(args[0], args[1], args[2], true);

        Test16 t2 = new Test16(args[3], args[4], args[5], false);

        Thread th1 = new Thread(t1);

        Thread th2 = new Thread(t2);



        th1.start();

        th2.start();

    }



}

Code example of using Java Tidy as a parser


import java.io.PrintWriter;

import java.io.FileInputStream;

import java.io.IOException;



import org.w3c.dom.Attr;

import org.w3c.dom.Document;

import org.w3c.dom.NamedNodeMap;

import org.w3c.dom.Node;

import org.w3c.dom.NodeList;

import org.w3c.tidy.Tidy;



/**

 * A sample DOM writer. This sample program illustrates how to

 * traverse a DOM tree in order to print a document that is parsed.

 *

 */

public class TestDOM {





   protected PrintWriter out;



   public TestDOM() {

      out = new PrintWriter(System.out);

   }



   /** Prints the specified node, recursively. */

   public void print(Node node) {



      if ( node == null ) {

         return;

      }



      int type = node.getNodeType();

      switch ( type ) {

      case Node.DOCUMENT_NODE:

         out.println("");

         print(((Document)node).getDocumentElement());

         out.flush();

         break;



      case Node.ELEMENT_NODE:

         out.print('<');

         out.print(node.getNodeName());

         NamedNodeMap attrs = node.getAttributes();

         for ( int i = 0; i < attrs.getLength(); i++ ) {

            out.print(' ');

            out.print(attrs.item(i).getNodeName());

            out.print("=\"");

            out.print(attrs.item(i).getNodeValue());

            out.print('"');

         }

         out.print('>');

         out.println(); // HACK

         NodeList children = node.getChildNodes();

         if ( children != null ) {

            int len = children.getLength();

            for ( int i = 0; i < len; i++ ) {

               print(children.item(i));

            }

         }

         break;



      case Node.TEXT_NODE:

         out.print(node.getNodeValue());

         break;



      }



      if ( type == Node.ELEMENT_NODE ) {

         out.print("');

         out.println(); // HACK

      }



      out.flush();



   }



   public static void main(String args[]) {



      if ( args.length == 0 ) {

         System.exit(1);

      }



      System.err.println(args[0]);



      FileInputStream in;

      Tidy tidy = new Tidy();

      TestDOM t = new TestDOM();



      try {

          in = new FileInputStream(args[0]);

          tidy.setMakeClean(true);

          tidy.setXmlTags(true);

          t.print(tidy.parseDOM(in, null));

      }

      catch ( IOException e ) {

          System.err.println( e.toString() );

      }



   }





} 


Send questions, comments, or bug reports to Andy Quick.