#!/usr/bin/perl # # stidy, version 1.1 # Copyright (C) 2000 Pete Gelbman # GNU General Public License # # Description: # ------------ # Script to filter output of Tidy HTML/XHTML parser program. Requires Perl 5.001 # or later. This is just a wrapper that passes input to Tidy and parses Tidy's # output based on user-specified options. # # Two types of post-Tidy parsing options are supported; Fine-tuning and stripping: # # Basic concept of the fine-tuning option (-f) is to correct Tidy deficiencies, # and add support for specific Web-authoring tool quirks. Currently fine-tuning is # limited to adding line-breaks to concatenated comments, and removing some known # false element attributes. # # Basic concept of the stripping option (-s) is to remove all document-structure # elements from Tidy's output for embedding HTML/XHTML code within server-side # applications (SSI, CGI, etc...), without creating entire HTML/XHTML documents. # # Usage: # ------ # Run Stidy from the command-line; usage syntax mimics Tidy's. User must specify # parsing options (-f, -x, or -a for both). If no parsing options are given, Stidy # simply returns Tidy's output, the same as if you ran Tidy directly. In addition, # there several options that can be passed to Tidy: (-c , -i, and -x). These # specify Tidy's configuration file, indentation, and XHTML output options, # respectively. Do stidy -h for usage details. # # Input/output defaults to STDIN/STDOUT. Errors go to STDERR, # along with Tidy's. We use warn() to delimit the two. # # Changes: # -------- # - Pete Gelbman, Wed May 10 05:10:25 CST 2000. Created # # - Pete Gelbman, Sat May 24 15:56:50 CST 2000. Incorporated # IPC::Open2 module to replace temp files. Added support for fine-tuning. # # Notes: # ------ # - All kudos go to Dave Raggett and crew for creating Tidy! # http://www.w3.org/People/Raggett/tidy/ # # - Many thanks to Sebastian Lange for # code contribution and support. # # - This is pretty icky, please feel free to improve it! # # - I've only used/tested this under unix & linux... # # +------------------------------------------------------------------------+ # | This program is free software; you can redistribute it and/or modify | # | it under the terms of the GNU General Public License as published by | # | the Free Software Foundation; either version 2 of the License, or | # | (at your option) any later version. | # | | # | This program is distributed in the hope that it will be useful, | # | but WITHOUT ANY WARRANTY; without even the implied warranty of | # | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | # | GNU General Public License for more details. | # | | # | GNU General Public License may be found at http://www.gnu.org | # +------------------------------------------------------------------------+ # #################################################################################### use Getopt::Std; use IPC::Open2; require 5.001; $ver = 1.1; $version =< GNU General Public License Stidy may be copied, modified and redistributed under the terms of the GNU General Public License as published by the Free Software Foundation which may be found at http://www.gnu.org EOP $usage = < Specify Tidy configuration file -i Specify Tidy\'s indentation option -x Spedify Tidy\'s XHTML option EOP # Parse command-line options unless (getopts("aAfFhHiIvsSVxXc:C:")) { print $usage; exit (1); } # Fine-tune if ($opt_f || $opt_F) { $fine_tune = 1; # set a flag } # Strip if ($opt_s || $opt_S) { $strip = 1; # set a flag } # All (Fine-tune and Strip if ($opt_a || $opt_A) { $all = 1; # set a flag } # Help if ($opt_h || $opt_H) { print $usage; exit (1); } # Version if ($opt_v || $opt_V) { print $version; exit(1); } # Tidy config file if ($opt_c) { push(@tidyOptions, "-config $opt_c"); } elsif ($opt_C) { push(@tidyOptions, "-config $opt_C"); } # Tidy indentation if ($opt_i || $opt_I) { push(@tidyOptions, "-i"); } # Tid XHTML output if ($opt_x || $opt_X) { push(@tidyOptions, "-asxml"); } # Get input file, otherwise use STDIN if ( $ARGV[0] ) { if (-r $ARGV[0] ) { open(INPUT_FILE, "< $ARGV[0]"); @input = ; $file = 1; # set a flag close INPUT_FILE if ($file); }else { die "\nStidy: can't open file \"$ARGV[0]\"!\n"; } }else { @input = ; } # Input must contain at least one tag $testString = "@input"; die ("\nStidy: input has no HTML/XHTML content!\n") unless ($testString =~ /<[^>]*>/); #Print a Stidy header on STDERR log if ($file) { $errorHeader = "\nStidy (version $ver) Filtering \"$ARGV[0]\"\n"; } else { $errorHeader = "\nStidy (version $ver) Filtering console input (stdin)\n"; } warn $errorHeader; # Delimit STDERR log warn "\nTidy errors:\n"; warn "------------\n"; # Perform Tidy parsing on our input. Tidy is spawned with specfied # options (if any) using IPC::Open2 module to write/read Tidy's # input/ouput. Open2 doesn't return errors, so we need to eval the # piped filehandle process and test manually. $tidyCommand = "tidy @tidyOptions"; eval { $child = open2(*READ_TIDY, *WRITE_TIDY, $tidyCommand); }; if ($@) { if ($@ =~ /^open2/) { warn "\nStidy: tidy IPC error! $!\n$@\n"; return; } die; # default exception error } else { print WRITE_TIDY @input; close(WRITE_TIDY); @tidyOutput = ; waitpid($child, 0); # Reap our child. Normally we should test $? to # see if/how the child process terminated, but # Open2 takes care of that stuff properly. close(READ_TIDY); close(IN); } # Delimit STDERR log warn "\nStidy errors:\n"; warn "-------------\n\n"; # Do our deed based on user-specified parsing options # Fine-tuning option only if ($fine_tune && !$strip) { @output = fineTune("@tidyOutput"); print "@output"; # Strip option only } elsif ($strip && !$fine_tune) { @output = stripDocTags("@tidyOutput"); print "@output"; # Fine-tune and Strip options } elsif ($fine_tune && $strip) { @bothTemp = fineTune("@tidyOutput"); @output = stripDocTags("@bothTemp"); print "@output"; # All option } elsif ($all) { @allTemp = fineTune("@tidyOutput"); @output = stripDocTags("@allTemp"); print "@output"; # No options; plain Tidy output } else { warn "- Ran Tidy only. No post-parsing performed, no errors to report!\n\n"; print "@tidyOutput"; } #========================================================================== # Functions #========================================================================== #------------------------ # Fine-tune Tidy output #------------------------ sub fineTune { my (@fineTuneArray, @fineTuneOutput, $catComment, $falseBorderAtt, $fineTune_errors); # Get input from subroutine arg @fineTuneArray = $_[0]; # Flag to count our errors $fineTune_errors = 0; # Perform fine-tuning foreach (@fineTuneArray) { # Add line breaks to concatenated comments if (/>\s*)/\n$1\n/gis) { $catComment = 1; # set a flag $fineTune_errors++; } } # Drop known false attributes if (/\s*(BORDERCOLOR([a-z]*))\s*=\s*".*?"/i) { if (s/\s*$1\s*=\s*".*?"//gis) { $falseBorderAtt = 1; # set flag $fineTune_errors++; } } } # Print results to STDERR log if ($fineTune_errors < 1) { warn "- No fine-tuning required!\n\n"; } warn "- Added line breaks to concatenated comments!\n" if ($catComment); warn "- Removed false BORDERCOLOR attributes!\n" if ($falseBorderAtt); return @fineTuneArray; } #------------------------------------- # Strip document structure elements #------------------------------------- sub stripDocTags { my ($stripString, $stripArray, @stripOutput); # Scalarize input from subroutine arg $stripString = $_[0]; # Strip everything except whats between and if ($stripString =~ s/^.*]*>\n//si) { warn "- Removed everything up to and including \n"; } if ($stripString =~ s\.+.*$\\si) { warn "- Removed everything after and including \n"; } # Split back and extract all document structure elements @stripArray = split(/^/m, $stripString); foreach (@stripArray) { # For stand-alone elements, purge the entire line unless ( /^\s*\s*$/ || /^\s*\s*$/i || /^\s*\s*$/i || /^\s*<\/TITLE.*>\s*$/i || /^\s*<\/HEAD.*>\s*$/i || /^\s*]*>\s*$/i ) { # For lines that may have stuff before/after the elements, extract only the tags s///i; s///i; s///i; s/<\/TITLE.*>//i; s/<\/HEAD.*>//i; s/]*>//i; push (@stripOutput, $_); } } # Print results to STDERR log if (@stripOutput) { warn "- Stripped all document-structure elements; this is not a complete HTML/XHTML document!\n"; $stidy_errors++; } # Clean up any leftover empty lines on top $i = 0; foreach (@stripOutput) { if (/^\s*$/) { $i++; } elsif (/[^\s*]/) { last; } } splice(@stripOutput, 0, $i); # Clean up any leftover empty lines on bottom $i = 0; foreach (reverse(@stripOutput)) { if (/^\s*$/) { $i++; } elsif (/[^\s*]/) { last; } } reverse(@stripOutput); splice(@stripOutput, -$i); return @stripOutput; } __END__