- From: Bjoern Hoehrmann <derhoermi@gmx.net>
- Date: Sun, 07 Nov 2004 04:28:15 +0100
- To: public-qa-dev@w3.org
Hi,
Except for SGML::Parser::OpenSP 0.99 the following script is able to
validate HTML/XHTML documents using only CPAN modules; except for some
features like showing an outline of the document and complaining about
encoding errors and missing doctypes this should be as good as the
current Markup Validator. Of course, output is plain text only and it
lacks error checking, etc
#!perl -w
BEGIN
{
$ENV{SP_CHARSET_FIXED} = 1;
$ENV{SP_ENCODING} = "UTF-8";
$ENV{SP_BCTF} = "UTF-8";
}
sub ErrorHandler::new {bless {p=>$_[1]}, shift}
sub ErrorHandler::error
{
push @{$_[0]->{errors}}, $_[0]->{p}->split_message($_[1])
}
use strict;
use warnings;
use SGML::Parser::OpenSP 0.99 qw();
use HTML::Encoding qw();
use HTML::Doctype qw();
use LWP::UserAgent qw();
use Encode qw();
use I18N::Charset qw();
our $HTML_CATALOG = "sgml.soc";
our $XML_CATALOG = "xml.soc";
our @SP_OPTS =
qw/
non-sgml-char-ref
valid
no-duplicate
/;
my $u = LWP::UserAgent->new;
my $p = SGML::Parser::OpenSP->new;
my $h = HTML::Doctype::Detector->new($p);
my $e = ErrorHandler->new($p);
my $r = $u->get("http://www.sun.com");
my $name1 = HTML::Encoding::encoding_from_http_message($r);
my $name2 = I18N::Charset::enco_charset_name($name1);
my $text = Encode::decode($name2 => $r->content);
# XHTML detection
$p->handler($h);
$p->catalogs($HTML_CATALOG);
$p->parse_string($text);
my $is_xhtml = $h->is_xhtml;
# Validation
$p->handler($e);
$p->catalogs($is_xhtml ? $XML_CATALOG : $HTML_CATALOG);
$p->warnings(@SP_OPTS, $is_xhtml ? 'xml' : ());
$p->parse_string($text);
foreach my $error (@{$e->{errors}})
{
my $prim = $error->{primary_message};
printf "[%4d %4d %s]: %s\n",
$prim->{LineNumber},
$prim->{ColumnNumber},
$prim->{Severity},
$prim->{Text}
}
This would currently print
[ 89 17 E]: there is no attribute "LEFTMARGIN"
[ 89 31 E]: there is no attribute "TOPMARGIN"
[ 89 48 E]: there is no attribute "MARGINHEIGHT"
[ 89 64 E]: there is no attribute "MARGINWIDTH"
[ 99 32 E]: document type does not allow element "FORM" here
[ 100 3 E]: document type does not allow element "TR" here
[ 106 77 E]: there is no attribute "BORDER"
[ 667 73 E]: required attribute "TYPE" not specified
regards.
Received on Sunday, 7 November 2004 03:29:04 UTC