- From: Bjoern Hoehrmann <derhoermi@gmx.net>
- Date: Sun, 07 Nov 2004 04:28:15 +0100
- To: public-qa-dev@w3.org
Hi, Except for SGML::Parser::OpenSP 0.99 the following script is able to validate HTML/XHTML documents using only CPAN modules; except for some features like showing an outline of the document and complaining about encoding errors and missing doctypes this should be as good as the current Markup Validator. Of course, output is plain text only and it lacks error checking, etc #!perl -w BEGIN { $ENV{SP_CHARSET_FIXED} = 1; $ENV{SP_ENCODING} = "UTF-8"; $ENV{SP_BCTF} = "UTF-8"; } sub ErrorHandler::new {bless {p=>$_[1]}, shift} sub ErrorHandler::error { push @{$_[0]->{errors}}, $_[0]->{p}->split_message($_[1]) } use strict; use warnings; use SGML::Parser::OpenSP 0.99 qw(); use HTML::Encoding qw(); use HTML::Doctype qw(); use LWP::UserAgent qw(); use Encode qw(); use I18N::Charset qw(); our $HTML_CATALOG = "sgml.soc"; our $XML_CATALOG = "xml.soc"; our @SP_OPTS = qw/ non-sgml-char-ref valid no-duplicate /; my $u = LWP::UserAgent->new; my $p = SGML::Parser::OpenSP->new; my $h = HTML::Doctype::Detector->new($p); my $e = ErrorHandler->new($p); my $r = $u->get("http://www.sun.com"); my $name1 = HTML::Encoding::encoding_from_http_message($r); my $name2 = I18N::Charset::enco_charset_name($name1); my $text = Encode::decode($name2 => $r->content); # XHTML detection $p->handler($h); $p->catalogs($HTML_CATALOG); $p->parse_string($text); my $is_xhtml = $h->is_xhtml; # Validation $p->handler($e); $p->catalogs($is_xhtml ? $XML_CATALOG : $HTML_CATALOG); $p->warnings(@SP_OPTS, $is_xhtml ? 'xml' : ()); $p->parse_string($text); foreach my $error (@{$e->{errors}}) { my $prim = $error->{primary_message}; printf "[%4d %4d %s]: %s\n", $prim->{LineNumber}, $prim->{ColumnNumber}, $prim->{Severity}, $prim->{Text} } This would currently print [ 89 17 E]: there is no attribute "LEFTMARGIN" [ 89 31 E]: there is no attribute "TOPMARGIN" [ 89 48 E]: there is no attribute "MARGINHEIGHT" [ 89 64 E]: there is no attribute "MARGINWIDTH" [ 99 32 E]: document type does not allow element "FORM" here [ 100 3 E]: document type does not allow element "TR" here [ 106 77 E]: there is no attribute "BORDER" [ 667 73 E]: required attribute "TYPE" not specified regards.
Received on Sunday, 7 November 2004 03:29:04 UTC