- From: Olivier Thereaux via cvs-syncmail <cvsmail@w3.org>
- Date: Thu, 12 Jun 2008 22:13:56 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/XHTML/HTMLCompatChecker/bin In directory hutz:/tmp/cvs-serv16831/bin Modified Files: appCcheck.pl Log Message: modularization and refactoring Index: appCcheck.pl =================================================================== RCS file: /sources/public/perl/modules/W3C/XHTML/HTMLCompatChecker/bin/appCcheck.pl,v retrieving revision 1.40 retrieving revision 1.41 diff -u -d -r1.40 -r1.41 --- appCcheck.pl 10 Oct 2006 08:10:06 -0000 1.40 +++ appCcheck.pl 12 Jun 2008 22:13:54 -0000 1.41 @@ -1,363 +1,87 @@ #!/usr/bin/perl -# Copyright (c) 2005-2006 the World Wide Web Consortium : -# Keio University, -# Institut National de Recherche en Informatique et Automatique, -# Massachusetts Institute of Technology. -# -# Written and maintained by the QA-dev tool development team at W3C: -# http://www.w3.org/QA/Tools/qa-dev/ +# Copyright (c) 2005-2008 W3C + + use strict; use warnings; -use XML::Parser; + use CGI qw(param); require LWP::UserAgent; use URI; use CGI::Carp 'fatalsToBrowser'; use HTML::Template 2.6 qw(); +use W3C::XHTML::HTMLCompatChecker; # Define global constants use constant TRUE => 1; use constant FALSE => 0; -use constant APPC_FOUND_XML_DECL => 1; # http://www.w3.org/TR/xhtml1/#C_1 -use constant APPC_FOUND_XML_PI => 2; # http://www.w3.org/TR/xhtml1/#C_1 -use constant APPC_MISSING_SPACE => 3; # http://www.w3.org/TR/xhtml1/#C_2 -use constant APPC_UNMINIMIZED => 4; # http://www.w3.org/TR/xhtml1/#C_2 -use constant APPC_MINIMIZED => 5; # http://www.w3.org/TR/xhtml1/#C_3 -use constant APPC_MANY_ISINDEX => 6; # http://www.w3.org/TR/xhtml1/#C_6 -use constant APPC_ONLY_LANG => 7; # http://www.w3.org/TR/xhtml1/#C_7 -use constant APPC_ONLY_XML_LANG => 8; # http://www.w3.org/TR/xhtml1/#C_7 -use constant APPC_APOS_IN_ATTR => 9; # http://www.w3.org/TR/xhtml1/#C_16 -use constant APPC_APOS_IN_ELEM => 10; # http://www.w3.org/TR/xhtml1/#C_16 - -use constant APPC_ERRO => 0; # @@ -use constant APPC_WARN => 1; # @@ -use constant APPC_INFO => 2; # @@ -use constant APPC_HINT => 3; # @@ - -use constant SEVERITY_NAMES => -{ - APPC_ERRO, "Error", - APPC_WARN, "Warning", - APPC_INFO, "Info", - APPC_HINT, "Hint", -}; - -use constant CRITERIA => -{ - APPC_FOUND_XML_DECL, [ 1, APPC_INFO, "XML declarations are problematic" ], - APPC_FOUND_XML_PI, [ 1, APPC_INFO, "XML processing instructions are problematic" ], - APPC_MISSING_SPACE, [ 2, APPC_ERRO, "<example/> shall be written as <example />" ], - APPC_UNMINIMIZED, [ 2, APPC_ERRO, "For empty elements you shall use <example />" ], - APPC_MINIMIZED, [ 3, APPC_ERRO, "For non-empty elements, you shall use <example></example>" ], - APPC_ONLY_LANG, [ 7, APPC_ERRO, "<example lang='en'> shall be written as <example lang='en' xml:lang='en'>" ], - APPC_ONLY_XML_LANG, [ 7, APPC_ERRO, "<example xml:lang='en'> shall be written as <example lang='en' xml:lang='en'>" ], - APPC_MANY_ISINDEX, [10, APPC_WARN, "Avoid more than one <isindex> element in the <head> element" ], - APPC_APOS_IN_ATTR, [16, APPC_ERRO, "You must write ' as e.g. ' for legacy user agents" ], - APPC_APOS_IN_ELEM, [16, APPC_ERRO, "You must write ' as e.g. ' for legacy user agents" ], -}; - -use constant GUIDELINE_TITLES => -{ - 1, "Processing Instructions and the XML Declaration", - 2, "Empty Elements", - 3, "Element Minimization and Empty Element Content", - 6, "Isindex", - 7, "The lang and xml:lang Attributes", - 16, "The Named Character Reference '", -}; - -use constant EMPTY_ELEMENTS => { map { $_ => 1 } -qw/ - base basefont link area hr img - meta param input isindex col br -/ }; - -# global variables... -our $ISINDEX = 0; -our $IS_RELEVANT_DOC = 1; # whether the checker is relevant to the doctype of the document being processed. -our $IS_RELEVANT_CT = 1; # whether the checker is relevant to the media type of the document being processed. -our $IS_WF = 1; # whether the document is at least well-formed XML -our @MESSAGES; - -## Helper functions ####################################################### -sub is_empty_element { EMPTY_ELEMENTS->{shift()} } -sub is_isindex_element { shift eq "isindex" } -sub is_inside_head { shift->within_element("head") } - -sub report_problem -{ - my $exp = shift; - my $cod = shift; - my $loc = shift; - - my $str = $exp->recognized_string; - my $lin = $exp->current_line; - my $col = $exp->current_column; - my $off = $exp->current_byte; - - # determine position after skipping $loc, e.g. if there is - # - # <p lang = "de" - # xml:lang = "de" - # class = "a b c d e f g" - # id = "example"/> - # - # the error is the / and it would be unhelpful to point at - # the < as expat would do in this case. - - my $left = substr $str, 0, $loc; - my $lines = $left =~ y/\n//; # @@ does \n always work? - $left =~ s/^.*\n//s; # @@ does \n always work? - my $chars = length $left; - - # set new positions - my $posy = $lin + $lines; # advance pointer - my $posx = $lines ? $chars : $col + $chars; # advance or replace - my $posxy = $off + $loc; # advance pointer - - my $stext = SEVERITY_NAMES->{CRITERIA->{$cod}->[1]}; - my $mtext = CRITERIA->{$cod}->[2]; - my $cnum = CRITERIA->{$cod}->[0]; - my $gtitle = GUIDELINE_TITLES->{$cnum}; - - push @MESSAGES, {severity => $stext, line => $posy, column => $posx + 1, cnum => $cnum, message_text => $mtext, guideline_title => $gtitle} - -} - - -## Pre-Parsing routines ################################################### -# make sure we are actually handling XHTML 1.0 documents served as text/html -# some code taken from W3C Markup Validator Codebase - -sub parse_content_type { - my $Content_Type = shift; - my ($ct, @others) = split /\s*;\s*/, lc $Content_Type; - #print p($ct); - if ($ct ne "text/html") { - $IS_RELEVANT_CT = 0; - } - return $ct; -} - - -## Handler for XML::Parser ################################################ - -sub _start -{ - my $exp = shift; - my $ele = shift; - my %att = @_; - my $str = $exp->recognized_string; - my $lin = $exp->current_line; - my $col = $exp->current_column; - my $off = $exp->current_byte; - my $end = length($str) - 1; - - # check for multiple isindex elements - if (is_isindex_element($ele) and - is_inside_head($exp) and - $ISINDEX++) - { - report_problem($exp, APPC_MANY_ISINDEX, 0); - } - - if ($str =~ m|/>$|) - { - # check for preceding space in empty element tag - if ($str !~ m|[ \x0d\x0a\t]/>$|) - { - report_problem($exp, APPC_MISSING_SPACE, $end - 1); - } - - # check that empty element tags are used only for - # elements declared as EMPTY in the DTD - if (!is_empty_element($ele)) - { - report_problem($exp, APPC_MINIMIZED, $end - 1); - } - } - - # check that elements declared as EMPTY use empty element tags - if (is_empty_element($ele)) - { - if ($str !~ m|/>$|) - { - report_problem($exp, APPC_UNMINIMIZED, $end); - } - } - - # check for ' in attribute values - if ($str =~ m|'|) - { - local $_ = $str; - my $len = 0; - - while(s/^(.*?)'//) - { - $len += length $1; - report_problem($exp, APPC_APOS_IN_ATTR, $len); - - } - } - - # check for <p lang="de">...</p> - if (exists $att{'lang'} && not exists $att{'xml:lang'}) - { - report_problem($exp, APPC_ONLY_LANG, $end); - } - - # check for <p xml:lang="de">...</p> - if (exists $att{'xml:lang'} && not exists $att{'lang'}) - { - report_problem($exp, APPC_ONLY_XML_LANG, $end); - } -} - -sub _char -{ - my $exp = shift; - my $txt = shift; - my $str = $exp->recognized_string; - my $lin = $exp->current_line; - my $col = $exp->current_column; - my $off = $exp->current_byte; - - # check for ' in parsed character data - if ($str =~ /'/) - { - local $_ = $str; - my $len = 0; - - while(s/^(.*?)'//) - { - $len += length $1; - report_problem($exp, APPC_APOS_IN_ELEM, $len); - - } - } -} - -sub _proc -{ - # check for XML processing instructions - report_problem(shift, APPC_FOUND_XML_PI, 0); -} - -sub _xmldecl -{ - # check for XML declaration - report_problem(shift, APPC_FOUND_XML_DECL, 0); -} - -sub _doctype -{ - my $exp = shift; - my $doctypename = shift; - my $doctypesys = shift; - my $doctypepub = shift; - my $doctypeint = shift; - if (defined $doctypename) { - $IS_RELEVANT_DOC = 0 if ($doctypename ne "html"); - } - if(defined $doctypesys) { - $_ = $doctypesys; - $IS_RELEVANT_DOC = 0 if (not /http:\/\/www.w3.org\/TR\/xhtml1\/DTD\/xhtml1-(strict|transitional|frameset).dtd/); - } - if (defined $doctypepub) { - $_ = $doctypepub; - $IS_RELEVANT_DOC = 0 if (not /-\/\/W3C\/\/DTD XHTML 1.0 (Strict|Transitional|Frameset)\/\/EN/); - } - if (defined $doctypeint) # there should be no internal subset - { - $IS_RELEVANT_DOC = 0 if (length $doctypeint); - } - $IS_RELEVANT_DOC = 0 if ((not defined $doctypesys) and (not defined $doctypepub)); # should not happen with XHTML 1.0 -} - ## Output routines ######################################################## sub prep_output { my $output_param = shift; my $html_output_template_text = CGI::header(). ' -<!DOCTYPE html - PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml" lang="en-US" xml:lang="en-US"> <head> -<title>Experimental XHTML 1.0 Appendix C checker</title> -<link rel="stylesheet" type="text/css" href="/unicorn/style/base.css" /> -<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /> +<title>W3C HTML Compatibility Checker for HTML</title> +<style type="text/css" media="all"> + @import "http://validator.w3.org/style/base.css"; + p.submit_button { margin-top: 1em; } +</style> + </head> <body> -<div id="banner"><h1 id="title"><a href="http://www.w3.org"><img src="http://www.w3.org/Icons/WWW/w3c_home.gif" alt="W3C" /></a> XHTML 1.0 Appendix C checker</h1></div> -<p id="tagline">Checks XHTML 1.0 documents against HTML compatibility guidelines</p> + <div id="banner"> + <h1 id="title"> + <a href="http://www.w3.org/"><img alt="W3C" width="110" height="61" id="logo" src="http://validator.w3.org/images/w3c.png" /></a> + <a href="./"><span>HTML Compatibility Checker for HTML</span></a> + </h1> + <p id="tagline">Check XHTML documents against HTML compatibility guidelines</p> + </div> -<form method="get" action="/appc/" enctype="multipart/form-data"> -<div><input name="uri" size="60" <TMPL_IF NAME="uri">value="<TMPL_VAR NAME="uri" ESCAPE="HTML">"</TMPL_IF> /><input type="submit" value="Check" /> + + + +<div id="frontforms"> +<ul id="tabset_tabs"> + <li class="selected"><a href="#validate-by-uri"><span>Check online</span> document</a></li> +</ul> + <div id="fields"> + +<fieldset id="validate-by-uri" class="tabset_content front"> +<form method="get" action=""> +<p> + <label title="Address of page to Validate" for="uri">Address:</label> + <input type="text" name="uri" id="uri" size="60" <TMPL_IF NAME="uri">value="<TMPL_VAR NAME="uri" ESCAPE="HTML">"</TMPL_IF>/> +</p> +<p class="submit_button"><input title="Submit for validation" type="submit" value="Check" /> +</p> + + +</fieldset> +</form> +</div> <dl> <TMPL_LOOP NAME="message_loop"> <dt><strong><TMPL_VAR NAME="severity" ESCAPE="HTML"></strong> Line <TMPL_VAR NAME="line" ESCAPE="HTML"> column <TMPL_VAR NAME="column" ESCAPE="HTML"></dt> <dd><a href="http://www.w3.org/TR/xhtml1/#C_<TMPL_VAR NAME="cnum" ESCAPE="HTML">"><TMPL_VAR NAME="message_text" ESCAPE="HTML"></a></dd> </TMPL_LOOP> - </dl> <TMPL_UNLESS NAME="uri"><p>Enter the URI of an XHTML 1.0 document which you would like to check against the <a href="http://www.w3.org/TR/xhtml1/#guidelines">HTML Compatibility Guidelines</a> .</p></TMPL_UNLESS> -<TMPL_IF NAME="uri"> -<TMPL_UNLESS NAME="is_wf"> -<p>The document appears to not be well-formed XML. Either it is not XHTML (e.g HTML up to version 4.01) or it has XML well-formedness errors. -HTML Compatibility guidelines checking does not apply. +<TMPL_IF NAME="Abort"> +<p>The document was not checked against HTML Compatibility Guidelines. +Reason: <em><TMPL_VAR NAME="Abort_Message"></em>. </p> -<TMPL_IF NAME="wf_errors"> -<p>WF errors: <TMPL_VAR NAME="wf_errors" ESCAPE="HTML"> </p> -</TMPL_IF> -</TMPL_UNLESS> -<TMPL_UNLESS NAME="is_relevant_ct"> -<p>The document is not served with a <code>text/html</code> media type (<TMPL_VAR NAME="content_type" ESCAPE="HTML">). HTML Compatibility guidelines checking does not apply. -</TMPL_UNLESS> -<TMPL_UNLESS NAME="is_relevant_doctype"> -<p>The document does not appear to be XHTML 1.0. HTML Compatibility guidelines checking does not apply.<p> -</TMPL_UNLESS> </TMPL_IF> </div> -</form> - - <ul class="navbar" id="menu"> - <li><a href="" title="About this service">About</a> <span class="hideme">|</span></li> - </ul> - <p id="activity_logos"> - - <a href="http://www.w3.org/QA/" title="W3C\'s Quality Assurance Activity, bringing you free Web quality tools and more"><img src="http://www.w3.org/QA/2002/12/qa-small.png" alt="QA" /></a> - </p> - <p id="support_logo"> - - Support this tool, become a<br /> - <a href="http://www.w3.org/Consortium/supporters"><img src="http://www.w3.org/Consortium/supporter-logos/csupporter.png" alt="W3C Supporter" /></a> - </p> - <p class="copyright"> - <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> © 1994-2006 - <a href="http://www.w3.org/"><acronym title="World Wide Web Consortium">W3C</acronym></a>® - - (<a href="http://www.csail.mit.edu/"><acronym title="Massachusetts Institute of Technology">MIT</acronym></a>, - <a href="http://www.ercim.org/"><acronym title="European Research Consortium for Informatics and Mathematics">ERCIM</acronym></a>, - <a href="http://www.keio.ac.jp/">Keio</a>), - All Rights Reserved. - W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>, - <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a>, - <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a> - and <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/copyright-software">software licensing</a> - - rules apply. Your interactions with this site are in accordance - with our <a href="http://www.w3.org/Consortium/Legal/privacy-statement#Public">public</a> and - <a href="http://www.w3.org/Consortium/Legal/privacy-statement#Members">Member</a> privacy - statements. - </p> </body> </html> '; @@ -470,77 +194,34 @@ $output->param(warning_count => 0); $output->param(error_count => 0); -if (defined $uri and length $uri and URI->new($uri)->scheme eq "http") -{ - $output->param(uri => $uri); - my $ua = LWP::UserAgent->new; - my $response = $ua->get($uri); - my $xml = undef; - my $ct = undef; - my @content_type_values = undef; - if ($response->is_success) { - $xml = $response->content; - @content_type_values = $response->header('Content-Type'); - $ct = $content_type_values[0]; - } - if (defined $ct and length $ct) { - $ct = &parse_content_type($ct); - $output->param(content_type => $ct); +my $compat_parser = W3C::XHTML::HTMLCompatChecker->new(); +my @checker_messages = $compat_parser->check_uri($uri); +if (exists $checker_messages[0]) { + if ($checker_messages[0]{"severity"} eq "Abort"){ + if ($checker_messages[0]{"message_text"} != "Bad URI") { $output->param(uri => $uri);} + $output->param(Abort=>1); + $output->param(Abort_Message=>$checker_messages[0]{"message_text"}); } - - if (defined $xml and length $xml) - { - my $p = XML::Parser->new; - $p->setHandlers(Doctype => \&_doctype); - - eval { $p->parsestring($xml); }; - $output->param(is_relevant_ct => $IS_RELEVANT_CT); - $output->param(is_relevant_doctype => $IS_RELEVANT_DOC); - - if ($@) # not well-formed - { - $IS_WF = 0; - my $wf_errors = join '', $@; - $output->param(info_count => 1); - $output->param(wf_errors => $wf_errors); - } - elsif ((not $IS_RELEVANT_CT) or (not $IS_RELEVANT_DOC)) # not relevant to this checker - { - #pass - } - else # woot, Well-formed, and relevant. Let's get to work. + else { + $output->param(uri => $uri); + $output->param(message_loop => \@checker_messages); + my (@ERRORS, @WARNINGS, @INFOS); + for (my $i=0; $i < scalar @checker_messages; $i++) { - my $p = XML::Parser->new; - $p->setHandlers(Char => \&_char, - Proc => \&_proc, - Start => \&_start, - XMLDecl => \&_xmldecl); - eval { $p->parsestring($xml); }; - my (@ERRORS, @WARNINGS, @INFOS); - for (my $i=0; $i < scalar @MESSAGES; $i++) - { - if ($MESSAGES[$i]{'severity'} eq "Error") { push @ERRORS, $MESSAGES[$i];} - elsif ($MESSAGES[$i]{'severity'} eq "Warning") {push @WARNINGS, $MESSAGES[$i];} - else {push @INFOS, $MESSAGES[$i];} - } - $output->param(message_loop => \@MESSAGES); - if (@ERRORS) {$output->param(passed => 0) } else {$output->param(passed => 1)} - $output->param(info_loop => \@INFOS); - $output->param(info_count => scalar @INFOS); - $output->param(warning_loop => \@WARNINGS); - $output->param(warning_count => scalar @WARNINGS); - $output->param(error_loop => \@ERRORS); - $output->param(error_count => scalar @ERRORS); + if ($checker_messages[$i]{'severity'} eq "Error") { push @ERRORS, $checker_messages[$i];} + elsif ($checker_messages[$i]{'severity'} eq "Warning") {push @WARNINGS, $checker_messages[$i];} + else {push @INFOS, $checker_messages[$i];} } - $output->param(is_wf => $IS_WF); + if (@ERRORS) {$output->param(passed => 0) } else {$output->param(passed => 1)} + $output->param(info_loop => \@INFOS); + $output->param(info_count => scalar @INFOS); + $output->param(warning_loop => \@WARNINGS); + $output->param(warning_count => scalar @WARNINGS); + $output->param(error_loop => \@ERRORS); + $output->param(error_count => scalar @ERRORS); } - else # houston, we had a problem - { - # TODO some fault code - } - -} +} else {$output->param(uri => $uri);} print $output->output();
Received on Thursday, 12 June 2008 22:14:30 UTC