perl/modules/W3C/XHTML/HTMLCompatChecker/bin appCcheck.pl,1.40,1.41 from Olivier Thereaux via cvs-syncmail on 2008-06-12 (www-validator-cvs@w3.org from June 2008)

From: Olivier Thereaux via cvs-syncmail <cvsmail@w3.org>
Date: Thu, 12 Jun 2008 22:13:56 +0000
To: www-validator-cvs@w3.org
Message-Id: <E1K6v3M-0004OO-8d@lionel-hutz.w3.org>
Update of /sources/public/perl/modules/W3C/XHTML/HTMLCompatChecker/bin
In directory hutz:/tmp/cvs-serv16831/bin

Modified Files:
	appCcheck.pl 
Log Message:
modularization and refactoring

Index: appCcheck.pl
===================================================================
RCS file: /sources/public/perl/modules/W3C/XHTML/HTMLCompatChecker/bin/appCcheck.pl,v
retrieving revision 1.40
retrieving revision 1.41
diff -u -d -r1.40 -r1.41
--- appCcheck.pl	10 Oct 2006 08:10:06 -0000	1.40
+++ appCcheck.pl	12 Jun 2008 22:13:54 -0000	1.41
@@ -1,363 +1,87 @@
 #!/usr/bin/perl
-# Copyright (c) 2005-2006 the World Wide Web Consortium :
-#       Keio University,
-#       Institut National de Recherche en Informatique et Automatique,
-#       Massachusetts Institute of Technology.
-#
-# Written and maintained by the QA-dev tool development team at W3C: 
-# http://www.w3.org/QA/Tools/qa-dev/
+# Copyright (c) 2005-2008 W3C
+
+
 use strict;
 use warnings;
-use XML::Parser;
+
 use CGI qw(param);
 require LWP::UserAgent;
 use URI;
 use CGI::Carp 'fatalsToBrowser';
 use HTML::Template       2.6  qw();
+use W3C::XHTML::HTMLCompatChecker;
 
 # Define global constants
 use constant TRUE  => 1;
 use constant FALSE => 0;
 
 
-use constant APPC_FOUND_XML_DECL =>  1; # http://www.w3.org/TR/xhtml1/#C_1
-use constant APPC_FOUND_XML_PI   =>  2; # http://www.w3.org/TR/xhtml1/#C_1
-use constant APPC_MISSING_SPACE  =>  3; # http://www.w3.org/TR/xhtml1/#C_2
-use constant APPC_UNMINIMIZED    =>  4; # http://www.w3.org/TR/xhtml1/#C_2
-use constant APPC_MINIMIZED      =>  5; # http://www.w3.org/TR/xhtml1/#C_3
-use constant APPC_MANY_ISINDEX   =>  6; # http://www.w3.org/TR/xhtml1/#C_6
-use constant APPC_ONLY_LANG      =>  7; # http://www.w3.org/TR/xhtml1/#C_7
-use constant APPC_ONLY_XML_LANG  =>  8; # http://www.w3.org/TR/xhtml1/#C_7
-use constant APPC_APOS_IN_ATTR   =>  9; # http://www.w3.org/TR/xhtml1/#C_16
-use constant APPC_APOS_IN_ELEM   => 10; # http://www.w3.org/TR/xhtml1/#C_16
-
-use constant APPC_ERRO => 0; # @@
-use constant APPC_WARN => 1; # @@
-use constant APPC_INFO => 2; # @@
-use constant APPC_HINT => 3; # @@
-
-use constant SEVERITY_NAMES =>
-{
-    APPC_ERRO, "Error",
-    APPC_WARN, "Warning",
-    APPC_INFO, "Info",
-    APPC_HINT, "Hint",
-};
-
-use constant CRITERIA =>
-{
-    APPC_FOUND_XML_DECL,    [ 1, APPC_INFO, "XML declarations are problematic"                                              ],
-    APPC_FOUND_XML_PI,      [ 1, APPC_INFO, "XML processing instructions are problematic"                                   ],
-    APPC_MISSING_SPACE,     [ 2, APPC_ERRO, "<example/> shall be written as <example />"                                    ],
-    APPC_UNMINIMIZED,       [ 2, APPC_ERRO, "For empty elements you shall use <example />"                                  ],
-    APPC_MINIMIZED,         [ 3, APPC_ERRO, "For non-empty elements, you shall use <example></example>"                     ],
-    APPC_ONLY_LANG,         [ 7, APPC_ERRO, "<example lang='en'> shall be written as <example lang='en' xml:lang='en'>"     ],
-    APPC_ONLY_XML_LANG,     [ 7, APPC_ERRO, "<example xml:lang='en'> shall be written as <example lang='en' xml:lang='en'>" ],
-    APPC_MANY_ISINDEX,      [10, APPC_WARN, "Avoid more than one <isindex> element in the <head> element"                   ],
-    APPC_APOS_IN_ATTR,      [16, APPC_ERRO, "You must write &apos; as e.g. &#39; for legacy user agents"                    ],
-    APPC_APOS_IN_ELEM,      [16, APPC_ERRO, "You must write &apos; as e.g. &#39; for legacy user agents"                    ],
-};
-
-use constant GUIDELINE_TITLES =>
-{
-    1, "Processing Instructions and the XML Declaration",
-    2, "Empty Elements",
-    3, "Element Minimization and Empty Element Content",
-    6, "Isindex",
-    7, "The lang and xml:lang Attributes",
-    16, "The Named Character Reference &apos;",
-};
-
-use constant EMPTY_ELEMENTS => { map { $_ => 1 }
-qw/
-    base basefont link area hr img
-    meta param input isindex col br
-/ };
-
-# global variables...
-our $ISINDEX = 0;
-our $IS_RELEVANT_DOC = 1; # whether the checker is relevant to the doctype of the document being processed.
-our $IS_RELEVANT_CT = 1; # whether the checker is relevant to the media type of the document being processed.
-our $IS_WF = 1; # whether the document is at least well-formed XML
-our @MESSAGES;
-
-## Helper functions #######################################################
-sub is_empty_element   { EMPTY_ELEMENTS->{shift()} }
-sub is_isindex_element { shift eq "isindex" }
-sub is_inside_head     { shift->within_element("head") }
-
-sub report_problem
-{
-    my $exp = shift;
-    my $cod = shift;
-    my $loc = shift;
-
-    my $str = $exp->recognized_string;
-    my $lin = $exp->current_line;
-    my $col = $exp->current_column;
-    my $off = $exp->current_byte;
-    
-    # determine position after skipping $loc, e.g. if there is
-    #
-    #   <p lang     = "de"
-    #      xml:lang = "de"
-    #      class    = "a b c d e f g"
-    #      id       = "example"/>
-    #
-    # the error is the / and it would be unhelpful to point at
-    # the < as expat would do in this case.
-    
-    my $left = substr $str, 0, $loc;
-    my $lines = $left =~ y/\n//; # @@ does \n always work?
-    $left =~ s/^.*\n//s;         # @@ does \n always work?
-    my $chars = length $left;
-    
-    # set new positions
-    my $posy = $lin + $lines;                   # advance pointer
-    my $posx = $lines ? $chars : $col + $chars; # advance or replace
-    my $posxy = $off + $loc;                    # advance pointer
-
-    my $stext = SEVERITY_NAMES->{CRITERIA->{$cod}->[1]};
-    my $mtext = CRITERIA->{$cod}->[2];
-    my $cnum  = CRITERIA->{$cod}->[0];
-    my $gtitle = GUIDELINE_TITLES->{$cnum};
-
-	push @MESSAGES, {severity => $stext, line => $posy, column => $posx + 1, cnum => $cnum, message_text => $mtext, guideline_title => $gtitle}
-   
-}
-
-
-## Pre-Parsing routines ###################################################
-# make sure we are actually handling XHTML 1.0 documents served as text/html
-# some code taken from W3C Markup Validator Codebase
-
-sub parse_content_type {
-    my $Content_Type = shift;
-    my ($ct, @others) = split /\s*;\s*/, lc $Content_Type;
-    #print p($ct);
-    if ($ct ne "text/html") {
-            $IS_RELEVANT_CT = 0;
-    } 
-    return $ct;
-}
-
-
-## Handler for XML::Parser ################################################
-
-sub _start
-{
-    my $exp = shift;
-    my $ele = shift;
-    my %att = @_;
-    my $str = $exp->recognized_string;
-    my $lin = $exp->current_line;
-    my $col = $exp->current_column;
-    my $off = $exp->current_byte;
-    my $end = length($str) - 1;
-    
-    # check for multiple isindex elements
-    if (is_isindex_element($ele) and
-        is_inside_head($exp) and
-        $ISINDEX++)
-    {
-        report_problem($exp, APPC_MANY_ISINDEX, 0);
-    }
-    
-    if ($str =~ m|/>$|)
-    {
-        # check for preceding space in empty element tag
-        if ($str !~ m|[ \x0d\x0a\t]/>$|)
-        {
-            report_problem($exp, APPC_MISSING_SPACE, $end - 1);
-        }
-        
-        # check that empty element tags are used only for
-        # elements declared as EMPTY in the DTD
-        if (!is_empty_element($ele))
-        {
-            report_problem($exp, APPC_MINIMIZED, $end - 1);
-        }
-    }
-    
-    # check that elements declared as EMPTY use empty element tags
-    if (is_empty_element($ele))
-    {
-        if ($str !~ m|/>$|)
-        {
-            report_problem($exp, APPC_UNMINIMIZED, $end);
-        }
-    }
-    
-    # check for &apos; in attribute values
-    if ($str =~ m|&apos;|)
-    {
-        local $_ = $str;
-        my $len = 0;
-        
-        while(s/^(.*?)&apos;//)
-        {
-            $len += length $1;
-            report_problem($exp, APPC_APOS_IN_ATTR, $len);
-            
-        }
-    }
-    
-    # check for <p lang="de">...</p>
-    if (exists $att{'lang'} && not exists $att{'xml:lang'})
-    {
-        report_problem($exp, APPC_ONLY_LANG, $end);
-    }
-    
-    # check for <p xml:lang="de">...</p>
-    if (exists $att{'xml:lang'} && not exists $att{'lang'})
-    {
-        report_problem($exp, APPC_ONLY_XML_LANG, $end);
-    }
-}
-
-sub _char
-{
-    my $exp = shift;
-    my $txt = shift;
-    my $str = $exp->recognized_string;
-    my $lin = $exp->current_line;
-    my $col = $exp->current_column;
-    my $off = $exp->current_byte;
-    
-    # check for &apos; in parsed character data
-    if ($str =~ /&apos;/)
-    {
-        local $_ = $str;
-        my $len = 0;
-        
-        while(s/^(.*?)&apos;//)
-        {
-            $len += length $1;
-            report_problem($exp, APPC_APOS_IN_ELEM, $len);
-            
-        }
-    }
-}
-
-sub _proc
-{
-    # check for XML processing instructions
-    report_problem(shift, APPC_FOUND_XML_PI, 0);
-}
-
-sub _xmldecl
-{
-    # check for XML declaration
-    report_problem(shift, APPC_FOUND_XML_DECL, 0);
-}
-
-sub _doctype
-{
-    my $exp = shift;
-    my $doctypename = shift;
-    my $doctypesys = shift;
-    my $doctypepub = shift;
-    my $doctypeint = shift;
-    if (defined $doctypename) {
-            $IS_RELEVANT_DOC = 0 if ($doctypename ne "html");
-    }
-    if(defined $doctypesys) {
-            $_ = $doctypesys;
-            $IS_RELEVANT_DOC = 0 if (not /http:\/\/www.w3.org\/TR\/xhtml1\/DTD\/xhtml1-(strict|transitional|frameset).dtd/);
-    }
-    if (defined $doctypepub) {
-            $_ = $doctypepub;
-            $IS_RELEVANT_DOC = 0 if (not /-\/\/W3C\/\/DTD XHTML 1.0 (Strict|Transitional|Frameset)\/\/EN/);
-    }
-    if (defined $doctypeint) # there should be no internal subset
-    { 
-           $IS_RELEVANT_DOC = 0 if (length  $doctypeint);
-    }
-    $IS_RELEVANT_DOC = 0 if ((not defined $doctypesys) and (not defined $doctypepub)); # should not happen with XHTML 1.0
-}
-
 ## Output routines ########################################################
 sub prep_output {
     my $output_param = shift;
     my $html_output_template_text = CGI::header(). '
-<!DOCTYPE html
-	PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
 	 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
 <html xmlns="http://www.w3.org/1999/xhtml" lang="en-US" xml:lang="en-US">
 <head>
-<title>Experimental XHTML 1.0 Appendix C checker</title>
-<link rel="stylesheet" type="text/css" href="/unicorn/style/base.css" />
-<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+<title>W3C HTML Compatibility Checker for HTML</title>
+<style type="text/css" media="all">
+  @import "http://validator.w3.org/style/base.css";
+  p.submit_button { margin-top: 1em; }
+</style>
+
 </head>
 <body>
-<div id="banner"><h1 id="title"><a href="http://www.w3.org"><img src="http://www.w3.org/Icons/WWW/w3c_home.gif" alt="W3C" /></a> XHTML 1.0 Appendix C checker</h1></div>
-<p id="tagline">Checks XHTML 1.0 documents against HTML compatibility guidelines</p>
+   <div id="banner">
+    <h1 id="title">
+      <a href="http://www.w3.org/"><img alt="W3C" width="110" height="61" id="logo" src="http://validator.w3.org/images/w3c.png" /></a>
+			<a href="./"><span>HTML Compatibility Checker for HTML</span></a>
+      </h1>
+      <p id="tagline">Check XHTML documents against HTML compatibility guidelines</p>
+   </div>
 
-<form method="get" action="/appc/" enctype="multipart/form-data">
-<div><input name="uri" size="60" <TMPL_IF NAME="uri">value="<TMPL_VAR NAME="uri" ESCAPE="HTML">"</TMPL_IF> /><input type="submit" value="Check" />
+
+
+
+<div id="frontforms">
+<ul id="tabset_tabs">
+	<li class="selected"><a href="#validate-by-uri"><span>Check online</span> document</a></li>
+</ul>
+   <div id="fields">
+
+<fieldset id="validate-by-uri" class="tabset_content front">
+<form method="get" action="">
+<p>
+	<label title="Address of page to Validate" for="uri">Address:</label>
+     <input type="text" name="uri" id="uri" size="60" <TMPL_IF NAME="uri">value="<TMPL_VAR NAME="uri" ESCAPE="HTML">"</TMPL_IF>/>
+</p>
+<p class="submit_button"><input  title="Submit for validation" type="submit" value="Check" />
+</p>
+
+
+</fieldset>
+</form>
+</div>
 
 <dl>
 <TMPL_LOOP NAME="message_loop">
     <dt><strong><TMPL_VAR NAME="severity" ESCAPE="HTML"></strong> Line <TMPL_VAR NAME="line" ESCAPE="HTML"> column <TMPL_VAR NAME="column" ESCAPE="HTML"></dt>
     <dd><a href="http://www.w3.org/TR/xhtml1/#C_<TMPL_VAR NAME="cnum" ESCAPE="HTML">"><TMPL_VAR NAME="message_text" ESCAPE="HTML"></a></dd>
 </TMPL_LOOP>
-
 </dl>
 
 <TMPL_UNLESS NAME="uri"><p>Enter the URI of an XHTML 1.0 document which you would like to check against the 
 <a href="http://www.w3.org/TR/xhtml1/#guidelines">HTML Compatibility Guidelines</a> .</p></TMPL_UNLESS>
 
-<TMPL_IF NAME="uri">
-<TMPL_UNLESS NAME="is_wf">
-<p>The document appears to not be well-formed XML. Either it is not XHTML (e.g HTML up to version 4.01) or it has XML well-formedness errors. 
-HTML Compatibility guidelines checking does not apply.
+<TMPL_IF NAME="Abort">
+<p>The document was not checked against HTML Compatibility Guidelines.  
+Reason: <em><TMPL_VAR NAME="Abort_Message"></em>.
 </p>
-<TMPL_IF NAME="wf_errors">
-<p>WF errors: <TMPL_VAR NAME="wf_errors" ESCAPE="HTML"> </p>
-</TMPL_IF>
-</TMPL_UNLESS>
-<TMPL_UNLESS NAME="is_relevant_ct">
-<p>The document is not served with a <code>text/html</code> media type (<TMPL_VAR NAME="content_type" ESCAPE="HTML">). HTML Compatibility guidelines checking does not apply.
-</TMPL_UNLESS>
-<TMPL_UNLESS NAME="is_relevant_doctype">
-<p>The document does not appear to be XHTML 1.0. HTML Compatibility guidelines checking does not apply.<p>
-</TMPL_UNLESS>
 </TMPL_IF>
 
 </div>
-</form>
-
-	   <ul class="navbar"  id="menu">
-                <li><a href="" title="About this service">About</a> <span class="hideme">|</span></li>
-              </ul>
-              <p id="activity_logos">
-
-              <a href="http://www.w3.org/QA/" title="W3C\'s Quality Assurance Activity, bringing you free Web quality tools and more"><img src="http://www.w3.org/QA/2002/12/qa-small.png" alt="QA" /></a>
-           </p>
-           <p id="support_logo">
-
-        Support this tool, become a<br />
-        <a href="http://www.w3.org/Consortium/supporters"><img src="http://www.w3.org/Consortium/supporter-logos/csupporter.png" alt="W3C Supporter" /></a>
-           </p>
 
-            <p class="copyright">
-              <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> &copy; 1994-2006
-              <a href="http://www.w3.org/"><acronym title="World Wide Web Consortium">W3C</acronym></a>&reg;
-
-              (<a href="http://www.csail.mit.edu/"><acronym title="Massachusetts Institute of Technology">MIT</acronym></a>,
-              <a href="http://www.ercim.org/"><acronym title="European Research Consortium for Informatics and Mathematics">ERCIM</acronym></a>,
-              <a href="http://www.keio.ac.jp/">Keio</a>),
-              All Rights Reserved.
-              W3C <a href="http://www.w3.org/Consortium/Legal/ipr-notice#Legal_Disclaimer">liability</a>,
-              <a href="http://www.w3.org/Consortium/Legal/ipr-notice#W3C_Trademarks">trademark</a>,
-              <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/copyright-documents">document use</a>
-              and <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/copyright-software">software licensing</a>
-
-              rules apply. Your interactions with this site are in accordance
-              with our <a href="http://www.w3.org/Consortium/Legal/privacy-statement#Public">public</a> and
-              <a href="http://www.w3.org/Consortium/Legal/privacy-statement#Members">Member</a> privacy
-              statements.
-            </p>
 </body>
 </html>
 ';
@@ -470,77 +194,34 @@
 $output->param(warning_count => 0);                         
 $output->param(error_count => 0);
 
-if (defined $uri and length $uri and URI->new($uri)->scheme eq "http")
-{
-     $output->param(uri => $uri);
-     my $ua = LWP::UserAgent->new;
-     my $response = $ua->get($uri);
-     my $xml = undef;
-     my $ct = undef;
-     my @content_type_values = undef;
-     if ($response->is_success) {
-         $xml =  $response->content;
-         @content_type_values = $response->header('Content-Type');
-         $ct = $content_type_values[0];
-    }
-    if (defined $ct and length $ct) {
-        $ct = &parse_content_type($ct);
-        $output->param(content_type => $ct);
+my $compat_parser =  W3C::XHTML::HTMLCompatChecker->new();
+my @checker_messages = $compat_parser->check_uri($uri);
+if (exists $checker_messages[0]) {
+     if ($checker_messages[0]{"severity"} eq "Abort"){
+        if ($checker_messages[0]{"message_text"} != "Bad URI") { $output->param(uri => $uri);}
+        $output->param(Abort=>1);
+        $output->param(Abort_Message=>$checker_messages[0]{"message_text"});
     }
-    
-    if (defined $xml and length $xml)
-    {
-        my $p = XML::Parser->new;
-        $p->setHandlers(Doctype => \&_doctype);
-        
-        eval { $p->parsestring($xml); };
-        $output->param(is_relevant_ct => $IS_RELEVANT_CT);
-        $output->param(is_relevant_doctype => $IS_RELEVANT_DOC);
-
-        if ($@) # not well-formed
-        {
-                $IS_WF = 0;
-                my $wf_errors = join '', $@;
-                $output->param(info_count => 1);
-                $output->param(wf_errors => $wf_errors);      
-        }
-        elsif ((not $IS_RELEVANT_CT) or (not $IS_RELEVANT_DOC)) # not relevant to this checker
-        {
-            #pass 
-        }
-        else # woot, Well-formed, and relevant. Let's get to work.
+    else {
+        $output->param(uri => $uri);
+        $output->param(message_loop => \@checker_messages);
+        my (@ERRORS, @WARNINGS, @INFOS);
+        for (my $i=0; $i < scalar @checker_messages; $i++)
         {
-            my $p = XML::Parser->new;
-            $p->setHandlers(Char    => \&_char,
-                        Proc    => \&_proc,
-                        Start   => \&_start,
-                        XMLDecl => \&_xmldecl);       
-            eval { $p->parsestring($xml); };
-            my (@ERRORS, @WARNINGS, @INFOS);
-            for (my $i=0; $i < scalar @MESSAGES; $i++)
-    	    {
-    	            if ($MESSAGES[$i]{'severity'} eq "Error") {	push @ERRORS, $MESSAGES[$i];}
-                    elsif ($MESSAGES[$i]{'severity'} eq "Warning") {push @WARNINGS, $MESSAGES[$i];}
-                    else {push @INFOS, $MESSAGES[$i];}
-            }
-            $output->param(message_loop => \@MESSAGES);
-            if (@ERRORS) {$output->param(passed => 0) } else {$output->param(passed => 1)}
-            $output->param(info_loop => \@INFOS);
-            $output->param(info_count => scalar @INFOS);
-            $output->param(warning_loop => \@WARNINGS);
-            $output->param(warning_count => scalar @WARNINGS);
-            $output->param(error_loop => \@ERRORS);
-            $output->param(error_count => scalar @ERRORS);
+                if ($checker_messages[$i]{'severity'} eq "Error") {	push @ERRORS, $checker_messages[$i];}
+                elsif ($checker_messages[$i]{'severity'} eq "Warning") {push @WARNINGS, $checker_messages[$i];}
+                else {push @INFOS, $checker_messages[$i];}
         }
-        $output->param(is_wf => $IS_WF);        
+        if (@ERRORS) {$output->param(passed => 0) } else {$output->param(passed => 1)}
+        $output->param(info_loop => \@INFOS);
+        $output->param(info_count => scalar @INFOS);
+        $output->param(warning_loop => \@WARNINGS);
+        $output->param(warning_count => scalar @WARNINGS);
+        $output->param(error_loop => \@ERRORS);
+        $output->param(error_count => scalar @ERRORS);
         
     }
-    else # houston, we had a problem
-    {
-            # TODO some fault code
-    }
-
-}
+} else {$output->param(uri => $uri);} 
 
 print $output->output();
Received on Thursday, 12 June 2008 22:14:30 UTC