validator/httpd/cgi-bin check,1.380,1.381

Update of /sources/public/validator/httpd/cgi-bin
In directory hutz:/tmp/cvs-serv3462/httpd/cgi-bin

Modified Files:
	check 
Log Message:
Implement more sophisticated parsemode determination, and admit to setting
parse mode and not the document's "type". Inline MIME mapping into v.conf.


Index: check
===================================================================
RCS file: /sources/public/validator/httpd/cgi-bin/check,v
retrieving revision 1.380
retrieving revision 1.381
diff -u -d -r1.380 -r1.381
--- check	4 Feb 2005 16:50:13 -0000	1.380
+++ check	4 Feb 2005 18:40:59 -0000	1.381
@@ -82,11 +82,19 @@
 
 #
 # Output flags for error processing
-use constant O_SOURCE  =>  1; # 0000 0001
-use constant O_CHARSET =>  2; # 0000 0010
-use constant O_DOCTYPE =>  4; # 0000 0100
-use constant O_NONE    =>  8; # 0000 1000
+use constant O_SOURCE  => 1; # 0000 0001
+use constant O_CHARSET => 2; # 0000 0010
+use constant O_DOCTYPE => 4; # 0000 0100
+use constant O_NONE    => 8; # 0000 1000
 
+#
+# Parse mode tokens.
+use constant MODE_SGML => 1; # 0000 0001
+use constant MODE_XML  => 2; # 0000 0010
+use constant MODE_TBD  => 4; # 0000 0100, needs further info to decide.
+#@@FIXME: XML WF checking isn't implemented.
+use constant MODE_WF   => 8; # 0000 1000, only XML well-formed checking.
+#@@FIXME;
 
 #
 # Define global variables.
@@ -95,8 +103,8 @@
 
 
 #
-# Things inside BEGIN don't happen on every request in persistent
-# environments, such as mod_perl.  So let's do globals, eg. read config here.
+# Things inside BEGIN don't happen on every request in persistent environments
+# (such as mod_perl); so let's do the globals, eg. read config, here.
 BEGIN {
   # Launder data for -T; -AutoLaunder doesn't catch this one.
   if (exists $ENV{W3C_VALIDATOR_HOME}) {
@@ -183,6 +191,15 @@
   }
 
   #
+  # Change strings to internal constants in MIME type mapping.
+  for (keys %{$CFG->{MIME}}) {
+    if    ($CFG->{MIME}->{$_} eq 'SGML') {$CFG->{MIME}->{$_} = MODE_SGML}
+    elsif ($CFG->{MIME}->{$_} eq 'XML')  {$CFG->{MIME}->{$_} = MODE_XML}
+    elsif ($CFG->{MIME}->{$_} eq 'TBD')  {$CFG->{MIME}->{$_} = MODE_TBD}
+    else                                 {$CFG->{MIME}->{$_} = MODE_TBD};
+  }
+
+  #
   # Use IPC::Run on mod_perl if it's available, IPC::Open3 otherwise.
   $HAVE_IPC_RUN = 0;
   if ($ENV{MOD_PERL}) {
@@ -235,9 +252,12 @@
 delete $ENV{PATH};
 
 
+#@@DEBUG: Dump $CFG datastructure. Used only as a developer aid.
 #use Data::Dumper qw(Dumper);
 #print Dumper($CFG);
 #exit;
+#@@DEBUG;
+
 ###############################################################################
 #### Process CGI variables and initialize. ####################################
 ###############################################################################
@@ -280,7 +300,7 @@
 
 #
 # Misc simple types.
-$File->{Type} = '';
+$File->{Mode} = MODE_SGML; # Default parse mode is SGML.
 
 #
 # Array (ref) used to store character offsets for the XML report.
@@ -450,7 +470,7 @@
 #
 # Try to extract META charset
 # (works only if ascii-based and reasonably clean before <meta>)
-$File = &preparse_meta($File);
+$File = &preparse_meta($File); # First call. Repeated later to fetch the FPI.
 unless ($File->{Charset}->{Use}) {
   $File->{Charset}->{Use} = $File->{Charset}->{META};
 }
@@ -619,13 +639,58 @@
 # Try to extract a DOCTYPE or xmlns.
 $File = &preparse_doctype($File);
 
-
 #
-# Set document type to XHTML if the DOCTYPE was for XHTML.
-# Set document type to MathML if the DOCTYPE was for MathML.
-# This happens when the file is served as text/html
-$File->{Type} = 'xhtml+xml'  if $File->{DOCTYPE} =~ /xhtml/i;
-$File->{Type} = 'mathml+xml' if $File->{DOCTYPE} =~ /mathml/i;
+# Set parse mode.
+if ($File->{DOCTYPE}) {
+  my $fpi = $File->{DOCTYPE};
+  if (exists $CFG->{Types}->{$fpi}) {
+    my $cfg  = $CFG->{Types}->{$fpi};
+    my $mode = $cfg->{'Parse Mode'};
+    if    ($mode eq 'SGML') {$mode = MODE_SGML}
+    elsif ($mode eq 'XML')  {$mode = MODE_XML}
+    else                    {$mode = MODE_TBD}
+&add_warning($File, 'debug', "Mode from DTD was $mode");
+    if ($File->{Mode} == MODE_TBD) {
+      if    ($mode == MODE_SGML) {$File->{Mode} = MODE_SGML}
+      elsif ($mode == MODE_XML)  {$File->{Mode} = MODE_XML}
+      else {
+        $File->{Mode} = MODE_SGML;
+        &add_warning($File, 'Unable to Determine Parse Mode!',
+                     'Falling back to SGML mode.');
+      }
+    } else {
+      unless ($mode == $File->{Mode}) {
+        my $dtd = $mode;
+        my $ct  = $File->{Mode};
+        for ($dtd, $ct) {
+          if    ($_ == MODE_SGML) {$_ = 'SGML'}
+          elsif ($_ == MODE_XML)  {$_ =  'XML'}
+          else                    {$_ = 'SGML'};
+        }
+        unless ($File->{Mode} == MODE_TBD) {
+          &add_warning($File, 'warning', 'Contradictory Parse Modes Detected!', <<".EOF.");
+        The MIME Media Type (<code>$File->{ContentType}</code>)
+        indicated parse mode should be $ct, but the <code>DOCTYPE</code>
+        Declaration indicates $dtd mode.
+        Using $ct mode based on <code>Content-Type</code> header.
+.EOF.
+        }
+      }
+    }
+  } else {
+    if ($File->{Mode} == MODE_TBD) {
+      &add_warning($File, 'warning', 'Unknown Document Type and Parse Mode!', <<".EOF.");
+    The MIME Media Type (<code>$File->{ContentType}</code>) for
+    this document is used to serve both SGML and XML based documents, and
+    no <code>DOCTYPE</code> Declaration was found to disambiguate it.
+    Parsing will continue in SGML mode and with a fallback <code>DOCTYPE</code>
+    similar to HTML 4.01 Transitional.
+.EOF.
+    }
+  }
+}
+&add_warning($File, 'debug', "Final mode is $File->{Mode}");
+
 
 
 #
@@ -707,7 +772,7 @@
             ],
            );
 
-  #FIXME: This needs a UI and testing!
+  #@@FIXME: This needs a UI and testing!
   #
   # Set onsgmls' -E switch to the number of errors requested.
   if ($File->{Opt}->{'Max Errors'} =~ m(^all$)i) {
@@ -723,7 +788,7 @@
   } else {
     push @cmd, '-E' . ($CFG->{'Max Errors'} || 0); # "-E0" means "all".
   }
-  #FIXME;
+  #@@FIXME;
 
   #
   # Temporary filehandles.
@@ -846,21 +911,35 @@
   $File->{Version} = $prettyver;
 }
 
+
 #
-# Warn about unknown Namespaces.
-if (&is_xml($File) and $File->{Namespace}) {
+# Warn about unknown, incorrect, or missing Namespaces.
+if ($File->{Namespace}) {
+  my $ns  = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE;
   my $rns = &ent($File->{Namespace});
-  if (&is_xhtml($File) and $File->{Namespace} ne 'http://www.w3.org/1999/xhtml') {
-    &add_warning($File, 'warning', 'Warning:',
-      "Unknown namespace (&#171;<code>$rns</code>&#187;) for text/html document!",
+
+  if (&is_xml($File)) {
+    if ($ns eq $File->{Namespace}) {
+      &add_warning($File, 'warning', 'Unknown Namespace Found',
+        "Unknown namespace (&#171;<code>$rns</code>&#187;) for $File->{Version} document!",
+      );
+    }
+  } else {
+    &add_warning($File, 'warning', 'Namespace Found in non-XML Document',
+      "Namespace &#171;<code>$rns</code>&#187; found, but document type is not XML!",
     );
-  } elsif (&is_svg($File) and $File->{Namespace} ne 'http://www.w3.org/2000/svg') {
-    &add_warning($File, 'warning', 'Warning:',
-      "Unknown namespace (&#171;<code>$rns</code>&#187;) for SVG document!",
+  }
+} else {
+  if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) {
+    &add_warning($File, 'warning', 'No Namespace Found',
+      "No Namespace was found, but document type requires one to be present!",
     );
   }
 }
 
+
+#
+#@@FIXME: This logic belongs in the templates.
 if (defined $File->{Tentative}) {
   my $class = '';
      $class .= ($File->{Tentative} & T_INFO  ? ' info'    :'');
@@ -886,6 +965,7 @@
 .EOF.
   }
 }
+#@@FIXME;
 
 if ($File->{Opt}->{Output} eq 'xml') {
   &report_xml($File);
@@ -1244,7 +1324,7 @@
     return $File;
   }
 
-  my($type, $ct, $charset)
+  my($mode, $ct, $charset)
     = &parse_content_type(
                           $File,
                           $res->header('Content-Type'),
@@ -1260,7 +1340,7 @@
     $res->decoded_content(charset => 'none') : $res->content;
 
   $File->{Bytes}           = $content;
-  $File->{Type}            = $type;
+  $File->{Mode}            = $mode;
   $File->{ContentType}     = $ct;
   $File->{ContentEnc}      = $res->content_encoding;
   $File->{TransferEnc}     = $res->header('Client-Transfer-Encoding');
@@ -1270,7 +1350,7 @@
   $File->{Size}            = scalar $res->content_length;
   $File->{URI}             = scalar $res->request->uri->canonical;
   $File->{'Is Upload'}     = FALSE;
-
+&add_warning($File, 'debug', "Mode from CT was: $mode");
   return $File;
 
 }
@@ -1288,10 +1368,10 @@
   local $/ = undef; # set line delimiter so that <> reads rest of file
   $file = <$f>;
 
-  my($type, $ct, $charset) = &parse_content_type($File, $h->{'Content-Type'});
+  my($mode, $ct, $charset) = &parse_content_type($File, $h->{'Content-Type'});
 
   $File->{Bytes}           = $file;
-  $File->{Type}            = $type;
+  $File->{Mode}            = $mode;
   $File->{ContentType}     = $ct;
   $File->{Charset}->{HTTP} = lc $charset;
   $File->{Modified}        = $h->{'Last-Modified'};
@@ -1312,7 +1392,7 @@
   my $File = shift; # The master datastructure.
 
   $File->{Bytes}       = $q->param('fragment');
-  $File->{Type}        = 'html';
+  $File->{Mode}        = MODE_TBD;
   $File->{Modified}    = '';
   $File->{Server}      = '';
   $File->{Size}        = '';
@@ -1330,11 +1410,11 @@
   my $Content_Type = shift;
   my $url          = shift;
   my $charset      = '';
-  my $type         = '';
+  my $mode         = '';
 
   my($ct, @param) = split /\s*;\s*/, lc $Content_Type;
 
-  $type = $CFG->{File_Type}->{$ct} || $ct;
+  $mode = $CFG->{MIME}->{$ct} || MODE_TBD;
 
   foreach my $param (@param) {
     my($p, $v) = split /\s*=\s*/, $param;
@@ -1345,7 +1425,8 @@
     }
   }
 
-  if ($type =~ m(/)) {
+  if ($mode =~ m(/)) {
+    my $type = $mode; # $mode is the MIME media type.
     if ($type =~ m(text/css) and defined $url) {
       print redirect
         'http://jigsaw.w3.org/css-validator/validator?uri='
@@ -1382,7 +1463,7 @@
     }
   }
 
-  return $type, $ct, $charset;
+  return $mode, $ct, $charset;
 }
 
 
@@ -2217,14 +2298,8 @@
 
 
 #
-# Utility subs to tell if type "is" something.
-sub is_xml    {shift->{Type} =~ m(^[^+]+\+xml$)};
-sub is_svg    {shift->{Type} =~ m(svg\+xml$)};
-sub is_smil   {shift->{Type} =~ m(smil\+xml$)};
-sub is_html   {shift->{Type} =~ m(html\+sgml$)};
-sub is_xhtml  {shift->{Type} =~ m(xhtml\+xml$)};
-sub is_mathml {shift->{Type} =~ m(mathml\+xml$)};
-
+# Utility sub to tell if mode "is" XML.
+sub is_xml {shift->{Mode} == MODE_XML};
 
 #
 # Do an initial parse of the Document Entity to extract charset from HTML <meta>.
@@ -2914,8 +2989,13 @@
   unless ($File->{Charset}->{Use}) {
     $File->{Charset}->{Use} = $File->{Charset}->{META};
   }
-  $File->{Type} = 'xhtml+xml'  if $File->{DOCTYPE} =~ /xhtml/i;
-  $File->{Type} = 'mathml+xml' if $File->{DOCTYPE} =~ /mathml/i;
+
+  #
+  # @@FIXME: This needs updating for new MIME->Mode config.
+#  $File->{Mode} = MODE_XML if $File->{DOCTYPE} =~ /xhtml/i;
+#  $File->{Mode} = MODE_XML if $File->{DOCTYPE} =~ /mathml/i;
+  # @@FIXME;
+
   $File = &main::parse($File);
   if ($File->{'Is Valid'}) {
     return $File->{ESIS};

Received on Friday, 4 February 2005 18:41:02 UTC