- From: Terje Bless <link@dev.w3.org>
- Date: Fri, 04 Feb 2005 18:41:01 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/validator/httpd/cgi-bin In directory hutz:/tmp/cvs-serv3462/httpd/cgi-bin Modified Files: check Log Message: Implement more sophisticated parsemode determination, and admit to setting parse mode and not the document's "type". Inline MIME mapping into v.conf. Index: check =================================================================== RCS file: /sources/public/validator/httpd/cgi-bin/check,v retrieving revision 1.380 retrieving revision 1.381 diff -u -d -r1.380 -r1.381 --- check 4 Feb 2005 16:50:13 -0000 1.380 +++ check 4 Feb 2005 18:40:59 -0000 1.381 @@ -82,11 +82,19 @@ # # Output flags for error processing -use constant O_SOURCE => 1; # 0000 0001 -use constant O_CHARSET => 2; # 0000 0010 -use constant O_DOCTYPE => 4; # 0000 0100 -use constant O_NONE => 8; # 0000 1000 +use constant O_SOURCE => 1; # 0000 0001 +use constant O_CHARSET => 2; # 0000 0010 +use constant O_DOCTYPE => 4; # 0000 0100 +use constant O_NONE => 8; # 0000 1000 +# +# Parse mode tokens. +use constant MODE_SGML => 1; # 0000 0001 +use constant MODE_XML => 2; # 0000 0010 +use constant MODE_TBD => 4; # 0000 0100, needs further info to decide. +#@@FIXME: XML WF checking isn't implemented. +use constant MODE_WF => 8; # 0000 1000, only XML well-formed checking. +#@@FIXME; # # Define global variables. @@ -95,8 +103,8 @@ # -# Things inside BEGIN don't happen on every request in persistent -# environments, such as mod_perl. So let's do globals, eg. read config here. +# Things inside BEGIN don't happen on every request in persistent environments +# (such as mod_perl); so let's do the globals, eg. read config, here. BEGIN { # Launder data for -T; -AutoLaunder doesn't catch this one. if (exists $ENV{W3C_VALIDATOR_HOME}) { @@ -183,6 +191,15 @@ } # + # Change strings to internal constants in MIME type mapping. + for (keys %{$CFG->{MIME}}) { + if ($CFG->{MIME}->{$_} eq 'SGML') {$CFG->{MIME}->{$_} = MODE_SGML} + elsif ($CFG->{MIME}->{$_} eq 'XML') {$CFG->{MIME}->{$_} = MODE_XML} + elsif ($CFG->{MIME}->{$_} eq 'TBD') {$CFG->{MIME}->{$_} = MODE_TBD} + else {$CFG->{MIME}->{$_} = MODE_TBD}; + } + + # # Use IPC::Run on mod_perl if it's available, IPC::Open3 otherwise. $HAVE_IPC_RUN = 0; if ($ENV{MOD_PERL}) { @@ -235,9 +252,12 @@ delete $ENV{PATH}; +#@@DEBUG: Dump $CFG datastructure. Used only as a developer aid. #use Data::Dumper qw(Dumper); #print Dumper($CFG); #exit; +#@@DEBUG; + ############################################################################### #### Process CGI variables and initialize. #################################### ############################################################################### @@ -280,7 +300,7 @@ # # Misc simple types. -$File->{Type} = ''; +$File->{Mode} = MODE_SGML; # Default parse mode is SGML. # # Array (ref) used to store character offsets for the XML report. @@ -450,7 +470,7 @@ # # Try to extract META charset # (works only if ascii-based and reasonably clean before <meta>) -$File = &preparse_meta($File); +$File = &preparse_meta($File); # First call. Repeated later to fetch the FPI. unless ($File->{Charset}->{Use}) { $File->{Charset}->{Use} = $File->{Charset}->{META}; } @@ -619,13 +639,58 @@ # Try to extract a DOCTYPE or xmlns. $File = &preparse_doctype($File); - # -# Set document type to XHTML if the DOCTYPE was for XHTML. -# Set document type to MathML if the DOCTYPE was for MathML. -# This happens when the file is served as text/html -$File->{Type} = 'xhtml+xml' if $File->{DOCTYPE} =~ /xhtml/i; -$File->{Type} = 'mathml+xml' if $File->{DOCTYPE} =~ /mathml/i; +# Set parse mode. +if ($File->{DOCTYPE}) { + my $fpi = $File->{DOCTYPE}; + if (exists $CFG->{Types}->{$fpi}) { + my $cfg = $CFG->{Types}->{$fpi}; + my $mode = $cfg->{'Parse Mode'}; + if ($mode eq 'SGML') {$mode = MODE_SGML} + elsif ($mode eq 'XML') {$mode = MODE_XML} + else {$mode = MODE_TBD} +&add_warning($File, 'debug', "Mode from DTD was $mode"); + if ($File->{Mode} == MODE_TBD) { + if ($mode == MODE_SGML) {$File->{Mode} = MODE_SGML} + elsif ($mode == MODE_XML) {$File->{Mode} = MODE_XML} + else { + $File->{Mode} = MODE_SGML; + &add_warning($File, 'Unable to Determine Parse Mode!', + 'Falling back to SGML mode.'); + } + } else { + unless ($mode == $File->{Mode}) { + my $dtd = $mode; + my $ct = $File->{Mode}; + for ($dtd, $ct) { + if ($_ == MODE_SGML) {$_ = 'SGML'} + elsif ($_ == MODE_XML) {$_ = 'XML'} + else {$_ = 'SGML'}; + } + unless ($File->{Mode} == MODE_TBD) { + &add_warning($File, 'warning', 'Contradictory Parse Modes Detected!', <<".EOF."); + The MIME Media Type (<code>$File->{ContentType}</code>) + indicated parse mode should be $ct, but the <code>DOCTYPE</code> + Declaration indicates $dtd mode. + Using $ct mode based on <code>Content-Type</code> header. +.EOF. + } + } + } + } else { + if ($File->{Mode} == MODE_TBD) { + &add_warning($File, 'warning', 'Unknown Document Type and Parse Mode!', <<".EOF."); + The MIME Media Type (<code>$File->{ContentType}</code>) for + this document is used to serve both SGML and XML based documents, and + no <code>DOCTYPE</code> Declaration was found to disambiguate it. + Parsing will continue in SGML mode and with a fallback <code>DOCTYPE</code> + similar to HTML 4.01 Transitional. +.EOF. + } + } +} +&add_warning($File, 'debug', "Final mode is $File->{Mode}"); + # @@ -707,7 +772,7 @@ ], ); - #FIXME: This needs a UI and testing! + #@@FIXME: This needs a UI and testing! # # Set onsgmls' -E switch to the number of errors requested. if ($File->{Opt}->{'Max Errors'} =~ m(^all$)i) { @@ -723,7 +788,7 @@ } else { push @cmd, '-E' . ($CFG->{'Max Errors'} || 0); # "-E0" means "all". } - #FIXME; + #@@FIXME; # # Temporary filehandles. @@ -846,21 +911,35 @@ $File->{Version} = $prettyver; } + # -# Warn about unknown Namespaces. -if (&is_xml($File) and $File->{Namespace}) { +# Warn about unknown, incorrect, or missing Namespaces. +if ($File->{Namespace}) { + my $ns = $CFG->{Types}->{$File->{Version}}->{Namespace} || FALSE; my $rns = &ent($File->{Namespace}); - if (&is_xhtml($File) and $File->{Namespace} ne 'http://www.w3.org/1999/xhtml') { - &add_warning($File, 'warning', 'Warning:', - "Unknown namespace («<code>$rns</code>») for text/html document!", + + if (&is_xml($File)) { + if ($ns eq $File->{Namespace}) { + &add_warning($File, 'warning', 'Unknown Namespace Found', + "Unknown namespace («<code>$rns</code>») for $File->{Version} document!", + ); + } + } else { + &add_warning($File, 'warning', 'Namespace Found in non-XML Document', + "Namespace «<code>$rns</code>» found, but document type is not XML!", ); - } elsif (&is_svg($File) and $File->{Namespace} ne 'http://www.w3.org/2000/svg') { - &add_warning($File, 'warning', 'Warning:', - "Unknown namespace («<code>$rns</code>») for SVG document!", + } +} else { + if (&is_xml($File) and $CFG->{Types}->{$File->{Version}}->{Namespace}) { + &add_warning($File, 'warning', 'No Namespace Found', + "No Namespace was found, but document type requires one to be present!", ); } } + +# +#@@FIXME: This logic belongs in the templates. if (defined $File->{Tentative}) { my $class = ''; $class .= ($File->{Tentative} & T_INFO ? ' info' :''); @@ -886,6 +965,7 @@ .EOF. } } +#@@FIXME; if ($File->{Opt}->{Output} eq 'xml') { &report_xml($File); @@ -1244,7 +1324,7 @@ return $File; } - my($type, $ct, $charset) + my($mode, $ct, $charset) = &parse_content_type( $File, $res->header('Content-Type'), @@ -1260,7 +1340,7 @@ $res->decoded_content(charset => 'none') : $res->content; $File->{Bytes} = $content; - $File->{Type} = $type; + $File->{Mode} = $mode; $File->{ContentType} = $ct; $File->{ContentEnc} = $res->content_encoding; $File->{TransferEnc} = $res->header('Client-Transfer-Encoding'); @@ -1270,7 +1350,7 @@ $File->{Size} = scalar $res->content_length; $File->{URI} = scalar $res->request->uri->canonical; $File->{'Is Upload'} = FALSE; - +&add_warning($File, 'debug', "Mode from CT was: $mode"); return $File; } @@ -1288,10 +1368,10 @@ local $/ = undef; # set line delimiter so that <> reads rest of file $file = <$f>; - my($type, $ct, $charset) = &parse_content_type($File, $h->{'Content-Type'}); + my($mode, $ct, $charset) = &parse_content_type($File, $h->{'Content-Type'}); $File->{Bytes} = $file; - $File->{Type} = $type; + $File->{Mode} = $mode; $File->{ContentType} = $ct; $File->{Charset}->{HTTP} = lc $charset; $File->{Modified} = $h->{'Last-Modified'}; @@ -1312,7 +1392,7 @@ my $File = shift; # The master datastructure. $File->{Bytes} = $q->param('fragment'); - $File->{Type} = 'html'; + $File->{Mode} = MODE_TBD; $File->{Modified} = ''; $File->{Server} = ''; $File->{Size} = ''; @@ -1330,11 +1410,11 @@ my $Content_Type = shift; my $url = shift; my $charset = ''; - my $type = ''; + my $mode = ''; my($ct, @param) = split /\s*;\s*/, lc $Content_Type; - $type = $CFG->{File_Type}->{$ct} || $ct; + $mode = $CFG->{MIME}->{$ct} || MODE_TBD; foreach my $param (@param) { my($p, $v) = split /\s*=\s*/, $param; @@ -1345,7 +1425,8 @@ } } - if ($type =~ m(/)) { + if ($mode =~ m(/)) { + my $type = $mode; # $mode is the MIME media type. if ($type =~ m(text/css) and defined $url) { print redirect 'http://jigsaw.w3.org/css-validator/validator?uri=' @@ -1382,7 +1463,7 @@ } } - return $type, $ct, $charset; + return $mode, $ct, $charset; } @@ -2217,14 +2298,8 @@ # -# Utility subs to tell if type "is" something. -sub is_xml {shift->{Type} =~ m(^[^+]+\+xml$)}; -sub is_svg {shift->{Type} =~ m(svg\+xml$)}; -sub is_smil {shift->{Type} =~ m(smil\+xml$)}; -sub is_html {shift->{Type} =~ m(html\+sgml$)}; -sub is_xhtml {shift->{Type} =~ m(xhtml\+xml$)}; -sub is_mathml {shift->{Type} =~ m(mathml\+xml$)}; - +# Utility sub to tell if mode "is" XML. +sub is_xml {shift->{Mode} == MODE_XML}; # # Do an initial parse of the Document Entity to extract charset from HTML <meta>. @@ -2914,8 +2989,13 @@ unless ($File->{Charset}->{Use}) { $File->{Charset}->{Use} = $File->{Charset}->{META}; } - $File->{Type} = 'xhtml+xml' if $File->{DOCTYPE} =~ /xhtml/i; - $File->{Type} = 'mathml+xml' if $File->{DOCTYPE} =~ /mathml/i; + + # + # @@FIXME: This needs updating for new MIME->Mode config. +# $File->{Mode} = MODE_XML if $File->{DOCTYPE} =~ /xhtml/i; +# $File->{Mode} = MODE_XML if $File->{DOCTYPE} =~ /mathml/i; + # @@FIXME; + $File = &main::parse($File); if ($File->{'Is Valid'}) { return $File->{ESIS};
Received on Friday, 4 February 2005 18:41:02 UTC