- From: Olivier Thereaux via cvs-syncmail <cvsmail@w3.org>
- Date: Thu, 28 Jun 2007 00:13:29 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/validator/httpd/cgi-bin In directory hutz:/tmp/cvs-serv11273/httpd/cgi-bin Modified Files: check Log Message: Revisiting the parse mode detection algorithm. In order of priority: * Media type (if Content-Type HTTP info present) * Doctype Declaration * XML Declaration (Which we now detect - wasn't the case before) We now also run the parse mode detection algorithm even for doctype-less documents. Another addition: Parse Mode Factor parameter passed to template showing (in debug mode) which was the deciding factor for the parse mode. Index: check =================================================================== RCS file: /sources/public/validator/httpd/cgi-bin/check,v retrieving revision 1.528 retrieving revision 1.529 diff -u -d -r1.528 -r1.529 --- check 26 Jun 2007 09:08:50 -0000 1.528 +++ check 28 Jun 2007 00:13:26 -0000 1.529 @@ -583,9 +583,9 @@ # # 1. check if there's a doctype # 2. if there is a doctype, parse/validate against that DTD -# 3. if no doctype, check for an xmlns= attribute on the first element -# 4. if there is an xmlns= attribute, check for XML well-formedness -# 5. if there is no xmlns= attribute, and no DOCTYPE, punt. +# 3. if no doctype, check for an xmlns= attribute on the first element, or XML declaration +# 4. if no doctype and XML mode, check for XML well-formedness +# 5. otherwise , punt. # # @@ -600,8 +600,9 @@ $File = &preparse_doctype($File); # -# Set parse mode. -set_parse_mode($File, $CFG) if $File->{DOCTYPE}; +# Determine the parse mode (XML or SGML). +##set_parse_mode($File, $CFG) if $File->{DOCTYPE}; +set_parse_mode($File, $CFG); # # Sanity check Charset information and add any warnings necessary. @@ -750,6 +751,7 @@ { name => 'Content-Location', value => $File->{ContentLoc} }, { name => 'Transfer-Encoding', value => $File->{TransferEnc} }, { name => 'Parse Mode', value => $File->{Mode} }, + { name => 'Parse Mode Factor', value => $File->{ModeChoice} }, { name => 'Parser', value => $parser_name }, { name => 'Parser Options', value => join " ", @spopt }, @@ -1893,18 +1895,44 @@ } # -# Set parse mode. +# Set parse mode (SGML or XML) based on a number of preparsed factors: +# * HTTP Content-Type +# * Doctype Declaration +# * XML Declaration sub set_parse_mode { my $File = shift; my $CFG = shift; my $fpi = $File->{DOCTYPE}; + $File->{ModeChoice} = ''; my $parseModeFromDoctype = $CFG->{Types}->{$fpi}->{'Parse Mode'} || 'TBD'; my $parseModeFromMimeType = $File->{Mode}; + my $begincontent = join "",@{$File->{Content}}[0,9]; # for the sake of xml decl detection, + # the 10 first lines should be safe + my $parseModeFromXMLDecl = ( + $begincontent + =~ /^ [\x20|\x9|\xD|\xA]* # whitespace before the decl should not be happening + # but we are greedy for the sake of detection, not validation + <\?xml # start matching an XML Declaration + [\x20|\x9|\xD|\xA]+ # x20, x9, xD and xA are the allowed "xml white space" + version [\x20|\x9|\xD|\xA]* = # for documents, version info is mandatory + [\x20|\x9|\xD|\xA]* ("1.0"|"1.1"|'1.0'|'1.1') # hardcoding the existing XML versions. + # Maybe we should use \d\.\d + ([\x20|\x9|\xD|\xA]+ encoding + [\x20|\x9|\xD|\xA]* = [\x20|\x9|\xD|\xA]* + ("[A-Za-z][a-zA-Z0-9-_]+"|'[A-Za-z][a-zA-Z0-9_]+') + )? # encoding info is optional + ([\x20|\x9|\xD|\xA]+ standalone + [\x20|\x9|\xD|\xA]* = [\x20|\x9|\xD|\xA]* + ("yes"|"no"|'yes'|'no') + )? # ditto standalone info, optional + [\x20|\x9|\xD|\xA]* \?> # end of XML Declaration + /x + ? 'XML' : 'TBD' ); - - if (($parseModeFromMimeType eq 'TBD') and (!exists $CFG->{Types}->{$fpi})) { - # the mime type is text/html (ambiguous, hence TBD mode) + if (($parseModeFromMimeType eq 'TBD') and ($parseModeFromXMLDecl eq 'TBD') and (!exists $CFG->{Types}->{$fpi})) { + # if the mime type is text/html (ambiguous, hence TBD mode) + # and XML prolog detection was unsuccessful # and the doctype isn't in the catalogue... we scream &add_warning('W08', {W08_mime => $File->{ContentType}}); return; @@ -1912,41 +1940,47 @@ $parseModeFromDoctype = 'TBD' unless $parseModeFromDoctype eq 'SGML' or $parseModeFromDoctype eq 'XML'; - if (($parseModeFromDoctype eq 'TBD') and ($parseModeFromMimeType eq 'TBD')) { - # if both doctype and mime type are useless to give us a parse mode + if (($parseModeFromDoctype eq 'TBD') and ($parseModeFromXMLDecl eq 'TBD') and ($parseModeFromMimeType eq 'TBD')) { + # if all three factors are useless to give us a parse mode # => we use SGML as a default $File->{Mode} = 'SGML'; - + $File->{ModeChoice} = 'Fallback'; + # and send warning about the fallback &add_warning('W06', { W06_mime => $File->{ContentType}, w06_doctype => $File->{Version} }); return; } - elsif ($parseModeFromDoctype eq 'TBD') { - # doctype does not give us anything clear (e.g custom DTD) - # but mime type gives clear indication - # => we just use what the content type tells us - move along + elsif ($parseModeFromMimeType ne 'TBD') { + # if The mime type gives clear indication of the parse mode + if (($parseModeFromDoctype ne 'TBD') and ($parseModeFromMimeType ne $parseModeFromDoctype)) { + # if document-type recommended mode and content-type recommended mode clash + # shoot a warning + &add_warning('W07', { + W07_mime => $File->{ContentType}, + W07_ct => $parseModeFromMimeType, + W07_dtd => $parseModeFromDoctype, + }); + } + # mime type has precedence, we stick to it + $File->{ModeChoice} = 'Mime'; return; } - elsif ($parseModeFromMimeType eq 'TBD') { - # the mime type is text/html (ambiguous, hence TBD mode) + elsif ($parseModeFromDoctype ne 'TBD') { + # the mime type is ambiguous (hence we didn't stop at the previous test) # but by now we're sure that the document type is a good indication # so we use that. $File->{Mode} = $parseModeFromDoctype; + $File->{ModeChoice} = 'Doctype'; return; } - elsif ($parseModeFromMimeType ne $parseModeFromDoctype) { - # if document-type recommended mode and content-type recommended mode clash - # shoot a warning - &add_warning('W07', { - W07_mime => $File->{ContentType}, - W07_ct => $parseModeFromMimeType, - W07_dtd => $parseModeFromDoctype, - }); - return; - - } + else { + # this is the last case. We know that all three modes are not TBD, + # yet both mime type and doctype tests have failed => we are saved by the XML declaration + $File->{Mode} = $parseModeFromXMLDecl; + $File->{ModeChoice} = 'XMLDecl'; + } }
Received on Thursday, 28 June 2007 00:13:32 UTC