- From: Ville Skytta via cvs-syncmail <cvsmail@w3.org>
- Date: Fri, 07 May 2010 17:23:00 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/validator/httpd/cgi-bin In directory hutz:/tmp/cvs-serv13440/httpd/cgi-bin Modified Files: check Log Message: When passing transcoded content to external validators (and the internal XML one), fix up charset declarations in content to reflect transcoding. http://www.w3.org/mid/Pine.LNX.4.64.1004291710580.13453%40zen.rrzn.uni-hannover.de Index: check =================================================================== RCS file: /sources/public/validator/httpd/cgi-bin/check,v retrieving revision 1.774 retrieving revision 1.775 diff -u -d -r1.774 -r1.775 --- check 7 May 2010 09:18:53 -0000 1.774 +++ check 7 May 2010 17:22:58 -0000 1.775 @@ -613,21 +613,13 @@ # [NOT] loading the XML catalog for entities resolution as it seems to # cause a lot of unnecessary DTD/entities fetching #$xmlparser->load_catalog(catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc')); + + &override_charset($File, "UTF-8"); + my $xml_string = join "\n", @{$File->{Content}}; my $xmlws = qr/[\x20\x09\x0D\x0A]/o; - # the XML parser will check the value of encoding attribute in XML - # declaration so we have to amend it to reflect transcoding. - # see Bug 4867 - $xml_string =~ s/ - (^<\?xml\b[^>]*${xmlws}) - (encoding${xmlws}*=${xmlws}* - (?:(["'])[A-Za-z][a-zA-Z0-9_-]+\3) - ) - (.*?\?>) - /$1encoding="UTF-8"$4/sox; - # Is the document standalone? Need to check with a regex because # the parser may fail to return a document we could use for this. my $standalone = ( @@ -1073,11 +1065,9 @@ if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) { # Doctype or charset overridden, need to use $File->{Content} in UTF-8 - # because $File->{Bytes} is not affected by the overrides. This will - # most likely be a source of errors about internal/actual charset - # differences as long as our transcoding process does not "fix" the - # charset info in XML declaration, meta http-equiv/charset and/or BOM - # (any others?). + # because $File->{Bytes} is not affected by the overrides. + + &override_charset($File, "UTF-8"); $ct = $File->{ContentType} unless $File->{'Direct Input'}; my @ct = ($ct => undef, charset => "UTF-8"); @@ -2030,7 +2020,7 @@ $seen_root = TRUE; if ($seen_doctype) { - # doctype addition aldready done, we move on + # doctype addition already done, we move on $HTML .= $_[0]; } else { @@ -2082,6 +2072,52 @@ } # +# Override inline charset declarations, for use e.g. when passing +# transcoded results to external parsers that use them. +sub override_charset ($$) +{ + my ($File, $charset) = @_; + + my $ws = qr/[\x20\x09\x0D\x0A]/o; + my $cs = qr/[A-Za-z][a-zA-Z0-9_-]+/o; + + my $content = join("\n", @{$File->{Content}}); + + # <?xml encoding="charset"?> + $content =~ s/( + (^<\?xml\b[^>]*?${ws}encoding${ws}*=${ws}*(["'])) + (${cs}) + (\3.*?\?>) + )/lc($4) eq lc($charset) ? "$1" : "$2$charset$5<!-- $1 -->"/esx; + + # <meta charset="charset"> + $content =~ s/( + (<meta\b[^>]*?${ws}charset${ws}*=${ws}*["']?${ws}*) + (${cs}) + (.*?>) + )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix; + + # <meta http-equiv="content-type" content="some/type; charset=charset"> + $content =~ s/( + (<meta\b[^>]*${ws} + http-equiv${ws}*=${ws}*["']?${ws}*content-type\b[^>]*?${ws} + content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*) + (${cs}) + (.*?>) + )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix; + + # <meta content="some/type; charset=charset" http-equiv="content-type"> + $content =~ s/( + (<meta\b[^>]*${ws} + content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*) + (${cs}) + ([^>]*?${ws}http-equiv${ws}*=${ws}*["']?${ws}*content-type\b.*?>) + )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix; + + $File->{Content} = [split /\n/, $content]; +} + +# # Generate a HTML report of detected errors. sub report_errors ($) {
Received on Friday, 7 May 2010 17:23:02 UTC