- From: Ville Skytta via cvs-syncmail <cvsmail@w3.org>
- Date: Fri, 07 May 2010 17:23:00 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/validator/httpd/cgi-bin
In directory hutz:/tmp/cvs-serv13440/httpd/cgi-bin
Modified Files:
check
Log Message:
When passing transcoded content to external validators (and the internal
XML one), fix up charset declarations in content to reflect transcoding.
http://www.w3.org/mid/Pine.LNX.4.64.1004291710580.13453%40zen.rrzn.uni-hannover.de
Index: check
===================================================================
RCS file: /sources/public/validator/httpd/cgi-bin/check,v
retrieving revision 1.774
retrieving revision 1.775
diff -u -d -r1.774 -r1.775
--- check 7 May 2010 09:18:53 -0000 1.774
+++ check 7 May 2010 17:22:58 -0000 1.775
@@ -613,21 +613,13 @@
# [NOT] loading the XML catalog for entities resolution as it seems to
# cause a lot of unnecessary DTD/entities fetching
#$xmlparser->load_catalog(catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'));
+
+ &override_charset($File, "UTF-8");
+
my $xml_string = join "\n", @{$File->{Content}};
my $xmlws = qr/[\x20\x09\x0D\x0A]/o;
- # the XML parser will check the value of encoding attribute in XML
- # declaration so we have to amend it to reflect transcoding.
- # see Bug 4867
- $xml_string =~ s/
- (^<\?xml\b[^>]*${xmlws})
- (encoding${xmlws}*=${xmlws}*
- (?:(["'])[A-Za-z][a-zA-Z0-9_-]+\3)
- )
- (.*?\?>)
- /$1encoding="UTF-8"$4/sox;
-
# Is the document standalone? Need to check with a regex because
# the parser may fail to return a document we could use for this.
my $standalone = (
@@ -1073,11 +1065,9 @@
if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) {
# Doctype or charset overridden, need to use $File->{Content} in UTF-8
- # because $File->{Bytes} is not affected by the overrides. This will
- # most likely be a source of errors about internal/actual charset
- # differences as long as our transcoding process does not "fix" the
- # charset info in XML declaration, meta http-equiv/charset and/or BOM
- # (any others?).
+ # because $File->{Bytes} is not affected by the overrides.
+
+ &override_charset($File, "UTF-8");
$ct = $File->{ContentType} unless $File->{'Direct Input'};
my @ct = ($ct => undef, charset => "UTF-8");
@@ -2030,7 +2020,7 @@
$seen_root = TRUE;
if ($seen_doctype) {
- # doctype addition aldready done, we move on
+ # doctype addition already done, we move on
$HTML .= $_[0];
}
else {
@@ -2082,6 +2072,52 @@
}
#
+# Override inline charset declarations, for use e.g. when passing
+# transcoded results to external parsers that use them.
+sub override_charset ($$)
+{
+ my ($File, $charset) = @_;
+
+ my $ws = qr/[\x20\x09\x0D\x0A]/o;
+ my $cs = qr/[A-Za-z][a-zA-Z0-9_-]+/o;
+
+ my $content = join("\n", @{$File->{Content}});
+
+ # <?xml encoding="charset"?>
+ $content =~ s/(
+ (^<\?xml\b[^>]*?${ws}encoding${ws}*=${ws}*(["']))
+ (${cs})
+ (\3.*?\?>)
+ )/lc($4) eq lc($charset) ? "$1" : "$2$charset$5<!-- $1 -->"/esx;
+
+ # <meta charset="charset">
+ $content =~ s/(
+ (<meta\b[^>]*?${ws}charset${ws}*=${ws}*["']?${ws}*)
+ (${cs})
+ (.*?>)
+ )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix;
+
+ # <meta http-equiv="content-type" content="some/type; charset=charset">
+ $content =~ s/(
+ (<meta\b[^>]*${ws}
+ http-equiv${ws}*=${ws}*["']?${ws}*content-type\b[^>]*?${ws}
+ content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
+ (${cs})
+ (.*?>)
+ )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix;
+
+ # <meta content="some/type; charset=charset" http-equiv="content-type">
+ $content =~ s/(
+ (<meta\b[^>]*${ws}
+ content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
+ (${cs})
+ ([^>]*?${ws}http-equiv${ws}*=${ws}*["']?${ws}*content-type\b.*?>)
+ )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix;
+
+ $File->{Content} = [split /\n/, $content];
+}
+
+#
# Generate a HTML report of detected errors.
sub report_errors ($)
{
Received on Friday, 7 May 2010 17:23:02 UTC