validator/httpd/cgi-bin check,1.774,1.775

Update of /sources/public/validator/httpd/cgi-bin
In directory hutz:/tmp/cvs-serv13440/httpd/cgi-bin

Modified Files:
	check 
Log Message:
When passing transcoded content to external validators (and the internal
XML one), fix up charset declarations in content to reflect transcoding.

http://www.w3.org/mid/Pine.LNX.4.64.1004291710580.13453%40zen.rrzn.uni-hannover.de


Index: check
===================================================================
RCS file: /sources/public/validator/httpd/cgi-bin/check,v
retrieving revision 1.774
retrieving revision 1.775
diff -u -d -r1.774 -r1.775
--- check	7 May 2010 09:18:53 -0000	1.774
+++ check	7 May 2010 17:22:58 -0000	1.775
@@ -613,21 +613,13 @@
         # [NOT] loading the XML catalog for entities resolution as it seems to
         # cause a lot of unnecessary DTD/entities fetching
         #$xmlparser->load_catalog(catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'));
+
+        &override_charset($File, "UTF-8");
+
         my $xml_string = join "\n", @{$File->{Content}};
 
         my $xmlws = qr/[\x20\x09\x0D\x0A]/o;
 
-        # the XML parser will check the value of encoding attribute in XML
-        # declaration so we have to amend it to reflect transcoding.
-        # see Bug 4867
-        $xml_string =~ s/
-               (^<\?xml\b[^>]*${xmlws})
-               (encoding${xmlws}*=${xmlws}*
-                   (?:(["'])[A-Za-z][a-zA-Z0-9_-]+\3)
-               )
-               (.*?\?>)
-           /$1encoding="UTF-8"$4/sox;
-
         # Is the document standalone?  Need to check with a regex because
         # the parser may fail to return a document we could use for this.
         my $standalone = (
@@ -1073,11 +1065,9 @@
     if ($File->{Opt}->{DOCTYPE} || $File->{Charset}->{Override}) {
 
         # Doctype or charset overridden, need to use $File->{Content} in UTF-8
-        # because $File->{Bytes} is not affected by the overrides.  This will
-        # most likely be a source of errors about internal/actual charset
-        # differences as long as our transcoding process does not "fix" the
-        # charset info in XML declaration, meta http-equiv/charset and/or BOM
-        # (any others?).
+        # because $File->{Bytes} is not affected by the overrides.
+
+        &override_charset($File, "UTF-8");
 
         $ct = $File->{ContentType} unless $File->{'Direct Input'};
         my @ct = ($ct => undef, charset => "UTF-8");
@@ -2030,7 +2020,7 @@
             $seen_root = TRUE;
             if ($seen_doctype) {
 
-                # doctype addition aldready done, we move on
+                # doctype addition already done, we move on
                 $HTML .= $_[0];
             }
             else {
@@ -2082,6 +2072,52 @@
 }
 
 #
+# Override inline charset declarations, for use e.g. when passing
+# transcoded results to external parsers that use them.
+sub override_charset ($$)
+{
+    my ($File, $charset) = @_;
+
+    my $ws = qr/[\x20\x09\x0D\x0A]/o;
+    my $cs = qr/[A-Za-z][a-zA-Z0-9_-]+/o;
+
+    my $content = join("\n", @{$File->{Content}});
+
+    # <?xml encoding="charset"?>
+    $content =~ s/(
+              (^<\?xml\b[^>]*?${ws}encoding${ws}*=${ws}*(["']))
+              (${cs})
+              (\3.*?\?>)
+          )/lc($4) eq lc($charset) ? "$1" : "$2$charset$5<!-- $1 -->"/esx;
+
+    # <meta charset="charset">
+    $content =~ s/(
+              (<meta\b[^>]*?${ws}charset${ws}*=${ws}*["']?${ws}*)
+              (${cs})
+              (.*?>)
+          )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix;
+
+    # <meta http-equiv="content-type" content="some/type; charset=charset">
+    $content =~ s/(
+              (<meta\b[^>]*${ws}
+                  http-equiv${ws}*=${ws}*["']?${ws}*content-type\b[^>]*?${ws}
+                  content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
+              (${cs})
+              (.*?>)
+          )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix;
+
+    # <meta content="some/type; charset=charset" http-equiv="content-type">
+    $content =~ s/(
+              (<meta\b[^>]*${ws}
+                  content${ws}*=${ws}*["']?[^"'>]+?;${ws}*charset${ws}*=${ws}*)
+              (${cs})
+              ([^>]*?${ws}http-equiv${ws}*=${ws}*["']?${ws}*content-type\b.*?>)
+          )/lc($3) eq lc($charset) ? "$1" : "$2$charset$4<!-- $1 -->"/esix;
+
+    $File->{Content} = [split /\n/, $content];
+}
+
+#
 # Generate a HTML report of detected errors.
 sub report_errors ($)
 {

Received on Friday, 7 May 2010 17:23:02 UTC