W3C home > Mailing lists > Public > www-validator-cvs@w3.org > April 2008

validator/httpd/cgi-bin check,1.584,1.585

From: Olivier Thereaux via cvs-syncmail <cvsmail@w3.org>
Date: Mon, 28 Apr 2008 03:42:45 +0000
To: www-validator-cvs@w3.org
Message-Id: <E1JqKGM-0003rV-0N@lionel-hutz.w3.org>

Update of /sources/public/validator/httpd/cgi-bin
In directory hutz:/tmp/cvs-serv14629/httpd/cgi-bin

Modified Files:
	check 
Log Message:
Trying to solve the headache of the default fallback character encoding,
when different specs suggest different encodings, and we try to promote utf-8
as the best practice. This patch tried to go around the issue by trying, in sequence,
a fallback of utf8, win-1252, then iso-8859-1 - in sequence.



Index: check
===================================================================
RCS file: /sources/public/validator/httpd/cgi-bin/check,v
retrieving revision 1.584
retrieving revision 1.585
diff -u -d -r1.584 -r1.585
--- check	23 Apr 2008 04:23:31 -0000	1.584
+++ check	28 Apr 2008 03:42:43 -0000	1.585
@@ -519,8 +519,7 @@
   $File->{Charset}->{Use} = "utf-8";
 } elsif (&is_xml($File) and not $File->{ContentType} =~ m(^text/)) {
   $File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.)
-}
-
+} 
 unless ($File->{Charset}->{Use}) {
   $File->{Charset}->{Use} = $File->{Charset}->{META};
 }
@@ -563,12 +562,6 @@
   }
 }
 
-unless ($File->{Charset}->{Use}) { # No charset given...
-  &add_warning('W04', {W04_charset => 'UTF-8'});
-  $File->{Tentative} |= T_ERROR; # Can never be valid.
-  $File->{Charset}->{Use} = 'utf-8';
-}
-
 #
 # Abort if an error was flagged while finding the encoding.
 &abort_if_error_flagged($File, O_CHARSET|O_DOCTYPE);
@@ -598,9 +591,32 @@
 
 # gb18030 requires Encode::HanExtra but no additional alias
 
-#
+$File->{Charset}->{Default} = FALSE;
+unless ($File->{Charset}->{Use}) { # No charset given...
+     $File->{Charset}->{Use} = 'utf-8';
+     $File->{Charset}->{Default} = TRUE;
+    $File->{Tentative} |= T_ERROR; # Can never be valid.
+    &add_warning('W04', {W04_charset => "UTF-8"});
+}
+
+
 # Always transcode, even if the content claims to be UTF-8
 $File = transcode($File);
+if (($File->{ContentType} == "text/html") and ($File->{Charset}->{Default}) and $File->{'Error Flagged'}) {
+    $File->{'Error Flagged'} = FALSE; # reset
+    # we try again, this time with win-1252
+    $File->{Charset}->{Use} = 'windows-1252';
+    &add_warning('W04', {W04_charset => "windows-1252", W04_also_tried=> "UTF-8"});
+    $File = transcode($File);
+}
+if (($File->{ContentType} == "text/html") and ($File->{Charset}->{Default}) and $File->{'Error Flagged'}) {
+    $File->{'Error Flagged'} = FALSE; # reset
+    # we try again, this time with latin1...
+    $File->{Charset}->{Use} = 'iso-8859-1';
+    &add_warning('W04', {W04_charset => "iso-8859-1", W04_also_tried => "UTF-8, windows-1252"});
+    $File = transcode($File);
+}
+# if it still does not work, we abandon hope here
 &abort_if_error_flagged($File, O_CHARSET);
 
 #
Received on Monday, 28 April 2008 03:43:24 GMT

This archive was generated by hypermail 2.2.0+W3C-0.50 : Thursday, 26 April 2012 12:55:01 GMT