validator/httpd/cgi-bin check,1.441,1.442 from Bjoern Hoehrmann via cvs-syncmail on 2005-08-18 (www-validator-cvs@w3.org from August 2005)

From: Bjoern Hoehrmann via cvs-syncmail <cvsmail@w3.org>
Date: Thu, 18 Aug 2005 02:51:56 +0000
To: www-validator-cvs@w3.org
Message-Id: <E1E5aVY-0006Oq-Aa@lionel-hutz.w3.org>
Update of /sources/public/validator/httpd/cgi-bin
In directory hutz:/tmp/cvs-serv24559

Modified Files:
	check 
Log Message:
Use HTML::Encoding instead of proprietary code and some cleanup

Index: check
===================================================================
RCS file: /sources/public/validator/httpd/cgi-bin/check,v
retrieving revision 1.441
retrieving revision 1.442
diff -u -d -r1.441 -r1.442
--- check	17 Aug 2005 02:09:08 -0000	1.441
+++ check	18 Aug 2005 02:51:54 -0000	1.442
@@ -36,27 +36,21 @@
 # when loading modules to prevent non-OO or poorly written modules from
 # polluting our namespace.
 #
-use CGI             2.81 qw(
-                            -newstyle_urls
-                            -private_tempfiles
-                            redirect
-                           ); # 2.81 for XHTML, and import redirect() function.
-
-use CGI::Carp            qw(carp croak fatalsToBrowser);
-use Config::General 2.19 qw(); # Need 2.19 for -AutoLaunder
-use File::Spec           qw();
-use HTML::Parser    3.25 qw(); # Need 3.25 for $p->ignore_elements.
-use HTML::Template  2.6  qw();
-use HTTP::Request        qw();
-use HTTP::Headers::Auth  qw(); # Needs to be imported after other HTTP::*.
-use IO::File             qw();
-use IPC::Open3           qw(open3);
-use Set::IntSpan         qw();
-use Text::Iconv          qw();
-use Text::Wrap           qw(wrap);
-use URI                  qw();
-use URI::Escape          qw(uri_escape);
 
+use CGI                  2.81 qw(-newstyle_urls -private_tempfiles redirect);
+use CGI::Carp                 qw(carp croak fatalsToBrowser);
+use Config::General      2.19 qw(); # Need 2.19 for -AutoLaunder
+use File::Spec                qw();
+use HTML::Parser         3.25 qw(); # Need 3.25 for $p->ignore_elements.
+use HTML::Template       2.6  qw();
+use HTTP::Request             qw();
+use HTTP::Headers::Auth       qw(); # Needs to be imported after other HTTP::*.
+use URI                       qw();
+use URI::Escape               qw(uri_escape);
+use Encode                    qw();
+use HTML::Encoding       0.52 qw();
+use SGML::Parser::OpenSP 0.99 qw();
+  
 ###############################################################################
 #### Constant definitions. ####################################################
 ###############################################################################
@@ -234,7 +228,10 @@
 # @@FIXME This can't be my() as $File will sooner or
 # later be undef and add_warning will cause the script
 # to die. our() seems to work but has other problems.
-our $File;
+# @@FIXME Apparently, this must be set to {} also,
+# otherwise the script might pick up an old object
+# after abort_if_error_flagged under mod_perl.
+our $File = {};
 
 ##############################################
 # Populate $File->{Env} -- Session Metadata. #
@@ -262,12 +259,7 @@
 $File->{Mode} = MODE_SGML; # Default parse mode is SGML.
 
 #
-# Array (ref) used to store character offsets for the XML report.
-$File->{Offsets}->[0] = [0, 0]; # The first item isn't used...
-
-#
 # Listrefs.
-$File->{Lines}      = []; # Line numbers for encoding errors.
 $File->{Warnings}   = []; # Warnings...
 $File->{Namespaces} = []; # Other (non-root) Namespaces.
 
@@ -377,9 +369,7 @@
 #### Output validation results. ###############################################
 ###############################################################################
 
-#
-# Find the XML Encoding.
-$File = &find_xml_encoding($File);
+$File = find_encodings($File);
 
 #
 # Decide on a charset to use (first part)
@@ -406,20 +396,13 @@
   $File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.)
 }
 
-$File->{Content} = &normalize_newlines($File->{Bytes},
-                       exact_charset($File, $File->{Charset}->{Use}));
-
-#
-# Try to extract META charset
-# (works only if ascii-based and reasonably clean before <meta>)
-$File = &preparse_meta($File); # First call. Repeated later to fetch the FPI.
 unless ($File->{Charset}->{Use}) {
   $File->{Charset}->{Use} = $File->{Charset}->{META};
 }
 
 #
 # Handle any Fallback or Override for the charset.
-if (&conflict($File->{Opt}->{Charset}, '(detect automatically)')) {
+if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) {
   # charset=foo was given to the CGI and it wasn't "autodetect".
 
   #
@@ -467,18 +450,16 @@
 &abort_if_error_flagged($File, O_CHARSET|O_DOCTYPE);
 
 #
-# Check the detected Encoding and transcode.
-if (&conflict($File->{Charset}->{Use}, 'utf-8')) {
-  $File = &transcode($File);
-  &abort_if_error_flagged($File, O_CHARSET);
-}
-
-$File = &check_utf8($File); # always check
-$File = &byte_error($File);
+# Always transcode, even if the content claims to be UTF-8
+$File = transcode($File);
+&abort_if_error_flagged($File, O_CHARSET);
 
 #
-# Abort if an error was flagged during transcoding
-&abort_if_error_flagged($File, O_SOURCE|O_CHARSET);
+# Add a warning if doc is UTF-8 and contains a BOM.
+if ($File->{Charset}->{Use} eq 'utf-8' &&
+    $File->{Content}->[0] =~ m(^\x{FEFF})) {
+  &add_warning('W21', {});
+}
 
 #
 # Overall parsing algorithm for documents returned as text/html:
@@ -556,8 +537,6 @@
 sub parse (\$) {
   my $File = shift;
   
-  use SGML::Parser::OpenSP 0.99 qw();
-  
   my $opensp = SGML::Parser::OpenSP->new();
   
   #
@@ -576,6 +555,10 @@
   
   #
   # Parser configuration
+  #
+  # This is broken on Win32 with restrict_file_reading since it
+  # would need to allow access to the temp file directory which
+  # it does not. Not sure how to address that yet.
   $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library});
   $opensp->catalogs($catalog);
   $opensp->show_error_numbers(1);
@@ -594,22 +577,6 @@
   $T->param(opt_debug => $DEBUG);
   $T->param(debug =>
             [
-             # This is obsolete now?
-             { name => 'Command',           value => "SGML::Parser::OpenSP->new" },
-             
-             # These are probably obsolete now, SGML::Parser::OpenSP will
-             # always use <spec encoding="utf-8">... for parse_string. It
-             # might be possible that messing with these variables makes
-             # OpenSP fail to recognize the encoding, I remember something
-             # to this effect but can't reproduce this at the moment. We
-             # rely on SGML::Parser::OpenSP to get this right because it
-             # seems there is no way for Apache2+mod_perl2+OpenSP+SPO to
-             # communicate the correct environment settings to OpenSP.
-             # @@FIXME this note should be removed after some testing.
-             { name => 'SP_CHARSET_FIXED',  value => $ENV{SP_CHARSET_FIXED} },
-             { name => 'SP_ENCODING',       value => $ENV{SP_ENCODING} },
-             { name => 'SP_BCTF',           value => $ENV{SP_BCTF} },
-             
              { name => 'Content-Encoding',  value => $File->{ContentEnc} },
              { name => 'Transfer-Encoding', value => $File->{TransferEnc} },
             ],
@@ -682,7 +649,9 @@
   &prep_template($File, $T);
 
   if (! $File->{Doctype} and ($File->{Version} eq 'unknown' or $File->{Version} eq 'SGML' or (!$File->{Version}))) {
-    # @@TODO@@ we should try falling back on other version info, such as the ones stored in Version_ESIS
+
+    # @@TODO@@ we should try falling back on other version
+    # info, such as the ones stored in Version_ESIS
     $T->param(file_version => '(no Doctype found)');
   }
   else {
@@ -1026,18 +995,12 @@
   my $charset      = '';
   my $mode         = '';
 
+  # @@FIXME @param now unused
   my ($ct, @param) = split /\s*;\s*/, lc $Content_Type;
 
   $mode = $CFG->{MIME}->{$ct} || $ct;
-
-  foreach my $param (@param) {
-    my ($p, $v) = split /\s*=\s*/, $param;
-    next unless $p =~ m(charset)i;
-    if ($v =~ m/([\'\"]?)(\S+)\1/i) {
-      $charset = lc $2;
-      last;
-    }
-  }
+  
+  $charset = HTML::Encoding::encoding_from_content_type($Content_Type);
 
   if ($mode =~ m(/)) { # a "/" means it's unknown or we'd have a mode here.
     if ($ct =~ m(text/css) and defined $url) {
@@ -1075,43 +1038,6 @@
 }
 
 #
-# Normalize newline forms (CRLF/CR/LF) to native newline.
-sub normalize_newlines {
-  my $file = shift;
-  local $_ = shift;  #charset
-  my $pattern = '';
-
-  # don't use backreference parentheses!
-  $pattern = '\x00\x0D(?:\x00\x0A)?|\x00\x0A' if /^utf-16be$/;
-  $pattern = '\x0D\x00(?:\x0A\x00)?|\x0A\x00' if /^utf-16le$/;
-  # $pattern = '\x00\x00\x00\x0D(?:\x00\x00\x00\x0A)?|\x00\x00\x00\x0A' if /^UCS-4be$/;
-  # $pattern = '\x0D\x00\x00\x00(?:\x0A\x00\x00\x00)?|\x0A\x00\x00\x00' if /^UCS-4le$/;
-  # insert other special cases here, such as EBCDIC
-  $pattern = '\x0D(?:\x0A)?|\x0A' if !$pattern;    # all other cases
-
-  return [split /$pattern/, $file];
-}
-
-#
-# find exact charset from general one (utf-16)
-#
-# needed for per-line conversion and line splitting
-# (BE is default, but this will apply only to HTML)
-sub exact_charset {
-  my $File = shift;
-  my $general_charset = shift;
-  my $exact_charset = $general_charset;
-
-  if ($general_charset eq 'utf-16') {
-    if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) {
-      $exact_charset = $File->{Charset}->{Auto};
-    } else { $exact_charset = 'utf-16be'; }
-  }
-  # add same code for ucs-4 here
-  return $exact_charset;
-}
-
-#
 # Return $_[0] encoded for HTML entities (cribbed from merlyn).
 #
 # Note that this is used both for HTML and XML escaping.
@@ -1126,11 +1052,6 @@
 #
 # Truncate source lines for report.
 #
-# This *really* wants Perl 5.8.0 and it's improved UNICODE support.
-# Byte semantics are in effect on all length(), substr(), etc. calls,
-# so offsets will be wrong if there are multi-byte sequences prior to
-# the column where the error is detected.
-#
 sub truncate_line {
   my $line  = shift;
   my $col   = shift;
@@ -1327,7 +1248,6 @@
       $length = $col;
     }
     $left = substr $line, $offset, $length;
-    $left = &ent($left);
   }
 
   #
@@ -1367,11 +1287,10 @@
       $length = length($line) - ($col - 1); # Otherwise get the rest of the line.
     }
     $right = substr $line, $offset, $length;
-    $right = &ent($right);
   }
 
   $char = qq(<strong title="Position where error was detected.">$char</strong>);
-  $line = $left . $char . $right;
+  $line = &ent($left) . $char . &ent($right);
 
   return $line;
 }
@@ -1383,7 +1302,7 @@
 
   # Remove any BOM since we're not at BOT anymore...
   $File->{Content}->[0] =
-    substr $File->{Content}->[0], ($File->{BOM} ? 3 : 0); # remove BOM
+    substr $File->{Content}->[0], ($File->{BOM} ? 1 : 0); # remove BOM
 
   my @source = map({file_source_line => $_}, @{$File->{Content}});
   return \@source;
@@ -1393,7 +1312,7 @@
 # Do an initial parse of the Document Entity to extract FPI.
 sub preparse_doctype {
   my $File = shift;
-
+  
   #
   # Reset DOCTYPE, Root (for second invocation, probably not needed anymore).
   $File->{DOCTYPE}         = '';
@@ -1436,6 +1355,7 @@
 #
 # Print out the raw error output for debugging.
 sub show_errors ($) {
+  # @@FIXME This is broken with SGML::Parser::OpenSP
   my $file_raw_errors = "";
   for (@{shift->{DEBUG}->{Errors}}) {
   $file_raw_errors .= ent $_
@@ -1546,47 +1466,6 @@
 sub is_xml {shift->{Mode} == MODE_XML};
 
 #
-# Do an initial parse of the Document Entity to extract charset from HTML <meta>.
-# (still also extracts FPI, at least to some extent)
-sub preparse_meta {
-  my $File = shift;
-
-  my $dtd = sub {
-    return if $File->{Root};
-    ($File->{Root}, $File->{DOCTYPE}) = shift =~ m(<!DOCTYPE\s+(\w+)\s+PUBLIC\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*>)si;
-  };
-
-  my $start = sub {
-    my $tag  = shift;
-    my $attr = shift;
-    my %attr = map {lc($_) => $attr->{$_}} keys %{$attr};
-
-    if ($File->{Root}) {
-      if (lc $tag eq 'meta') {
-        if (lc $attr{'http-equiv'} eq 'content-type') {
-          if ($attr{content} =~ m(charset\s*=[\s\"\']*([^\s;\"\'>]*))si) {
-            $File->{Charset}->{META} = lc $1;
-          }
-        }
-      }
-      return unless $tag eq $File->{Root};
-    } else {
-      $File->{Root} = $tag;
-    }
-  };
-
-  my $p = HTML::Parser->new(api_version => 3);
-  $p->xml_mode(TRUE);
-  $p->ignore_elements('BODY');
-  $p->ignore_elements('body');
-  $p->handler(declaration => $dtd, 'text');
-  $p->handler(start => $start, 'tag,attr');
-  $p->parse(join "\n", @{$File->{Content}});
-
-  return $File;
-}
-
-#
 # Check charset conflicts and add any warnings necessary.
 sub charset_conflicts {
   my $File = shift;
@@ -1607,19 +1486,19 @@
   #
   # Add a warning if there was charset info conflict (HTTP header,
   # XML declaration, or <meta> element).
-  if (&conflict($File->{Charset}->{HTTP}, $File->{Charset}->{XML})) {
+  if (charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{XML})) {
     &add_warning('W18', {
       W18_http => $cs_http,
       W18_xml  => $cs_xml,
       W18_use  => $cs_use,
     });
-  } elsif (&conflict($File->{Charset}->{HTTP}, $File->{Charset}->{META})) {
+  } elsif (charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META})) {
     &add_warning('W19', {
       W19_http => $cs_http,
       W19_meta => $cs_meta,
       W19_use  => $cs_use,
     });
-  } elsif (&conflict($File->{Charset}->{XML}, $File->{Charset}->{META})) {
+  } elsif (charset_not_equal($File->{Charset}->{XML}, $File->{Charset}->{META})) {
     &add_warning('W20', {
       W20_http => $cs_xml,
       W20_xml  => $cs_meta,
@@ -1634,110 +1513,112 @@
 # Transcode to UTF-8
 sub transcode {
   my $File = shift;
+  
+  my $general_charset = $File->{Charset}->{Use};
+  my $exact_charset = $general_charset;
 
-  my $cs = $File->{Charset}->{Use};
-  my ($command, $result_charset) = ('', '');
-  if ($CFG->{Charsets}->{$cs}) {
-    ($command, $result_charset) = split(" ", $CFG->{Charsets}->{$cs}, 2);
+  # eeeeek!
+  if ($general_charset eq 'utf-16') {
+    if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) {
+      $exact_charset = $File->{Charset}->{Auto};
+    } else { $exact_charset = 'utf-16be'; }
   }
+  
+  my $cs = $exact_charset;
+  
+  if (!$CFG->{Charsets}->{$cs}) {
+    # The encoding is not supported due to policy
+    # and possibly other reasons
 
-  my $c;
-  $result_charset = exact_charset($File, $result_charset);
-
-  if ($command eq 'I') {
-    # test if given charset is available
-    eval { $c = Text::Iconv->new($result_charset, 'utf-8') };
-    $command = '' if $@;
-  } elsif ($command eq 'X') {
-    $@ = "$File->{Charset}->{Use} undefined; replace by $result_charset";
+    $File->{'Error Flagged'} = TRUE;
+    $File->{E}->param(fatal_transcode_error   => TRUE);
+    $File->{E}->param(fatal_transcode_charset => $cs);
+    
+    # @@FIXME might need better text
+    $File->{E}->param(fatal_transcode_errmsg  => "Encoding not supported.");
+    
+    return $File;
   }
+  
+  # Does the system support decoding this encoding?
+  eval { Encode::decode($cs, ''); };
+  
+  if ($@) {
+    # This system's Encode installation does not support
+    # the character encoding; might need additional modules
 
-  if ($command ne 'I') {
     $File->{'Error Flagged'} = TRUE;
     $File->{E}->param(fatal_transcode_error   => TRUE);
     $File->{E}->param(fatal_transcode_charset => $cs);
-    $File->{E}->param(fatal_transcode_errmsg  => ($@ || ''));
+    
+    # @@FIXME might need better text
+    $File->{E}->param(fatal_transcode_errmsg  => "Encoding not supported.");
+
     return $File;
   }
+  
+  my $output;
+  my $input = $File->{Bytes};
+  
+  # Try to transcode
+  eval {
+    $output = Encode::decode($cs, $input, Encode::FB_CROAK);
+  };
+  
+  # Transcoding failed  
+  if ($@) {
+    $File->{'Error Flagged'} = TRUE;
 
-  my $line = 0;
-  for (@{$File->{Content}}) {
-    my $in = $_;
-    $line++;
-    $_ = $c->convert($_); # $_ is local!!
-    if ($in ne "" and (!defined($_) || $_ eq "")) {
-      push @{$File->{Lines}}, $line;
-      
-      # try to decoded as much as possible of the line
-      my $short = 0;                # longest okay
-      my $long  = (length $in) - 1; # longest unknown
-      
-      while ($long > $short) {
-        # binary search
-        my $try = int (($long+$short+1) / 2);
-        my $converted = $c->convert(substr($in, 0, $try));
-        if (!defined($converted) || $converted eq "") {
-          $long  = $try-1;
-        } else {
-          $short = $try;
-        }
-      }
-      my $remain = (length $in) - $short;
-      $_ = $c->convert(substr($in,0,$short))
-           . "#### $remain byte(s) unconvertible ####";
-    }
+    # @@FIXME might need better text, in particular, this does not tell
+    # where the error occured; it might be possible to emulate that
+    # using a Encode CHECK parameter that modifies the input, then split
+    # the decodable string to give line / column information, or don't
+    # split and report the offset calculated from the result.
+    $File->{E}->param(fatal_byte_error   => TRUE);
+    $File->{E}->param(fatal_byte_lines   => 0);
+    $File->{E}->param(fatal_byte_charset => $cs);
+
+    return $File;
   }
+  
+  # @@FIXME is this what we want?
+  $output =~ s/\015?\012/\n/g;
+  $File->{Content} = [split/\n/, $output];
+  
   return $File;
 }
 
-#
-# Check correctness of UTF-8 both for UTF-8 input and for conversion results
-sub check_utf8 {
+sub find_encodings
+{
   my $File = shift;
-
-  for (my $i = 0; $i < $#{$File->{Content}}; $i++) {
-    # substitution needed for very long lines (>32K), to avoid backtrack
-    # stack overflow. Handily, this also happens to count characters.
-    local $_ = $File->{Content}->[$i];
-    my $count =
-    s/  [\x00-\x7F]                           # ASCII
-      | [\xC2-\xDF]        [\x80-\xBF]        # non-overlong 2-byte sequences
-      |  \xE0[\xA0-\xBF]   [\x80-\xBF]        # excluding overlongs
-      | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}     # straight 3-byte sequences
-      |  \xED[\x80-\x9F]   [\x80-\xBF]        # excluding surrogates
-      |  \xF0[\x90-\xBF]   [\x80-\xBF]{2}     # planes 1-3
-      | [\xF1-\xF3]        [\x80-\xBF]{3}     # planes 4-15
-      |  \xF4[\x80-\x8F][\x80-\xBF]{2}        # plane 16
-     //xg;
-    if (length) {
-      push @{$File->{Lines}}, ($i+1);
-      $File->{Content}->[$i] = "#### encoding problem on this line, not shown ####";
-      $count = 50; # length of above text
-    }
-    $count += 0; # Force numeric.
-    $File->{Offsets}->[$i + 1] = [$count, $File->{Offsets}->[$i]->[1] + $count];
+  my $bom = HTML::Encoding::encoding_from_byte_order_mark($File->{Bytes});
+  my @first = HTML::Encoding::encoding_from_first_chars($File->{Bytes});
+  
+  if (defined $bom)
+  {
+    # @@FIXME this BOM entry should not be needed at all!
+    $File->{BOM} = length(Encode::encode($bom, "\x{FEFF}"));
+    $File->{Charset}->{Auto} = lc $bom;
   }
-
-  # Add a warning if doc is UTF-8 and contains a BOM.
-  if ($File->{Charset}->{Use} eq 'utf-8' &&
-        $File->{Content}->[0] =~ m(^\xEF\xBB\xBF)) {
-    &add_warning('W21', {});
+  else
+  {
+    $File->{Charset}->{Auto} = lc($first[0]) if @first;
   }
-  return $File;
-}
+  
+  my $xml = HTML::Encoding::encoding_from_xml_document($File->{Bytes});
+  $File->{Charset}->{XML} = lc $xml if defined $xml;
 
-#
-# byte error analysis
-sub byte_error {
-  my $File = shift;
-  my @lines = @{$File->{Lines}};
-  if (scalar @lines) {
-    $File->{'Error Flagged'} = TRUE;
-    my $lines = join ', ', split ',', Set::IntSpan->new(\@lines)->run_list;
-    $File->{E}->param(fatal_byte_error   => TRUE);
-    $File->{E}->param(fatal_byte_lines   => $lines);
-    $File->{E}->param(fatal_byte_charset => $File->{Charset}->{Use});
+  my %metah;
+  foreach my $try (@first)
+  {
+    # @@FIXME I think the old code used HTML::Parser xml mode, check this is ok
+    my $meta = lc HTML::Encoding::encoding_from_meta_element($File->{Bytes}, $try);
+    $metah{$meta}++ if defined $meta and length $meta;
   }
+  
+  my @meta = sort { $metah{$b} <=> $metah{$a} } keys %metah;
+  $File->{Charset}->{META} = lc $meta[0] if @meta;
+  
   return $File;
 }
 
@@ -1819,9 +1700,8 @@
     print qq(  <messages>\n);
 
     foreach my $err (@{$File->{Errors}}) {
-      my $offset = $File->{Offsets}->[$err->{line} - 1]->[1] + $err->{char};
       printf <<".EOF.", &ent($err->{msg});
-    <msg line="$err->{line}" col="$err->{char}" offset="$offset">%s</msg>
+    <msg line="$err->{line}" col="$err->{char}">%s</msg>
 .EOF.
     }
     print qq(  </messages>\n);
@@ -1873,11 +1753,6 @@
     my $errnum = 0 ;
     foreach my $err (@{$File->{Errors}}) {
       ++$errnum ;
-      my @offsets = (
-                     $File->{Offsets}->[$err->{line}    ]->[0],
-                     $File->{Offsets}->[$err->{line} - 1]->[1],
-                     $File->{Offsets}->[$err->{line} - 1]->[1] + $err->{char}
-                    );
       printf <<".EOF.", &ent($File->{URI}), &ent($err->{msg});
     <asserts>
       <Assertion rdf:ID="err$errnum">
@@ -1885,7 +1760,6 @@
           <reprOf rdf:resource="%s"/>
           <val:line>$err->{line}</val:line>
           <val:column>$err->{char}</val:column>
-          <val:offset>@offsets</val:offset>
         </subject>
         <result rdf:resource="http://www.w3.org/2003/03/earl/1.00#fails" />
         <testCase rdf:resource="http://www.w3.org/Markup/" />
@@ -1943,11 +1817,6 @@
   unless ($File->{'Is Valid'}) {
     for (my $i = 0; $i <= scalar @{$File->{Errors}}; $i++) {
       my $err = $File->{Errors}->[$i];
-      my @offsets = (
-                     $File->{Offsets}->[$err->{line}    ]->[0],
-                     $File->{Offsets}->[$err->{line} - 1]->[1],
-                     $File->{Offsets}->[$err->{line} - 1]->[1] + $err->{char}
-                    );
       print <<".EOF.";
     [
       earl:testMode earl:Auto;
@@ -1955,7 +1824,6 @@
       rdf:subject [
                     val:column "$err->{char}";
                     val:line   "$err->{line}";
-                    val:offset "@offsets";
                     earl:testSubject <$File->{URI}>
                   ];
       rdf:object [
@@ -1987,88 +1855,11 @@
 }
 
 #
-# Autodetection as in Appendix F of the XML 1.0 Recommendation.
-# <http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing>
-#
-# return values are: (base_encoding, BOMSize, Size, Pattern)
-sub find_base_encoding {
-  local $_ = shift;
-
-  # With a Byte Order Mark:
-  return ('ucs-4be',  4, 4, '\0\0\0(.)')
-    if /^\x00\x00\xFE\xFF/; # UCS-4, big-endian machine (1234)
-  return ('ucs-4le',  4, 4, '(.)\0\0\0')
-    if /^\xFF\xFE\x00\x00/; # UCS-4, little-endian machine (4321)
-  return ('utf-16be', 2, 2, '\0(.)')
-    if /^\xFE\xFF/;         # UTF-16, big-endian.
-  return ('utf-16le', 2, 2, '(.)\0')
-    if /^\xFF\xFE/;         # UTF-16, little-endian.
-  return ('utf-8',    3, 1, '')
-    if /^\xEF\xBB\xBF/; # UTF-8.
-
-  # Without a Byte Order Mark:
-  return ('ucs-4be',  0, 4, '\0\0\0(.)')
-    if /^\x00\x00\x00\x3C/; # UCS-4 or 32bit; big-endian machine (1234 order).
-  return ('ucs-4le',  0, 4, '(.)\0\0\0')
-    if /^\x3C\x00\x00\x00/; # UCS-4 or 32bit; little-endian machine (4321 order).
-  return ('utf-16be', 0, 2, '\0(.)')
-    if /^\x00\x3C\x00\x3F/; # UCS-2, UTF-16, or 16bit; big-endian.
-  return ('utf-16le', 0, 2, '(.)\0')
-    if /^\x3C\x00\x3F\x00/; # UCS-2, UTF-16, or 16bit; little-endian.
-  return ('utf-8',    0, 1, '')
-    if /^\x3C\x3F\x78\x6D/; # UTF-8, ISO-646, ASCII, ISO-8859-*, Shift-JIS, EUC, etc.
-  return ('ebcdic',   0, 1, '')
-    if /^\x4C\x6F\xA7\x94/; # EBCDIC
-  return ('',         0, 1, '');
-                            # nothing in particular
-}
-
-#
-# Find encoding in document according to XML rules
-# Only meaningful if file contains a BOM, or for well-formed XML!
-sub find_xml_encoding {
-  my $File = shift;
-  my ($CodeUnitSize, $Pattern);
-
-  ($File->{Charset}->{Auto}, $File->{BOM}, $CodeUnitSize, $Pattern)
-    = &find_base_encoding($File->{Bytes});
-
-  # 100 arbitrary, but enough in any case
-  my $someBytes = substr $File->{Bytes}, $File->{BOM}, ($CodeUnitSize * 100);
-  my $someText  = '';
-
-  # translate from guessed encoding to ascii-compatible
-  if ($File->{Charset}->{Auto} eq 'ebcdic') {
-
-    # special treatment for EBCDIC, maybe use tr///
-    # work on this later
-  }
-  elsif (!$Pattern) {
-    $someText = $someBytes; # efficiency shortcut
-  }
-  else {
-    # generic code for UTF-16/UCS-4
-    $someBytes =~ /^(($Pattern)*)/s;
-    $someText = $1;                   # get initial piece without chars >255
-    $someText =~ s/$Pattern/$1/sg;    # select the relevant bytes
-  }
-
-  # try to find encoding pseudo-attribute
-  my $s = '[\ \t\n\r]';
-  $someText =~ m(^<\?xml $s+ version $s* = $s* ([\'\"]) [-._:a-zA-Z0-9]+ \1 $s+
-                  encoding $s* = $s* ([\'\"]) ([A-Za-z][-._A-Za-z0-9]*) \2
-                )xso;
-
-  $File->{Charset}->{XML} = lc $3;
-  return $File;
-}
-
-#
 # Abort with a message if an error was flagged at point.
 sub abort_if_error_flagged {
   my $File  = shift;
   my $Flags = shift;
-
+  
   return unless $File->{'Error Flagged'};
   return if     $File->{'Error Handled'}; # Previous error, keep going.
 
@@ -2093,7 +1884,7 @@
 
 #
 # conflicting encodings
-sub conflict {
+sub charset_not_equal {
   my $encodingA = shift;
   my $encodingB = shift;
   return $encodingA && $encodingB && ($encodingA ne $encodingB);
Received on Thursday, 18 August 2005 02:52:07 UTC