- From: Bjoern Hoehrmann via cvs-syncmail <cvsmail@w3.org>
- Date: Thu, 18 Aug 2005 02:51:56 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/validator/httpd/cgi-bin
In directory hutz:/tmp/cvs-serv24559
Modified Files:
check
Log Message:
Use HTML::Encoding instead of proprietary code and some cleanup
Index: check
===================================================================
RCS file: /sources/public/validator/httpd/cgi-bin/check,v
retrieving revision 1.441
retrieving revision 1.442
diff -u -d -r1.441 -r1.442
--- check 17 Aug 2005 02:09:08 -0000 1.441
+++ check 18 Aug 2005 02:51:54 -0000 1.442
@@ -36,27 +36,21 @@
# when loading modules to prevent non-OO or poorly written modules from
# polluting our namespace.
#
-use CGI 2.81 qw(
- -newstyle_urls
- -private_tempfiles
- redirect
- ); # 2.81 for XHTML, and import redirect() function.
-
-use CGI::Carp qw(carp croak fatalsToBrowser);
-use Config::General 2.19 qw(); # Need 2.19 for -AutoLaunder
-use File::Spec qw();
-use HTML::Parser 3.25 qw(); # Need 3.25 for $p->ignore_elements.
-use HTML::Template 2.6 qw();
-use HTTP::Request qw();
-use HTTP::Headers::Auth qw(); # Needs to be imported after other HTTP::*.
-use IO::File qw();
-use IPC::Open3 qw(open3);
-use Set::IntSpan qw();
-use Text::Iconv qw();
-use Text::Wrap qw(wrap);
-use URI qw();
-use URI::Escape qw(uri_escape);
+use CGI 2.81 qw(-newstyle_urls -private_tempfiles redirect);
+use CGI::Carp qw(carp croak fatalsToBrowser);
+use Config::General 2.19 qw(); # Need 2.19 for -AutoLaunder
+use File::Spec qw();
+use HTML::Parser 3.25 qw(); # Need 3.25 for $p->ignore_elements.
+use HTML::Template 2.6 qw();
+use HTTP::Request qw();
+use HTTP::Headers::Auth qw(); # Needs to be imported after other HTTP::*.
+use URI qw();
+use URI::Escape qw(uri_escape);
+use Encode qw();
+use HTML::Encoding 0.52 qw();
+use SGML::Parser::OpenSP 0.99 qw();
+
###############################################################################
#### Constant definitions. ####################################################
###############################################################################
@@ -234,7 +228,10 @@
# @@FIXME This can't be my() as $File will sooner or
# later be undef and add_warning will cause the script
# to die. our() seems to work but has other problems.
-our $File;
+# @@FIXME Apparently, this must be set to {} also,
+# otherwise the script might pick up an old object
+# after abort_if_error_flagged under mod_perl.
+our $File = {};
##############################################
# Populate $File->{Env} -- Session Metadata. #
@@ -262,12 +259,7 @@
$File->{Mode} = MODE_SGML; # Default parse mode is SGML.
#
-# Array (ref) used to store character offsets for the XML report.
-$File->{Offsets}->[0] = [0, 0]; # The first item isn't used...
-
-#
# Listrefs.
-$File->{Lines} = []; # Line numbers for encoding errors.
$File->{Warnings} = []; # Warnings...
$File->{Namespaces} = []; # Other (non-root) Namespaces.
@@ -377,9 +369,7 @@
#### Output validation results. ###############################################
###############################################################################
-#
-# Find the XML Encoding.
-$File = &find_xml_encoding($File);
+$File = find_encodings($File);
#
# Decide on a charset to use (first part)
@@ -406,20 +396,13 @@
$File->{Charset}->{Use} = 'utf-8'; # UTF-8 (image/svg+xml etc.)
}
-$File->{Content} = &normalize_newlines($File->{Bytes},
- exact_charset($File, $File->{Charset}->{Use}));
-
-#
-# Try to extract META charset
-# (works only if ascii-based and reasonably clean before <meta>)
-$File = &preparse_meta($File); # First call. Repeated later to fetch the FPI.
unless ($File->{Charset}->{Use}) {
$File->{Charset}->{Use} = $File->{Charset}->{META};
}
#
# Handle any Fallback or Override for the charset.
-if (&conflict($File->{Opt}->{Charset}, '(detect automatically)')) {
+if (charset_not_equal($File->{Opt}->{Charset}, '(detect automatically)')) {
# charset=foo was given to the CGI and it wasn't "autodetect".
#
@@ -467,18 +450,16 @@
&abort_if_error_flagged($File, O_CHARSET|O_DOCTYPE);
#
-# Check the detected Encoding and transcode.
-if (&conflict($File->{Charset}->{Use}, 'utf-8')) {
- $File = &transcode($File);
- &abort_if_error_flagged($File, O_CHARSET);
-}
-
-$File = &check_utf8($File); # always check
-$File = &byte_error($File);
+# Always transcode, even if the content claims to be UTF-8
+$File = transcode($File);
+&abort_if_error_flagged($File, O_CHARSET);
#
-# Abort if an error was flagged during transcoding
-&abort_if_error_flagged($File, O_SOURCE|O_CHARSET);
+# Add a warning if doc is UTF-8 and contains a BOM.
+if ($File->{Charset}->{Use} eq 'utf-8' &&
+ $File->{Content}->[0] =~ m(^\x{FEFF})) {
+ &add_warning('W21', {});
+}
#
# Overall parsing algorithm for documents returned as text/html:
@@ -556,8 +537,6 @@
sub parse (\$) {
my $File = shift;
- use SGML::Parser::OpenSP 0.99 qw();
-
my $opensp = SGML::Parser::OpenSP->new();
#
@@ -576,6 +555,10 @@
#
# Parser configuration
+ #
+ # This is broken on Win32 with restrict_file_reading since it
+ # would need to allow access to the temp file directory which
+ # it does not. Not sure how to address that yet.
$opensp->search_dirs($CFG->{Paths}->{SGML}->{Library});
$opensp->catalogs($catalog);
$opensp->show_error_numbers(1);
@@ -594,22 +577,6 @@
$T->param(opt_debug => $DEBUG);
$T->param(debug =>
[
- # This is obsolete now?
- { name => 'Command', value => "SGML::Parser::OpenSP->new" },
-
- # These are probably obsolete now, SGML::Parser::OpenSP will
- # always use <spec encoding="utf-8">... for parse_string. It
- # might be possible that messing with these variables makes
- # OpenSP fail to recognize the encoding, I remember something
- # to this effect but can't reproduce this at the moment. We
- # rely on SGML::Parser::OpenSP to get this right because it
- # seems there is no way for Apache2+mod_perl2+OpenSP+SPO to
- # communicate the correct environment settings to OpenSP.
- # @@FIXME this note should be removed after some testing.
- { name => 'SP_CHARSET_FIXED', value => $ENV{SP_CHARSET_FIXED} },
- { name => 'SP_ENCODING', value => $ENV{SP_ENCODING} },
- { name => 'SP_BCTF', value => $ENV{SP_BCTF} },
-
{ name => 'Content-Encoding', value => $File->{ContentEnc} },
{ name => 'Transfer-Encoding', value => $File->{TransferEnc} },
],
@@ -682,7 +649,9 @@
&prep_template($File, $T);
if (! $File->{Doctype} and ($File->{Version} eq 'unknown' or $File->{Version} eq 'SGML' or (!$File->{Version}))) {
- # @@TODO@@ we should try falling back on other version info, such as the ones stored in Version_ESIS
+
+ # @@TODO@@ we should try falling back on other version
+ # info, such as the ones stored in Version_ESIS
$T->param(file_version => '(no Doctype found)');
}
else {
@@ -1026,18 +995,12 @@
my $charset = '';
my $mode = '';
+ # @@FIXME @param now unused
my ($ct, @param) = split /\s*;\s*/, lc $Content_Type;
$mode = $CFG->{MIME}->{$ct} || $ct;
-
- foreach my $param (@param) {
- my ($p, $v) = split /\s*=\s*/, $param;
- next unless $p =~ m(charset)i;
- if ($v =~ m/([\'\"]?)(\S+)\1/i) {
- $charset = lc $2;
- last;
- }
- }
+
+ $charset = HTML::Encoding::encoding_from_content_type($Content_Type);
if ($mode =~ m(/)) { # a "/" means it's unknown or we'd have a mode here.
if ($ct =~ m(text/css) and defined $url) {
@@ -1075,43 +1038,6 @@
}
#
-# Normalize newline forms (CRLF/CR/LF) to native newline.
-sub normalize_newlines {
- my $file = shift;
- local $_ = shift; #charset
- my $pattern = '';
-
- # don't use backreference parentheses!
- $pattern = '\x00\x0D(?:\x00\x0A)?|\x00\x0A' if /^utf-16be$/;
- $pattern = '\x0D\x00(?:\x0A\x00)?|\x0A\x00' if /^utf-16le$/;
- # $pattern = '\x00\x00\x00\x0D(?:\x00\x00\x00\x0A)?|\x00\x00\x00\x0A' if /^UCS-4be$/;
- # $pattern = '\x0D\x00\x00\x00(?:\x0A\x00\x00\x00)?|\x0A\x00\x00\x00' if /^UCS-4le$/;
- # insert other special cases here, such as EBCDIC
- $pattern = '\x0D(?:\x0A)?|\x0A' if !$pattern; # all other cases
-
- return [split /$pattern/, $file];
-}
-
-#
-# find exact charset from general one (utf-16)
-#
-# needed for per-line conversion and line splitting
-# (BE is default, but this will apply only to HTML)
-sub exact_charset {
- my $File = shift;
- my $general_charset = shift;
- my $exact_charset = $general_charset;
-
- if ($general_charset eq 'utf-16') {
- if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) {
- $exact_charset = $File->{Charset}->{Auto};
- } else { $exact_charset = 'utf-16be'; }
- }
- # add same code for ucs-4 here
- return $exact_charset;
-}
-
-#
# Return $_[0] encoded for HTML entities (cribbed from merlyn).
#
# Note that this is used both for HTML and XML escaping.
@@ -1126,11 +1052,6 @@
#
# Truncate source lines for report.
#
-# This *really* wants Perl 5.8.0 and it's improved UNICODE support.
-# Byte semantics are in effect on all length(), substr(), etc. calls,
-# so offsets will be wrong if there are multi-byte sequences prior to
-# the column where the error is detected.
-#
sub truncate_line {
my $line = shift;
my $col = shift;
@@ -1327,7 +1248,6 @@
$length = $col;
}
$left = substr $line, $offset, $length;
- $left = &ent($left);
}
#
@@ -1367,11 +1287,10 @@
$length = length($line) - ($col - 1); # Otherwise get the rest of the line.
}
$right = substr $line, $offset, $length;
- $right = &ent($right);
}
$char = qq(<strong title="Position where error was detected.">$char</strong>);
- $line = $left . $char . $right;
+ $line = &ent($left) . $char . &ent($right);
return $line;
}
@@ -1383,7 +1302,7 @@
# Remove any BOM since we're not at BOT anymore...
$File->{Content}->[0] =
- substr $File->{Content}->[0], ($File->{BOM} ? 3 : 0); # remove BOM
+ substr $File->{Content}->[0], ($File->{BOM} ? 1 : 0); # remove BOM
my @source = map({file_source_line => $_}, @{$File->{Content}});
return \@source;
@@ -1393,7 +1312,7 @@
# Do an initial parse of the Document Entity to extract FPI.
sub preparse_doctype {
my $File = shift;
-
+
#
# Reset DOCTYPE, Root (for second invocation, probably not needed anymore).
$File->{DOCTYPE} = '';
@@ -1436,6 +1355,7 @@
#
# Print out the raw error output for debugging.
sub show_errors ($) {
+ # @@FIXME This is broken with SGML::Parser::OpenSP
my $file_raw_errors = "";
for (@{shift->{DEBUG}->{Errors}}) {
$file_raw_errors .= ent $_
@@ -1546,47 +1466,6 @@
sub is_xml {shift->{Mode} == MODE_XML};
#
-# Do an initial parse of the Document Entity to extract charset from HTML <meta>.
-# (still also extracts FPI, at least to some extent)
-sub preparse_meta {
- my $File = shift;
-
- my $dtd = sub {
- return if $File->{Root};
- ($File->{Root}, $File->{DOCTYPE}) = shift =~ m(<!DOCTYPE\s+(\w+)\s+PUBLIC\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*>)si;
- };
-
- my $start = sub {
- my $tag = shift;
- my $attr = shift;
- my %attr = map {lc($_) => $attr->{$_}} keys %{$attr};
-
- if ($File->{Root}) {
- if (lc $tag eq 'meta') {
- if (lc $attr{'http-equiv'} eq 'content-type') {
- if ($attr{content} =~ m(charset\s*=[\s\"\']*([^\s;\"\'>]*))si) {
- $File->{Charset}->{META} = lc $1;
- }
- }
- }
- return unless $tag eq $File->{Root};
- } else {
- $File->{Root} = $tag;
- }
- };
-
- my $p = HTML::Parser->new(api_version => 3);
- $p->xml_mode(TRUE);
- $p->ignore_elements('BODY');
- $p->ignore_elements('body');
- $p->handler(declaration => $dtd, 'text');
- $p->handler(start => $start, 'tag,attr');
- $p->parse(join "\n", @{$File->{Content}});
-
- return $File;
-}
-
-#
# Check charset conflicts and add any warnings necessary.
sub charset_conflicts {
my $File = shift;
@@ -1607,19 +1486,19 @@
#
# Add a warning if there was charset info conflict (HTTP header,
# XML declaration, or <meta> element).
- if (&conflict($File->{Charset}->{HTTP}, $File->{Charset}->{XML})) {
+ if (charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{XML})) {
&add_warning('W18', {
W18_http => $cs_http,
W18_xml => $cs_xml,
W18_use => $cs_use,
});
- } elsif (&conflict($File->{Charset}->{HTTP}, $File->{Charset}->{META})) {
+ } elsif (charset_not_equal($File->{Charset}->{HTTP}, $File->{Charset}->{META})) {
&add_warning('W19', {
W19_http => $cs_http,
W19_meta => $cs_meta,
W19_use => $cs_use,
});
- } elsif (&conflict($File->{Charset}->{XML}, $File->{Charset}->{META})) {
+ } elsif (charset_not_equal($File->{Charset}->{XML}, $File->{Charset}->{META})) {
&add_warning('W20', {
W20_http => $cs_xml,
W20_xml => $cs_meta,
@@ -1634,110 +1513,112 @@
# Transcode to UTF-8
sub transcode {
my $File = shift;
+
+ my $general_charset = $File->{Charset}->{Use};
+ my $exact_charset = $general_charset;
- my $cs = $File->{Charset}->{Use};
- my ($command, $result_charset) = ('', '');
- if ($CFG->{Charsets}->{$cs}) {
- ($command, $result_charset) = split(" ", $CFG->{Charsets}->{$cs}, 2);
+ # eeeeek!
+ if ($general_charset eq 'utf-16') {
+ if ($File->{Charset}->{Auto} =~ m/^utf-16[bl]e$/) {
+ $exact_charset = $File->{Charset}->{Auto};
+ } else { $exact_charset = 'utf-16be'; }
}
+
+ my $cs = $exact_charset;
+
+ if (!$CFG->{Charsets}->{$cs}) {
+ # The encoding is not supported due to policy
+ # and possibly other reasons
- my $c;
- $result_charset = exact_charset($File, $result_charset);
-
- if ($command eq 'I') {
- # test if given charset is available
- eval { $c = Text::Iconv->new($result_charset, 'utf-8') };
- $command = '' if $@;
- } elsif ($command eq 'X') {
- $@ = "$File->{Charset}->{Use} undefined; replace by $result_charset";
+ $File->{'Error Flagged'} = TRUE;
+ $File->{E}->param(fatal_transcode_error => TRUE);
+ $File->{E}->param(fatal_transcode_charset => $cs);
+
+ # @@FIXME might need better text
+ $File->{E}->param(fatal_transcode_errmsg => "Encoding not supported.");
+
+ return $File;
}
+
+ # Does the system support decoding this encoding?
+ eval { Encode::decode($cs, ''); };
+
+ if ($@) {
+ # This system's Encode installation does not support
+ # the character encoding; might need additional modules
- if ($command ne 'I') {
$File->{'Error Flagged'} = TRUE;
$File->{E}->param(fatal_transcode_error => TRUE);
$File->{E}->param(fatal_transcode_charset => $cs);
- $File->{E}->param(fatal_transcode_errmsg => ($@ || ''));
+
+ # @@FIXME might need better text
+ $File->{E}->param(fatal_transcode_errmsg => "Encoding not supported.");
+
return $File;
}
+
+ my $output;
+ my $input = $File->{Bytes};
+
+ # Try to transcode
+ eval {
+ $output = Encode::decode($cs, $input, Encode::FB_CROAK);
+ };
+
+ # Transcoding failed
+ if ($@) {
+ $File->{'Error Flagged'} = TRUE;
- my $line = 0;
- for (@{$File->{Content}}) {
- my $in = $_;
- $line++;
- $_ = $c->convert($_); # $_ is local!!
- if ($in ne "" and (!defined($_) || $_ eq "")) {
- push @{$File->{Lines}}, $line;
-
- # try to decoded as much as possible of the line
- my $short = 0; # longest okay
- my $long = (length $in) - 1; # longest unknown
-
- while ($long > $short) {
- # binary search
- my $try = int (($long+$short+1) / 2);
- my $converted = $c->convert(substr($in, 0, $try));
- if (!defined($converted) || $converted eq "") {
- $long = $try-1;
- } else {
- $short = $try;
- }
- }
- my $remain = (length $in) - $short;
- $_ = $c->convert(substr($in,0,$short))
- . "#### $remain byte(s) unconvertible ####";
- }
+ # @@FIXME might need better text, in particular, this does not tell
+ # where the error occured; it might be possible to emulate that
+ # using a Encode CHECK parameter that modifies the input, then split
+ # the decodable string to give line / column information, or don't
+ # split and report the offset calculated from the result.
+ $File->{E}->param(fatal_byte_error => TRUE);
+ $File->{E}->param(fatal_byte_lines => 0);
+ $File->{E}->param(fatal_byte_charset => $cs);
+
+ return $File;
}
+
+ # @@FIXME is this what we want?
+ $output =~ s/\015?\012/\n/g;
+ $File->{Content} = [split/\n/, $output];
+
return $File;
}
-#
-# Check correctness of UTF-8 both for UTF-8 input and for conversion results
-sub check_utf8 {
+sub find_encodings
+{
my $File = shift;
-
- for (my $i = 0; $i < $#{$File->{Content}}; $i++) {
- # substitution needed for very long lines (>32K), to avoid backtrack
- # stack overflow. Handily, this also happens to count characters.
- local $_ = $File->{Content}->[$i];
- my $count =
- s/ [\x00-\x7F] # ASCII
- | [\xC2-\xDF] [\x80-\xBF] # non-overlong 2-byte sequences
- | \xE0[\xA0-\xBF] [\x80-\xBF] # excluding overlongs
- | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2} # straight 3-byte sequences
- | \xED[\x80-\x9F] [\x80-\xBF] # excluding surrogates
- | \xF0[\x90-\xBF] [\x80-\xBF]{2} # planes 1-3
- | [\xF1-\xF3] [\x80-\xBF]{3} # planes 4-15
- | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
- //xg;
- if (length) {
- push @{$File->{Lines}}, ($i+1);
- $File->{Content}->[$i] = "#### encoding problem on this line, not shown ####";
- $count = 50; # length of above text
- }
- $count += 0; # Force numeric.
- $File->{Offsets}->[$i + 1] = [$count, $File->{Offsets}->[$i]->[1] + $count];
+ my $bom = HTML::Encoding::encoding_from_byte_order_mark($File->{Bytes});
+ my @first = HTML::Encoding::encoding_from_first_chars($File->{Bytes});
+
+ if (defined $bom)
+ {
+ # @@FIXME this BOM entry should not be needed at all!
+ $File->{BOM} = length(Encode::encode($bom, "\x{FEFF}"));
+ $File->{Charset}->{Auto} = lc $bom;
}
-
- # Add a warning if doc is UTF-8 and contains a BOM.
- if ($File->{Charset}->{Use} eq 'utf-8' &&
- $File->{Content}->[0] =~ m(^\xEF\xBB\xBF)) {
- &add_warning('W21', {});
+ else
+ {
+ $File->{Charset}->{Auto} = lc($first[0]) if @first;
}
- return $File;
-}
+
+ my $xml = HTML::Encoding::encoding_from_xml_document($File->{Bytes});
+ $File->{Charset}->{XML} = lc $xml if defined $xml;
-#
-# byte error analysis
-sub byte_error {
- my $File = shift;
- my @lines = @{$File->{Lines}};
- if (scalar @lines) {
- $File->{'Error Flagged'} = TRUE;
- my $lines = join ', ', split ',', Set::IntSpan->new(\@lines)->run_list;
- $File->{E}->param(fatal_byte_error => TRUE);
- $File->{E}->param(fatal_byte_lines => $lines);
- $File->{E}->param(fatal_byte_charset => $File->{Charset}->{Use});
+ my %metah;
+ foreach my $try (@first)
+ {
+ # @@FIXME I think the old code used HTML::Parser xml mode, check this is ok
+ my $meta = lc HTML::Encoding::encoding_from_meta_element($File->{Bytes}, $try);
+ $metah{$meta}++ if defined $meta and length $meta;
}
+
+ my @meta = sort { $metah{$b} <=> $metah{$a} } keys %metah;
+ $File->{Charset}->{META} = lc $meta[0] if @meta;
+
return $File;
}
@@ -1819,9 +1700,8 @@
print qq( <messages>\n);
foreach my $err (@{$File->{Errors}}) {
- my $offset = $File->{Offsets}->[$err->{line} - 1]->[1] + $err->{char};
printf <<".EOF.", &ent($err->{msg});
- <msg line="$err->{line}" col="$err->{char}" offset="$offset">%s</msg>
+ <msg line="$err->{line}" col="$err->{char}">%s</msg>
.EOF.
}
print qq( </messages>\n);
@@ -1873,11 +1753,6 @@
my $errnum = 0 ;
foreach my $err (@{$File->{Errors}}) {
++$errnum ;
- my @offsets = (
- $File->{Offsets}->[$err->{line} ]->[0],
- $File->{Offsets}->[$err->{line} - 1]->[1],
- $File->{Offsets}->[$err->{line} - 1]->[1] + $err->{char}
- );
printf <<".EOF.", &ent($File->{URI}), &ent($err->{msg});
<asserts>
<Assertion rdf:ID="err$errnum">
@@ -1885,7 +1760,6 @@
<reprOf rdf:resource="%s"/>
<val:line>$err->{line}</val:line>
<val:column>$err->{char}</val:column>
- <val:offset>@offsets</val:offset>
</subject>
<result rdf:resource="http://www.w3.org/2003/03/earl/1.00#fails" />
<testCase rdf:resource="http://www.w3.org/Markup/" />
@@ -1943,11 +1817,6 @@
unless ($File->{'Is Valid'}) {
for (my $i = 0; $i <= scalar @{$File->{Errors}}; $i++) {
my $err = $File->{Errors}->[$i];
- my @offsets = (
- $File->{Offsets}->[$err->{line} ]->[0],
- $File->{Offsets}->[$err->{line} - 1]->[1],
- $File->{Offsets}->[$err->{line} - 1]->[1] + $err->{char}
- );
print <<".EOF.";
[
earl:testMode earl:Auto;
@@ -1955,7 +1824,6 @@
rdf:subject [
val:column "$err->{char}";
val:line "$err->{line}";
- val:offset "@offsets";
earl:testSubject <$File->{URI}>
];
rdf:object [
@@ -1987,88 +1855,11 @@
}
#
-# Autodetection as in Appendix F of the XML 1.0 Recommendation.
-# <http://www.w3.org/TR/2000/REC-xml-20001006#sec-guessing>
-#
-# return values are: (base_encoding, BOMSize, Size, Pattern)
-sub find_base_encoding {
- local $_ = shift;
-
- # With a Byte Order Mark:
- return ('ucs-4be', 4, 4, '\0\0\0(.)')
- if /^\x00\x00\xFE\xFF/; # UCS-4, big-endian machine (1234)
- return ('ucs-4le', 4, 4, '(.)\0\0\0')
- if /^\xFF\xFE\x00\x00/; # UCS-4, little-endian machine (4321)
- return ('utf-16be', 2, 2, '\0(.)')
- if /^\xFE\xFF/; # UTF-16, big-endian.
- return ('utf-16le', 2, 2, '(.)\0')
- if /^\xFF\xFE/; # UTF-16, little-endian.
- return ('utf-8', 3, 1, '')
- if /^\xEF\xBB\xBF/; # UTF-8.
-
- # Without a Byte Order Mark:
- return ('ucs-4be', 0, 4, '\0\0\0(.)')
- if /^\x00\x00\x00\x3C/; # UCS-4 or 32bit; big-endian machine (1234 order).
- return ('ucs-4le', 0, 4, '(.)\0\0\0')
- if /^\x3C\x00\x00\x00/; # UCS-4 or 32bit; little-endian machine (4321 order).
- return ('utf-16be', 0, 2, '\0(.)')
- if /^\x00\x3C\x00\x3F/; # UCS-2, UTF-16, or 16bit; big-endian.
- return ('utf-16le', 0, 2, '(.)\0')
- if /^\x3C\x00\x3F\x00/; # UCS-2, UTF-16, or 16bit; little-endian.
- return ('utf-8', 0, 1, '')
- if /^\x3C\x3F\x78\x6D/; # UTF-8, ISO-646, ASCII, ISO-8859-*, Shift-JIS, EUC, etc.
- return ('ebcdic', 0, 1, '')
- if /^\x4C\x6F\xA7\x94/; # EBCDIC
- return ('', 0, 1, '');
- # nothing in particular
-}
-
-#
-# Find encoding in document according to XML rules
-# Only meaningful if file contains a BOM, or for well-formed XML!
-sub find_xml_encoding {
- my $File = shift;
- my ($CodeUnitSize, $Pattern);
-
- ($File->{Charset}->{Auto}, $File->{BOM}, $CodeUnitSize, $Pattern)
- = &find_base_encoding($File->{Bytes});
-
- # 100 arbitrary, but enough in any case
- my $someBytes = substr $File->{Bytes}, $File->{BOM}, ($CodeUnitSize * 100);
- my $someText = '';
-
- # translate from guessed encoding to ascii-compatible
- if ($File->{Charset}->{Auto} eq 'ebcdic') {
-
- # special treatment for EBCDIC, maybe use tr///
- # work on this later
- }
- elsif (!$Pattern) {
- $someText = $someBytes; # efficiency shortcut
- }
- else {
- # generic code for UTF-16/UCS-4
- $someBytes =~ /^(($Pattern)*)/s;
- $someText = $1; # get initial piece without chars >255
- $someText =~ s/$Pattern/$1/sg; # select the relevant bytes
- }
-
- # try to find encoding pseudo-attribute
- my $s = '[\ \t\n\r]';
- $someText =~ m(^<\?xml $s+ version $s* = $s* ([\'\"]) [-._:a-zA-Z0-9]+ \1 $s+
- encoding $s* = $s* ([\'\"]) ([A-Za-z][-._A-Za-z0-9]*) \2
- )xso;
-
- $File->{Charset}->{XML} = lc $3;
- return $File;
-}
-
-#
# Abort with a message if an error was flagged at point.
sub abort_if_error_flagged {
my $File = shift;
my $Flags = shift;
-
+
return unless $File->{'Error Flagged'};
return if $File->{'Error Handled'}; # Previous error, keep going.
@@ -2093,7 +1884,7 @@
#
# conflicting encodings
-sub conflict {
+sub charset_not_equal {
my $encodingA = shift;
my $encodingB = shift;
return $encodingA && $encodingB && ($encodingA ne $encodingB);
Received on Thursday, 18 August 2005 02:52:07 UTC