- From: Bjoern Hoehrmann via cvs-syncmail <cvsmail@w3.org>
- Date: Tue, 16 Aug 2005 04:40:55 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/validator/httpd/cgi-bin In directory hutz:/tmp/cvs-serv26648 Modified Files: check Log Message: Experimental integration of SGML::Parser::OpenSP plus mod_perl2 enhancements Index: check =================================================================== RCS file: /sources/public/validator/httpd/cgi-bin/check,v retrieving revision 1.436 retrieving revision 1.437 diff -u -d -r1.436 -r1.437 --- check 15 Aug 2005 22:23:57 -0000 1.436 +++ check 16 Aug 2005 04:40:52 -0000 1.437 @@ -16,8 +16,8 @@ $| = 1; # -# We need Perl 5.6.0+. -use 5.006; +# We need Perl 5.8.0+. +use 5.008; ############################################################################### #### Load modules. ############################################################ @@ -87,9 +87,9 @@ use constant MODE_SGML => 1; # 0000 0001 use constant MODE_XML => 2; # 0000 0010 use constant MODE_TBD => 4; # 0000 0100, needs further info to decide. + #@@FIXME: XML WF checking isn't implemented. use constant MODE_WF => 8; # 0000 1000, only XML well-formed checking. -#@@FIXME; # # Define global variables. @@ -208,6 +208,14 @@ # # Use passive FTP by default. $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); + + # + # Read friendly error message file + my $error_messages_list = File::Spec->catfile($CFG->{Paths}->{Templates}, 'en_US', 'error_messages.cfg'); + my %config_opts = (-ConfigFile => $error_messages_list); + my %rsrc = Config::General->new(%config_opts)->getall(); + $RSRC = \%rsrc; + } # end of BEGIN block. # @@ -230,7 +238,10 @@ # # The data structure that will hold all session data. -my $File; +# @@FIXME This can't be my() as $File will sooner or +# later be undef and add_warning will cause the script +# to die. our() seems to work but has other problems. +our $File; ############################################## # Populate $File->{Env} -- Session Metadata. # @@ -278,28 +289,25 @@ $lang, 'result.tmpl'), die_on_bad_params => FALSE, loop_context_vars => TRUE, + cache => TRUE, ); my $E = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'fatal-error.tmpl'), die_on_bad_params => FALSE, + cache => TRUE, ); my $H = HTML::Template->new( filename => File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'http_401_authrequired.tmpl'), die_on_bad_params => FALSE, + cache => TRUE, ); $File->{T} = $T; $File->{E} = $E; $File->{H} = $H; -# Read friendly error message file -my $error_messages_list = File::Spec->catfile($CFG->{Paths}->{Templates}, $lang, 'error_messages.cfg'); -my %config_opts = (-ConfigFile => $error_messages_list); -my %rsrc = Config::General->new(%config_opts)->getall(); -$RSRC = \%rsrc; - $T->param(cfg_home_page => $CFG->{'Home Page'}); undef $lang; @@ -554,161 +562,81 @@ $File = &parse($File); sub parse (\$) { my $File = shift; - + + use SGML::Parser::OpenSP 0.99 qw(); + + my $opensp = SGML::Parser::OpenSP->new(); + # # By default, use SGML catalog file and SGML Declaration. my $catalog = File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'sgml.soc'); + # - # Note: if you feel the urge to remove -R from here, please understand that - # doing so opens a potential security hole. Don't do that; instead just - # make sure you're running OpenSP 1.5 or later. - my @spopt = qw( - -R - -wvalid - -wnon-sgml-char-ref - -wno-duplicate - ); - + my @spopt = qw(valid non-sgml-char-ref no-duplicate); + # # Switch to XML semantics if file is XML. if (&is_xml($File)) { $catalog = File::Spec->catfile($CFG->{Paths}->{SGML}->{Library}, 'xml.soc'); - push(@spopt, '-wxml'); + push(@spopt, 'xml'); } # - # Defaults for SP; turn off fixed charset mode and set encoding to UTF-8. - $ENV{SP_CHARSET_FIXED} = 'NO'; - $ENV{SP_ENCODING} = 'UTF-8'; - $ENV{SP_BCTF} = 'UTF-8'; - - # - # Tell onsgmls about the SGML Library. - $ENV{SGML_SEARCH_PATH} = $CFG->{Paths}->{SGML}->{Library}; - + # Parser configuration + $opensp->search_dirs($CFG->{Paths}->{SGML}->{Library}); + $opensp->catalogs($catalog); + $opensp->show_error_numbers(1); + # - # Set the command to execute. - my @cmd = ($CFG->{Paths}->{SGML}->{Parser}, '-n', '-c', $catalog, @spopt); - + # Note: if you feel the urge to remove -R from here, please understand that + # doing so opens a potential security hole. Don't do that; instead just + # make sure you're running OpenSP 1.5 or later. + $opensp->restrict_file_reading(1); + # # Set debug info for HTML report. $T->param(opt_debug => $DEBUG); $T->param(debug => [ - {name => 'Command', value => "@cmd"}, - {name => 'SP_CHARSET_FIXED', value => $ENV{SP_CHARSET_FIXED}}, - {name => 'SP_ENCODING', value => $ENV{SP_ENCODING}}, - {name => 'SP_BCTF', value => $ENV{SP_BCTF}}, - {name => 'Content-Encoding', value => $File->{ContentEnc}}, - {name => 'Transfer-Encoding', value => $File->{TransferEnc}}, + # This is obsolete now? + { name => 'Command', value => "SGML::Parser::OpenSP->new" }, + + # These are probably obsolete now, SGML::Parser::OpenSP will + # always use <spec encoding="utf-8">... for parse_string. It + # might be possible that messing with these variables makes + # OpenSP fail to recognize the encoding, I remember something + # to this effect but can't reproduce this at the moment. We + # rely on SGML::Parser::OpenSP to get this right because it + # seems there is no way for Apache2+mod_perl2+OpenSP+SPO to + # communicate the correct environment settings to OpenSP. + # @@FIXME this note should be removed after some testing. + { name => 'SP_CHARSET_FIXED', value => $ENV{SP_CHARSET_FIXED} }, + { name => 'SP_ENCODING', value => $ENV{SP_ENCODING} }, + { name => 'SP_BCTF', value => $ENV{SP_BCTF} }, + + { name => 'Content-Encoding', value => $File->{ContentEnc} }, + { name => 'Transfer-Encoding', value => $File->{TransferEnc} }, ], ); - - #@@FIXME: This needs a UI and testing! - # - # Set onsgmls' -E switch to the number of errors requested. - if ($File->{Opt}->{'Max Errors'} =~ m(^all$)i) { - push @cmd, '-E0'; - } elsif ($File->{Opt}->{'Max Errors'} =~ m(^(\d+)$)) { - my $numErr = $1; - if ($numErr >= 200) { - $numErr = 200; - } elsif ($numErr <= 0) { - $numErr = 0; #@@FIXME: Should add feature to supress error output in this case.; - } - push @cmd, '-E' . $numErr; - } else { - push @cmd, '-E' . ($CFG->{'Max Errors'} || 0); # "-E0" means "all". - } - #@@FIXME; - - # - # Temporary filehandles. - my $spin = IO::File->new_tmpfile; - my $spout = IO::File->new_tmpfile; - my $sperr = IO::File->new_tmpfile; - - # - # Dump file to a temp file for parsing. - for (@{$File->{Content}}) { - print $spin $_, "\n"; - } - - # - # seek() to beginning of the file. - seek $spin, 0, 0; - - # - # Run it through SP, redirecting output to temporary files. - my $pid = do { - no warnings 'once'; - local (*SPIN, *SPOUT, *SPERR) = ($spin, $spout, $sperr); - open3("<&SPIN", ">&SPOUT", ">&SPERR", @cmd); - }; - undef $spin; - waitpid $pid, 0; - - # - # Rewind temporary filehandles. - seek $_, 0, 0 for $spout, $sperr; - - # - # Proper text mode for Win32 systems - binmode($spout, ':crlf') if $^O eq "MSWin32"; - - $File = &parse_errors($File, $sperr); # Parse error output. - undef $sperr; # Get rid of no longer needed filehandle. - - $File->{ESIS} = []; - my $elements_found = 0; - while (<$spout>) { - $elements_found++ if /^\(/; - - if (/^Axmlns() \w+ (.*)/ or /^Axmlns:([^ ]+) \w+ (.*)/) { - if (not $File->{Namespace}) { - if ($elements_found == 0 and $1 eq "") { - $File->{Namespace} = $2; - } else { - # @@FIXME: should not happen; - push(@{$File->{Namespaces}}, $2); - } - } else { - push(@{$File->{Namespaces}}, $2) if ($2 ne $File->{Namespace}); - } - } - - next if / IMPLIED$/ && not $DEBUG; - next if /^ASDAFORM CDATA /; - next if /^ASDAPREF CDATA /; - chomp; # Removes trailing newlines - push @{$File->{ESIS}}, $_; - } - undef $spout; - - if (@{$File->{ESIS}} && $File->{ESIS}->[-1] =~ /^C$/) { - pop(@{$File->{ESIS}}); - $File->{'Is Valid'} = TRUE; - } else { - $File->{'Is Valid'} = FALSE; - } - + + my $h = W3C::Validator::ErrorHandler->new($opensp, $File); + + $opensp->handler($h); + $opensp->parse_string(join"\n",@{$File->{Content}}); + + # Make sure there are no circular references, otherwise the script + # would leak memory until mod_perl unloads it which could take some + # time. @@FIXME It's probably overly careful though. + $opensp->handler(undef); + undef $h->{_parser}; + undef $h->{_file}; + undef $h; + undef $opensp; + # # Set Version to be the FPI initially. $File->{Version} = $File->{DOCTYPE}; - - # - # Extract any version attribute from the ESIS. - for (@{$File->{ESIS}}) { - no warnings 'uninitialized'; - next unless /^AVERSION CDATA (.*)/i; - push @{$File->{Version_ESIS}}, $1; - if ($1 =~ '-//W3C//DTD (SGML|XML) Fallback//EN') { - $File->{Tentative} |= (T_ERROR | T_FALL); - my $dtd = $1 eq 'SGML' ? 'HTML 4.01 Transitional' : 'XHTML 1.0 Strict'; - &add_warning('W09', { W09_dtd => $dtd }); - } - } - + return $File; } @@ -790,14 +718,9 @@ } $T->param(file_warnings => $File->{Warnings}); - $T->param(file_outline => &outline($File)) - if $T->param('opt_show_outline'); $T->param(file_source => &source($File)) if $T->param('opt_show_source'); - $T->param(file_parsetree => &parsetree($File)) - if $T->param('opt_show_parsetree'); $T->param('opt_show_esis' => TRUE) if $File->{Opt}->{'Show ESIS'}; - $T->param('file_esis' => &show_esis($File)) if $T->param('opt_show_esis'); $T->param('opt_show_raw_errors' => TRUE) if $File->{Opt}->{'Show Errors'}; $T->param('file_raw_errors' => &show_errors($File)) if $T->param('opt_show_raw_errors'); @@ -919,7 +842,7 @@ sub add_warning ($$) { my $WID = shift; my $params = shift; - + $File->{T}->param($WID => TRUE, %{$params}); $File->{T}->param(have_warnings => TRUE); } @@ -1258,6 +1181,7 @@ sub override_doctype { no strict 'vars'; my $File = shift; + my ($dt) = grep { $_->{Display} eq $File->{Opt}->{DOCTYPE} } values %{$CFG->{Types}}; @@ -1316,81 +1240,6 @@ } # -# Parse errors reported by SP. -sub parse_errors ($$) { - my $File = shift; - my $fh = shift; - - $File->{Errors} = []; # Initialize to an (empty) anonymous array ref. - for (<$fh>) { - - # remove SGML Parser path if it contains colons - s/^\Q$CFG->{Paths}->{SGML}->{Parser}\E// if - $CFG->{Paths}->{SGML}->{Parser} =~ /:/; - - push @{$File->{DEBUG}->{Errors}}, $_; - chomp; - my ($err, @errors); - next if /^<OSFD>0:[0-9]+:[0-9]+:[^A-Z]/; - next if /numbers exceeding 65535 not supported/; - next if /URL Redirected to/; - - my (@_err) = split /:/; - next unless $_err[1] eq '<OSFD>0'; #@@FIXME: This is a polite fiction!; - if ($_err[1] =~ m(^<URL>)) { - @errors = ($_err[0], join(':', $_err[1], $_err[2]), @_err[3..$#_err]); - } else { - @errors = @_err; - } - $err->{src} = $errors[1]; - $err->{line} = $errors[2]; - $err->{char} = $errors[3]; - - # Workaround for onsgmls 1.5 sometimes reporting errors beyond EOL. - if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) { - $err->{char} = $l; - } - $err->{num} = $errors[4] || ''; - $err->{type} = $errors[5] || ''; - if ($err->{type} eq 'E' or $err->{type} eq 'X' or $err->{type} eq 'Q') { - $err->{msg} = join ':', @errors[6 .. $#errors]; - } elsif ($err->{type} eq 'W') { - - #@@FIXME: This is borked after templatification. - # &add_warning($File, 'fake', 'Warning:', - # "Line $err->{line}, column $err->{char}: " . &ent($errors[6])); - #@@FIXME; - $err->{msg} = join ':', @errors[6 .. $#errors]; - } else { - $err->{type} = 'I'; - $err->{num} = ''; - $err->{msg} = join ':', @errors[4 .. $#errors]; - } - - # No or unknown FPI and a relative SI. - if ($err->{msg} =~ m(cannot (open|find))) { - $File->{'Error Flagged'} = TRUE; - $File->{E}->param(fatal_parse_extid_error => TRUE); - $File->{E}->param(fatal_parse_extid_msg => $err->{msg}); - } - - # No DOCTYPE. - if ($err->{msg} =~ m(prolog can\'t be omitted)) { - my $dtd = ($File->{Mode} == MODE_SGML ? - 'HTML 4.01 Transitional' : 'XHTML 1.0 Transitional'); - &add_warning('W09', {W09_dtd => $dtd}); - next; # Don't report this as a normal error. - } - - &abort_if_error_flagged($File, O_DOCTYPE); - $err->{msg} =~ s/^\s*//; - push @{$File->{Errors}}, $err; - } - undef $fh; - return $File; -} - -# # Generate a HTML report of detected errors. sub report_errors ($) { my $File = shift; @@ -1408,7 +1257,7 @@ my $explanation; if ($err->{num}) { - my (undef, $num) = split /\./, $err->{num}; + my $num = $err->{num}; if (exists $Msgs{$num}) { # We've already seen this message... if ($File->{Opt}->{Verbose}) { # ...so only repeat it in Verbose mode. $explanation = qq(\n <div class="hidden mid-$num"></div>\n); @@ -1422,14 +1271,14 @@ my $_msg = $RSRC->{msg}->{nomsg}->{verbose}; $_msg =~ s/<!--MID-->/$num/g; if ($File->{'Is Upload'}) - { - $_msg =~ s/<!--URI-->//g - } - else - { - my $escaped_uri = uri_escape($File->{URI}); - $_msg =~ s/<!--URI-->/$escaped_uri/g; - } + { + $_msg =~ s/<!--URI-->//g + } + else + { + my $escaped_uri = uri_escape($File->{URI}); + $_msg =~ s/<!--URI-->/$escaped_uri/g; + } $explanation .= " $_msg\n"; # The send feedback plea. } @@ -1533,76 +1382,6 @@ } # -# Produce an outline of the document based on Hn elements from the ESIS. -sub outline { - my $File = shift; - - my $outline = ''; - - my $prevlevel = 0; - my $level = 0; - - for (1..$#{$File->{ESIS}}) { - my $line = $File->{ESIS}->[$_]; - next unless ($line && $line =~ /^\(H([1-6])$/i); - - $prevlevel = $level; - $level = $1; - - my $TAB = $level + 2; - - if ($prevlevel == 0) { - $outline .= " <ul>\n"; - } else { - if ($level < $prevlevel) { - $outline .= "</li>\n"; - for (my $i = $prevlevel; $i > $level; $i--) { - $outline .= " " x ($i + 2) . "</ul>\n"; - $outline .= " " x (($i + 2) - 1) . "</li>\n"; - } - } elsif ($level == $prevlevel) { - $outline .= "</li>\n"; - } elsif ($level > $prevlevel) { - if ($level - $prevlevel > 1) { - foreach my $i (($prevlevel + 1) .. ($level - 1)) { - $outline .= "\n". " " x ($i + 2) . "<ul>\n" . " " x ($i + 2); - $outline .= qq(<li class="warning">A level $i heading is missing!); - } - $outline .= "\n" . " " x $TAB . "<ul>\n"; - } else { - $outline .= "\n" . " " x $TAB; - $outline .= "<ul>\n"; - } - } - } - - $line = ''; - my $heading = ''; - until (substr($line, 0, 3) =~ /^\)H$level/i) { - $line = $File->{ESIS}->[$_++]; - if ($line =~ /^-/) { - my $headcont = $line; - substr($headcont, 0, 1) = " "; - $heading .= $headcont; - } elsif ($line =~ /^AALT CDATA( .+)/i) { - my $headcont = $1; - $heading .= $headcont; - } - } - - $heading =~ s/\\011/ /g; - $heading =~ s/\\012/ /g; - $heading =~ s/\\n/ /g; - $heading =~ s/\s+/ /g; - $heading =~ s/^[- ]//; - $heading = &ent($heading); - $outline .= " <li>$heading"; - } - $outline .= " </li></ul>\n" x $level; - return $outline; -} - -# # Create a HTML representation of the document. sub source { my $File = shift; @@ -1616,64 +1395,6 @@ } # -# Create a HTML Parse Tree of the document for validation report. -sub parsetree { - my $File = shift; - my $tree = ''; - - $T->param(file_parsetree_noatt => TRUE) if $File->{Opt}->{'No Attributes'}; - - my $indent = 0; - my $prevdata = ''; - - foreach my $line (@{$File->{ESIS}}) { - - next if ($File->{Opt}->{'No Attributes'} && $line =~ /^A/); - - $line =~ s/\\n/ /g; - $line =~ s/\\011/ /g; - $line =~ s/\\012/ /g; - $line =~ s/\s+/ /g; - next if $line =~ /^-\s*$/; - - if ($line =~ /^-/) { - substr($line, 0, 1) = ' '; - $prevdata .= $line; - next; - } elsif ($prevdata) { - $prevdata =~ s/\s+/ /g; - $tree .= &ent(wrap(' ' x $indent, ' ' x $indent, $prevdata)) . "\n"; - undef $prevdata; - } - - $line = &ent($line); - if ($line =~ /^\)/) { - $indent -= 2; - } - - my $printme; - chomp($printme = $line); - if (my ($close, $elem) = $printme =~ /^([()])(.+)/) { - # reformat and add links on HTML elements - $close = ($close eq ')') ? '/' : ''; # ")" -> close-tag - if (my $u = $CFG->{Elements}->{lc($elem)}) { - $elem = '<a href="' . $CFG->{'Element Ref URI'} . "$u\">$elem</a>"; - } - $printme = "<$close$elem>"; - } else { - $printme =~ s,^A, A,; # indent attributes a bit - } - - $tree .= ' ' x $indent . $printme . "\n"; - - if ($line =~ /^\(/) { - $indent += 2; - } - } - return $tree; -} - -# # Do an initial parse of the Document Entity to extract FPI. sub preparse_doctype { my $File = shift; @@ -1718,19 +1439,6 @@ } # -# Print out the raw ESIS output for debugging. -sub show_esis ($) { - my $file_esis = ""; - for (@{shift->{ESIS}}) { - s/\\012//g; - s/\\n/\n/g; - $file_esis .= ent $_; - $file_esis .= "\n"; - } - return $file_esis; -} - -# # Print out the raw error output for debugging. sub show_errors ($) { my $file_raw_errors = ""; @@ -1820,7 +1528,8 @@ } # Redirect to a GETable URL if method is POST without a file upload. - if ($q->request_method eq 'POST' and not $File->{'Is Upload'}) { + if (defined $q->request_method and $q->request_method eq 'POST' + and not $File->{'Is Upload'}) { my $thispage = &self_url_q($q, $File); print redirect $thispage; exit; @@ -2505,6 +2214,79 @@ ##### +sub W3C::Validator::ErrorHandler::new +{ + my $class = shift; + my $parser = shift; + my $File = shift; + + my $self = { _file => $File, _parser => $parser }; + + # ... + $File->{'Is Valid'} = TRUE; + $File->{Errors} = []; + + bless $self, $class; +} + +sub W3C::Validator::ErrorHandler::error +{ + my $self = shift; + my $error = shift; + my $mess = $self->{_parser}->split_message($error); + my $File = $self->{_file}; + + my $err; + + $err->{src} = '...'; # do this with show_open_entities()? + $err->{line} = $mess->{primary_message}{LineNumber}; + $err->{char} = $mess->{primary_message}{ColumnNumber}; + $err->{num} = $mess->{primary_message}{Number}; + $err->{type} = $mess->{primary_message}{Severity}; + $err->{msg} = $mess->{primary_message}{Text}; + + # ... + $File->{'Is Valid'} = FALSE if $err->{type} eq 'E'; + + # Workaround for onsgmls 1.5 sometimes reporting errors beyond EOL. + # (How true is that? Test cases please.) + if ((my $l = length($File->{Content}->[$err->{line}-1])) < $err->{char}) { + $err->{char} = $l; + } + + # No or unknown FPI and a relative SI. + if ($err->{msg} =~ m(cannot (open|find))) { + $File->{'Error Flagged'} = TRUE; + $File->{E}->param(fatal_parse_extid_error => TRUE); + $File->{E}->param(fatal_parse_extid_msg => $err->{msg}); + } + + # No DOCTYPE. + if ($err->{msg} =~ m(prolog can\'t be omitted)) { + my $dtd = ($File->{Mode} == MODE_SGML ? + 'HTML 4.01 Transitional' : 'XHTML 1.0 Transitional'); + + add_warning('W09', {W09_dtd => $dtd}); + return; # Don't report this as a normal error. + } + + abort_if_error_flagged($File, O_DOCTYPE); + $err->{msg} =~ s/^\s*//; + push @{$File->{Errors}}, $err; + + if (defined $mess->{aux_message}) + { + # "duplicate id ... first defined here" style messages + push @{$File->{Errors}}, { line => $mess->{aux_message}{LineNumber}, + char => $mess->{aux_message}{ColumnNumber}, + msg => $mess->{aux_message}{Text}, + type => 'I', + }; + } +} + +##### + package W3C::Validator::UserAgent; use LWP::UserAgent 1.90 qw(); # Need 1.90 for protocols_(allowed|forbidden)
Received on Tuesday, 16 August 2005 04:41:05 UTC