- From: Martin Duerst <duerst@dev.w3.org>
- Date: Wed, 22 Sep 2004 00:40:42 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/validator/httpd/cgi-bin In directory hutz:/tmp/cvs-serv23184 Modified Files: check Log Message: spliting up &preparse into two subs Index: check =================================================================== RCS file: /sources/public/validator/httpd/cgi-bin/check,v retrieving revision 1.342 retrieving revision 1.343 diff -u -d -r1.342 -r1.343 --- check 17 Sep 2004 14:36:23 -0000 1.342 +++ check 22 Sep 2004 00:40:40 -0000 1.343 @@ -440,7 +440,7 @@ # # Try to extract META charset # (works only if ascii-based and reasonably clean before <meta>) -$File = &preparse($File); +$File = &preparse_meta($File); unless ($File->{Charset}->{Use}) { $File->{Charset}->{Use} = $File->{Charset}->{META}; } @@ -607,7 +607,7 @@ # # Try to extract a DOCTYPE or xmlns. -$File = &preparse($File); +$File = &preparse_doctype($File); # @@ -1886,8 +1886,62 @@ # -# Do an initial parse of the Document Entity to extract charset and FPI. -sub preparse { +# Do an initial parse of the Document Entity to extract FPI. +# (still also extracts charset) +sub preparse_doctype { + my $File = shift; + + # + # Reset DOCTYPE, Root, and Charset (for second invocation). + $File->{Charset}->{META} = ''; + $File->{DOCTYPE} = ''; + $File->{Root} = ''; + + my $dtd = sub { + return if $File->{Root}; + ($File->{Root}, $File->{DOCTYPE}) = shift =~ m(<!DOCTYPE\s+(\w+)\s+PUBLIC\s+(?:[\'\"])([^\"\']+)(?:[\"\']).*>)si; + }; + + my $start = sub { + my $tag = shift; + my $attr = shift; + my %attr = map {lc($_) => $attr->{$_}} keys %{$attr}; + + if ($File->{Root}) { + if (lc $tag eq 'meta') { + if (lc $attr{'http-equiv'} eq 'content-type') { + if ($attr{content} =~ m(charset\s*=[\s\"\']*([^\s;\"\'>]*))si) { + $File->{Charset}->{META} = lc $1; + } + } + } + return unless $tag eq $File->{Root}; + } else { + $File->{Root} = $tag; + } + if ($attr->{xmlns}) {$File->{Namespace} = $attr->{xmlns}}; + }; + + my $p = HTML::Parser->new(api_version => 3); + $p->xml_mode(TRUE); + $p->ignore_elements('BODY'); + $p->ignore_elements('body'); + $p->handler(declaration => $dtd, 'text'); + $p->handler(start => $start, 'tag,attr'); + $p->parse(join "\n", @{$File->{Content}}); + + $File->{DOCTYPE} = '' unless defined $File->{DOCTYPE}; + $File->{DOCTYPE} =~ s(^\s+){ }g; + $File->{DOCTYPE} =~ s(\s+$){ }g; + $File->{DOCTYPE} =~ s(\s+) { }g; + + return $File; +} + +# +# Do an initial parse of the Document Entity to extract charset from HTML <meta>. +# (still also extracts FPI) +sub preparse_meta { my $File = shift; #
Received on Wednesday, 22 September 2004 00:40:43 UTC