- From: Ville Skytta via cvs-syncmail <cvsmail@w3.org>
- Date: Sun, 29 Jul 2007 16:49:03 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LinkChecker/bin In directory hutz:/tmp/cvs-serv24243 Modified Files: checklink Log Message: Change parse_document to operate directly on response content refs, decode content encodings, and clean up response contents when done. Index: checklink =================================================================== RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v retrieving revision 4.58 retrieving revision 4.59 diff -u -d -r4.58 -r4.59 --- checklink 29 Jul 2007 13:48:29 -0000 4.58 +++ checklink 29 Jul 2007 16:49:00 -0000 4.59 @@ -796,10 +796,7 @@ # Record that we have processed this resource $processed{$absolute_uri} = 1; # Parse the document - my $p = &parse_document($uri, $absolute_uri, - $response->content(), 1, - $depth != 0); - $response->content(""); + my $p = &parse_document($uri, $absolute_uri, $response, 1, ($depth != 0)); my $base = URI->new($p->{base}); # Check anchors @@ -1228,7 +1225,7 @@ sub parse_document ($$$$$) { - my ($uri, $location, $document, $links, $rec_needs_links) = @_; + my ($uri, $location, $response, $links, $rec_needs_links) = @_; my $p; @@ -1240,6 +1237,12 @@ return $p; } + my $docref = undef; + # @@@TODO: maybe also do charset decoding some day? + $docref = $response->decoded_content(ref => 1, charset => "none") + if ($response->can('decoded_content')); # LWP >= 5.802 + $docref ||= $response->content_ref(); + my $start; $p = W3C::LinkChecker->new(); $p->{base} = $location; @@ -1248,7 +1251,7 @@ print("Parsing...\n"); } if (!$Opts{Summary_Only} || $Opts{Progress}) { - $p->{Total} = ($document =~ tr/\n//); + $p->{Total} = ($$docref =~ tr/\n//); } # We only look for anchors if we are not interested in the links # obviously, or if we are running a recursive checking because we @@ -1259,9 +1262,10 @@ # Processing instructions are not parsed by process, but in this case # it should be. It's expensive, it's horrible, but it's the easiest way # for right now. - $document =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors}; + $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors}; - $p->parse($document); + $p->parse($$docref); + $response->content(""); if (! $Opts{Summary_Only}) { my $stop = &get_timestamp(); @@ -1485,21 +1489,17 @@ my $p; if ($being_processed) { # Can we really parse the document? - my $done = 0; if (!defined($results{$uri}{location}{type}) || $results{$uri}{location}{type} !~ $ContentTypes) { &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n", $uri, $results{$uri}{location}{type}) if ($Opts{Verbose}); - $done = 1; - } else { - # Do it then - $p = &parse_document($uri, $response->base(), - $response->content(), 0, $want_links); + $response->content(""); + return; } - $response->content(""); - return if $done; + # Do it then + $p = &parse_document($uri, $response->base(), $response, 0, $want_links); } else { # We already had the information $p->{Anchors} = $results{$uri}{parsing}{Anchors};
Received on Sunday, 29 July 2007 16:49:04 UTC