- From: Mercurial notifier <nobody@w3.org>
- Date: Thu, 05 Aug 2010 14:47:08 +0000
- To: link-checker updates <www-validator-cvs@w3.org>
changeset: 174:cce894fac135 user: ville date: Sun Jul 29 16:49:00 2007 +0000 files: bin/checklink description: Change parse_document to operate directly on response content refs, decode content encodings, and clean up response contents when done. diff -r 6fed70c62e92 -r cce894fac135 bin/checklink --- a/bin/checklink Sun Jul 29 13:48:29 2007 +0000 +++ b/bin/checklink Sun Jul 29 16:49:00 2007 +0000 @@ -5,7 +5,7 @@ # (c) 1999-2007 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # -# $Id: checklink,v 4.58 2007-07-29 13:48:29 ville Exp $ +# $Id: checklink,v 4.59 2007-07-29 16:49:00 ville Exp $ # # This program is licensed under the W3C(r) Software License: # http://www.w3.org/Consortium/Legal/copyright-software @@ -192,7 +192,7 @@ $PROGRAM = 'W3C-checklink'; $VERSION = '4.3'; $REVISION = sprintf('version %s (c) 1999-2007 W3C', $VERSION); - my ($cvsver) = q$Revision: 4.58 $ =~ /(\d+[\d\.]*\.\d+)/; + my ($cvsver) = q$Revision: 4.59 $ =~ /(\d+[\d\.]*\.\d+)/; $AGENT = sprintf('%s/%s [%s] %s', $PROGRAM, $VERSION, $cvsver, LWP::RobotUA->_agent()); @@ -796,10 +796,7 @@ # Record that we have processed this resource $processed{$absolute_uri} = 1; # Parse the document - my $p = &parse_document($uri, $absolute_uri, - $response->content(), 1, - $depth != 0); - $response->content(""); + my $p = &parse_document($uri, $absolute_uri, $response, 1, ($depth != 0)); my $base = URI->new($p->{base}); # Check anchors @@ -1228,7 +1225,7 @@ sub parse_document ($$$$$) { - my ($uri, $location, $document, $links, $rec_needs_links) = @_; + my ($uri, $location, $response, $links, $rec_needs_links) = @_; my $p; @@ -1240,6 +1237,12 @@ return $p; } + my $docref = undef; + # @@@TODO: maybe also do charset decoding some day? + $docref = $response->decoded_content(ref => 1, charset => "none") + if ($response->can('decoded_content')); # LWP >= 5.802 + $docref ||= $response->content_ref(); + my $start; $p = W3C::LinkChecker->new(); $p->{base} = $location; @@ -1248,7 +1251,7 @@ print("Parsing...\n"); } if (!$Opts{Summary_Only} || $Opts{Progress}) { - $p->{Total} = ($document =~ tr/\n//); + $p->{Total} = ($$docref =~ tr/\n//); } # We only look for anchors if we are not interested in the links # obviously, or if we are running a recursive checking because we @@ -1259,9 +1262,10 @@ # Processing instructions are not parsed by process, but in this case # it should be. It's expensive, it's horrible, but it's the easiest way # for right now. - $document =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors}; + $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors}; - $p->parse($document); + $p->parse($$docref); + $response->content(""); if (! $Opts{Summary_Only}) { my $stop = &get_timestamp(); @@ -1485,21 +1489,17 @@ my $p; if ($being_processed) { # Can we really parse the document? - my $done = 0; if (!defined($results{$uri}{location}{type}) || $results{$uri}{location}{type} !~ $ContentTypes) { &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n", $uri, $results{$uri}{location}{type}) if ($Opts{Verbose}); - $done = 1; - } else { - # Do it then - $p = &parse_document($uri, $response->base(), - $response->content(), 0, $want_links); + $response->content(""); + return; } - $response->content(""); - return if $done; + # Do it then + $p = &parse_document($uri, $response->base(), $response, 0, $want_links); } else { # We already had the information $p->{Anchors} = $results{$uri}{parsing}{Anchors};
Received on Thursday, 5 August 2010 14:47:21 UTC