- From: Mercurial notifier <nobody@w3.org>
- Date: Thu, 05 Aug 2010 14:47:13 +0000
- To: link-checker updates <www-validator-cvs@w3.org>
changeset: 228:a546e8de7f77 user: ville date: Mon Apr 21 07:09:53 2008 +0000 files: bin/checklink description: Support Content-Encodings automatically handled by libwww-perl (#5648). diff -r accfd6b9990b -r a546e8de7f77 bin/checklink --- a/bin/checklink Tue Feb 19 22:41:12 2008 +0000 +++ b/bin/checklink Mon Apr 21 07:09:53 2008 +0000 @@ -5,7 +5,7 @@ # (c) 1999-2008 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # -# $Id: checklink,v 4.98 2008-02-19 22:41:12 ville Exp $ +# $Id: checklink,v 4.99 2008-04-21 07:09:53 ville Exp $ # # This program is licensed under the W3C(r) Software License: # http://www.w3.org/Consortium/Legal/copyright-software @@ -242,7 +242,7 @@ $PROGRAM = 'W3C-checklink'; $VERSION = '4.3'; $REVISION = sprintf('version %s (c) 1999-2008 W3C', $VERSION); - my ($cvsver) = q$Revision: 4.98 $ =~ /(\d+[\d\.]*\.\d+)/; + my ($cvsver) = q$Revision: 4.99 $ =~ /(\d+[\d\.]*\.\d+)/; $AGENT = sprintf('%s/%s [%s] %s', $PROGRAM, $VERSION, $cvsver, LWP::RobotUA->_agent()); @@ -1088,13 +1088,25 @@ # Can we parse the document? my $failed_reason; my $ct = $response->header('Content-Type'); - my $ce = $response->header('Content-Encoding'); if (!$ct || $ct !~ $ContentTypes) { $failed_reason = "Content-Type for <$request_uri> is " . (defined($ct) ? "'$ct'" : 'undefined'); - } elsif (defined($ce) && $ce ne 'identity') { - # @@@ We could maybe handle gzip... - $failed_reason = "Content-Encoding for <$request_uri> is '$ce'"; + } else { + if ($response->can('decoded_content')) { # LWP >= 5.802 + # Pre-decode Content-Encoding. + # @@@TODO: maybe also decode charsets? + my $docref = $response->decoded_content(ref => 1, charset => 'none'); + if (defined($docref)) { + $response->content_ref($docref); + # Remove Content-Encoding so it won't be decoded again later. + $response->remove_header('Content-Encoding') + } else { + my $ce = $response->header('Content-Encoding'); + $ce = defined($ce) ? "'$ce'" : 'undefined'; + $ct = defined($ct) ? "'$ct'" : 'undefined'; + $failed_reason = "Error decoding document at <$request_uri>, Content-Type $ct, Content-Encoding $ce: '$@'"; + } + } } if ($failed_reason) { # No, there is a problem... @@ -1324,12 +1336,6 @@ return $p; } - my $docref = undef; - # @@@TODO: maybe also do charset decoding some day? - $docref = $response->decoded_content(ref => 1, charset => "none") - if ($response->can('decoded_content')); # LWP >= 5.802 - $docref ||= $response->content_ref(); - my $start; $p = W3C::LinkChecker->new(); $p->{base} = $base_uri; @@ -1337,6 +1343,9 @@ $start = &get_timestamp(); print("Parsing...\n"); } + + # Content-Encoding etc already decoded in get_document(). + my $docref = $response->content_ref(); # Count lines beforehand if needed for progress indicator. In all cases, # the actual final number of lines processed shown is populated by our
Received on Thursday, 5 August 2010 14:47:21 UTC