link-checker commit: Decode content encodings also when parsing for anchors, thanks to Mark Olson.

changeset:   328:a7e0c2202898
user:        ville
date:        Thu Dec 03 20:30:20 2009 +0000
files:       bin/checklink
description:
Decode content encodings also when parsing for anchors, thanks to Mark Olson.


diff -r be2d15135b4e -r a7e0c2202898 bin/checklink
--- a/bin/checklink	Wed Nov 04 18:44:17 2009 +0000
+++ b/bin/checklink	Thu Dec 03 20:30:20 2009 +0000
@@ -5,7 +5,7 @@
 # (c) 1999-2009 World Wide Web Consortium
 # based on Renaud Bruyeron's checklink.pl
 #
-# $Id: checklink,v 4.168 2009-11-04 18:44:17 mernst Exp $
+# $Id: checklink,v 4.169 2009-12-03 20:30:20 ville Exp $
 #
 # This program is licensed under the W3C(r) Software License:
 #       http://www.w3.org/Consortium/Legal/copyright-software
@@ -283,7 +283,7 @@
   $PROGRAM     = 'W3C-checklink';
   $VERSION     = '4.5';
   $REVISION    = sprintf('version %s (c) 1999-2009 W3C', $VERSION);
-  my ($cvsver) = q$Revision: 4.168 $ =~ /(\d+[\d\.]*\.\d+)/;
+  my ($cvsver) = q$Revision: 4.169 $ =~ /(\d+[\d\.]*\.\d+)/;
   $AGENT       = sprintf('%s/%s [%s] %s',
                          $PROGRAM, $VERSION, $cvsver,
                          (W3C::UserAgent::USE_ROBOT_UA
@@ -1210,6 +1210,32 @@
   return;
 }
 
+##########################################
+# Decode Content-Encodings in a response #
+##########################################
+
+sub decode_content ($)
+{
+  my $response = shift;
+  my $error = undef;
+
+  # @@@TODO: maybe also decode charsets?
+  my $docref = $response->decoded_content(ref => 1, charset => 'none');
+  if (defined($docref)) {
+    $response->content_ref($docref);
+    # Remove Content-Encoding so it won't be decoded again later.
+    $response->remove_header('Content-Encoding')
+  } else {
+    my $ce = $response->header('Content-Encoding');
+    $ce = defined($ce) ? "'$ce'" : 'undefined';
+    my $ct = $response->header('Content-Type');
+    $ct = defined($ct) ? "'$ct'" : 'undefined';
+    my $request_uri = $response->request->url;
+    $error = "Error decoding document at <$request_uri>, Content-Type $ct, Content-Encoding $ce: '$@'";
+  }
+  return $error;
+}
+
 #######################################
 # Get and parse a resource to process #
 #######################################
@@ -1270,19 +1296,7 @@
     $failed_reason = "Content-Type for <$request_uri> is " .
       (defined($ct) ? "'$ct'" : 'undefined');
   } else {
-    # Pre-decode Content-Encoding.
-    # @@@TODO: maybe also decode charsets?
-    my $docref = $response->decoded_content(ref => 1, charset => 'none');
-    if (defined($docref)) {
-      $response->content_ref($docref);
-      # Remove Content-Encoding so it won't be decoded again later.
-      $response->remove_header('Content-Encoding')
-    } else {
-      my $ce = $response->header('Content-Encoding');
-      $ce = defined($ce) ? "'$ce'" : 'undefined';
-      $ct = defined($ct) ? "'$ct'" : 'undefined';
-      $failed_reason = "Error decoding document at <$request_uri>, Content-Type $ct, Content-Encoding $ce: '$@'";
-    }
+    $failed_reason = decode_content($response);
   }
   if ($failed_reason) {
     # No, there is a problem...
@@ -1850,6 +1864,10 @@
       return;
     }
     # Do it then
+    if (my $error = decode_content($response)) {
+      &hprintf("%s\n.", $error);
+    }
+    # @@@TODO: this isn't the best thing to do if a decode error occurred
     $p = &parse_document($uri, $response->base(), $response, 0, $want_links);
   } else {
     # We already had the information

Received on Thursday, 5 August 2010 14:47:46 UTC