link-checker commit: Change parse_document to operate directly on response content refs, decode content encodings, and clean up response contents when done. from Mercurial notifier on 2010-08-05 (www-validator-cvs@w3.org from August 2010)

From: Mercurial notifier <nobody@w3.org>
Date: Thu, 05 Aug 2010 14:47:08 +0000
To: link-checker updates <www-validator-cvs@w3.org>
Message-Id: <E1Oh1iu-0005s6-Gu@blinky.w3.org>

changeset:   174:cce894fac135
user:        ville
date:        Sun Jul 29 16:49:00 2007 +0000
files:       bin/checklink
description:
Change parse_document to operate directly on response content refs, decode content encodings, and clean up response contents when done.


diff -r 6fed70c62e92 -r cce894fac135 bin/checklink
--- a/bin/checklink	Sun Jul 29 13:48:29 2007 +0000
+++ b/bin/checklink	Sun Jul 29 16:49:00 2007 +0000
@@ -5,7 +5,7 @@
 # (c) 1999-2007 World Wide Web Consortium
 # based on Renaud Bruyeron's checklink.pl
 #
-# $Id: checklink,v 4.58 2007-07-29 13:48:29 ville Exp $
+# $Id: checklink,v 4.59 2007-07-29 16:49:00 ville Exp $
 #
 # This program is licensed under the W3C(r) Software License:
 #       http://www.w3.org/Consortium/Legal/copyright-software
@@ -192,7 +192,7 @@
   $PROGRAM     = 'W3C-checklink';
   $VERSION     = '4.3';
   $REVISION    = sprintf('version %s (c) 1999-2007 W3C', $VERSION);
-  my ($cvsver) = q$Revision: 4.58 $ =~ /(\d+[\d\.]*\.\d+)/;
+  my ($cvsver) = q$Revision: 4.59 $ =~ /(\d+[\d\.]*\.\d+)/;
   $AGENT       = sprintf('%s/%s [%s] %s',
                          $PROGRAM, $VERSION, $cvsver, LWP::RobotUA->_agent());
 
@@ -796,10 +796,7 @@
   # Record that we have processed this resource
   $processed{$absolute_uri} = 1;
   # Parse the document
-  my $p = &parse_document($uri, $absolute_uri,
-                          $response->content(), 1,
-                          $depth != 0);
-  $response->content("");
+  my $p = &parse_document($uri, $absolute_uri, $response, 1, ($depth != 0));
   my $base = URI->new($p->{base});
 
   # Check anchors
@@ -1228,7 +1225,7 @@
 
 sub parse_document ($$$$$)
 {
-  my ($uri, $location, $document, $links, $rec_needs_links) = @_;
+  my ($uri, $location, $response, $links, $rec_needs_links) = @_;
 
   my $p;
 
@@ -1240,6 +1237,12 @@
     return $p;
   }
 
+  my $docref = undef;
+  # @@@TODO: maybe also do charset decoding some day?
+  $docref = $response->decoded_content(ref => 1, charset => "none")
+    if ($response->can('decoded_content')); # LWP >= 5.802
+  $docref ||= $response->content_ref();
+
   my $start;
   $p = W3C::LinkChecker->new();
   $p->{base} = $location;
@@ -1248,7 +1251,7 @@
     print("Parsing...\n");
   }
   if (!$Opts{Summary_Only} || $Opts{Progress}) {
-    $p->{Total} = ($document =~ tr/\n//);
+    $p->{Total} = ($$docref =~ tr/\n//);
   }
   # We only look for anchors if we are not interested in the links
   # obviously, or if we are running a recursive checking because we
@@ -1259,9 +1262,10 @@
   # Processing instructions are not parsed by process, but in this case
   # it should be. It's expensive, it's horrible, but it's the easiest way
   # for right now.
-  $document =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors};
+  $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors};
 
-  $p->parse($document);
+  $p->parse($$docref);
+  $response->content("");
 
   if (! $Opts{Summary_Only}) {
     my $stop = &get_timestamp();
@@ -1485,21 +1489,17 @@
   my $p;
   if ($being_processed) {
     # Can we really parse the document?
-    my $done = 0;
     if (!defined($results{$uri}{location}{type}) ||
         $results{$uri}{location}{type} !~ $ContentTypes)
     {
       &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n",
                $uri, $results{$uri}{location}{type})
         if ($Opts{Verbose});
-      $done = 1;
-    } else {
-      # Do it then
-      $p = &parse_document($uri, $response->base(),
-                           $response->content(), 0, $want_links);
+      $response->content("");
+      return;
     }
-    $response->content("");
-    return if $done;
+    # Do it then
+    $p = &parse_document($uri, $response->base(), $response, 0, $want_links);
   } else {
     # We already had the information
     $p->{Anchors} = $results{$uri}{parsing}{Anchors};

Received on Thursday, 5 August 2010 14:47:21 UTC