perl/modules/W3C/LinkChecker/bin checklink,4.21,4.22

Update of /sources/public/perl/modules/W3C/LinkChecker/bin
In directory hutz:/tmp/cvs-serv8657

Modified Files:
	checklink 
Log Message:
WARNING: this code is rather broken...

This is not much more than a proof of concept of how we could be using 
LWP::Parallel::RobotUA for our requests. The basic idea is to send the
list of links as an array to $ua->register() instead of request()ing 
them sequentially.

Done by 
- modifying check_uri to pass an array of URIs to check_validity
- modifying check_validity to pass this array to  get_uris (a clone of get_uri)
- cloning get_uri into get_uris

The latter is only roughly done, and there probably is some loss of 
information (recursion, etc) in the process. 

This is a rather poor patch but the best I can wrap my head around right now, 
feel free to revert if it's too ugly.



Index: checklink
===================================================================
RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v
retrieving revision 4.21
retrieving revision 4.22
diff -u -d -r4.21 -r4.22
--- checklink	15 May 2005 13:46:42 -0000	4.21
+++ checklink	29 Aug 2005 05:11:39 -0000	4.22
@@ -33,10 +33,14 @@
 
 package W3C::UserAgent;
 
-use LWP::RobotUA 1.19 qw();
+#use LWP::RobotUA 1.19 qw();
+use Exporter();
+use LWP::Parallel::RobotUA qw(:CALLBACK);
 # @@@ Needs also W3C::LinkChecker but can't use() it here.
 
-@W3C::UserAgent::ISA = qw(LWP::RobotUA);
+@W3C::UserAgent::ISA = qw(LWP::Parallel::RobotUA Exporter);
+@W3C::UserAgent::EXPORT = @LWP::Parallel::RobotUA::EXPORT_OK;
+
 
 sub new
 {
@@ -69,6 +73,28 @@
   return $response;
 }
 
+
+  # on_return gets called whenever a connection (or its callback)
+  # returns EOF (or any other terminating status code available for
+  # callback functions). Please note that on_return gets called for
+  # any successfully terminated HTTP connection! This does not imply
+  # that the response sent from the server is a success! 
+  # see http://search.cpan.org/~marclang/ParallelUserAgent-2.57/lib/LWP/Parallel.pm
+#  sub on_return {
+#    my ($self, $request, $response, $entry) = @_;
+#    if ($response->is_success) {
+#       print "\n\nWoa! Request to ",$request->url," returned code ", $response->code,
+#          ": ", $response->message, "\n";
+#       print $response->content;
+#    } else {
+#       print "\n\nBummer! Request to ",$request->url," returned code ", $response->code,
+#          ": ", $response->message, "\n";
+#       # print $response->error_as_HTML;
+#    }
+#    return;
+#  }
+
+
 sub redirect_ok
 {
   my ($self, $request, $response) = @_;
@@ -120,7 +146,7 @@
   $REVISION    = sprintf('version %s (c) 1999-2005 W3C', $VERSION);
   my ($cvsver) = q$Revision$ =~ /(\d+[\d\.]*\.\d+)/;
   $AGENT       = sprintf('%s/%s [%s] %s',
-                         $PROGRAM, $VERSION, $cvsver, LWP::RobotUA->_agent());
+                         $PROGRAM, $VERSION, $cvsver, LWP::Parallel::RobotUA->_agent());
 
   # Pull in mod_perl modules if applicable.
   if ($ENV{MOD_PERL}) {
@@ -268,6 +294,13 @@
 $ua->timeout($Opts{Timeout});
 $ua->proxy('http', 'http://' . $Opts{HTTP_Proxy}) if $Opts{HTTP_Proxy};
 
+#settings for the parallel crawling of resources
+# TODO make these configuration options
+$ua->max_req(3); # max num of requests to a given host at a time
+$ua->max_hosts(5); # max num of hosts polled in parallel
+
+
+
 if ($Opts{Command_Line}) {
 
   require Text::Wrap;
@@ -772,6 +805,7 @@
 
   # Build the list of broken URI's
   my %broken;
+  my @links_to_check;
   foreach my $u (keys %links) {
 
     # Don't check mailto: URI's
@@ -783,11 +817,17 @@
     } else {
       &hprintf("Checking link %s\n", $u);
     }
-
+    push (@links_to_check, $u);
     # Check that a link is valid
-    &check_validity($uri, $u,
-                    ($depth != 0 && &in_recursion_scope($u)),
-                    \%links, \%redirects);
+  }
+ # &check_validity($uri, @links_to_check,
+ #                   ($depth != 0 && &in_recursion_scope($u)),
+ #                   \%links, \%redirects);
+ # TODO fix recursion scope issue
+ &check_validity($uri, @links_to_check);
+ 
+ 
+  foreach my $u (keys %links) {
     &hprintf("\tReturn code: %s\n", $results{$u}{location}{code})
       if ($Opts{Verbose});
     if ($results{$u}{location}{success}) {
@@ -1041,7 +1081,13 @@
   return $response if $response;
 
   # Do the query
-  $response = $ua->request($request);
+  $ua->register($request);
+  my $entries = $ua->wait();
+
+  foreach (keys %$entries) {
+     $response = $entries->{$_}->response;
+
+   
 
   # Get the results
   # Record the very first response
@@ -1076,6 +1122,95 @@
   $response->{Realm} = $realm if defined($realm);
 
   return $response;
+  
+  } # end foreach above
+}
+
+sub get_uris (%@;$\%$$$$)
+{
+  # Here we have a lot of extra parameters in order not to lose information
+  # if the function is called several times (401's)
+  my (%methods, @uris, $start, $redirects, $code, $realm, $message, $auth) = @_;
+
+  # $method contains the method used
+  # $uri contains the target of the request
+  # $start is a timestamp (not defined the first time the function is
+  #                        called)
+  # $redirects is a map of redirects
+  # $code is the first HTTP return code
+  # $realm is the realm of the request
+  # $message is the HTTP message received
+  # $auth equals 1 if we want to send out authentication information
+
+  # For timing purposes
+  $start = &get_timestamp() unless defined($start);
+
+  # Prepare the queries
+  foreach my $uri (@uris) { 
+    my $method = $methods{$uri};
+    my $request = new HTTP::Request($method, $uri);
+    $request->header('Accept-Language' => $Opts{Accept_Language})
+      if $Opts{Accept_Language};
+    $request->header('Accept', $Accept);
+    # Are we providing authentication info?
+    if ($auth && $request->url()->host() =~ $Opts{Trusted}) {
+      if (defined($ENV{HTTP_AUTHORIZATION})) {
+        $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION});
+      } elsif (defined($Opts{User}) && defined($Opts{Password})) {
+        $request->authorization_basic($Opts{User}, $Opts{Password});
+      }
+    }
+
+ 
+  # Check if the IP address is allowed.
+  my $response = &ip_allowed($request->uri());
+  next if $response;
+  
+  # Do the query
+  $ua->register($request);
+  print $request;
+  }
+  
+  my $entries = $ua->wait();
+
+  my %responses;
+
+  foreach (keys %$entries) {
+  
+    my $response = $entries->{$_}->response;
+  my $uri = $response->request()->url();
+  # Get the results
+  # Record the very first response
+  if (! defined($code)) {
+    ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)});
+  }
+  # Authentication requested?
+  if ($response->code() == 401 &&
+      !defined($auth) &&
+      (defined($ENV{HTTP_AUTHORIZATION})
+       || (defined($Opts{User}) && defined($Opts{Password})))) {
+
+    # Set host as trusted domain unless we already have one.
+    if (!$Opts{Trusted}) {
+      my $re = sprintf('^%s$', quotemeta($response->base()->host()));
+      $Opts{Trusted} = qr/$re/io;
+    }
+
+    # Deal with authentication and avoid loops
+    if (! defined($realm)) {
+      $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
+      $realm = $1;
+    }
+    return &get_uri($methods{$uri}, $response->request()->url(),
+                    $start, $redirects, $code, $realm, $message, 1);
+  }
+  # @@@ subtract robot delay from the "fetched in" time?
+    $response->{Realm} = $realm if defined($realm);
+  
+  $responses{$response->request()->url()} = $response;
+  } # end foreach above
+  
+  return %responses;
 }
 
 #########################################
@@ -1344,51 +1479,67 @@
 
 sub check_validity ($$$\%\%)
 {
-  my ($testing, $uri, $want_links, $links, $redirects) = @_;
+  my ($testing, @links_to_check, $want_links, $links, $redirects) = @_;
   # $testing is the URI of the document checked
   # $uri is the URI of the target that we are verifying
   # $want_links is true if we're interested in links in the target doc
   # $links is a hash of the links in the documents checked
   # $redirects is a map of the redirects encountered
 
-  # Checking file: URI's is not allowed with a CGI
-  if ($testing ne $uri) {
-    if (!$Opts{Command_Line} && $testing !~ m/^file:/ && $uri =~ m/^file:/) {
-      my $msg = 'Error: \'file:\' URI not allowed';
-      # Can't test? Return 400 Bad request.
-      $results{$uri}{location}{code}    = 400;
-      $results{$uri}{location}{record}  = 400;
-      $results{$uri}{location}{success} = 0;
-      $results{$uri}{location}{message} = $msg;
-      &hprintf("Error: %d %s\n", 400, $msg) if $Opts{Verbose};
-      return;
+  my @links_checked_final;
+  my %methods;
+  my %being_processed;
+  foreach my $uri (@links_to_check) 
+  {
+    # Checking file: URI's is not allowed with a CGI
+    if ($testing ne $uri) {
+      if (!$Opts{Command_Line} && $testing !~ m/^file:/ && $uri =~ m/^file:/) {
+        my $msg = 'Error: \'file:\' URI not allowed';
+        # Can't test? Return 400 Bad request.
+        $results{$uri}{location}{code}    = 400;
+        $results{$uri}{location}{record}  = 400;
+        $results{$uri}{location}{success} = 0;
+        $results{$uri}{location}{message} = $msg;
+        &hprintf("Error: %d %s\n", 400, $msg) if $Opts{Verbose};
+      }
+    else {
+        push @links_checked_final, $uri;
+      }
     }
-  }
-
-  # Get the document with the appropriate method
-  # Only use GET if there are fragments. HEAD is enough if it's not the
-  # case.
-  my @fragments = keys %{$links->{$uri}{fragments}};
-  my $method = scalar(@fragments) ? 'GET' : 'HEAD';
+  
 
-  my $response;
-  my $being_processed = 0;
-  if ((! defined($results{$uri}))
-      || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
-    $being_processed = 1;
-    $response = &get_uri($method, $uri);
+    # Get the document with the appropriate method
+    # Only use GET if there are fragments. HEAD is enough if it's not the
+    # case.
+    my @fragments = keys %{$links->{$uri}{fragments}};
+    my $method = scalar(@fragments) ? 'GET' : 'HEAD';
+    $methods{$uri}=$method;
+    my $response;
+    $being_processed{$uri} = 0;
+    if ((! defined($results{$uri}))
+        || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
+      $being_processed{$uri} = 1;
+    }
+  }
+  my %responses = &get_uris(%methods, @links_checked_final);
+    
+  foreach my $uri (keys %responses) {  
+    my $response = $responses{$uri};
+    my $method = $methods{$uri};
     # Get the information back from get_uri()
     &record_results($uri, $method, $response);
     # Record the redirects
-    &record_redirects($redirects, $response);
-  }
+    # FIXME where were these set?!
+    #&record_redirects($redirects, $response);
+  
 
+  
   # We got the response of the HTTP request. Stop here if it was a HEAD.
-  return if ($method eq 'HEAD');
+  next if ($method eq 'HEAD');
 
   # There are fragments. Parse the document.
   my $p;
-  if ($being_processed) {
+  if ($being_processed{$uri}) {
     # Can we really parse the document?
     return unless defined($results{$uri}{location}{type});
     if ($results{$uri}{location}{type} !~ $ContentTypes) {
@@ -1413,6 +1564,7 @@
       $results{$uri}{fragments}{$fragment} = 0;
     }
   }
+  }
 }
 
 sub escape_match ($\%)

Received on Monday, 29 August 2005 05:11:49 UTC