- From: Olivier Thereaux via cvs-syncmail <cvsmail@w3.org>
- Date: Mon, 29 Aug 2005 05:11:41 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LinkChecker/bin In directory hutz:/tmp/cvs-serv8657 Modified Files: checklink Log Message: WARNING: this code is rather broken... This is not much more than a proof of concept of how we could be using LWP::Parallel::RobotUA for our requests. The basic idea is to send the list of links as an array to $ua->register() instead of request()ing them sequentially. Done by - modifying check_uri to pass an array of URIs to check_validity - modifying check_validity to pass this array to get_uris (a clone of get_uri) - cloning get_uri into get_uris The latter is only roughly done, and there probably is some loss of information (recursion, etc) in the process. This is a rather poor patch but the best I can wrap my head around right now, feel free to revert if it's too ugly. Index: checklink =================================================================== RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v retrieving revision 4.21 retrieving revision 4.22 diff -u -d -r4.21 -r4.22 --- checklink 15 May 2005 13:46:42 -0000 4.21 +++ checklink 29 Aug 2005 05:11:39 -0000 4.22 @@ -33,10 +33,14 @@ package W3C::UserAgent; -use LWP::RobotUA 1.19 qw(); +#use LWP::RobotUA 1.19 qw(); +use Exporter(); +use LWP::Parallel::RobotUA qw(:CALLBACK); # @@@ Needs also W3C::LinkChecker but can't use() it here. -@W3C::UserAgent::ISA = qw(LWP::RobotUA); +@W3C::UserAgent::ISA = qw(LWP::Parallel::RobotUA Exporter); +@W3C::UserAgent::EXPORT = @LWP::Parallel::RobotUA::EXPORT_OK; + sub new { @@ -69,6 +73,28 @@ return $response; } + + # on_return gets called whenever a connection (or its callback) + # returns EOF (or any other terminating status code available for + # callback functions). Please note that on_return gets called for + # any successfully terminated HTTP connection! This does not imply + # that the response sent from the server is a success! + # see http://search.cpan.org/~marclang/ParallelUserAgent-2.57/lib/LWP/Parallel.pm +# sub on_return { +# my ($self, $request, $response, $entry) = @_; +# if ($response->is_success) { +# print "\n\nWoa! Request to ",$request->url," returned code ", $response->code, +# ": ", $response->message, "\n"; +# print $response->content; +# } else { +# print "\n\nBummer! Request to ",$request->url," returned code ", $response->code, +# ": ", $response->message, "\n"; +# # print $response->error_as_HTML; +# } +# return; +# } + + sub redirect_ok { my ($self, $request, $response) = @_; @@ -120,7 +146,7 @@ $REVISION = sprintf('version %s (c) 1999-2005 W3C', $VERSION); my ($cvsver) = q$Revision$ =~ /(\d+[\d\.]*\.\d+)/; $AGENT = sprintf('%s/%s [%s] %s', - $PROGRAM, $VERSION, $cvsver, LWP::RobotUA->_agent()); + $PROGRAM, $VERSION, $cvsver, LWP::Parallel::RobotUA->_agent()); # Pull in mod_perl modules if applicable. if ($ENV{MOD_PERL}) { @@ -268,6 +294,13 @@ $ua->timeout($Opts{Timeout}); $ua->proxy('http', 'http://' . $Opts{HTTP_Proxy}) if $Opts{HTTP_Proxy}; +#settings for the parallel crawling of resources +# TODO make these configuration options +$ua->max_req(3); # max num of requests to a given host at a time +$ua->max_hosts(5); # max num of hosts polled in parallel + + + if ($Opts{Command_Line}) { require Text::Wrap; @@ -772,6 +805,7 @@ # Build the list of broken URI's my %broken; + my @links_to_check; foreach my $u (keys %links) { # Don't check mailto: URI's @@ -783,11 +817,17 @@ } else { &hprintf("Checking link %s\n", $u); } - + push (@links_to_check, $u); # Check that a link is valid - &check_validity($uri, $u, - ($depth != 0 && &in_recursion_scope($u)), - \%links, \%redirects); + } + # &check_validity($uri, @links_to_check, + # ($depth != 0 && &in_recursion_scope($u)), + # \%links, \%redirects); + # TODO fix recursion scope issue + &check_validity($uri, @links_to_check); + + + foreach my $u (keys %links) { &hprintf("\tReturn code: %s\n", $results{$u}{location}{code}) if ($Opts{Verbose}); if ($results{$u}{location}{success}) { @@ -1041,7 +1081,13 @@ return $response if $response; # Do the query - $response = $ua->request($request); + $ua->register($request); + my $entries = $ua->wait(); + + foreach (keys %$entries) { + $response = $entries->{$_}->response; + + # Get the results # Record the very first response @@ -1076,6 +1122,95 @@ $response->{Realm} = $realm if defined($realm); return $response; + + } # end foreach above +} + +sub get_uris (%@;$\%$$$$) +{ + # Here we have a lot of extra parameters in order not to lose information + # if the function is called several times (401's) + my (%methods, @uris, $start, $redirects, $code, $realm, $message, $auth) = @_; + + # $method contains the method used + # $uri contains the target of the request + # $start is a timestamp (not defined the first time the function is + # called) + # $redirects is a map of redirects + # $code is the first HTTP return code + # $realm is the realm of the request + # $message is the HTTP message received + # $auth equals 1 if we want to send out authentication information + + # For timing purposes + $start = &get_timestamp() unless defined($start); + + # Prepare the queries + foreach my $uri (@uris) { + my $method = $methods{$uri}; + my $request = new HTTP::Request($method, $uri); + $request->header('Accept-Language' => $Opts{Accept_Language}) + if $Opts{Accept_Language}; + $request->header('Accept', $Accept); + # Are we providing authentication info? + if ($auth && $request->url()->host() =~ $Opts{Trusted}) { + if (defined($ENV{HTTP_AUTHORIZATION})) { + $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION}); + } elsif (defined($Opts{User}) && defined($Opts{Password})) { + $request->authorization_basic($Opts{User}, $Opts{Password}); + } + } + + + # Check if the IP address is allowed. + my $response = &ip_allowed($request->uri()); + next if $response; + + # Do the query + $ua->register($request); + print $request; + } + + my $entries = $ua->wait(); + + my %responses; + + foreach (keys %$entries) { + + my $response = $entries->{$_}->response; + my $uri = $response->request()->url(); + # Get the results + # Record the very first response + if (! defined($code)) { + ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)}); + } + # Authentication requested? + if ($response->code() == 401 && + !defined($auth) && + (defined($ENV{HTTP_AUTHORIZATION}) + || (defined($Opts{User}) && defined($Opts{Password})))) { + + # Set host as trusted domain unless we already have one. + if (!$Opts{Trusted}) { + my $re = sprintf('^%s$', quotemeta($response->base()->host())); + $Opts{Trusted} = qr/$re/io; + } + + # Deal with authentication and avoid loops + if (! defined($realm)) { + $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/; + $realm = $1; + } + return &get_uri($methods{$uri}, $response->request()->url(), + $start, $redirects, $code, $realm, $message, 1); + } + # @@@ subtract robot delay from the "fetched in" time? + $response->{Realm} = $realm if defined($realm); + + $responses{$response->request()->url()} = $response; + } # end foreach above + + return %responses; } ######################################### @@ -1344,51 +1479,67 @@ sub check_validity ($$$\%\%) { - my ($testing, $uri, $want_links, $links, $redirects) = @_; + my ($testing, @links_to_check, $want_links, $links, $redirects) = @_; # $testing is the URI of the document checked # $uri is the URI of the target that we are verifying # $want_links is true if we're interested in links in the target doc # $links is a hash of the links in the documents checked # $redirects is a map of the redirects encountered - # Checking file: URI's is not allowed with a CGI - if ($testing ne $uri) { - if (!$Opts{Command_Line} && $testing !~ m/^file:/ && $uri =~ m/^file:/) { - my $msg = 'Error: \'file:\' URI not allowed'; - # Can't test? Return 400 Bad request. - $results{$uri}{location}{code} = 400; - $results{$uri}{location}{record} = 400; - $results{$uri}{location}{success} = 0; - $results{$uri}{location}{message} = $msg; - &hprintf("Error: %d %s\n", 400, $msg) if $Opts{Verbose}; - return; + my @links_checked_final; + my %methods; + my %being_processed; + foreach my $uri (@links_to_check) + { + # Checking file: URI's is not allowed with a CGI + if ($testing ne $uri) { + if (!$Opts{Command_Line} && $testing !~ m/^file:/ && $uri =~ m/^file:/) { + my $msg = 'Error: \'file:\' URI not allowed'; + # Can't test? Return 400 Bad request. + $results{$uri}{location}{code} = 400; + $results{$uri}{location}{record} = 400; + $results{$uri}{location}{success} = 0; + $results{$uri}{location}{message} = $msg; + &hprintf("Error: %d %s\n", 400, $msg) if $Opts{Verbose}; + } + else { + push @links_checked_final, $uri; + } } - } - - # Get the document with the appropriate method - # Only use GET if there are fragments. HEAD is enough if it's not the - # case. - my @fragments = keys %{$links->{$uri}{fragments}}; - my $method = scalar(@fragments) ? 'GET' : 'HEAD'; + - my $response; - my $being_processed = 0; - if ((! defined($results{$uri})) - || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { - $being_processed = 1; - $response = &get_uri($method, $uri); + # Get the document with the appropriate method + # Only use GET if there are fragments. HEAD is enough if it's not the + # case. + my @fragments = keys %{$links->{$uri}{fragments}}; + my $method = scalar(@fragments) ? 'GET' : 'HEAD'; + $methods{$uri}=$method; + my $response; + $being_processed{$uri} = 0; + if ((! defined($results{$uri})) + || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { + $being_processed{$uri} = 1; + } + } + my %responses = &get_uris(%methods, @links_checked_final); + + foreach my $uri (keys %responses) { + my $response = $responses{$uri}; + my $method = $methods{$uri}; # Get the information back from get_uri() &record_results($uri, $method, $response); # Record the redirects - &record_redirects($redirects, $response); - } + # FIXME where were these set?! + #&record_redirects($redirects, $response); + + # We got the response of the HTTP request. Stop here if it was a HEAD. - return if ($method eq 'HEAD'); + next if ($method eq 'HEAD'); # There are fragments. Parse the document. my $p; - if ($being_processed) { + if ($being_processed{$uri}) { # Can we really parse the document? return unless defined($results{$uri}{location}{type}); if ($results{$uri}{location}{type} !~ $ContentTypes) { @@ -1413,6 +1564,7 @@ $results{$uri}{fragments}{$fragment} = 0; } } + } } sub escape_match ($\%)
Received on Monday, 29 August 2005 05:11:49 UTC