- From: Ville Skytta via cvs-syncmail <cvsmail@w3.org>
- Date: Sun, 29 Jul 2007 13:25:24 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LinkChecker/bin In directory hutz:/tmp/cvs-serv10786 Modified Files: checklink Log Message: Refactor IP address allowance and redirect progress callback code into our UserAgent class. Index: checklink =================================================================== RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v retrieving revision 4.55 retrieving revision 4.56 diff -u -d -r4.55 -r4.56 --- checklink 28 Jul 2007 10:44:32 -0000 4.55 +++ checklink 29 Jul 2007 13:25:21 -0000 4.56 @@ -34,7 +34,6 @@ package W3C::UserAgent; use LWP::RobotUA 1.19 qw(); -# @@@ Needs also W3C::LinkChecker but can't use() it here. @W3C::UserAgent::ISA = qw(LWP::RobotUA); @@ -55,45 +54,107 @@ $self->from(undef) unless $from_ok; $self->env_proxy(); + + $self->allow_private_ips(1); + return $self; } +sub allow_private_ips +{ + my $self = shift; + if (@_) { + $self->{Checklink_allow_private_ips} = shift; + if (!$self->{Checklink_allow_private_ips}) { + # Pull in dependencies + require Net::IP; + require Socket; + require Net::hostent; + } + } + return $self->{Checklink_allow_private_ips}; +} + +sub redirect_progress_callback +{ + my $self = shift; + $self->{Checklink_redirect_callback} = shift if @_; + return $self->{Checklink_redirect_callback}; +} + sub simple_request { my $self = shift; - my $response = do { + + my $response = $self->ip_disallowed($_[0]->uri()); + + $response ||= do { local $SIG{__WARN__} = sub { # Suppress some warnings, rt.cpan.org #18902 warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/); }; + # @@@ Why not just $self->SUPER::simple_request? $self->W3C::UserAgent::SUPER::simple_request(@_); }; + if (! defined($self->{FirstResponse})) { $self->{FirstResponse} = $response->code(); $self->{FirstMessage} = $response->message() || '(no message)'; } + return $response; } sub redirect_ok { my ($self, $request, $response) = @_; - if ($self->{Checklink_verbose_progress}) { - # @@@ TODO: when an LWP internal robots.txt request gets redirected, - # this will a bit confusingly print out info about it. Would need a - # robust way of determining whether something is a LWP "internal" request. - &W3C::LinkChecker::hprintf("\n%s %s ", $request->method(),$request->uri()); + + if (my $callback = $self->redirect_progress_callback()) { + &$callback($request->method(), $request->uri()); } + return 0 unless $self->SUPER::redirect_ok($request, $response); - if (my $res = &W3C::LinkChecker::ip_allowed($request->uri())) { + + if (my $res = $self->ip_disallowed($request->uri())) { $response->previous($response->clone()); $response->request($request); $response->code($res->code()); $response->message($res->message()); return 0; } + return 1; } +# +# Checks whether we're allowed to retrieve the document based on it's IP +# address. Takes an URI object and returns a HTTP::Response containing the +# appropriate status and error message if the IP was disallowed, 0 +# otherwise. URIs without hostname or IP address are always allowed, +# including schemes where those make no sense (eg. data:, often javascript:). +# +sub ip_disallowed +{ + my ($self, $uri) = @_; + return 0 if $self->allow_private_ips(); # Short-circuit + + my $hostname = undef; + eval { $hostname = $uri->host() }; # Not all URIs implement host()... + return 0 unless $hostname; + + my $addr = my $iptype = my $resp = undef; + if (my $host = Net::hostent::gethostbyname($hostname)) { + $addr = Socket::inet_ntoa($host->addr()) if $host->addr(); + if ($addr && (my $ip = Net::IP->new($addr))) { + $iptype = $ip->iptype(); + } + } + if ($iptype && $iptype ne 'PUBLIC') { + $resp = HTTP::Response->new(403, + 'Checking non-public IP address disallowed by link checker configuration'); + } + return $resp; +} + # ----------------------------------------------------------------------------- package W3C::LinkChecker; @@ -229,22 +290,6 @@ ); undef $cmdline; -unless ($Opts{Allow_Private_IPs}) { - eval { - require Net::IP; - require Socket; - Socket->import('inet_ntoa'); - require Net::hostent; - }; - if ($@) { - die <<".EOF."; -Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and -Net::hostent modules: -$@ -.EOF. - } -} - # Global variables # What URI's did we process? (used for recursive mode) my %processed; @@ -278,6 +323,16 @@ $ua->conn_cache({ total_capacity => 1}); # 1 keep-alive connection $ua->delay($Opts{Sleep_Time}/60); $ua->timeout($Opts{Timeout}); +eval { + $ua->allow_private_ips($Opts{Allow_Private_IPs}); +}; +if ($@) { + die <<".EOF."; +Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and +Net::hostent modules: +$@ +.EOF. +} if ($Opts{Command_Line}) { @@ -1067,15 +1122,12 @@ } } - # Tell the user agent if we want progress reports (in redirects) or not. - $ua->{Checklink_verbose_progress} = $verbose_progress; - - # Check if the IP address is allowed. - my $response = &ip_allowed($request->uri()); - return $response if $response; + # Tell the user agent if we want progress reports for redirects or not. + $ua->redirect_progress_callback(sub { &hprintf("\n%s %s ", @_); }) + if $verbose_progress; # Do the query - $response = $ua->request($request); + my $response = $ua->request($request); # Get the results # Record the very first response @@ -1860,36 +1912,6 @@ } } -# -# Checks whether we're allowed to retrieve the document based on it's IP -# address. Takes an URI object and returns a HTTP::Response containing the -# appropriate status and error message if the IP was disallowed, 0 -# otherwise. URIs without hostname or IP address are always allowed, -# including schemes where those make no sense (eg. data:, often javascript:). -# -sub ip_allowed ($) -{ - my ($uri) = @_; - return 0 if $Opts{Allow_Private_IPs}; # Short-circuit - - my $hostname = undef; - eval { $hostname = $uri->host() }; # Not all URIs implement host()... - return 0 unless $hostname; - - my $addr = my $iptype = my $resp = undef; - if (my $host = Net::hostent::gethostbyname($hostname)) { - $addr = inet_ntoa($host->addr()) if $host->addr(); - if ($addr && (my $ip = Net::IP->new($addr))) { - $iptype = $ip->iptype(); - } - } - if ($iptype && $iptype ne 'PUBLIC') { - $resp = HTTP::Response->new(403, - 'Checking non-public IP address disallowed by link checker configuration'); - } - return $resp; -} - sub links_summary (\%\%\%\%) { # Advices to fix the problems
Received on Sunday, 29 July 2007 13:25:32 UTC