- From: Mercurial notifier <nobody@w3.org>
- Date: Thu, 05 Aug 2010 14:47:08 +0000
- To: link-checker updates <www-validator-cvs@w3.org>
changeset: 171:af4fe17d6ae9 user: ville date: Sun Jul 29 13:25:21 2007 +0000 files: bin/checklink description: Refactor IP address allowance and redirect progress callback code into our UserAgent class. diff -r 9a2df17fe54a -r af4fe17d6ae9 bin/checklink --- a/bin/checklink Sat Jul 28 10:44:32 2007 +0000 +++ b/bin/checklink Sun Jul 29 13:25:21 2007 +0000 @@ -5,7 +5,7 @@ # (c) 1999-2007 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # -# $Id: checklink,v 4.55 2007-07-28 10:44:32 ville Exp $ +# $Id: checklink,v 4.56 2007-07-29 13:25:21 ville Exp $ # # This program is licensed under the W3C(r) Software License: # http://www.w3.org/Consortium/Legal/copyright-software @@ -34,7 +34,6 @@ package W3C::UserAgent; use LWP::RobotUA 1.19 qw(); -# @@@ Needs also W3C::LinkChecker but can't use() it here. @W3C::UserAgent::ISA = qw(LWP::RobotUA); @@ -55,43 +54,105 @@ $self->from(undef) unless $from_ok; $self->env_proxy(); + + $self->allow_private_ips(1); + return $self; +} + +sub allow_private_ips +{ + my $self = shift; + if (@_) { + $self->{Checklink_allow_private_ips} = shift; + if (!$self->{Checklink_allow_private_ips}) { + # Pull in dependencies + require Net::IP; + require Socket; + require Net::hostent; + } + } + return $self->{Checklink_allow_private_ips}; +} + +sub redirect_progress_callback +{ + my $self = shift; + $self->{Checklink_redirect_callback} = shift if @_; + return $self->{Checklink_redirect_callback}; } sub simple_request { my $self = shift; - my $response = do { + + my $response = $self->ip_disallowed($_[0]->uri()); + + $response ||= do { local $SIG{__WARN__} = sub { # Suppress some warnings, rt.cpan.org #18902 warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/); }; + # @@@ Why not just $self->SUPER::simple_request? $self->W3C::UserAgent::SUPER::simple_request(@_); }; + if (! defined($self->{FirstResponse})) { $self->{FirstResponse} = $response->code(); $self->{FirstMessage} = $response->message() || '(no message)'; } + return $response; } sub redirect_ok { my ($self, $request, $response) = @_; - if ($self->{Checklink_verbose_progress}) { - # @@@ TODO: when an LWP internal robots.txt request gets redirected, - # this will a bit confusingly print out info about it. Would need a - # robust way of determining whether something is a LWP "internal" request. - &W3C::LinkChecker::hprintf("\n%s %s ", $request->method(),$request->uri()); + + if (my $callback = $self->redirect_progress_callback()) { + &$callback($request->method(), $request->uri()); } + return 0 unless $self->SUPER::redirect_ok($request, $response); - if (my $res = &W3C::LinkChecker::ip_allowed($request->uri())) { + + if (my $res = $self->ip_disallowed($request->uri())) { $response->previous($response->clone()); $response->request($request); $response->code($res->code()); $response->message($res->message()); return 0; } + return 1; +} + +# +# Checks whether we're allowed to retrieve the document based on it's IP +# address. Takes an URI object and returns a HTTP::Response containing the +# appropriate status and error message if the IP was disallowed, 0 +# otherwise. URIs without hostname or IP address are always allowed, +# including schemes where those make no sense (eg. data:, often javascript:). +# +sub ip_disallowed +{ + my ($self, $uri) = @_; + return 0 if $self->allow_private_ips(); # Short-circuit + + my $hostname = undef; + eval { $hostname = $uri->host() }; # Not all URIs implement host()... + return 0 unless $hostname; + + my $addr = my $iptype = my $resp = undef; + if (my $host = Net::hostent::gethostbyname($hostname)) { + $addr = Socket::inet_ntoa($host->addr()) if $host->addr(); + if ($addr && (my $ip = Net::IP->new($addr))) { + $iptype = $ip->iptype(); + } + } + if ($iptype && $iptype ne 'PUBLIC') { + $resp = HTTP::Response->new(403, + 'Checking non-public IP address disallowed by link checker configuration'); + } + return $resp; } # ----------------------------------------------------------------------------- @@ -128,7 +189,7 @@ $PROGRAM = 'W3C-checklink'; $VERSION = '4.3'; $REVISION = sprintf('version %s (c) 1999-2007 W3C', $VERSION); - my ($cvsver) = q$Revision: 4.55 $ =~ /(\d+[\d\.]*\.\d+)/; + my ($cvsver) = q$Revision: 4.56 $ =~ /(\d+[\d\.]*\.\d+)/; $AGENT = sprintf('%s/%s [%s] %s', $PROGRAM, $VERSION, $cvsver, LWP::RobotUA->_agent()); @@ -229,22 +290,6 @@ ); undef $cmdline; -unless ($Opts{Allow_Private_IPs}) { - eval { - require Net::IP; - require Socket; - Socket->import('inet_ntoa'); - require Net::hostent; - }; - if ($@) { - die <<".EOF."; -Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and -Net::hostent modules: -$@ -.EOF. - } -} - # Global variables # What URI's did we process? (used for recursive mode) my %processed; @@ -278,6 +323,16 @@ $ua->conn_cache({ total_capacity => 1}); # 1 keep-alive connection $ua->delay($Opts{Sleep_Time}/60); $ua->timeout($Opts{Timeout}); +eval { + $ua->allow_private_ips($Opts{Allow_Private_IPs}); +}; +if ($@) { + die <<".EOF."; +Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and +Net::hostent modules: +$@ +.EOF. +} if ($Opts{Command_Line}) { @@ -1067,15 +1122,12 @@ } } - # Tell the user agent if we want progress reports (in redirects) or not. - $ua->{Checklink_verbose_progress} = $verbose_progress; - - # Check if the IP address is allowed. - my $response = &ip_allowed($request->uri()); - return $response if $response; + # Tell the user agent if we want progress reports for redirects or not. + $ua->redirect_progress_callback(sub { &hprintf("\n%s %s ", @_); }) + if $verbose_progress; # Do the query - $response = $ua->request($request); + my $response = $ua->request($request); # Get the results # Record the very first response @@ -1860,36 +1912,6 @@ } } -# -# Checks whether we're allowed to retrieve the document based on it's IP -# address. Takes an URI object and returns a HTTP::Response containing the -# appropriate status and error message if the IP was disallowed, 0 -# otherwise. URIs without hostname or IP address are always allowed, -# including schemes where those make no sense (eg. data:, often javascript:). -# -sub ip_allowed ($) -{ - my ($uri) = @_; - return 0 if $Opts{Allow_Private_IPs}; # Short-circuit - - my $hostname = undef; - eval { $hostname = $uri->host() }; # Not all URIs implement host()... - return 0 unless $hostname; - - my $addr = my $iptype = my $resp = undef; - if (my $host = Net::hostent::gethostbyname($hostname)) { - $addr = inet_ntoa($host->addr()) if $host->addr(); - if ($addr && (my $ip = Net::IP->new($addr))) { - $iptype = $ip->iptype(); - } - } - if ($iptype && $iptype ne 'PUBLIC') { - $resp = HTTP::Response->new(403, - 'Checking non-public IP address disallowed by link checker configuration'); - } - return $resp; -} - sub links_summary (\%\%\%\%) { # Advices to fix the problems
Received on Thursday, 5 August 2010 14:47:17 UTC