- From: Mercurial notifier <nobody@w3.org>
- Date: Thu, 05 Aug 2010 14:46:55 +0000
- To: link-checker updates <www-validator-cvs@w3.org>
changeset: 44:73bdd57cab9b user: ville date: Wed Apr 07 22:08:34 2004 +0000 files: Makefile.PL bin/checklink docs/checklink.html description: Initial (partial) implementation for support for robots exclusion standard. The UI still needs some work: the results view should be fixed and some new configuration options (admin address, minimum delay) should be added. As a side effect, we now require libwww-perl >= 5.60. diff -r 53229d7ca1d2 -r 73bdd57cab9b Makefile.PL --- a/Makefile.PL Sun Apr 04 16:13:39 2004 +0000 +++ b/Makefile.PL Wed Apr 07 22:08:34 2004 +0000 @@ -25,7 +25,7 @@ CGI => 0, Config::General => 2.06, HTML::Parser => 3.00, - LWP::UserAgent => 0, + LWP::RobotUA => 0, Net::IP => 0, Term::ReadKey => 2.00, Time::HiRes => 0, diff -r 53229d7ca1d2 -r 73bdd57cab9b bin/checklink --- a/bin/checklink Sun Apr 04 16:13:39 2004 +0000 +++ b/bin/checklink Wed Apr 07 22:08:34 2004 +0000 @@ -5,7 +5,7 @@ # (c) 1999-2004 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # -# $Id: checklink,v 3.23 2004-04-04 16:13:39 ville Exp $ +# $Id: checklink,v 3.24 2004-04-07 22:08:34 ville Exp $ # # This program is licensed under the W3C(r) Software License: # http://www.w3.org/Consortium/Legal/copyright-software @@ -33,10 +33,30 @@ package W3C::UserAgent; -use LWP::UserAgent qw(); +use LWP::RobotUA qw(); # @@@ Needs also W3C::LinkChecker but can't use() it here. -@W3C::UserAgent::ISA = qw(LWP::UserAgent); +@W3C::UserAgent::ISA = qw(LWP::RobotUA); + +sub new +{ + my $proto = shift; + my $class = ref($proto) || $proto; + my ($name, $from, $rules) = @_; + + # For security/privacy reasons, if $from was not given, do not send it. + # Cheat by defining something for the constructor, and resetting it later. + my $from_ok = $from; + $from ||= 'www-validator@w3.org'; + # WWW::RobotRules <= 5.78 have bugs which cause suboptimal results with + # User-Agent substring matching against robots.txt files; "User-Agent: *" + # should work ok with all though, and "User-Agent: W3C-checklink" for >= 5.77 + my $self = $class->SUPER::new($name, $from, $rules); + $self->from(undef) unless $from_ok; + + $self->env_proxy(); + return $self; +} sub simple_request { @@ -68,7 +88,7 @@ package W3C::LinkChecker; -use vars qw($PROGRAM $AGENT $VERSION $CVS_VERSION $REVISION +use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION $DocType $Accept $ContentTypes %Cfg); use HTML::Entities qw(); @@ -89,12 +109,14 @@ BEGIN { # Version info - $PROGRAM = 'W3C checklink'; - ($AGENT = $PROGRAM) =~ s/\s+/-/g; - $VERSION = '3.9.2'; - ($CVS_VERSION) = q$Revision: 3.23 $ =~ /(\d+[\d\.]*\.\d+)/; + $PACKAGE = 'W3C Link Checker'; + $PROGRAM = 'W3C-checklink'; + $VERSION = '3.9.3-dev'; + my ($cvsver) = q$Revision: 3.24 $ =~ /(\d+[\d\.]*\.\d+)/; $REVISION = sprintf('version %s [%s] (c) 1999-2004 W3C', - $VERSION, $CVS_VERSION); + $VERSION, $cvsver); + $AGENT = sprintf('%s/%s [%s] %s', + $PROGRAM, $VERSION, $cvsver,LWP::RobotUA->_agent()); # Pull in mod_perl modules if applicable. if ($ENV{MOD_PERL}) { @@ -223,7 +245,7 @@ my $first = 1; foreach my $uri (@ARGV) { if (!$Opts{Summary_Only}) { - printf("%s %s\n", $PROGRAM, $REVISION) unless $Opts{HTML}; + printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML}; } else { $Opts{Verbose} = 0; $Opts{Progress} = 0; @@ -265,7 +287,7 @@ } # Override undefined values from the cookie, if we got one. - if (my %cookie = $query->cookie($AGENT)) { + if (my %cookie = $query->cookie($PROGRAM)) { while (my ($key, $value) = each %cookie) { $query->param($key, $value) unless defined($query->param($key)); } @@ -292,7 +314,7 @@ # Save, clear or leave cookie as is. my $cookie = ''; if (my $action = $query->param('cookie')) { - my %cookie = (-name => $AGENT); + my %cookie = (-name => $PROGRAM); if ($action eq 'clear') { # Clear the cookie. $cookie{-value} = ''; @@ -307,7 +329,7 @@ $cookie{-value} = \%options; } else { # Use the old values. - $cookie{-value} = { $query->cookie($AGENT) }; + $cookie{-value} = { $query->cookie($PROGRAM) }; } } $cookie = $query->cookie(%cookie); @@ -415,7 +437,7 @@ sub version () { - print "$PROGRAM $REVISION\n"; + print "$PACKAGE $REVISION\n"; exit 0; } @@ -430,7 +452,7 @@ my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only'; select(STDERR) if $exitval; - print "$msg$PROGRAM $REVISION + print "$msg$PACKAGE $REVISION Usage: checklink <options> <uris> Options: @@ -508,6 +530,7 @@ ########################################################################### # Guess an Accept-Language header based on the $LANG environment variable # ########################################################################### + sub guess_language () { my $lang = $ENV{LANG} or return undef; @@ -893,12 +916,11 @@ $start = &get_timestamp() unless defined($start); # Prepare the query - my %lwpargs = ($LWP::VERSION >= 5.6) ? (keep_alive => 1) : (); - my $ua = W3C::UserAgent->new(%lwpargs); + my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address + # @@@ make number of keep-alive connections and delay customizable + $ua->conn_cache({ total_capacity => 1}); # 1 keep-alive connection + $ua->delay(1/60); # 1 second $ua->timeout($Opts{Timeout}); - $ua->agent(sprintf('%s/%s [%s] %s', - $AGENT, $VERSION, $CVS_VERSION, $ua->agent())); - $ua->env_proxy(); $ua->proxy('http', 'http://' . $Opts{HTTP_Proxy}) if $Opts{HTTP_Proxy}; # $ua->{fetching} contains the URI we originally wanted @@ -966,6 +988,7 @@ } # Record the redirects $response->{Redirects} = $ua->{Redirects}; + # @@@ subtract robot delay from the "fetched in" time? &hprintf(" fetched in %ss\n", &time_diff($start, &get_timestamp())) if $verbose_progress; @@ -1524,6 +1547,7 @@ my $whattodo; my $redirect_too; if ($todo) { + my $currmsg = $results->{$u}{location}{message} || ''; if ($u =~ m/^javascript:/) { if ($Opts{HTML}) { $whattodo = @@ -1539,11 +1563,16 @@ } } elsif ($c == 500) { # 500's could be a real 500 or a DNS lookup problem - if ($results->{$u}{location}{message} =~ - m/Bad hostname '[^\']*'/) { + if ($currmsg =~ /Bad hostname '[^\']*'/) { $whattodo = 'The hostname could not be resolved. This link needs to be fixed.'; } else { $whattodo = 'This is a server-side problem. Check the URI.'; + } + } elsif ($c == 403 && $currmsg =~ /Forbidden by robots\.txt/) { + if ($Opts{HTML}) { + $whattodo = 'The link was not checked due to <a href="http://www.robotstxt.org/wc/exclusion.html#robotstxt">robots exclusion</a> rules, check the link manually.'; + } else { + $whattodo = 'Check the link manually.'; } } elsif ($redirect_loop) { $whattodo = @@ -2047,7 +2076,7 @@ print <<EOF; <div> <address> -$PROGRAM $REVISION, +$PACKAGE $REVISION, by <a href="http://www.w3.org/People/Hugo/">Hugo Haas</a> and others.<br> Please send bug reports, suggestions and comments to the <a href="mailto:www-validator\@w3.org?subject=checklink%3A%20">www-validator diff -r 53229d7ca1d2 -r 73bdd57cab9b docs/checklink.html --- a/docs/checklink.html Sun Apr 04 16:13:39 2004 +0000 +++ b/docs/checklink.html Wed Apr 07 22:08:34 2004 +0000 @@ -6,7 +6,7 @@ <title>W3C Link Checker Documentation</title> <link rev="made" href="mailto:www-validator@w3.org" /> <style type="text/css" media="all">@import "checklink.css";</style> - <meta name="revision" content="$Id: checklink.html,v 1.10 2004-03-31 19:22:19 ville Exp $" /> + <meta name="revision" content="$Id: checklink.html,v 1.11 2004-04-07 22:08:34 ville Exp $" /> </head> <body> @@ -127,7 +127,7 @@ <li><a href="http://search.cpan.org/dist/CGI.pm/">CGI.pm</a> (required for CGI mode only)</li> <li><a href="http://search.cpan.org/dist/Config-General/">Config-General</a> (optional, version 2.06 or newer; required only for reading the (optional) configuration file)</li> <li><a href="http://search.cpan.org/dist/HTML-Parser/">HTML-Parser</a> (version 3.00 or newer)</li> - <li><a href="http://search.cpan.org/dist/libwww-perl/">libwww-perl</a> (version 5.60 or newer if you want HTTP/1.1 with <code>Keep-Alive</code>; version 5.70 or newer recommended, except for 5.76 which has a bug that causes the link checker to detect many spurious redirect loops and to follow redirects to <code>file:</code> URLs)</li> + <li><a href="http://search.cpan.org/dist/libwww-perl/">libwww-perl</a> (version 5.60 or newer; version 5.70 or newer recommended, except for 5.76 which has a bug that causes the link checker to detect many spurious redirect loops and to follow redirects to <code>file:</code> URLs)</li> <li><a href="http://search.cpan.org/dist/Net-IP/">Net-IP</a></li> <li><a href="http://search.cpan.org/dist/TermReadKey/">TermReadKey</a> (optional but recommended; required only in command line mode for password input)</li> <li><a href="http://search.cpan.org/dist/Time-HiRes/">Time-HiRes</a></li> @@ -206,7 +206,7 @@ alt="Valid XHTML 1.0!" /></a> <a title="Send Feedback for the W3C Link Checker" href="http://validator.w3.org/feedback.html">The W3C Validator Team</a><br /> - $Date: 2004-03-31 19:22:19 $ + $Date: 2004-04-07 22:08:34 $ </address> <p class="copyright"> <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> © 1994-2004
Received on Thursday, 5 August 2010 14:46:57 UTC