- From: Ville Skytta <ville@hutz.w3.org>
- Date: Wed, 07 Apr 2004 22:08:36 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LinkChecker/bin
In directory hutz:/tmp/cvs-serv4272/bin
Modified Files:
checklink
Log Message:
Initial (partial) implementation for support for robots exclusion standard.
The UI still needs some work: the results view should be fixed and some
new configuration options (admin address, minimum delay) should be added.
As a side effect, we now require libwww-perl >= 5.60.
Index: checklink
===================================================================
RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v
retrieving revision 3.23
retrieving revision 3.24
diff -u -d -r3.23 -r3.24
--- checklink 4 Apr 2004 16:13:39 -0000 3.23
+++ checklink 7 Apr 2004 22:08:34 -0000 3.24
@@ -33,10 +33,30 @@
package W3C::UserAgent;
-use LWP::UserAgent qw();
+use LWP::RobotUA qw();
# @@@ Needs also W3C::LinkChecker but can't use() it here.
-@W3C::UserAgent::ISA = qw(LWP::UserAgent);
+@W3C::UserAgent::ISA = qw(LWP::RobotUA);
+
+sub new
+{
+ my $proto = shift;
+ my $class = ref($proto) || $proto;
+ my ($name, $from, $rules) = @_;
+
+ # For security/privacy reasons, if $from was not given, do not send it.
+ # Cheat by defining something for the constructor, and resetting it later.
+ my $from_ok = $from;
+ $from ||= 'www-validator@w3.org';
+ # WWW::RobotRules <= 5.78 have bugs which cause suboptimal results with
+ # User-Agent substring matching against robots.txt files; "User-Agent: *"
+ # should work ok with all though, and "User-Agent: W3C-checklink" for >= 5.77
+ my $self = $class->SUPER::new($name, $from, $rules);
+ $self->from(undef) unless $from_ok;
+
+ $self->env_proxy();
+ return $self;
+}
sub simple_request
{
@@ -68,7 +88,7 @@
package W3C::LinkChecker;
-use vars qw($PROGRAM $AGENT $VERSION $CVS_VERSION $REVISION
+use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION
$DocType $Accept $ContentTypes %Cfg);
use HTML::Entities qw();
@@ -89,12 +109,14 @@
BEGIN
{
# Version info
- $PROGRAM = 'W3C checklink';
- ($AGENT = $PROGRAM) =~ s/\s+/-/g;
- $VERSION = '3.9.2';
- ($CVS_VERSION) = q$Revision$ =~ /(\d+[\d\.]*\.\d+)/;
+ $PACKAGE = 'W3C Link Checker';
+ $PROGRAM = 'W3C-checklink';
+ $VERSION = '3.9.3-dev';
+ my ($cvsver) = q$Revision$ =~ /(\d+[\d\.]*\.\d+)/;
$REVISION = sprintf('version %s [%s] (c) 1999-2004 W3C',
- $VERSION, $CVS_VERSION);
+ $VERSION, $cvsver);
+ $AGENT = sprintf('%s/%s [%s] %s',
+ $PROGRAM, $VERSION, $cvsver,LWP::RobotUA->_agent());
# Pull in mod_perl modules if applicable.
if ($ENV{MOD_PERL}) {
@@ -223,7 +245,7 @@
my $first = 1;
foreach my $uri (@ARGV) {
if (!$Opts{Summary_Only}) {
- printf("%s %s\n", $PROGRAM, $REVISION) unless $Opts{HTML};
+ printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML};
} else {
$Opts{Verbose} = 0;
$Opts{Progress} = 0;
@@ -265,7 +287,7 @@
}
# Override undefined values from the cookie, if we got one.
- if (my %cookie = $query->cookie($AGENT)) {
+ if (my %cookie = $query->cookie($PROGRAM)) {
while (my ($key, $value) = each %cookie) {
$query->param($key, $value) unless defined($query->param($key));
}
@@ -292,7 +314,7 @@
# Save, clear or leave cookie as is.
my $cookie = '';
if (my $action = $query->param('cookie')) {
- my %cookie = (-name => $AGENT);
+ my %cookie = (-name => $PROGRAM);
if ($action eq 'clear') {
# Clear the cookie.
$cookie{-value} = '';
@@ -307,7 +329,7 @@
$cookie{-value} = \%options;
} else {
# Use the old values.
- $cookie{-value} = { $query->cookie($AGENT) };
+ $cookie{-value} = { $query->cookie($PROGRAM) };
}
}
$cookie = $query->cookie(%cookie);
@@ -415,7 +437,7 @@
sub version ()
{
- print "$PROGRAM $REVISION\n";
+ print "$PACKAGE $REVISION\n";
exit 0;
}
@@ -430,7 +452,7 @@
my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only';
select(STDERR) if $exitval;
- print "$msg$PROGRAM $REVISION
+ print "$msg$PACKAGE $REVISION
Usage: checklink <options> <uris>
Options:
@@ -508,6 +530,7 @@
###########################################################################
# Guess an Accept-Language header based on the $LANG environment variable #
###########################################################################
+
sub guess_language ()
{
my $lang = $ENV{LANG} or return undef;
@@ -893,12 +916,11 @@
$start = &get_timestamp() unless defined($start);
# Prepare the query
- my %lwpargs = ($LWP::VERSION >= 5.6) ? (keep_alive => 1) : ();
- my $ua = W3C::UserAgent->new(%lwpargs);
+ my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address
+ # @@@ make number of keep-alive connections and delay customizable
+ $ua->conn_cache({ total_capacity => 1}); # 1 keep-alive connection
+ $ua->delay(1/60); # 1 second
$ua->timeout($Opts{Timeout});
- $ua->agent(sprintf('%s/%s [%s] %s',
- $AGENT, $VERSION, $CVS_VERSION, $ua->agent()));
- $ua->env_proxy();
$ua->proxy('http', 'http://' . $Opts{HTTP_Proxy}) if $Opts{HTTP_Proxy};
# $ua->{fetching} contains the URI we originally wanted
@@ -966,6 +988,7 @@
}
# Record the redirects
$response->{Redirects} = $ua->{Redirects};
+ # @@@ subtract robot delay from the "fetched in" time?
&hprintf(" fetched in %ss\n",
&time_diff($start, &get_timestamp())) if $verbose_progress;
@@ -1524,6 +1547,7 @@
my $whattodo;
my $redirect_too;
if ($todo) {
+ my $currmsg = $results->{$u}{location}{message} || '';
if ($u =~ m/^javascript:/) {
if ($Opts{HTML}) {
$whattodo =
@@ -1539,12 +1563,17 @@
}
} elsif ($c == 500) {
# 500's could be a real 500 or a DNS lookup problem
- if ($results->{$u}{location}{message} =~
- m/Bad hostname '[^\']*'/) {
+ if ($currmsg =~ /Bad hostname '[^\']*'/) {
$whattodo = 'The hostname could not be resolved. This link needs to be fixed.';
} else {
$whattodo = 'This is a server-side problem. Check the URI.';
}
+ } elsif ($c == 403 && $currmsg =~ /Forbidden by robots\.txt/) {
+ if ($Opts{HTML}) {
+ $whattodo = 'The link was not checked due to <a href="http://www.robotstxt.org/wc/exclusion.html#robotstxt">robots exclusion</a> rules, check the link manually.';
+ } else {
+ $whattodo = 'Check the link manually.';
+ }
} elsif ($redirect_loop) {
$whattodo =
'Retrieving the URI results in a redirect loop, that should be ' .
@@ -2047,7 +2076,7 @@
print <<EOF;
<div>
<address>
-$PROGRAM $REVISION,
+$PACKAGE $REVISION,
by <a href="http://www.w3.org/People/Hugo/">Hugo Haas</a> and others.<br>
Please send bug reports, suggestions and comments to the
<a href="mailto:www-validator\@w3.org?subject=checklink%3A%20">www-validator
Received on Wednesday, 7 April 2004 18:08:36 UTC