- From: Ville Skytta <ville@hutz.w3.org>
- Date: Mon, 19 Apr 2004 20:10:41 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LinkChecker/bin
In directory hutz:/tmp/cvs-serv21555/bin
Modified Files:
checklink
Log Message:
Implement "better" (yet somewhat hacky) handling of non-HTTP error codes,
currently used for "forbidden by robots.txt" and "bad hostname" messages from
LWP. While at it, display "forbidden by robots.txt" with a light gray
background (CVS class "dubious") instead of screaming-red-403.
Index: checklink
===================================================================
RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v
retrieving revision 3.35
retrieving revision 3.36
diff -u -d -r3.35 -r3.36
--- checklink 12 Apr 2004 15:38:12 -0000 3.35
+++ checklink 19 Apr 2004 20:10:38 -0000 3.36
@@ -73,6 +73,9 @@
{
my ($self, $request, $response) = @_;
if ($self->{Checklink_verbose_progress}) {
+ # @@@ TODO: when an LWP internal robots.txt request gets redirected,
+ # this will a bit confusingly print out info about it. Would need a
+ # robust way of determining whether something is a LWP "internal" request.
&W3C::LinkChecker::hprintf("\n%s %s ", $request->method(),$request->uri());
}
return $self->SUPER::redirect_ok($request, $response);
@@ -98,6 +101,9 @@
use URI::file qw();
# @@@ Needs also W3C::UserAgent but can't use() it here.
+use constant RC_ROBOTS_TXT => -1;
+use constant RC_DNS_ERROR => -2;
+
@W3C::LinkChecker::ISA = qw(HTML::Parser);
BEGIN
@@ -1022,6 +1028,12 @@
$results{$uri}{response} = $response;
$results{$uri}{method} = $method;
$results{$uri}{location}{code} = $response->code();
+ $results{$uri}{location}{code} = RC_ROBOTS_TXT()
+ if ($results{$uri}{location}{code} == 403 &&
+ $response->message() =~ /Forbidden by robots\.txt/);
+ $results{$uri}{location}{code} = RC_DNS_ERROR()
+ if ($results{$uri}{location}{code} == 500 &&
+ $response->message() =~ /Bad hostname '[^\']*'/);
$results{$uri}{location}{type} = $response->header('Content-type');
$results{$uri}{location}{display} = $results{$uri}{location}{code};
# Rewind, check for the original code and message.
@@ -1573,19 +1585,9 @@
} else {
$whattodo = 'Change this link: people using a browser without Javascript support will not be able to follow this link.';
}
- } elsif ($c == 500) {
- # 500's could be a real 500 or a DNS lookup problem
- if ($currmsg =~ /Bad hostname '[^\']*'/) {
- $whattodo = 'The hostname could not be resolved. This link needs to be fixed.';
- } else {
- $whattodo = 'This is a server-side problem. Check the URI.';
- }
- } elsif ($c == 403 && $currmsg =~ /Forbidden by robots\.txt/) {
- if ($Opts{HTML}) {
- $whattodo = 'The link was not checked due to <a href="http://www.robotstxt.org/wc/exclusion.html#robotstxt">robots exclusion</a> rules, check the link manually.';
- } else {
- $whattodo = 'Check the link manually.';
- }
+ } elsif ($c == RC_ROBOTS_TXT()) {
+ $whattodo = 'The link was not checked due to robots exclusion ' .
+ 'rules. Check the link manually.';
} elsif ($redirect_loop) {
$whattodo =
'Retrieving the URI results in a redirect loop, that should be ' .
@@ -1611,6 +1613,7 @@
my @http_codes = ($currloc->{code});
unshift(@http_codes, $currloc->{orig}) if $currloc->{orig};
+ @http_codes = map { $_ < 0 ? '(N/A)' : $_ } @http_codes;
if ($Opts{HTML}) {
# Style stuff
@@ -1787,9 +1790,13 @@
408 => 'The request timed out.',
410 => 'The resource is gone. You should remove this link.',
415 => 'The media type is not supported.',
- 500 => 'Either the hostname is incorrect or it is a server side problem. Check the detailed list.',
+ 500 => 'This is a server side problem. Check the URI.',
501 => 'Could not check this link: method not implemented or scheme not supported.',
- 503 => 'The server cannot service the request, for some unknown reason.');
+ 503 => 'The server cannot service the request, for some unknown reason.',
+ # Non-HTTP codes:
+ RC_ROBOTS_TXT() => 'The link was not checked due to <a href="http://www.robotstxt.org/wc/exclusion.html#robotstxt">robots exclusion rules</a>. Check the link manually.',
+ RC_DNS_ERROR() => 'The hostname could not be resolved. This link needs to be fixed.',
+ );
my %priority = ( 410 => 1,
404 => 2,
403 => 5,
@@ -1880,7 +1887,7 @@
foreach my $code (sort(keys(%code_summary))) {
printf('<tr%s>', &bgcolor($code));
printf('<td><a href="#d%scode_%s">%s</a></td>',
- $doc_count, $code, $code);
+ $doc_count, $code, $code < 0 ? '(N/A)' : $code);
printf('<td>%s</td>', $code_summary{$code});
printf('<td>%s</td>', $todo{$code});
print "</tr>\n";
@@ -2033,6 +2040,9 @@
.multiple {
background-color: fuchsia;
}
+.dubious {
+ background-color: lightgray;
+}
div#settings {
font-size: smaller;
float: right;
@@ -2054,6 +2064,8 @@
my $r = HTTP::Response->new($code);
if ($r->is_success()) {
return '';
+ } elsif ($code == RC_ROBOTS_TXT()) {
+ $class = 'dubious';
} elsif ($code == 300) {
$class = 'multiple';
} elsif ($code == 401) {
Received on Monday, 19 April 2004 16:10:45 UTC