link-checker commit: Implement "better" (yet somewhat hacky) handling of non-HTTP error codes, from Mercurial notifier on 2010-08-05 (www-validator-cvs@w3.org from August 2010)

From: Mercurial notifier <nobody@w3.org>
Date: Thu, 05 Aug 2010 14:46:57 +0000
To: link-checker updates <www-validator-cvs@w3.org>
Message-Id: <E1Oh1ij-0005aw-Ko@blinky.w3.org>
changeset:   62:65e7f84fa6ff
user:        ville
date:        Mon Apr 19 20:10:38 2004 +0000
files:       bin/checklink
description:
Implement "better" (yet somewhat hacky) handling of non-HTTP error codes,
currently used for "forbidden by robots.txt" and "bad hostname" messages from
LWP.  While at it, display "forbidden by robots.txt" with a light gray
background (CVS class "dubious") instead of screaming-red-403.


diff -r 445cb94a5278 -r 65e7f84fa6ff bin/checklink
--- a/bin/checklink	Sat Apr 17 10:28:07 2004 +0000
+++ b/bin/checklink	Mon Apr 19 20:10:38 2004 +0000
@@ -5,7 +5,7 @@
 # (c) 1999-2004 World Wide Web Consortium
 # based on Renaud Bruyeron's checklink.pl
 #
-# $Id: checklink,v 3.35 2004-04-12 15:38:12 ville Exp $
+# $Id: checklink,v 3.36 2004-04-19 20:10:38 ville Exp $
 #
 # This program is licensed under the W3C(r) Software License:
 #       http://www.w3.org/Consortium/Legal/copyright-software
@@ -73,6 +73,9 @@
 {
   my ($self, $request, $response) = @_;
   if ($self->{Checklink_verbose_progress}) {
+    # @@@ TODO: when an LWP internal robots.txt request gets redirected,
+    # this will a bit confusingly print out info about it.  Would need a
+    # robust way of determining whether something is a LWP "internal" request.
     &W3C::LinkChecker::hprintf("\n%s %s ", $request->method(),$request->uri());
   }
   return $self->SUPER::redirect_ok($request, $response);
@@ -98,6 +101,9 @@
 use URI::file            qw();
 # @@@ Needs also W3C::UserAgent but can't use() it here.
 
+use constant RC_ROBOTS_TXT => -1;
+use constant RC_DNS_ERROR  => -2;
+
 @W3C::LinkChecker::ISA =  qw(HTML::Parser);
 
 BEGIN
@@ -106,7 +112,7 @@
   $PACKAGE       = 'W3C Link Checker';
   $PROGRAM       = 'W3C-checklink';
   $VERSION       = '3.9.3-dev';
-  my ($cvsver)   = q$Revision: 3.35 $ =~ /(\d+[\d\.]*\.\d+)/;
+  my ($cvsver)   = q$Revision: 3.36 $ =~ /(\d+[\d\.]*\.\d+)/;
   $REVISION      = sprintf('version %s [%s] (c) 1999-2004 W3C',
                            $VERSION, $cvsver);
   $AGENT         = sprintf('%s/%s [%s] %s',
@@ -1022,6 +1028,12 @@
   $results{$uri}{response} = $response;
   $results{$uri}{method} = $method;
   $results{$uri}{location}{code} = $response->code();
+  $results{$uri}{location}{code} = RC_ROBOTS_TXT()
+    if ($results{$uri}{location}{code} == 403 &&
+        $response->message() =~ /Forbidden by robots\.txt/);
+  $results{$uri}{location}{code} = RC_DNS_ERROR()
+    if ($results{$uri}{location}{code} == 500 &&
+        $response->message() =~ /Bad hostname '[^\']*'/);
   $results{$uri}{location}{type} = $response->header('Content-type');
   $results{$uri}{location}{display} = $results{$uri}{location}{code};
   # Rewind, check for the original code and message.
@@ -1573,19 +1585,9 @@
         } else {
           $whattodo = 'Change this link: people using a browser without Javascript support will not be able to follow this link.';
         }
-      } elsif ($c == 500) {
-        # 500's could be a real 500 or a DNS lookup problem
-        if ($currmsg =~ /Bad hostname '[^\']*'/) {
-          $whattodo = 'The hostname could not be resolved. This link needs to be fixed.';
-        } else {
-          $whattodo = 'This is a server-side problem. Check the URI.';
-        }
-      } elsif ($c == 403 && $currmsg =~ /Forbidden by robots\.txt/) {
-        if ($Opts{HTML}) {
-          $whattodo = 'The link was not checked due to <a href="http://www.robotstxt.org/wc/exclusion.html#robotstxt">robots exclusion</a> rules, check the link manually.';
-        } else {
-          $whattodo = 'Check the link manually.';
-        }
+      } elsif ($c == RC_ROBOTS_TXT()) {
+        $whattodo = 'The link was not checked due to robots exclusion ' .
+          'rules. Check the link manually.';
       } elsif ($redirect_loop) {
         $whattodo =
           'Retrieving the URI results in a redirect loop, that should be ' .
@@ -1611,6 +1613,7 @@
 
     my @http_codes = ($currloc->{code});
     unshift(@http_codes, $currloc->{orig}) if $currloc->{orig};
+    @http_codes = map { $_ < 0 ? '(N/A)' : $_ } @http_codes;
 
     if ($Opts{HTML}) {
       # Style stuff
@@ -1787,9 +1790,13 @@
                408 => 'The request timed out.',
                410 => 'The resource is gone. You should remove this link.',
                415 => 'The media type is not supported.',
-               500 => 'Either the hostname is incorrect or it is a server side problem. Check the detailed list.',
+               500 => 'This is a server side problem. Check the URI.',
                501 => 'Could not check this link: method not implemented or scheme not supported.',
-               503 => 'The server cannot service the request, for some unknown reason.');
+               503 => 'The server cannot service the request, for some unknown reason.',
+               # Non-HTTP codes:
+               RC_ROBOTS_TXT() => 'The link was not checked due to <a href="http://www.robotstxt.org/wc/exclusion.html#robotstxt">robots exclusion rules</a>. Check the link manually.',
+               RC_DNS_ERROR() => 'The hostname could not be resolved. This link needs to be fixed.',
+             );
   my %priority = ( 410 => 1,
                    404 => 2,
                    403 => 5,
@@ -1880,7 +1887,7 @@
       foreach my $code (sort(keys(%code_summary))) {
         printf('<tr%s>', &bgcolor($code));
         printf('<td><a href="#d%scode_%s">%s</a></td>',
-               $doc_count, $code, $code);
+               $doc_count, $code, $code < 0 ? '(N/A)' : $code);
         printf('<td>%s</td>', $code_summary{$code});
         printf('<td>%s</td>', $todo{$code});
         print "</tr>\n";
@@ -2033,6 +2040,9 @@
 .multiple {
   background-color: fuchsia;
 }
+.dubious {
+  background-color: lightgray;
+}
 div#settings {
   font-size: smaller;
   float: right;
@@ -2054,6 +2064,8 @@
   my $r = HTTP::Response->new($code);
   if ($r->is_success()) {
     return '';
+  } elsif ($code == RC_ROBOTS_TXT()) {
+    $class = 'dubious';
   } elsif ($code == 300) {
     $class = 'multiple';
   } elsif ($code == 401) {
Received on Thursday, 5 August 2010 14:47:05 UTC