link-checker commit: Improve handling of URI schemes we don't want checked. from Mercurial notifier on 2010-08-05 (www-validator-cvs@w3.org from August 2010)

From: Mercurial notifier <nobody@w3.org>
Date: Thu, 05 Aug 2010 14:47:19 +0000
To: link-checker updates <www-validator-cvs@w3.org>
Message-Id: <E1Oh1j5-0006CV-L7@blinky.w3.org>
changeset:   287:47393b35eff8
user:        ville
date:        Tue Feb 10 20:11:11 2009 +0000
files:       bin/checklink
description:
Improve handling of URI schemes we don't want checked.


diff -r eda248378158 -r 47393b35eff8 bin/checklink
--- a/bin/checklink	Tue Feb 10 19:28:52 2009 +0000
+++ b/bin/checklink	Tue Feb 10 20:11:11 2009 +0000
@@ -5,7 +5,7 @@
 # (c) 1999-2009 World Wide Web Consortium
 # based on Renaud Bruyeron's checklink.pl
 #
-# $Id: checklink,v 4.145 2009-02-10 19:28:52 ville Exp $
+# $Id: checklink,v 4.146 2009-02-10 20:11:11 ville Exp $
 #
 # This program is licensed under the W3C(r) Software License:
 #       http://www.w3.org/Consortium/Legal/copyright-software
@@ -71,6 +71,9 @@
   $self->env_proxy();
 
   $self->allow_private_ips(1);
+
+  # TODO: bug 29
+  $self->protocols_forbidden([qw(mailto javascript)]);
 
   return $self;
 }
@@ -203,9 +206,10 @@
 use URI::file            qw();
 # @@@ Needs also W3C::UserAgent but can't use() it here.
 
-use constant RC_ROBOTS_TXT    => -1;
-use constant RC_DNS_ERROR     => -2;
-use constant RC_IP_DISALLOWED => -3;
+use constant RC_ROBOTS_TXT          => -1;
+use constant RC_DNS_ERROR           => -2;
+use constant RC_IP_DISALLOWED       => -3;
+use constant RC_PROTOCOL_DISALLOWED => -4;
 
 use constant LINE_UNKNOWN  => -1;
 
@@ -265,7 +269,7 @@
   $PROGRAM     = 'W3C-checklink';
   $VERSION     = '4.4';
   $REVISION    = sprintf('version %s (c) 1999-2009 W3C', $VERSION);
-  my ($cvsver) = q$Revision: 4.145 $ =~ /(\d+[\d\.]*\.\d+)/;
+  my ($cvsver) = q$Revision: 4.146 $ =~ /(\d+[\d\.]*\.\d+)/;
   $AGENT       = sprintf('%s/%s [%s] %s',
                          $PROGRAM, $VERSION, $cvsver,
                          (W3C::UserAgent::USE_ROBOT_UA
@@ -498,6 +502,11 @@
   CGI::Carp->import(qw(fatalsToBrowser));
   require CGI::Cookie;
 
+  # file: URIs are not allowed in CGI mode
+  my $forbidden = $ua->protocols_forbidden() || [];
+  push(@$forbidden, 'file');
+  $ua->protocols_forbidden($forbidden);
+
   my $query = new CGI;
   # Set a few parameters in CGI mode
   $Opts{Verbose}   = 0;
@@ -580,11 +589,7 @@
   } if (MP2() && !$ENV{HTTP_AUTHORIZATION});
 
   $uri =~ s/^\s+//g;
-  if ($uri =~ m/^file:/) {
-    # Only the http scheme is allowed
-    # TODO: bug 29
-    &file_uri($uri);
-  } elsif ($uri !~ m/:/) {
+  if ($uri !~ m/:/) {
     if ($uri =~ m|^//|) {
       $uri = 'http:'.$uri;
     } else {
@@ -1010,10 +1015,6 @@
   my %broken;
   while (my ($u, $ulinks) = each(%links)) {
 
-    # Don't check mailto: URI's
-    # TODO: bug 29
-    next if ($u =~ m/^mailto:/);
-
     if ($Opts{Summary_Only}) {
       # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896
       print ' ' if ($Opts{HTML} && !$Opts{Command_Line});
@@ -1370,6 +1371,9 @@
   $results{$uri}{location}{code} = RC_DNS_ERROR()
     if ($results{$uri}{location}{code} == 500 &&
         $response->message() =~ /Bad hostname '[^\']*'/);
+  $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED()
+    if ($results{$uri}{location}{code} == 500 &&
+        $response->message() =~ /Access to '[^\']*' URIs has been disabled/);
   $results{$uri}{location}{type} = $response->header('Content-type');
   $results{$uri}{location}{display} = $results{$uri}{location}{code};
   # Rewind, check for the original code and message.
@@ -1698,21 +1702,6 @@
   # $want_links is true if we're interested in links in the target doc
   # $links is a hash of the links in the documents checked
   # $redirects is a map of the redirects encountered
-
-  # Checking file: URI's is not allowed with a CGI
-  # TODO: bug 29
-  if ($referer ne $uri) {
-    if (!$Opts{Command_Line} && $referer !~ m/^file:/ && $uri =~ m/^file:/) {
-      my $msg = 'Error: \'file:\' URI not allowed';
-      # Can't test? Return 400 Bad request.
-      $results{$uri}{location}{code}    = 400;
-      $results{$uri}{location}{record}  = 400;
-      $results{$uri}{location}{success} = 0;
-      $results{$uri}{location}{message} = $msg;
-      &hprintf("Error: %d %s\n", 400, $msg) if $Opts{Verbose};
-      return;
-    }
-  }
 
   # Get the document with the appropriate method
   # Only use GET if there are fragments. HEAD is enough if it's not the
@@ -2187,6 +2176,7 @@
                RC_DNS_ERROR() => 'The hostname could not be resolved. Check the link for typos.',
                RC_IP_DISALLOWED() => sprintf('The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.',
                                              $Opts{HTML} ? ('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') : ('') x 2),
+               RC_PROTOCOL_DISALLOWED() => 'The link checker does not support checking links with this URI scheme.',
              );
   my %priority = ( 410 => 1,
                    404 => 2,
@@ -2399,7 +2389,8 @@
   my $r = HTTP::Response->new($code);
   if ($r->is_success()) {
     $icon_type = 'error'; # if is success but reported, it's because of broken frags => error
-  } elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) {
+  } elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED() ||
+           $code == RC_PROTOCOL_DISALLOWED()) {
     $icon_type = 'info';
   } elsif ($code == 300) {
     $icon_type = 'info';
@@ -2482,18 +2473,6 @@
 </html>
 EOF
   return;
-}
-
-sub file_uri ($)
-{
-  my ($uri) = @_;
-  &html_header($uri);
-  printf(<<'EOF', &encode($uri));
-<h2>Forbidden</h2>
-<p>You cannot check such a URI (<code>%s</code>).</p>
-EOF
-  &html_footer();
-  exit;
 }
 
 sub print_form (\%$$)
Received on Thursday, 5 August 2010 14:47:33 UTC