W3C home > Mailing lists > Public > www-validator-cvs@w3.org > February 2009

perl/modules/W3C/LinkChecker/bin checklink,4.145,4.146

From: Ville Skytta via cvs-syncmail <cvsmail@w3.org>
Date: Tue, 10 Feb 2009 20:11:13 +0000
To: www-validator-cvs@w3.org
Message-Id: <E1LWywr-0007yW-D2@lionel-hutz.w3.org>

Update of /sources/public/perl/modules/W3C/LinkChecker/bin
In directory hutz:/tmp/cvs-serv30630/bin

Modified Files:
	checklink 
Log Message:
Improve handling of URI schemes we don't want checked.

Index: checklink
===================================================================
RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v
retrieving revision 4.145
retrieving revision 4.146
diff -u -d -r4.145 -r4.146
--- checklink	10 Feb 2009 19:28:52 -0000	4.145
+++ checklink	10 Feb 2009 20:11:11 -0000	4.146
@@ -72,6 +72,9 @@
 
   $self->allow_private_ips(1);
 
+  # TODO: bug 29
+  $self->protocols_forbidden([qw(mailto javascript)]);
+
   return $self;
 }
 
@@ -203,9 +206,10 @@
 use URI::file            qw();
 # @@@ Needs also W3C::UserAgent but can't use() it here.
 
-use constant RC_ROBOTS_TXT    => -1;
-use constant RC_DNS_ERROR     => -2;
-use constant RC_IP_DISALLOWED => -3;
+use constant RC_ROBOTS_TXT          => -1;
+use constant RC_DNS_ERROR           => -2;
+use constant RC_IP_DISALLOWED       => -3;
+use constant RC_PROTOCOL_DISALLOWED => -4;
 
 use constant LINE_UNKNOWN  => -1;
 
@@ -498,6 +502,11 @@
   CGI::Carp->import(qw(fatalsToBrowser));
   require CGI::Cookie;
 
+  # file: URIs are not allowed in CGI mode
+  my $forbidden = $ua->protocols_forbidden() || [];
+  push(@$forbidden, 'file');
+  $ua->protocols_forbidden($forbidden);
+
   my $query = new CGI;
   # Set a few parameters in CGI mode
   $Opts{Verbose}   = 0;
@@ -580,11 +589,7 @@
   } if (MP2() && !$ENV{HTTP_AUTHORIZATION});
 
   $uri =~ s/^\s+//g;
-  if ($uri =~ m/^file:/) {
-    # Only the http scheme is allowed
-    # TODO: bug 29
-    &file_uri($uri);
-  } elsif ($uri !~ m/:/) {
+  if ($uri !~ m/:/) {
     if ($uri =~ m|^//|) {
       $uri = 'http:'.$uri;
     } else {
@@ -1010,10 +1015,6 @@
   my %broken;
   while (my ($u, $ulinks) = each(%links)) {
 
-    # Don't check mailto: URI's
-    # TODO: bug 29
-    next if ($u =~ m/^mailto:/);
-
     if ($Opts{Summary_Only}) {
       # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896
       print ' ' if ($Opts{HTML} && !$Opts{Command_Line});
@@ -1370,6 +1371,9 @@
   $results{$uri}{location}{code} = RC_DNS_ERROR()
     if ($results{$uri}{location}{code} == 500 &&
         $response->message() =~ /Bad hostname '[^\']*'/);
+  $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED()
+    if ($results{$uri}{location}{code} == 500 &&
+        $response->message() =~ /Access to '[^\']*' URIs has been disabled/);
   $results{$uri}{location}{type} = $response->header('Content-type');
   $results{$uri}{location}{display} = $results{$uri}{location}{code};
   # Rewind, check for the original code and message.
@@ -1699,21 +1703,6 @@
   # $links is a hash of the links in the documents checked
   # $redirects is a map of the redirects encountered
 
-  # Checking file: URI's is not allowed with a CGI
-  # TODO: bug 29
-  if ($referer ne $uri) {
-    if (!$Opts{Command_Line} && $referer !~ m/^file:/ && $uri =~ m/^file:/) {
-      my $msg = 'Error: \'file:\' URI not allowed';
-      # Can't test? Return 400 Bad request.
-      $results{$uri}{location}{code}    = 400;
-      $results{$uri}{location}{record}  = 400;
-      $results{$uri}{location}{success} = 0;
-      $results{$uri}{location}{message} = $msg;
-      &hprintf("Error: %d %s\n", 400, $msg) if $Opts{Verbose};
-      return;
-    }
-  }
-
   # Get the document with the appropriate method
   # Only use GET if there are fragments. HEAD is enough if it's not the
   # case.
@@ -2187,6 +2176,7 @@
                RC_DNS_ERROR() => 'The hostname could not be resolved. Check the link for typos.',
                RC_IP_DISALLOWED() => sprintf('The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.',
                                              $Opts{HTML} ? ('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') : ('') x 2),
+               RC_PROTOCOL_DISALLOWED() => 'The link checker does not support checking links with this URI scheme.',
              );
   my %priority = ( 410 => 1,
                    404 => 2,
@@ -2399,7 +2389,8 @@
   my $r = HTTP::Response->new($code);
   if ($r->is_success()) {
     $icon_type = 'error'; # if is success but reported, it's because of broken frags => error
-  } elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) {
+  } elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED() ||
+           $code == RC_PROTOCOL_DISALLOWED()) {
     $icon_type = 'info';
   } elsif ($code == 300) {
     $icon_type = 'info';
@@ -2484,18 +2475,6 @@
   return;
 }
 
-sub file_uri ($)
-{
-  my ($uri) = @_;
-  &html_header($uri);
-  printf(<<'EOF', &encode($uri));
-<h2>Forbidden</h2>
-<p>You cannot check such a URI (<code>%s</code>).</p>
-EOF
-  &html_footer();
-  exit;
-}
-
 sub print_form (\%$$)
 {
   my ($params, $cookie, $check_num) = @_;
Received on Tuesday, 10 February 2009 20:11:21 GMT

This archive was generated by hypermail 2.2.0+W3C-0.50 : Thursday, 26 April 2012 12:55:08 GMT