link-checker commit: Implement sending the Referer header (#32).

changeset:   177:2d64fbf084ef
user:        ville
date:        Fri Aug 17 21:04:42 2007 +0000
files:       bin/checklink bin/checklink.pod
description:
Implement sending the Referer header (#32).


diff -r 9783d5eafa70 -r 2d64fbf084ef bin/checklink
--- a/bin/checklink	Sun Jul 29 18:56:18 2007 +0000
+++ b/bin/checklink	Fri Aug 17 21:04:42 2007 +0000
@@ -5,7 +5,7 @@
 # (c) 1999-2007 World Wide Web Consortium
 # based on Renaud Bruyeron's checklink.pl
 #
-# $Id: checklink,v 4.61 2007-07-29 18:56:18 ville Exp $
+# $Id: checklink,v 4.62 2007-08-17 21:04:41 ville Exp $
 #
 # This program is licensed under the W3C(r) Software License:
 #       http://www.w3.org/Consortium/Legal/copyright-software
@@ -88,6 +88,11 @@
 
   my $response = $self->ip_disallowed($_[0]->uri());
 
+  # RFC 2616, section 15.1.3
+  $_[0]->remove_header("Referer")
+    if ($_[0]->referer() &&
+        (secure_scheme($_[0]->referer()) && !secure_scheme($_[0]->uri())));
+
   $response ||= do {
     local $SIG{__WARN__} = sub { # Suppress some warnings, rt.cpan.org #18902
       warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/);
@@ -158,6 +163,13 @@
   return $resp;
 }
 
+sub secure_scheme
+{
+  my $uri = shift or return 0;
+  $uri = URI->new($uri) unless ref($uri);
+  return ($uri->scheme() =~ /^(?:file|https|ldaps|sips|snews|ssh)$/i);
+}
+
 # -----------------------------------------------------------------------------
 
 package W3C::LinkChecker;
@@ -192,7 +204,7 @@
   $PROGRAM     = 'W3C-checklink';
   $VERSION     = '4.3';
   $REVISION    = sprintf('version %s (c) 1999-2007 W3C', $VERSION);
-  my ($cvsver) = q$Revision: 4.61 $ =~ /(\d+[\d\.]*\.\d+)/;
+  my ($cvsver) = q$Revision: 4.62 $ =~ /(\d+[\d\.]*\.\d+)/;
   $AGENT       = sprintf('%s/%s [%s] %s',
                          $PROGRAM, $VERSION, $cvsver, LWP::RobotUA->_agent());
 
@@ -275,6 +287,7 @@
     Redirects         => 1,
     Dir_Redirects     => 1,
     Accept_Language   => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE},
+    No_Referer        => 0,
     Hide_Same_Realm   => 0,
     Depth             => 0,    # < 0 means unlimited recursion.
     Sleep_Time        => 1,
@@ -362,7 +375,7 @@
     @{$Opts{Base_Locations}} = @bases;
     # Transform the parameter into a URI
     $uri = &urize($uri);
-    &check_uri($uri, $is_first, $Opts{Depth}, undef, 1);
+    &check_uri($uri, $is_first, $Opts{Depth}, undef, undef, 1);
     $is_first &&= 0;
   }
   undef $is_first;
@@ -419,6 +432,7 @@
   }
 
   $Opts{Accept_Language} = undef if $query->param('no_accept_language');
+  $Opts{No_Referer} = $query->param('no_referer');
 
   $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0);
   if (my $depth = $query->param('depth')) {
@@ -515,6 +529,7 @@
              't|timeout=i'     => \$Opts{Timeout},
              'S|sleep=i'       => \$Opts{Sleep_Time},
              'L|languages=s'   => \$Opts{Accept_Language},
+             'R|no-referer'    => \$Opts{No_Referer},
              'D|depth=i'       => sub { $Opts{Depth} = $_[1]
                                           unless $_[1] == 0; },
              'd|domain=s'      => \$Opts{Trusted},
@@ -697,15 +712,15 @@
 # Check for broken links in a resource #
 ########################################
 
-sub check_uri ($$$;$$)
+sub check_uri ($$$;$$$)
 {
-  my ($uri, $is_first, $depth, $cookie, $is_start) = @_;
+  my ($uri, $is_first, $depth, $cookie, $referer, $is_start) = @_;
   $is_start ||= $is_first;
 
   my $start = $Opts{Summary_Only} ? 0 : &get_timestamp();
 
   # Get and parse the document
-  my $response = &get_document('GET', $uri, $doc_count, \%redirects);
+  my $response = &get_document('GET', $uri, $doc_count, \%redirects, $referer);
 
   # Can we check the resource? If not, we exit here...
   return if defined($response->{Stop});
@@ -740,11 +755,13 @@
   if ($is_first && !$Opts{HTML} && !$Opts{Summary_Only}) {
     my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
     my $acclang = $Opts{Accept_Language} || '(not sent)';
-    printf(<<'EOF', $Accept, $acclang, $Opts{Sleep_Time}, $s);
+    my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
+    printf(<<'EOF', $Accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s);
 
 Settings used:
 - Accept: %s
 - Accept-Language: %s
+- Referer: %s
 - Sleeping %d second%s between requests to each server
 EOF
     printf("- Excluding links matching %s\n", $Opts{Exclude})
@@ -762,13 +779,15 @@
     if (! $Opts{Summary_Only}) {
       my $accept = &encode($Accept);
       my $acclang = &encode($Opts{Accept_Language} || '(not sent)');
+      my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
       my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
-      printf(<<'EOF', $accept, $acclang, $Opts{Sleep_Time}, $s);
+      printf(<<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s);
 <div class="settings">
 Settings used:
  <ul>
   <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.1">Accept</a></tt>: %s</li>
   <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4">Accept-Language</a></tt>: %s</li>
+  <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36">Referer</a></tt>: %s</li>
   <li>Sleeping %d second%s between requests to each server</li>
  </ul>
 </div>
@@ -933,7 +952,7 @@
       next unless ($results{$u}{location}{type} =~ $ContentTypes);
 
       # Have we already processed this URI?
-      next if &already_processed($u);
+      next if &already_processed($u, $uri);
 
       # Do the job
       print "\n";
@@ -951,9 +970,9 @@
         }
       }
       if ($depth < 0) {
-        &check_uri($u, 0, -1);
+        &check_uri($u, 0, -1, undef, $uri);
       } else {
-        &check_uri($u, 0, $depth-1);
+        &check_uri($u, 0, $depth-1, undef, $uri);
       }
     }
   }
@@ -964,14 +983,15 @@
 # Get and parse a resource to process #
 #######################################
 
-sub get_document ($$$;\%)
+sub get_document ($$$;\%$)
 {
-  my ($method, $uri, $in_recursion, $redirects) = @_;
+  my ($method, $uri, $in_recursion, $redirects, $referer) = @_;
   # $method contains the HTTP method the use (GET or HEAD)
   # $uri contains the identifier of the resource
   # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least
   #                        the second resource checked)
   # $redirects is a pointer to the hash containing the map of the redirects
+  # $referer is the URI of the referring document
 
   # Get the resource
   my $response;
@@ -979,7 +999,7 @@
       && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
     $response = $results{$uri}{response};
   } else {
-    $response = &get_uri($method, $uri);
+    $response = &get_uri($method, $uri, $referer);
     &record_results($uri, $method, $response);
     &record_redirects($redirects, $response);
   }
@@ -1066,14 +1086,14 @@
 # Check whether a URI has already been processed #
 ##################################################
 
-sub already_processed ($)
+sub already_processed ($$)
 {
-  my ($uri) = @_;
+  my ($uri, $referer) = @_;
   # Don't be verbose for that part...
   my $summary_value = $Opts{Summary_Only};
   $Opts{Summary_Only} = 1;
   # Do a GET: if it fails, we stop, if not, the results are cached
-  my $response = &get_document('GET', $uri, 1);
+  my $response = &get_document('GET', $uri, 1, undef, $referer);
   # ... but just for that part
   $Opts{Summary_Only} = $summary_value;
   # Can we process the resource?
@@ -1088,16 +1108,17 @@
 # Get the content of a URI #
 ############################
 
-sub get_uri ($$;$\%$$$$)
+sub get_uri ($$;$$\%$$$$)
 {
   # Here we have a lot of extra parameters in order not to lose information
   # if the function is called several times (401's)
-  my ($method, $uri, $start, $redirects, $code, $realm, $message, $auth) = @_;
+  my ($method, $uri, $referer, $start, $redirects, $code, $realm, $message,
+      $auth) = @_;
 
   # $method contains the method used
   # $uri contains the target of the request
-  # $start is a timestamp (not defined the first time the function is
-  #                        called)
+  # $referer is the URI of the referring document
+  # $start is a timestamp (not defined the first time the function is called)
   # $redirects is a map of redirects
   # $code is the first HTTP return code
   # $realm is the realm of the request
@@ -1132,6 +1153,9 @@
   $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); })
     if $verbose_progress;
 
+  # Set referer
+  $request->referer($referer) if (!$Opts{No_Referer} && $referer);
+
   # Do the query
   my $response = $ua->request($request);
 
@@ -1159,7 +1183,7 @@
     }
 
     print "\n" if $verbose_progress;
-    return &get_uri($method, $response->request()->url(),
+    return &get_uri($method, $response->request()->url(), $referer,
                     $start, $redirects, $code, $realm, $message, 1);
   }
   # @@@ subtract robot delay from the "fetched in" time?
@@ -1442,8 +1466,8 @@
 
 sub check_validity ($$$\%\%)
 {
-  my ($testing, $uri, $want_links, $links, $redirects) = @_;
-  # $testing is the URI of the document checked
+  my ($referer, $uri, $want_links, $links, $redirects) = @_;
+  # $referer is the URI of the document checked
   # $uri is the URI of the target that we are verifying
   # $want_links is true if we're interested in links in the target doc
   # $links is a hash of the links in the documents checked
@@ -1451,8 +1475,8 @@
 
   # Checking file: URI's is not allowed with a CGI
   # TODO: bug 29
-  if ($testing ne $uri) {
-    if (!$Opts{Command_Line} && $testing !~ m/^file:/ && $uri =~ m/^file:/) {
+  if ($referer ne $uri) {
+    if (!$Opts{Command_Line} && $referer !~ m/^file:/ && $uri =~ m/^file:/) {
       my $msg = 'Error: \'file:\' URI not allowed';
       # Can't test? Return 400 Bad request.
       $results{$uri}{location}{code}    = 400;
@@ -1475,7 +1499,7 @@
   if ((! defined($results{$uri}))
       || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
     $being_processed = 1;
-    $response = &get_uri($method, $uri);
+    $response = &get_uri($method, $uri, $referer);
     # Get the information back from get_uri()
     &record_results($uri, $method, $response);
     # Record the redirects
@@ -2242,6 +2266,7 @@
   my $all = ($q->param('hide_type') ne 'dir') ? $chk : '';
   my $dir = $all                              ? ''   : $chk;
   my $acc = $q->param('no_accept_language')   ? $chk : '';
+  my $ref = $q->param('no_referer')           ? $chk : '';
   my $rec = $q->param('recursive')            ? $chk : '';
   my $dep = &encode($q->param('depth')              || '');
 
@@ -2270,6 +2295,8 @@
     <label for=\"hide_type_dir\"><input type=\"radio\" id=\"hide_type_dir\" name=\"hide_type\" value=\"dir\"", $dir, "> for directories only</label>
     <br>
     <label for=\"no_accept_language\"><input type=\"checkbox\" id=\"no_accept_language\" name=\"no_accept_language\" value=\"on\"", $acc, "> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> header</label>
+    <br>
+    <label for=\"no_referer\"><input type=\"checkbox\" id=\"no_referer\" name=\"no_referer\" value=\"on\"", $ref, "> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36\">Referer</a></tt> header</label>
     <br>
     <label title=\"Check linked documents recursively (maximum: ", $Opts{Max_Documents}, " documents)\" for=\"recursive\"><input type=\"checkbox\" id=\"recursive\" name=\"recursive\" value=\"on\"", $rec, "> Check linked documents recursively</label>,
     <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth\" name=\"depth\" value=\"", $dep, "\"></label>
diff -r 9783d5eafa70 -r 2d64fbf084ef bin/checklink.pod
--- a/bin/checklink.pod	Sun Jul 29 18:56:18 2007 +0000
+++ b/bin/checklink.pod	Fri Aug 17 21:04:42 2007 +0000
@@ -1,4 +1,4 @@
-$Id: checklink.pod,v 1.17 2007-07-19 16:40:16 ville Exp $
+$Id: checklink.pod,v 1.18 2007-08-17 21:04:42 ville Exp $
 
 =head1 NAME
 
@@ -86,6 +86,10 @@
 a value to be detected from the C<LANG> environment variable, and sent
 if found.  In CGI mode, the default is to send the value received from
 the client as is.
+
+=item B<-R, --no-referer>
+
+Do not send the C<Referer> HTTP header.
 
 =item B<-q, --quiet>
 

Received on Thursday, 5 August 2010 14:47:20 UTC