- From: Ville Skytta via cvs-syncmail <cvsmail@w3.org>
- Date: Fri, 17 Aug 2007 21:04:44 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LinkChecker/bin In directory hutz:/tmp/cvs-serv29978/bin Modified Files: checklink checklink.pod Log Message: Implement sending the Referer header (#32). Index: checklink =================================================================== RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v retrieving revision 4.61 retrieving revision 4.62 diff -u -d -r4.61 -r4.62 --- checklink 29 Jul 2007 18:56:18 -0000 4.61 +++ checklink 17 Aug 2007 21:04:41 -0000 4.62 @@ -88,6 +88,11 @@ my $response = $self->ip_disallowed($_[0]->uri()); + # RFC 2616, section 15.1.3 + $_[0]->remove_header("Referer") + if ($_[0]->referer() && + (secure_scheme($_[0]->referer()) && !secure_scheme($_[0]->uri()))); + $response ||= do { local $SIG{__WARN__} = sub { # Suppress some warnings, rt.cpan.org #18902 warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/); @@ -158,6 +163,13 @@ return $resp; } +sub secure_scheme +{ + my $uri = shift or return 0; + $uri = URI->new($uri) unless ref($uri); + return ($uri->scheme() =~ /^(?:file|https|ldaps|sips|snews|ssh)$/i); +} + # ----------------------------------------------------------------------------- package W3C::LinkChecker; @@ -275,6 +287,7 @@ Redirects => 1, Dir_Redirects => 1, Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE}, + No_Referer => 0, Hide_Same_Realm => 0, Depth => 0, # < 0 means unlimited recursion. Sleep_Time => 1, @@ -362,7 +375,7 @@ @{$Opts{Base_Locations}} = @bases; # Transform the parameter into a URI $uri = &urize($uri); - &check_uri($uri, $is_first, $Opts{Depth}, undef, 1); + &check_uri($uri, $is_first, $Opts{Depth}, undef, undef, 1); $is_first &&= 0; } undef $is_first; @@ -419,6 +432,7 @@ } $Opts{Accept_Language} = undef if $query->param('no_accept_language'); + $Opts{No_Referer} = $query->param('no_referer'); $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0); if (my $depth = $query->param('depth')) { @@ -515,6 +529,7 @@ 't|timeout=i' => \$Opts{Timeout}, 'S|sleep=i' => \$Opts{Sleep_Time}, 'L|languages=s' => \$Opts{Accept_Language}, + 'R|no-referer' => \$Opts{No_Referer}, 'D|depth=i' => sub { $Opts{Depth} = $_[1] unless $_[1] == 0; }, 'd|domain=s' => \$Opts{Trusted}, @@ -697,15 +712,15 @@ # Check for broken links in a resource # ######################################## -sub check_uri ($$$;$$) +sub check_uri ($$$;$$$) { - my ($uri, $is_first, $depth, $cookie, $is_start) = @_; + my ($uri, $is_first, $depth, $cookie, $referer, $is_start) = @_; $is_start ||= $is_first; my $start = $Opts{Summary_Only} ? 0 : &get_timestamp(); # Get and parse the document - my $response = &get_document('GET', $uri, $doc_count, \%redirects); + my $response = &get_document('GET', $uri, $doc_count, \%redirects, $referer); # Can we check the resource? If not, we exit here... return if defined($response->{Stop}); @@ -740,11 +755,13 @@ if ($is_first && !$Opts{HTML} && !$Opts{Summary_Only}) { my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; my $acclang = $Opts{Accept_Language} || '(not sent)'; - printf(<<'EOF', $Accept, $acclang, $Opts{Sleep_Time}, $s); + my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; + printf(<<'EOF', $Accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s); Settings used: - Accept: %s - Accept-Language: %s +- Referer: %s - Sleeping %d second%s between requests to each server EOF printf("- Excluding links matching %s\n", $Opts{Exclude}) @@ -762,13 +779,15 @@ if (! $Opts{Summary_Only}) { my $accept = &encode($Accept); my $acclang = &encode($Opts{Accept_Language} || '(not sent)'); + my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; - printf(<<'EOF', $accept, $acclang, $Opts{Sleep_Time}, $s); + printf(<<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s); <div class="settings"> Settings used: <ul> <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.1">Accept</a></tt>: %s</li> <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4">Accept-Language</a></tt>: %s</li> + <li><tt><a href="http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36">Referer</a></tt>: %s</li> <li>Sleeping %d second%s between requests to each server</li> </ul> </div> @@ -933,7 +952,7 @@ next unless ($results{$u}{location}{type} =~ $ContentTypes); # Have we already processed this URI? - next if &already_processed($u); + next if &already_processed($u, $uri); # Do the job print "\n"; @@ -951,9 +970,9 @@ } } if ($depth < 0) { - &check_uri($u, 0, -1); + &check_uri($u, 0, -1, undef, $uri); } else { - &check_uri($u, 0, $depth-1); + &check_uri($u, 0, $depth-1, undef, $uri); } } } @@ -964,14 +983,15 @@ # Get and parse a resource to process # ####################################### -sub get_document ($$$;\%) +sub get_document ($$$;\%$) { - my ($method, $uri, $in_recursion, $redirects) = @_; + my ($method, $uri, $in_recursion, $redirects, $referer) = @_; # $method contains the HTTP method the use (GET or HEAD) # $uri contains the identifier of the resource # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least # the second resource checked) # $redirects is a pointer to the hash containing the map of the redirects + # $referer is the URI of the referring document # Get the resource my $response; @@ -979,7 +999,7 @@ && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { $response = $results{$uri}{response}; } else { - $response = &get_uri($method, $uri); + $response = &get_uri($method, $uri, $referer); &record_results($uri, $method, $response); &record_redirects($redirects, $response); } @@ -1066,14 +1086,14 @@ # Check whether a URI has already been processed # ################################################## -sub already_processed ($) +sub already_processed ($$) { - my ($uri) = @_; + my ($uri, $referer) = @_; # Don't be verbose for that part... my $summary_value = $Opts{Summary_Only}; $Opts{Summary_Only} = 1; # Do a GET: if it fails, we stop, if not, the results are cached - my $response = &get_document('GET', $uri, 1); + my $response = &get_document('GET', $uri, 1, undef, $referer); # ... but just for that part $Opts{Summary_Only} = $summary_value; # Can we process the resource? @@ -1088,16 +1108,17 @@ # Get the content of a URI # ############################ -sub get_uri ($$;$\%$$$$) +sub get_uri ($$;$$\%$$$$) { # Here we have a lot of extra parameters in order not to lose information # if the function is called several times (401's) - my ($method, $uri, $start, $redirects, $code, $realm, $message, $auth) = @_; + my ($method, $uri, $referer, $start, $redirects, $code, $realm, $message, + $auth) = @_; # $method contains the method used # $uri contains the target of the request - # $start is a timestamp (not defined the first time the function is - # called) + # $referer is the URI of the referring document + # $start is a timestamp (not defined the first time the function is called) # $redirects is a map of redirects # $code is the first HTTP return code # $realm is the realm of the request @@ -1132,6 +1153,9 @@ $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); }) if $verbose_progress; + # Set referer + $request->referer($referer) if (!$Opts{No_Referer} && $referer); + # Do the query my $response = $ua->request($request); @@ -1159,7 +1183,7 @@ } print "\n" if $verbose_progress; - return &get_uri($method, $response->request()->url(), + return &get_uri($method, $response->request()->url(), $referer, $start, $redirects, $code, $realm, $message, 1); } # @@@ subtract robot delay from the "fetched in" time? @@ -1442,8 +1466,8 @@ sub check_validity ($$$\%\%) { - my ($testing, $uri, $want_links, $links, $redirects) = @_; - # $testing is the URI of the document checked + my ($referer, $uri, $want_links, $links, $redirects) = @_; + # $referer is the URI of the document checked # $uri is the URI of the target that we are verifying # $want_links is true if we're interested in links in the target doc # $links is a hash of the links in the documents checked @@ -1451,8 +1475,8 @@ # Checking file: URI's is not allowed with a CGI # TODO: bug 29 - if ($testing ne $uri) { - if (!$Opts{Command_Line} && $testing !~ m/^file:/ && $uri =~ m/^file:/) { + if ($referer ne $uri) { + if (!$Opts{Command_Line} && $referer !~ m/^file:/ && $uri =~ m/^file:/) { my $msg = 'Error: \'file:\' URI not allowed'; # Can't test? Return 400 Bad request. $results{$uri}{location}{code} = 400; @@ -1475,7 +1499,7 @@ if ((! defined($results{$uri})) || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { $being_processed = 1; - $response = &get_uri($method, $uri); + $response = &get_uri($method, $uri, $referer); # Get the information back from get_uri() &record_results($uri, $method, $response); # Record the redirects @@ -2242,6 +2266,7 @@ my $all = ($q->param('hide_type') ne 'dir') ? $chk : ''; my $dir = $all ? '' : $chk; my $acc = $q->param('no_accept_language') ? $chk : ''; + my $ref = $q->param('no_referer') ? $chk : ''; my $rec = $q->param('recursive') ? $chk : ''; my $dep = &encode($q->param('depth') || ''); @@ -2271,6 +2296,8 @@ <br> <label for=\"no_accept_language\"><input type=\"checkbox\" id=\"no_accept_language\" name=\"no_accept_language\" value=\"on\"", $acc, "> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> header</label> <br> + <label for=\"no_referer\"><input type=\"checkbox\" id=\"no_referer\" name=\"no_referer\" value=\"on\"", $ref, "> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36\">Referer</a></tt> header</label> + <br> <label title=\"Check linked documents recursively (maximum: ", $Opts{Max_Documents}, " documents)\" for=\"recursive\"><input type=\"checkbox\" id=\"recursive\" name=\"recursive\" value=\"on\"", $rec, "> Check linked documents recursively</label>, <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth\" name=\"depth\" value=\"", $dep, "\"></label> <br><br>", $cookie_options, " Index: checklink.pod =================================================================== RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink.pod,v retrieving revision 1.17 retrieving revision 1.18 diff -u -d -r1.17 -r1.18 --- checklink.pod 19 Jul 2007 16:40:16 -0000 1.17 +++ checklink.pod 17 Aug 2007 21:04:42 -0000 1.18 @@ -87,6 +87,10 @@ if found. In CGI mode, the default is to send the value received from the client as is. +=item B<-R, --no-referer> + +Do not send the C<Referer> HTTP header. + =item B<-q, --quiet> No output if no errors are found. Implies B<--summary>.
Received on Friday, 17 August 2007 21:04:48 UTC