- From: Michael Ernst via cvs-syncmail <cvsmail@w3.org>
- Date: Mon, 29 Jun 2009 20:14:40 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LinkChecker/bin
In directory hutz:/tmp/cvs-serv27271
Modified Files:
checklink checklink.pod
Log Message:
Add new options:
--suppress-redirect
--suppress-redirect-prefix
--suppress-broken
--suppress-fragment
Also improve initialization code for Exclude_Docs (--exclude-docs option).
Index: checklink
===================================================================
RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v
retrieving revision 4.160
retrieving revision 4.161
diff -u -d -r4.160 -r4.161
--- checklink 3 May 2009 09:34:56 -0000 4.160
+++ checklink 29 Jun 2009 20:14:38 -0000 4.161
@@ -392,7 +392,11 @@
Password => undef,
Base_Locations => [],
Exclude => undef,
- Exclude_Docs => [],
+ Exclude_Docs => undef,
+ Suppress_Redirect => [],
+ Suppress_Redirect_Prefix => [],
+ Suppress_Broken => [],
+ Suppress_Fragment => [],
Masquerade => 0,
Masquerade_From => '',
Masquerade_To => '',
@@ -419,20 +423,6 @@
&parse_arguments() if $Opts{Command_Line};
-# Precompile/error-check regular expressions.
-if (defined($Opts{Exclude})) {
- eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; };
- &usage(1, "Error in exclude regexp: $@") if $@;
-}
-for my $i (0 .. $#{$Opts{Exclude_Docs}}) {
- eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; };
- &usage(1, "Error in exclude-docs regexp: $@") if $@;
-}
-if (defined($Opts{Trusted})) {
- eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; };
- &usage(1, "Error in trusted domains regexp: $@") if $@;
-}
-
my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address
# @@@ make number of keep-alive connections customizable
$ua->conn_cache({ total_capacity => 1}); # 1 keep-alive connection
@@ -643,6 +633,10 @@
'l|location=s' => \@locs,
'X|exclude=s' => \$Opts{Exclude},
'exclude-docs=s@' => \@{$Opts{Exclude_Docs}},
+ 'suppress-redirect=s@' => \@{$Opts{Suppress_Redirect}},
+ 'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}},
+ 'suppress-broken=s@' => \@{$Opts{Suppress_Broken}},
+ 'suppress-fragment=s@' => \@{$Opts{Suppress_Fragment}},
'u|user=s' => \$Opts{User},
'p|password=s' => \$Opts{Password},
't|timeout=i' => \$Opts{Timeout},
@@ -686,6 +680,42 @@
$Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs);
+ # Precompile/error-check regular expressions.
+ if (defined($Opts{Exclude})) {
+ eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; };
+ &usage(1, "Error in exclude regexp: $@") if $@;
+ }
+ for my $i (0 .. $#{$Opts{Exclude_Docs}}) {
+ eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; };
+ &usage(1, "Error in exclude-docs regexp: $@") if $@;
+ }
+ if (defined($Opts{Trusted})) {
+ eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; };
+ &usage(1, "Error in trusted domains regexp: $@") if $@;
+ }
+
+ # Sanity-check error-suppression arguments
+ for my $sr_arg (@{$Opts{Suppress_Redirect}}) {
+ if ($sr_arg !~ /.->./) {
+ &usage(1, "Bad suppress-redirect argument, should contain \"->\": $sr_arg");
+ }
+ }
+ for my $srp_arg (@{$Opts{Suppress_Redirect_Prefix}}) {
+ if ($srp_arg !~ /.->./) {
+ &usage(1, "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg");
+ }
+ }
+ for my $sb_arg (@{$Opts{Suppress_Broken}}) {
+ if ($sb_arg !~ /^(-1|[0-9]+):./) {
+ &usage(1, "Bad suppress-broken argument: $sb_arg");
+ }
+ }
+ for my $sf_arg (@{$Opts{Suppress_Fragment}}) {
+ if ($sf_arg !~ /.#./) {
+ &usage(1, "Bad suppress-fragment argument, should contain \"#\": $sf_arg");
+ }
+ }
+
return;
}
@@ -728,6 +758,18 @@
--exclude-docs REGEXP In recursive mode, do not check links in documents
whose full, canonical URIs match REGEXP. This
option may be specified multiple times.
+ --suppress-redirect URI->URI Do not report a redirect from the first to the
+ second URI. This option may be specified multiple
+ times.
+ --suppress-redirect-prefix URI->URI Do not report a redirect from a child of
+ the first URI to the same child of the second URI.
+ This option may be specified multiple times.
+ --suppress-broken CODE:URI Do not report a broken link with the given CODE.
+ CODE is HTTP response, or -1 for robots exclusion.
+ This option may be specified multiple times.
+ --suppress-fragment URI Do not report the given broken fragment URI.
+ A fragment URI contains \"#\". This option may be
+ specified multiple times.
-L, --languages LANGS Accept-Language header to send. The special value
'auto' causes autodetection from the environment.
-R, --no-referer Do not send the Referer HTTP header.
@@ -1403,6 +1445,19 @@
$results{$uri}{location}{orig_message} = $tmp->message() || '(no message)';
}
$results{$uri}{location}{success} = $response->is_success();
+
+ # If a suppressed broken link, fill the data structure like a typical success.
+ # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n";
+ if (! $results{$uri}{location}{success}) {
+ my $code = $results{$uri}{location}{code};
+ my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}};
+ if ($match) {
+ $results{$uri}{location}{success} = 1;
+ $results{$uri}{location}{code} = 100;
+ $results{$uri}{location}{display} = 100;
+ }
+ }
+
# Stores the authentication information
if (defined($response->{Realm})) {
$results{$uri}{location}{realm} = $response->{Realm};
@@ -1767,7 +1822,8 @@
# Check that the fragments exist
foreach my $fragment (keys %{$links->{$uri}{fragments}}) {
if (defined($p->{Anchors}{$fragment})
- || &escape_match($fragment, $p->{Anchors})) {
+ || &escape_match($fragment, $p->{Anchors})
+ || grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}}) {
$results{$uri}{fragments}{$fragment} = 1;
} else {
$results{$uri}{fragments}{$fragment} = 0;
@@ -1870,6 +1926,44 @@
{
my ($redirects, $response) = @_;
for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) {
+
+ # Check for redirect match.
+ my $from = $prev->request()->url();
+ my $to = $response->request()->url(); # same on every loop iteration
+ my $from_to = $from . '->' . $to;
+ my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}};
+ # print STDERR "Result $match of checking $from_to\n";
+ if ($match) { next; }
+
+ # Check for redirect_prefix match
+ my $prefix_match = 0;
+ my $from_len = length($from);
+ my $to_len = length($to);
+ for my $redir_prefix (@{$Opts{Suppresss_Redirect_Prefix}}) {
+ if ($redir_prefix !~ /^(.*)->(.*)$/) {
+ die "Bad suppress-redirect-prefix: $redir_prefix";
+ }
+ my $from_prefix = $1;
+ my $to_prefix = $2;
+ my $from_prefix_len = length($from_prefix);
+ my $to_prefix_len = length($to_prefix);
+ if (($from eq $from_prefix) && ($to eq $to_prefix)) {
+ $prefix_match = 1;
+ last;
+ } elsif (($from_prefix_len < $from_len)
+ && ($to_prefix_len < $to_len)
+ && ($from_prefix eq substr($from, 0, $from_prefix_len))
+ && ($to_prefix eq substr($to, 0, $to_prefix_len))
+ && (substr($from, $from_prefix_len) eq substr($to, $to_prefix_len))) {
+ $prefix_match = 1;
+ last;
+ }
+ }
+ if ($prefix_match) {
+ # print STDERR "A SUPPRESSED REDIRECT:\n $from\n $to\n";
+ next;
+ }
+
$redirects->{$prev->request()->url()} = $response->request()->url();
}
return;
@@ -2119,9 +2213,9 @@
# Response code chain
join(' <span class="redirected_to" title="redirected to">-></span> ',
- map { &encode($_) } @http_codes),
+ map { &encode($_) } @http_codes),
# HTTP final message
- $http_message,
+ $http_message,
# What to do
$whattodo,
# Redirect too?
@@ -2586,8 +2680,8 @@
</fieldset>
<p class=\"submit_button\"><input type=\"submit\" name=\"check\" value=\"Check\" /></p>
</form>
-<div class=\"intro\" id=\"don_program\"></div>
-<script type=\"text/javascript\" src=\"http://www.w3.org/QA/Tools/don_prog.js\"></script>
+<div class=\"intro\" id=\"don_program\"></div>
+<script type=\"text/javascript\" src=\"http://www.w3.org/QA/Tools/don_prog.js\"></script>
";
return;
}
Index: checklink.pod
===================================================================
RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink.pod,v
retrieving revision 1.25
retrieving revision 1.26
diff -u -d -r1.25 -r1.26
--- checklink.pod 3 May 2009 09:36:24 -0000 1.25
+++ checklink.pod 29 Jun 2009 20:14:38 -0000 1.26
@@ -79,6 +79,28 @@
In recursive mode, do not check links in documents whose full, canonical
URIs match I<regexp>. This option may be specified multiple times.
+=item B<--suppress-redirect> I<URI-E<gt>URI>
+
+Do not report a redirect from the first to the second URI. The "-E<gt>" is
+literal text. This option may be specified multiple times.
+
+=item B<--suppress-redirect-prefix I<URI-E<gt>URI>
+
+Do not report a redirect from a child of the first URI to the same child of
+the second URI. The \"->\" is literal text. This option may be specified
+multiple times.
+
+=item B<--suppress-broken> I<CODE:URI>
+
+Do not report a broken link with the given CODE. CODE is the HTTP
+response, or -1 for robots exclusion. The ":" is literal text. This
+option may be specified multiple times.
+
+=item B<--suppress-fragment> I<URI>
+
+Do not report the given broken fragment URI. A fragment URI contains "#".
+This option may be specified multiple times.
+
=item B<-L, --languages> I<accept-language>
The C<Accept-Language> HTTP header to send. In command line mode,
Received on Monday, 29 June 2009 20:14:48 UTC