- From: Michael Ernst <mernst@alum.mit.edu>
- Date: Mon, 20 Oct 2008 10:02:11 +0200
- To: Ville Skyttä <ville.skytta@iki.fi>
- Cc: www-validator@w3.org
- Message-ID: <18684.15107.994072.395412@swsmde.ds.mpi-sws.mpg.de>
Here's a modified patch. It turns out that this suggestion:
> I think options that can be specified multiple times should be initialized to
> an empty array ([]) instead of undef, for cleanliness reasons and because
> that way there's no need to check their definedness later on.
doesn't work, because then you get output like
% checklink --help
Invalid option linkage for "suppress-redirect=s@"
Invalid option linkage for "suppress-redirect-prefix=s@"
Invalid option linkage for "suppress-broken=s@"
Invalid option linkage for "suppress-fragment=s@"
-Michael Ernst
diff -u -b -r --exclude=CVS --exclude=.bzr --exclude=.hg --exclude=.hgtags --exclude=.svn /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl
diff -u -b -r --exclude=CVS --exclude=.bzr --exclude=.hg --exclude=.hgtags --exclude=.svn /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink
--- /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink 2008-10-20 09:55:58.871814128 +0200
+++ /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink 2008-10-20 09:55:10.929779599 +0200
@@ -370,6 +370,10 @@
Base_Locations => [],
Exclude => undef,
Exclude_Docs => undef,
+ Suppress_Redirect => undef,
+ Suppress_Redirect_Prefix => undef,
+ Suppress_Broken => undef,
+ Suppress_Fragment => undef,
Masquerade => 0,
Masquerade_From => '',
Masquerade_To => '',
@@ -613,6 +617,10 @@
'l|location=s' => \@locs,
'X|exclude=s', => \$Opts{Exclude},
'exclude-docs=s@', => \$Opts{Exclude_Docs},
+ 'suppress-redirect=s@', => \$Opts{Suppress_Redirect},
+ 'suppress-redirect-prefix=s@', => \$Opts{Suppress_Redirect_Prefix},
+ 'suppress-broken=s@', => \$Opts{Suppress_Broken},
+ 'suppress-fragment=s@', => \$Opts{Suppress_Fragment},
'u|user=s' => \$Opts{User},
'p|password=s' => \$Opts{Password},
't|timeout=i' => \$Opts{Timeout},
@@ -695,6 +703,13 @@
as --exclude-docs with the same regexp would.
--exclude-docs REGEXP In recursive mode, do not check links in documents
whose full, canonical URIs match REGEXP.
+ --suppress-redirect URI->URI Do not report a redirect from the first to the
+ second URI.
+ --suppress-redirect-prefix URI->URI Do not report a redirect from a child of
+ the first URI to the same child of the second URI.
+ --suppress-broken CODE:URI Do not report a broken link with the given CODE.
+ CODE is HTTP response, or -1 for robots exclusion.
+ --suppress-fragment URL#FRAG Do not report the given broken fragment.
-L, --languages LANGS Accept-Language header to send. The special value
'auto' causes autodetection from the environment.
-R, --no-referer Do not send the Referer HTTP header.
@@ -1360,6 +1375,19 @@
$results{$uri}{location}{orig_message} = $tmp->message() || '(no message)';
}
$results{$uri}{location}{success} = $response->is_success();
+
+ # If a suppressed broken link, fill the data structure like a typical success.
+ # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n";
+ if (! $results{$uri}{location}{success}) {
+ my $code = $results{$uri}{location}{code};
+ my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}};
+ if ($match) {
+ $results{$uri}{location}{success} = 1;
+ $results{$uri}{location}{code} = 100;
+ $results{$uri}{location}{display} = 100;
+ }
+ }
+
# Stores the authentication information
if (defined($response->{Realm})) {
$results{$uri}{location}{realm} = $response->{Realm};
@@ -1729,7 +1757,8 @@
# Check that the fragments exist
foreach my $fragment (keys %{$links->{$uri}{fragments}}) {
if (defined($p->{Anchors}{$fragment})
- || &escape_match($fragment, $p->{Anchors})) {
+ || &escape_match($fragment, $p->{Anchors})
+ || grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}}) {
$results{$uri}{fragments}{$fragment} = 1;
} else {
$results{$uri}{fragments}{$fragment} = 0;
@@ -1823,6 +1852,44 @@
{
my ($redirects, $response) = @_;
for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) {
+
+ # Check for redirect match.
+ my $from = $prev->request()->url();
+ my $to = $response->request()->url(); # same on every loop iteration
+ my $from_to = $from . '->' . $to;
+ my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}};
+ # print STDERR "Result $match of checking $from_to\n";
+ if ($match) { next; }
+
+ # Check for redirect_prefix match
+ my $prefix_match = 0;
+ my $from_len = length($from);
+ my $to_len = length($to);
+ for my $redir_prefix (@{$Opts{Suppresss_Redirect_Prefix}}) {
+ if ($redir_prefix !~ /^(.*)->(.*)$/) {
+ die "Bad suppress-redirect-prefix: $redir_prefix";
+ }
+ my $from_prefix = $1;
+ my $to_prefix = $2;
+ my $from_prefix_len = length($from_prefix);
+ my $to_prefix_len = length($to_prefix);
+ if (($from eq $from_prefix) && ($to eq $to_prefix)) {
+ $prefix_match = 1;
+ last;
+ } elsif (($from_prefix_len < $from_len)
+ && ($to_prefix_len < $to_len)
+ && ($from_prefix eq substr($from, 0, $from_prefix_len))
+ && ($to_prefix eq substr($to, 0, $to_prefix_len))
+ && (substr($from, $from_prefix_len) eq substr($to, $to_prefix_len))) {
+ $prefix_match = 1;
+ last;
+ }
+ }
+ if ($prefix_match) {
+ # print STDERR "A SUPPRESSED REDIRECT:\n $from\n $to\n";
+ next;
+ }
+
$redirects->{$prev->request()->url()} = $response->request()->url();
}
return;
diff -u -b -r --exclude=CVS --exclude=.bzr --exclude=.hg --exclude=.hgtags --exclude=.svn /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink.pod /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink.pod
--- /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink.pod 2008-10-20 09:42:35.394805133 +0200
+++ /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink.pod 2008-10-20 09:36:44.111581530 +0200
@@ -80,6 +80,32 @@
URIs match I<regexp>.
This option may be specified multiple times.
+=item B<--suppress-redirect> I<URI-E<gt>URI>
+
+Do not report a redirect from the first to the
+second URI. The "-E<gt>" is literal text.
+This option may be specified multiple times.
+
+=item B<--suppress-redirect-prefix I<URI-E<gt>URI>
+
+Do not report a redirect from a child of
+the first URI to the same child of the second
+URI. The \"->\" is literal text.
+This option may be specified multiple times.
+
+=item B<--suppress-broken> I<CODE:URI>
+
+Do not report a broken link with the given CODE.
+CODE is the HTTP response, or -1 for robots exclusion.
+The ":" is literal text.
+This option may be specified multiple times.
+
+=item B<--suppress-fragment> I<URL#FRAG>
+
+Do not report the given broken fragment.
+The \"#\" is literal text.
+This option may be specified multiple times.
+
=item B<-L, --languages> I<accept-language>
The C<Accept-Language> HTTP header to send. In command line mode,
Diff finished. Mon Oct 20 09:59:34 2008
Received on Monday, 20 October 2008 08:02:51 UTC