- From: Michael Ernst <mernst@alum.mit.edu>
- Date: Mon, 20 Oct 2008 10:02:11 +0200
- To: Ville Skyttä <ville.skytta@iki.fi>
- Cc: www-validator@w3.org
- Message-ID: <18684.15107.994072.395412@swsmde.ds.mpi-sws.mpg.de>
Here's a modified patch. It turns out that this suggestion: > I think options that can be specified multiple times should be initialized to > an empty array ([]) instead of undef, for cleanliness reasons and because > that way there's no need to check their definedness later on. doesn't work, because then you get output like % checklink --help Invalid option linkage for "suppress-redirect=s@" Invalid option linkage for "suppress-redirect-prefix=s@" Invalid option linkage for "suppress-broken=s@" Invalid option linkage for "suppress-fragment=s@" -Michael Ernst
diff -u -b -r --exclude=CVS --exclude=.bzr --exclude=.hg --exclude=.hgtags --exclude=.svn /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl diff -u -b -r --exclude=CVS --exclude=.bzr --exclude=.hg --exclude=.hgtags --exclude=.svn /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink --- /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink 2008-10-20 09:55:58.871814128 +0200 +++ /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink 2008-10-20 09:55:10.929779599 +0200 @@ -370,6 +370,10 @@ Base_Locations => [], Exclude => undef, Exclude_Docs => undef, + Suppress_Redirect => undef, + Suppress_Redirect_Prefix => undef, + Suppress_Broken => undef, + Suppress_Fragment => undef, Masquerade => 0, Masquerade_From => '', Masquerade_To => '', @@ -613,6 +617,10 @@ 'l|location=s' => \@locs, 'X|exclude=s', => \$Opts{Exclude}, 'exclude-docs=s@', => \$Opts{Exclude_Docs}, + 'suppress-redirect=s@', => \$Opts{Suppress_Redirect}, + 'suppress-redirect-prefix=s@', => \$Opts{Suppress_Redirect_Prefix}, + 'suppress-broken=s@', => \$Opts{Suppress_Broken}, + 'suppress-fragment=s@', => \$Opts{Suppress_Fragment}, 'u|user=s' => \$Opts{User}, 'p|password=s' => \$Opts{Password}, 't|timeout=i' => \$Opts{Timeout}, @@ -695,6 +703,13 @@ as --exclude-docs with the same regexp would. --exclude-docs REGEXP In recursive mode, do not check links in documents whose full, canonical URIs match REGEXP. + --suppress-redirect URI->URI Do not report a redirect from the first to the + second URI. + --suppress-redirect-prefix URI->URI Do not report a redirect from a child of + the first URI to the same child of the second URI. + --suppress-broken CODE:URI Do not report a broken link with the given CODE. + CODE is HTTP response, or -1 for robots exclusion. + --suppress-fragment URL#FRAG Do not report the given broken fragment. -L, --languages LANGS Accept-Language header to send. The special value 'auto' causes autodetection from the environment. -R, --no-referer Do not send the Referer HTTP header. @@ -1360,6 +1375,19 @@ $results{$uri}{location}{orig_message} = $tmp->message() || '(no message)'; } $results{$uri}{location}{success} = $response->is_success(); + + # If a suppressed broken link, fill the data structure like a typical success. + # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n"; + if (! $results{$uri}{location}{success}) { + my $code = $results{$uri}{location}{code}; + my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}}; + if ($match) { + $results{$uri}{location}{success} = 1; + $results{$uri}{location}{code} = 100; + $results{$uri}{location}{display} = 100; + } + } + # Stores the authentication information if (defined($response->{Realm})) { $results{$uri}{location}{realm} = $response->{Realm}; @@ -1729,7 +1757,8 @@ # Check that the fragments exist foreach my $fragment (keys %{$links->{$uri}{fragments}}) { if (defined($p->{Anchors}{$fragment}) - || &escape_match($fragment, $p->{Anchors})) { + || &escape_match($fragment, $p->{Anchors}) + || grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}}) { $results{$uri}{fragments}{$fragment} = 1; } else { $results{$uri}{fragments}{$fragment} = 0; @@ -1823,6 +1852,44 @@ { my ($redirects, $response) = @_; for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) { + + # Check for redirect match. + my $from = $prev->request()->url(); + my $to = $response->request()->url(); # same on every loop iteration + my $from_to = $from . '->' . $to; + my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}}; + # print STDERR "Result $match of checking $from_to\n"; + if ($match) { next; } + + # Check for redirect_prefix match + my $prefix_match = 0; + my $from_len = length($from); + my $to_len = length($to); + for my $redir_prefix (@{$Opts{Suppresss_Redirect_Prefix}}) { + if ($redir_prefix !~ /^(.*)->(.*)$/) { + die "Bad suppress-redirect-prefix: $redir_prefix"; + } + my $from_prefix = $1; + my $to_prefix = $2; + my $from_prefix_len = length($from_prefix); + my $to_prefix_len = length($to_prefix); + if (($from eq $from_prefix) && ($to eq $to_prefix)) { + $prefix_match = 1; + last; + } elsif (($from_prefix_len < $from_len) + && ($to_prefix_len < $to_len) + && ($from_prefix eq substr($from, 0, $from_prefix_len)) + && ($to_prefix eq substr($to, 0, $to_prefix_len)) + && (substr($from, $from_prefix_len) eq substr($to, $to_prefix_len))) { + $prefix_match = 1; + last; + } + } + if ($prefix_match) { + # print STDERR "A SUPPRESSED REDIRECT:\n $from\n $to\n"; + next; + } + $redirects->{$prev->request()->url()} = $response->request()->url(); } return; diff -u -b -r --exclude=CVS --exclude=.bzr --exclude=.hg --exclude=.hgtags --exclude=.svn /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink.pod /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink.pod --- /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink.pod 2008-10-20 09:42:35.394805133 +0200 +++ /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink.pod 2008-10-20 09:36:44.111581530 +0200 @@ -80,6 +80,32 @@ URIs match I<regexp>. This option may be specified multiple times. +=item B<--suppress-redirect> I<URI-E<gt>URI> + +Do not report a redirect from the first to the +second URI. The "-E<gt>" is literal text. +This option may be specified multiple times. + +=item B<--suppress-redirect-prefix I<URI-E<gt>URI> + +Do not report a redirect from a child of +the first URI to the same child of the second +URI. The \"->\" is literal text. +This option may be specified multiple times. + +=item B<--suppress-broken> I<CODE:URI> + +Do not report a broken link with the given CODE. +CODE is the HTTP response, or -1 for robots exclusion. +The ":" is literal text. +This option may be specified multiple times. + +=item B<--suppress-fragment> I<URL#FRAG> + +Do not report the given broken fragment. +The \"#\" is literal text. +This option may be specified multiple times. + =item B<-L, --languages> I<accept-language> The C<Accept-Language> HTTP header to send. In command line mode, Diff finished. Mon Oct 20 09:59:34 2008
Received on Monday, 20 October 2008 08:02:51 UTC