- From: Mercurial notifier <nobody@w3.org>
- Date: Thu, 05 Aug 2010 14:47:22 +0000
- To: link-checker updates <www-validator-cvs@w3.org>
changeset: 318:2aa66ff72527 user: mernst date: Mon Jun 29 20:14:38 2009 +0000 files: bin/checklink bin/checklink.pod description: Add new options: --suppress-redirect --suppress-redirect-prefix --suppress-broken --suppress-fragment Also improve initialization code for Exclude_Docs (--exclude-docs option). diff -r 01b9b5a0b532 -r 2aa66ff72527 bin/checklink --- a/bin/checklink Sun May 03 09:36:24 2009 +0000 +++ b/bin/checklink Mon Jun 29 20:14:38 2009 +0000 @@ -5,7 +5,7 @@ # (c) 1999-2009 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # -# $Id: checklink,v 4.160 2009-05-03 09:34:56 ville Exp $ +# $Id: checklink,v 4.161 2009-06-29 20:14:38 mernst Exp $ # # This program is licensed under the W3C(r) Software License: # http://www.w3.org/Consortium/Legal/copyright-software @@ -275,7 +275,7 @@ $PROGRAM = 'W3C-checklink'; $VERSION = '4.5'; $REVISION = sprintf('version %s (c) 1999-2009 W3C', $VERSION); - my ($cvsver) = q$Revision: 4.160 $ =~ /(\d+[\d\.]*\.\d+)/; + my ($cvsver) = q$Revision: 4.161 $ =~ /(\d+[\d\.]*\.\d+)/; $AGENT = sprintf('%s/%s [%s] %s', $PROGRAM, $VERSION, $cvsver, (W3C::UserAgent::USE_ROBOT_UA @@ -392,7 +392,11 @@ Password => undef, Base_Locations => [], Exclude => undef, - Exclude_Docs => [], + Exclude_Docs => undef, + Suppress_Redirect => [], + Suppress_Redirect_Prefix => [], + Suppress_Broken => [], + Suppress_Fragment => [], Masquerade => 0, Masquerade_From => '', Masquerade_To => '', @@ -418,20 +422,6 @@ my $doc_header; &parse_arguments() if $Opts{Command_Line}; - -# Precompile/error-check regular expressions. -if (defined($Opts{Exclude})) { - eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; }; - &usage(1, "Error in exclude regexp: $@") if $@; -} -for my $i (0 .. $#{$Opts{Exclude_Docs}}) { - eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; }; - &usage(1, "Error in exclude-docs regexp: $@") if $@; -} -if (defined($Opts{Trusted})) { - eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; }; - &usage(1, "Error in trusted domains regexp: $@") if $@; -} my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address # @@@ make number of keep-alive connections customizable @@ -643,6 +633,10 @@ 'l|location=s' => \@locs, 'X|exclude=s' => \$Opts{Exclude}, 'exclude-docs=s@' => \@{$Opts{Exclude_Docs}}, + 'suppress-redirect=s@' => \@{$Opts{Suppress_Redirect}}, + 'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}}, + 'suppress-broken=s@' => \@{$Opts{Suppress_Broken}}, + 'suppress-fragment=s@' => \@{$Opts{Suppress_Fragment}}, 'u|user=s' => \$Opts{User}, 'p|password=s' => \$Opts{Password}, 't|timeout=i' => \$Opts{Timeout}, @@ -686,6 +680,42 @@ $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs); + # Precompile/error-check regular expressions. + if (defined($Opts{Exclude})) { + eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; }; + &usage(1, "Error in exclude regexp: $@") if $@; + } + for my $i (0 .. $#{$Opts{Exclude_Docs}}) { + eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; }; + &usage(1, "Error in exclude-docs regexp: $@") if $@; + } + if (defined($Opts{Trusted})) { + eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; }; + &usage(1, "Error in trusted domains regexp: $@") if $@; + } + + # Sanity-check error-suppression arguments + for my $sr_arg (@{$Opts{Suppress_Redirect}}) { + if ($sr_arg !~ /.->./) { + &usage(1, "Bad suppress-redirect argument, should contain \"->\": $sr_arg"); + } + } + for my $srp_arg (@{$Opts{Suppress_Redirect_Prefix}}) { + if ($srp_arg !~ /.->./) { + &usage(1, "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg"); + } + } + for my $sb_arg (@{$Opts{Suppress_Broken}}) { + if ($sb_arg !~ /^(-1|[0-9]+):./) { + &usage(1, "Bad suppress-broken argument: $sb_arg"); + } + } + for my $sf_arg (@{$Opts{Suppress_Fragment}}) { + if ($sf_arg !~ /.#./) { + &usage(1, "Bad suppress-fragment argument, should contain \"#\": $sf_arg"); + } + } + return; } @@ -728,6 +758,18 @@ --exclude-docs REGEXP In recursive mode, do not check links in documents whose full, canonical URIs match REGEXP. This option may be specified multiple times. + --suppress-redirect URI->URI Do not report a redirect from the first to the + second URI. This option may be specified multiple + times. + --suppress-redirect-prefix URI->URI Do not report a redirect from a child of + the first URI to the same child of the second URI. + This option may be specified multiple times. + --suppress-broken CODE:URI Do not report a broken link with the given CODE. + CODE is HTTP response, or -1 for robots exclusion. + This option may be specified multiple times. + --suppress-fragment URI Do not report the given broken fragment URI. + A fragment URI contains \"#\". This option may be + specified multiple times. -L, --languages LANGS Accept-Language header to send. The special value 'auto' causes autodetection from the environment. -R, --no-referer Do not send the Referer HTTP header. @@ -1403,6 +1445,19 @@ $results{$uri}{location}{orig_message} = $tmp->message() || '(no message)'; } $results{$uri}{location}{success} = $response->is_success(); + + # If a suppressed broken link, fill the data structure like a typical success. + # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n"; + if (! $results{$uri}{location}{success}) { + my $code = $results{$uri}{location}{code}; + my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}}; + if ($match) { + $results{$uri}{location}{success} = 1; + $results{$uri}{location}{code} = 100; + $results{$uri}{location}{display} = 100; + } + } + # Stores the authentication information if (defined($response->{Realm})) { $results{$uri}{location}{realm} = $response->{Realm}; @@ -1767,7 +1822,8 @@ # Check that the fragments exist foreach my $fragment (keys %{$links->{$uri}{fragments}}) { if (defined($p->{Anchors}{$fragment}) - || &escape_match($fragment, $p->{Anchors})) { + || &escape_match($fragment, $p->{Anchors}) + || grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}}) { $results{$uri}{fragments}{$fragment} = 1; } else { $results{$uri}{fragments}{$fragment} = 0; @@ -1870,6 +1926,44 @@ { my ($redirects, $response) = @_; for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) { + + # Check for redirect match. + my $from = $prev->request()->url(); + my $to = $response->request()->url(); # same on every loop iteration + my $from_to = $from . '->' . $to; + my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}}; + # print STDERR "Result $match of checking $from_to\n"; + if ($match) { next; } + + # Check for redirect_prefix match + my $prefix_match = 0; + my $from_len = length($from); + my $to_len = length($to); + for my $redir_prefix (@{$Opts{Suppresss_Redirect_Prefix}}) { + if ($redir_prefix !~ /^(.*)->(.*)$/) { + die "Bad suppress-redirect-prefix: $redir_prefix"; + } + my $from_prefix = $1; + my $to_prefix = $2; + my $from_prefix_len = length($from_prefix); + my $to_prefix_len = length($to_prefix); + if (($from eq $from_prefix) && ($to eq $to_prefix)) { + $prefix_match = 1; + last; + } elsif (($from_prefix_len < $from_len) + && ($to_prefix_len < $to_len) + && ($from_prefix eq substr($from, 0, $from_prefix_len)) + && ($to_prefix eq substr($to, 0, $to_prefix_len)) + && (substr($from, $from_prefix_len) eq substr($to, $to_prefix_len))) { + $prefix_match = 1; + last; + } + } + if ($prefix_match) { + # print STDERR "A SUPPRESSED REDIRECT:\n $from\n $to\n"; + next; + } + $redirects->{$prev->request()->url()} = $response->request()->url(); } return; @@ -2119,9 +2213,9 @@ # Response code chain join(' <span class="redirected_to" title="redirected to">-></span> ', - map { &encode($_) } @http_codes), + map { &encode($_) } @http_codes), # HTTP final message - $http_message, + $http_message, # What to do $whattodo, # Redirect too? @@ -2586,8 +2680,8 @@ </fieldset> <p class=\"submit_button\"><input type=\"submit\" name=\"check\" value=\"Check\" /></p> </form> -<div class=\"intro\" id=\"don_program\"></div> -<script type=\"text/javascript\" src=\"http://www.w3.org/QA/Tools/don_prog.js\"></script> +<div class=\"intro\" id=\"don_program\"></div> +<script type=\"text/javascript\" src=\"http://www.w3.org/QA/Tools/don_prog.js\"></script> "; return; } diff -r 01b9b5a0b532 -r 2aa66ff72527 bin/checklink.pod --- a/bin/checklink.pod Sun May 03 09:36:24 2009 +0000 +++ b/bin/checklink.pod Mon Jun 29 20:14:38 2009 +0000 @@ -1,4 +1,4 @@ -$Id: checklink.pod,v 1.25 2009-05-03 09:36:24 ville Exp $ +$Id: checklink.pod,v 1.26 2009-06-29 20:14:38 mernst Exp $ =head1 NAME @@ -78,6 +78,28 @@ In recursive mode, do not check links in documents whose full, canonical URIs match I<regexp>. This option may be specified multiple times. + +=item B<--suppress-redirect> I<URI-E<gt>URI> + +Do not report a redirect from the first to the second URI. The "-E<gt>" is +literal text. This option may be specified multiple times. + +=item B<--suppress-redirect-prefix I<URI-E<gt>URI> + +Do not report a redirect from a child of the first URI to the same child of +the second URI. The \"->\" is literal text. This option may be specified +multiple times. + +=item B<--suppress-broken> I<CODE:URI> + +Do not report a broken link with the given CODE. CODE is the HTTP +response, or -1 for robots exclusion. The ":" is literal text. This +option may be specified multiple times. + +=item B<--suppress-fragment> I<URI> + +Do not report the given broken fragment URI. A fragment URI contains "#". +This option may be specified multiple times. =item B<-L, --languages> I<accept-language>
Received on Thursday, 5 August 2010 14:47:32 UTC