W3C home > Mailing lists > Public > www-validator@w3.org > October 2008

Re: checklink: suppress expected errors to avoid false positive warnings

From: Michael Ernst <mernst@alum.mit.edu>
Date: Mon, 20 Oct 2008 10:02:11 +0200
Message-ID: <18684.15107.994072.395412@swsmde.ds.mpi-sws.mpg.de>
To: Ville Skyttä <ville.skytta@iki.fi>
Cc: www-validator@w3.org
Here's a modified patch.  It turns out that this suggestion:

> I think options that can be specified multiple times should be initialized to 
> an empty array ([]) instead of undef, for cleanliness reasons and because 
> that way there's no need to check their definedness later on.

doesn't work, because then you get output like

  % checklink --help
  Invalid option linkage for "suppress-redirect=s@"
  Invalid option linkage for "suppress-redirect-prefix=s@"
  Invalid option linkage for "suppress-broken=s@"
  Invalid option linkage for "suppress-fragment=s@"

                    -Michael Ernst


diff -u -b -r --exclude=CVS --exclude=.bzr --exclude=.hg --exclude=.hgtags --exclude=.svn /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl
diff -u -b -r --exclude=CVS --exclude=.bzr --exclude=.hg --exclude=.hgtags --exclude=.svn /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink
--- /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink	2008-10-20 09:55:58.871814128 +0200
+++ /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink	2008-10-20 09:55:10.929779599 +0200
@@ -370,6 +370,10 @@
     Base_Locations    => [],
     Exclude           => undef,
     Exclude_Docs      => undef,
+    Suppress_Redirect  => undef,
+    Suppress_Redirect_Prefix => undef,
+    Suppress_Broken    => undef,
+    Suppress_Fragment  => undef,
     Masquerade        => 0,
     Masquerade_From   => '',
     Masquerade_To     => '',
@@ -613,6 +617,10 @@
              'l|location=s'    => \@locs,
              'X|exclude=s',    => \$Opts{Exclude},
              'exclude-docs=s@', => \$Opts{Exclude_Docs},
+             'suppress-redirect=s@', => \$Opts{Suppress_Redirect},
+             'suppress-redirect-prefix=s@', => \$Opts{Suppress_Redirect_Prefix},
+             'suppress-broken=s@', => \$Opts{Suppress_Broken},
+             'suppress-fragment=s@', => \$Opts{Suppress_Fragment},
              'u|user=s'        => \$Opts{User},
              'p|password=s'    => \$Opts{Password},
              't|timeout=i'     => \$Opts{Timeout},
@@ -695,6 +703,13 @@
                             as --exclude-docs with the same regexp would.
  --exclude-docs REGEXP      In recursive mode, do not check links in documents
                             whose full, canonical URIs match REGEXP.
+ --suppress-redirect URI->URI  Do not report a redirect from the first to the
+                            second URI.
+ --suppress-redirect-prefix URI->URI  Do not report a redirect from a child of
+                            the first URI to the same child of the second URI.
+ --suppress-broken CODE:URI  Do not report a broken link with the given CODE.
+                            CODE is HTTP response, or -1 for robots exclusion.
+ --suppress-fragment URL#FRAG  Do not report the given broken fragment.
  -L, --languages LANGS      Accept-Language header to send.  The special value
                             'auto' causes autodetection from the environment.
  -R, --no-referer           Do not send the Referer HTTP header.
@@ -1360,6 +1375,19 @@
     $results{$uri}{location}{orig_message} = $tmp->message() || '(no message)';
   }
   $results{$uri}{location}{success} = $response->is_success();
+
+  # If a suppressed broken link, fill the data structure like a typical success.
+  # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n";
+  if (! $results{$uri}{location}{success}) {
+    my $code = $results{$uri}{location}{code};
+    my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}};
+    if ($match) {
+      $results{$uri}{location}{success} = 1;
+      $results{$uri}{location}{code} = 100;
+      $results{$uri}{location}{display} = 100;
+    }
+  }
+
   # Stores the authentication information
   if (defined($response->{Realm})) {
     $results{$uri}{location}{realm} = $response->{Realm};
@@ -1729,7 +1757,8 @@
   # Check that the fragments exist
   foreach my $fragment (keys %{$links->{$uri}{fragments}}) {
     if (defined($p->{Anchors}{$fragment})
-        || &escape_match($fragment, $p->{Anchors})) {
+        || &escape_match($fragment, $p->{Anchors})
+        || grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}}) {
       $results{$uri}{fragments}{$fragment} = 1;
     } else {
       $results{$uri}{fragments}{$fragment} = 0;
@@ -1823,6 +1852,44 @@
 {
   my ($redirects, $response) = @_;
   for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) {
+
+    # Check for redirect match.
+    my $from = $prev->request()->url();
+    my $to = $response->request()->url(); # same on every loop iteration
+    my $from_to = $from . '->' . $to;
+    my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}};
+    # print STDERR "Result $match of checking $from_to\n";
+    if ($match) { next; }
+
+    # Check for redirect_prefix match
+    my $prefix_match = 0;
+    my $from_len = length($from);
+    my $to_len = length($to);
+    for my $redir_prefix (@{$Opts{Suppresss_Redirect_Prefix}}) {
+      if ($redir_prefix !~ /^(.*)->(.*)$/) {
+        die "Bad suppress-redirect-prefix: $redir_prefix";
+      }
+      my $from_prefix = $1;
+      my $to_prefix = $2;
+      my $from_prefix_len = length($from_prefix);
+      my $to_prefix_len = length($to_prefix);
+      if (($from eq $from_prefix) && ($to eq $to_prefix)) {
+        $prefix_match = 1;
+        last;
+      } elsif (($from_prefix_len < $from_len)
+                 && ($to_prefix_len < $to_len)
+                 && ($from_prefix eq substr($from, 0, $from_prefix_len))
+                 && ($to_prefix eq substr($to, 0, $to_prefix_len))
+                 && (substr($from, $from_prefix_len) eq substr($to, $to_prefix_len))) {
+        $prefix_match = 1;
+        last;
+      }
+    }
+    if ($prefix_match) {
+      # print STDERR "A SUPPRESSED REDIRECT:\n  $from\n  $to\n";
+      next;
+    }
+
     $redirects->{$prev->request()->url()} = $response->request()->url();
   }
   return;
diff -u -b -r --exclude=CVS --exclude=.bzr --exclude=.hg --exclude=.hgtags --exclude=.svn /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink.pod /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink.pod
--- /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl-multi-exclude-docs/modules/W3C/LinkChecker/bin/checklink.pod	2008-10-20 09:42:35.394805133 +0200
+++ /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink.pod	2008-10-20 09:36:44.111581530 +0200
@@ -80,6 +80,32 @@
 URIs match I<regexp>.
 This option may be specified multiple times.
 
+=item B<--suppress-redirect> I<URI-E<gt>URI>
+
+Do not report a redirect from the first to the
+second URI.  The "-E<gt>" is literal text.
+This option may be specified multiple times.
+
+=item B<--suppress-redirect-prefix I<URI-E<gt>URI>
+
+Do not report a redirect from a child of
+the first URI to the same child of the second
+URI.  The \"->\" is literal text.
+This option may be specified multiple times.
+
+=item B<--suppress-broken> I<CODE:URI>
+
+Do not report a broken link with the given CODE.
+CODE is the HTTP response, or -1 for robots exclusion.
+The ":" is literal text.
+This option may be specified multiple times.
+
+=item B<--suppress-fragment> I<URL#FRAG>
+
+Do not report the given broken fragment.
+The \"#\" is literal text.
+This option may be specified multiple times.
+
 =item B<-L, --languages> I<accept-language>
 
 The C<Accept-Language> HTTP header to send.  In command line mode,

Diff finished.  Mon Oct 20 09:59:34 2008
Received on Monday, 20 October 2008 08:02:51 GMT

This archive was generated by hypermail 2.2.0+W3C-0.50 : Wednesday, 25 April 2012 12:14:32 GMT