W3C home > Mailing lists > Public > www-validator-cvs@w3.org > August 2010

link-checker commit: Add --exclude option for excluding links matching given regexp (#689).

From: Mercurial notifier <nobody@w3.org>
Date: Thu, 05 Aug 2010 14:47:07 +0000
To: link-checker updates <www-validator-cvs@w3.org>
Message-Id: <E1Oh1it-0005qB-En@blinky.w3.org>
changeset:   163:3a6812a7c653
user:        ville
date:        Mon Apr 09 22:18:21 2007 +0000
files:       bin/checklink bin/checklink.pod
description:
Add --exclude option for excluding links matching given regexp (#689).


diff -r 2137beef7996 -r 3a6812a7c653 bin/checklink
--- a/bin/checklink	Fri Mar 30 20:47:55 2007 +0000
+++ b/bin/checklink	Mon Apr 09 22:18:21 2007 +0000
@@ -5,7 +5,7 @@
 # (c) 1999-2007 World Wide Web Consortium
 # based on Renaud Bruyeron's checklink.pl
 #
-# $Id: checklink,v 4.51 2007-03-30 20:47:54 ville Exp $
+# $Id: checklink,v 4.52 2007-04-09 22:18:21 ville Exp $
 #
 # This program is licensed under the W3C(r) Software License:
 #       http://www.w3.org/Consortium/Legal/copyright-software
@@ -128,7 +128,7 @@
   $PROGRAM     = 'W3C-checklink';
   $VERSION     = '4.3';
   $REVISION    = sprintf('version %s (c) 1999-2007 W3C', $VERSION);
-  my ($cvsver) = q$Revision: 4.51 $ =~ /(\d+[\d\.]*\.\d+)/;
+  my ($cvsver) = q$Revision: 4.52 $ =~ /(\d+[\d\.]*\.\d+)/;
   $AGENT       = sprintf('%s/%s [%s] %s',
                          $PROGRAM, $VERSION, $cvsver, LWP::RobotUA->_agent());
 
@@ -219,6 +219,7 @@
     User              => undef,
     Password          => undef,
     Base_Locations    => [],
+    Exclude           => undef,
     Exclude_Docs      => undef,
     Masquerade        => 0,
     Masquerade_From   => '',
@@ -262,6 +263,10 @@
 &parse_arguments() if $Opts{Command_Line};
 
 # Precompile/error-check regular expressions.
+if (defined($Opts{Exclude})) {
+  eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; };
+  &usage(1, "Error in exclude regexp: $@") if $@;
+}
 if (defined($Opts{Exclude_Docs})) {
   eval { $Opts{Exclude_Docs} = qr/$Opts{Exclude_Docs}/o; };
   &usage(1, "Error in exclude-docs regexp: $@") if $@;
@@ -449,6 +454,7 @@
              'r|recursive'     => sub { $Opts{Depth} = -1
                                           if $Opts{Depth} == 0; },
              'l|location=s'    => \@locs,
+             'X|exclude=s',    => \$Opts{Exclude},
              'exclude-docs=s', => \$Opts{Exclude_Docs},
              'u|user=s'        => \$Opts{User},
              'p|password=s'    => \$Opts{Password},
@@ -537,6 +543,9 @@
                             times.  If not specified, the default eg. for
                             http://www.w3.org/TR/html4/Overview.html
                             would be http://www.w3.org/TR/html4/
+ -X, --exclude REGEXP       Do not check links whose full, canonical URIs
+                            match REGEXP; also limits recursion the same way
+                            as --exclude-docs with the same regexp would.
  --exclude-docs REGEXP      In recursive mode, do not check links in documents
                             whose full, canonical URIs match REGEXP.
  -L, --languages LANGS      Accept-Language header to send.  The special value
@@ -692,6 +701,8 @@
 - Accept-Language: %s
 - Sleeping %d second%s between requests to each server
 EOF
+    printf("- Excluding links matching %s\n", $Opts{Exclude})
+      if defined($Opts{Exclude});
     printf("- Excluding links in documents whose URIs match %s\n",
       $Opts{Exclude_Docs}) if defined($Opts{Exclude_Docs});
   }
@@ -786,10 +797,10 @@
       };
     }
     foreach my $lines (keys %{$p->{Links}{$link}}) {
-      my $canonical = URI->new($abs_link_uri->canonical());
-      my $url = $canonical->scheme().':'.$canonical->opaque();
-      my $fragment = $canonical->fragment();
-      if (! $fragment) {
+      my $url = URI->new($abs_link_uri->canonical());
+      my $fragment = $url->fragment(undef);
+      next if (defined($Opts{Exclude}) && $url =~ $Opts{Exclude});
+      if (!defined($fragment) || $fragment eq '') {
         # Document without fragment
         $links{$url}{location}{$lines} = 1;
       } else {
@@ -992,7 +1003,8 @@
   my $candidate = URI->new($uri)->canonical();
 
   return 0
-    if (defined($Opts{Exclude_Docs}) && $candidate =~ $Opts{Exclude_Docs});
+      if ((defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude}) ||
+          (defined($Opts{Exclude_Docs}) && $candidate =~ $Opts{Exclude_Docs}));
 
   foreach my $base (@{$Opts{Base_Locations}}) {
     my $rel = $candidate->rel($base);
diff -r 2137beef7996 -r 3a6812a7c653 bin/checklink.pod
--- a/bin/checklink.pod	Fri Mar 30 20:47:55 2007 +0000
+++ b/bin/checklink.pod	Mon Apr 09 22:18:21 2007 +0000
@@ -1,4 +1,4 @@
-$Id: checklink.pod,v 1.15 2007-03-30 20:47:55 ville Exp $
+$Id: checklink.pod,v 1.16 2007-04-09 22:18:21 ville Exp $
 
 =head1 NAME
 
@@ -67,6 +67,12 @@
 default is the base URI of the initial document, for example for
 L<http://www.w3.org/TR/html4/Overview.html> it would be
 L<http://www.w3.org/TR/html4/>.
+
+=item B<-X, --exclude> I<regexp>
+
+Do not check links whose full, canonical URIs match I<regexp>.  Note that
+this option limits recursion the same way as B<--exclude-docs> with the same
+regular expression would.
 
 =item B<--exclude-docs> I<regexp>
 
Received on Thursday, 5 August 2010 14:47:27 UTC

This archive was generated by hypermail 2.3.1 : Wednesday, 7 January 2015 15:17:43 UTC