perl/modules/W3C/LinkChecker/bin checklink,4.80,4.81

Update of /sources/public/perl/modules/W3C/LinkChecker/bin
In directory hutz:/tmp/cvs-serv28708

Modified Files:
	checklink 
Log Message:
Review and explicitly list things treated as links.

New cases checked:
* BLOCKQUOTE,DEL,INS,Q: cite
* BODY: background
* FRAME,IFRAME,IMG: longdesc
* HEAD: profile
* EMBED (proprietary): pluginspage, pluginurl.

Cases no longer checked:
* (any unknown element): src, href
* (any element whose src attribute is unknown to me): src
* (any element whose href attribute is unknown to me): href.

Theoretically, we could use %HTML::Tagset::linkElements, but it contains things
that are not necessarily dereferenceable URIs and/or require special treatment
so maintaining our own lists is simpler.


Index: checklink
===================================================================
RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v
retrieving revision 4.80
retrieving revision 4.81
diff -u -d -r4.80 -r4.81
--- checklink	24 Jan 2008 01:29:38 -0000	4.80
+++ checklink	9 Feb 2008 14:49:13 -0000	4.81
@@ -195,6 +195,46 @@
 use constant MP2 =>
   (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2);
 
+# Tag=>attribute mapping of things we treat as links.
+# Note: base/@href gets a special treatment, see start() for details.
+use constant LINK_ATTRS => {
+    a          => ['href'],
+    area       => ['href'],
+    blockquote => ['cite'],
+    body       => ['background'],
+    del        => ['cite'],
+    embed      => ['href', 'pluginspage', 'pluginurl', 'src'], # proprietary
+    frame      => ['longdesc', 'src'],
+    iframe     => ['longdesc', 'src'],
+    img        => ['longdesc', 'src'],
+    ins        => ['cite'],
+    input      => ['src'],
+    link       => ['href'],
+    object     => ['data'],
+    q          => ['cite'],
+    script     => ['src'],
+};
+
+# Tag=>attribute mapping of things we treat as space separeted lists of links.
+use constant LINK_LIST_ATTRS => {
+    head       => ['profile'],
+};
+
+# TBD/TODO:
+# - applet/@archive, @code?
+# - applet/@codebase: does not need to be dereferenceable but HTML 4 spec says
+#   it may only point to subdirectories of the directory containing the current
+#   document
+# - bgsound/@src?
+# - object/@classid?
+# - object/@archive?
+# - object/@codebase: base URI for @classid, @data, @archive
+# - isindex/@action?
+# - layer/@background,@src?
+# - ilayer/@background?
+# - table,tr,td,th/@background?
+# - xmp/@href?
+
 @W3C::LinkChecker::ISA =  qw(HTML::Parser);
 
 BEGIN
@@ -1437,19 +1477,34 @@
 
   # Links
   if (!$self->{only_anchors}) {
-    # Here, we are checking too many things
-    # The right thing to do is to parse the DTD...
+
+    # Special case: base/@href
     if ($tag eq 'base') {
       # Treat <base> (without href) or <base href=""> as if it didn't exist.
       if (defined($attr->{href}) && $attr->{href} ne '') {
         $self->{base} = $attr->{href};
       }
-    } else {
-      $self->add_link($attr->{href}, $line);
+      # Note: base/@href intentionally not treated as a dereferenceable link:
+      # http://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi
+    }
+
+    # Link attributes:
+    if (my $link_attrs = LINK_ATTRS()->{$tag}) {
+      for my $la (@$link_attrs) {
+        $self->add_link($attr->{$la}, $line);
+      }
+    }
+
+    # List of links attributes:
+    if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) {
+      for my $la (@$link_attrs) {
+        if (defined(my $value = $attr->{$la})) {
+          for my $link (split(/\s+/, $value)) {
+            $self->add_link($link, $line);
+          }
+        }
+      }
     }
-    $self->add_link($attr->{src}, $line);
-    $self->add_link($attr->{data}, $line) if ($tag eq 'object');
-    $self->add_link($attr->{cite}, $line) if ($tag eq 'blockquote');
   }
 
   $self->parse_progress($line) if $Opts{Progress};

Received on Saturday, 9 February 2008 14:49:25 UTC