perl/modules/W3C/LinkChecker/bin checklink,4.187,4.188 from Ville Skytta via cvs-syncmail on 2010-03-07 (www-validator-cvs@w3.org from March 2010)

From: Ville Skytta via cvs-syncmail <cvsmail@w3.org>
Date: Sun, 07 Mar 2010 17:16:16 +0000
To: www-validator-cvs@w3.org
Message-Id: <E1NoK5Q-0002Gq-2k@lionel-hutz.w3.org>
Update of /sources/public/perl/modules/W3C/LinkChecker/bin
In directory hutz:/tmp/cvs-serv8409/bin

Modified Files:
	checklink 
Log Message:
Add support for checking links in CSS.

Known issue: no support for line numbers yet.


Index: checklink
===================================================================
RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v
retrieving revision 4.187
retrieving revision 4.188
diff -u -d -r4.187 -r4.188
--- checklink	7 Mar 2010 16:34:34 -0000	4.187
+++ checklink	7 Mar 2010 17:16:13 -0000	4.188
@@ -219,10 +219,15 @@
 package W3C::LinkChecker;
 
 use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION
-    $DocType $Head $Accept $ContentTypes %Cfg);
+    $DocType $Head $Accept $ContentTypes %Cfg $CssUrl);
 
+use CSS::DOM 0.09 qw();    # >= 0.09 for many bugfixes
+use CSS::DOM::Constants qw(:rule);
+use CSS::DOM::Style qw();
+use CSS::DOM::Util qw();
 use HTML::Entities qw();
-use HTML::Parser 3.20 qw();      # >= 3.20 for "line" argspec identifier
+use HTML::Parser 3.20 qw();    # >= 3.20 for "line" argspec identifier
+use HTTP::Headers::Util qw();
 use HTTP::Request qw();
 use HTTP::Response 1.50 qw();    # >= 1.50 for decoded_content()
 use Time::HiRes qw();
@@ -322,9 +327,13 @@
         application/vnd.wap.xhtml+xml;q=0.6
     );
     $Accept = join(', ', @content_types, '*/*;q=0.5');
+    push(@content_types, "text/css");
     my $re = join('|', map { s/;.*//; quotemeta } @content_types);
     $ContentTypes = qr{\b(?:$re)\b}io;
 
+    # Regexp for matching URL values in CSS.
+    $CssUrl = qr/(?:\s|^)url\(\s*(['"]?)(.*?)\1\s*\)(?=\s|$)/;
+
     #
     # Read configuration.  If the W3C_CHECKLINK_CFG environment variable has
     # been set or the default contains a non-empty file, read it.  Otherwise,
@@ -1145,13 +1154,16 @@
                 $result_anchor);
             my $esc_uri =
                 URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9.");
+            print "<p>For reliable link checking results, check ";
+
+            if (!$response->{IsCss}) {
+                printf("<a href=\"%s\">HTML validity</a> and ",
+                    &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)));
+            }
             printf(
-                "<p>For reliable link checking results, check
-<a href=\"%s\">HTML validity</a> first.  See also
-<a href=\"%s\">CSS validity</a>.</p>
+                "<a href=\"%s\">CSS validity</a> first.</p>
 <p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n",
-                &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)),
-                &encode(sprintf($Cfg{CSS_Validator_URI},    $esc_uri)),
+                &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)),
                 &encode($Opts{_Self_URI})
             );
 
@@ -1536,6 +1548,18 @@
     return 0;    # We always have at least one base location, but none matched.
 }
 
+#################################
+# Check for content type match. #
+#################################
+
+sub is_content_type ($$)
+{
+    my ($candidate, $type) = @_;
+    return 0 unless ($candidate && $type);
+    my @v = HTTP::Headers::Util::split_header_words($candidate);
+    return scalar(@v) ? $type eq lc($v[0]->[0]) : 0;
+}
+
 ##################################################
 # Check whether a URI has already been processed #
 ##################################################
@@ -1666,6 +1690,8 @@
     &hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp()))
         if $verbose_progress;
 
+    $response->{IsCss} =
+        is_content_type($response->content_type(), "text/css");
     $response->{Realm} = $realm if defined($realm);
 
     return $response;
@@ -1767,9 +1793,13 @@
         return $p;
     }
 
-    my $start;
     $p = W3C::LinkChecker->new();
     $p->{base} = $base_uri;
+
+    my $stype = $response->header("Content-Style-Type");
+    $p->{style_is_css} = !$stype || is_content_type($stype, "text/css");
+
+    my $start;
     if (!$Opts{Summary_Only}) {
         $start = &get_timestamp();
         print("Parsing...\n");
@@ -1778,26 +1808,40 @@
     # Content-Encoding etc already decoded in get_document().
     my $docref = $response->content_ref();
 
-    # Count lines beforehand if needed for progress indicator.  In all cases,
-    # the actual final number of lines processed shown is populated by our
+    # Count lines beforehand if needed (for progress indicator, or CSS while
+    # we don't get any line context out of the parser).  In case of HTML, the
+    # actual final number of lines processed shown is populated by our
     # end_document handler.
-    $p->{Total} = ($$docref =~ tr/\n//) if $Opts{Progress};
+    $p->{Total} = ($$docref =~ tr/\n//)
+        if ($response->{IsCss} || $Opts{Progress});
 
     # We only look for anchors if we are not interested in the links
     # obviously, or if we are running a recursive checking because we
     # might need this information later
     $p->{only_anchors} = !($links || $rec_needs_links);
 
-    # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
-    # Processing instructions are not parsed by process, but in this case
-    # it should be. It's expensive, it's horrible, but it's the easiest way
-    # for right now.
-    $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/
-        unless $p->{only_anchors};
+    if ($response->{IsCss}) {
 
-    $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
+        # Parse as CSS
+
+        $p->parse_css($$docref, LINE_UNKNOWN());
+    }
+    else {
+
+        # Parse as HTML
+
+        # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
+        # Processing instructions are not parsed by process, but in this case
+        # it should be. It's expensive, it's horrible, but it's the easiest way
+        # for right now.
+        $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/
+            unless $p->{only_anchors};
+
+        $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
+
+        $p->parse($$docref)->eof();
+    }
 
-    $p->parse($$docref)->eof();
     $response->content("");
 
     if (!$Opts{Summary_Only}) {
@@ -1827,6 +1871,8 @@
     # Set up handlers
 
     $p->handler(start => 'start', 'self, tagname, attr, line');
+    $p->handler(end   => 'end',   'self, tagname, line');
+    $p->handler(text  => 'text',  'self, dtext, line');
     $p->handler(
         declaration => sub {
             my $self = shift;
@@ -2006,9 +2052,66 @@
                 }
             }
         }
+
+        # Inline CSS:
+        delete $self->{csstext};
+        if ($tag eq 'style') {
+            $self->{csstext} = ''
+                if ((!$attr->{type} && $self->{style_is_css}) ||
+                is_content_type($attr->{type}, "text/css"));
+        }
+        elsif ($self->{style_is_css} && (my $style = $attr->{style})) {
+            $style = CSS::DOM::Style::parse($style);
+            $self->parse_style($style, $line);
+        }
     }
 
     $self->parse_progress($line) if $Opts{Progress};
+    return;
+}
+
+sub end
+{
+    my ($self, $tagname, $line) = @_;
+
+    $self->parse_css($self->{csstext}, $line) if ($tagname eq 'style');
+    delete $self->{csstext};
+
+    $self->parse_progress($line) if $Opts{Progress};
+    return;
+}
+
+sub parse_css
+{
+    my ($self, $css, $line) = @_;
+    return unless $css;
+
+    my $sheet = CSS::DOM::parse($css);
+    for my $rule (@{$sheet->cssRules()}) {
+        if ($rule->type() == IMPORT_RULE()) {
+            $self->add_link($rule->href(), $self->{base}, $line);
+        }
+        elsif ($rule->type == STYLE_RULE()) {
+            $self->parse_style($rule->style(), $line);
+        }
+    }
+    return;
+}
+
+sub parse_style
+{
+    my ($self, $style, $line) = @_;
+    return unless $style;
+
+    for (my $i = 0, my $len = $style->length(); $i < $len; $i++) {
+        my $prop = $style->item($i);
+        my $val  = $style->getPropertyValue($prop);
+
+        while ($val =~ /$CssUrl/go) {
+            my $url = CSS::DOM::Util::unescape($2);
+            $self->add_link($url, $self->{base}, $line);
+        }
+    }
 
     return;
 }
@@ -2044,10 +2147,19 @@
     return;
 }
 
+sub text
+{
+    my ($self, $text, $line) = @_;
+    $self->{csstext} .= $text if defined($self->{csstext});
+    $self->parse_progress($line) if $Opts{Progress};
+    return;
+}
+
 sub end_document
 {
     my ($self, $line) = @_;
     $self->{Total} = $line;
+    delete $self->{csstext};
     return;
 }
 
@@ -2986,10 +3098,17 @@
         if ($doc_count > 0 && !$Opts{Quiet});
     if (!$doc_count) {
         print <<'EOF';
-  <div class="intro">
-      <p>This Link Checker looks for issues in links, anchors and referenced objects in a Web page, or recursively on a whole Web site.
-      For best results, it is recommended to first ensure that the documents checked use <a href="http://validator.w3.org/">Valid (X)HTML Markup</a>. The Link Checker is part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and Quality Web tools</a>.</p>
-  </div>
+<div class="intro">
+  <p>
+    This Link Checker looks for issues in links, anchors and referenced objects
+    in a Web page, CSS style sheet, or recursively on a whole Web site. For
+    best results, it is recommended to first ensure that the documents checked
+    use Valid <a href="http://validator.w3.org/">(X)HTML Markup</a> and
+    <a href="http://jigsaw.w3.org/css-validator/">CSS</a>. The Link Checker is
+    part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and
+    Quality Web tools</a>.
+  </p>
+</div>
 EOF
     }
     printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION);
Received on Sunday, 7 March 2010 17:16:18 UTC