- From: Mercurial notifier <nobody@w3.org>
- Date: Thu, 05 Aug 2010 14:47:26 +0000
- To: link-checker updates <www-validator-cvs@w3.org>
changeset: 352:bb26d957084a
user: ville
date: Sun Mar 07 17:16:14 2010 +0000
files: Makefile.PL bin/checklink docs/checklink.html
description:
Add support for checking links in CSS.
Known issue: no support for line numbers yet.
diff -r dfdcf2d8782d -r bb26d957084a Makefile.PL
--- a/Makefile.PL Sun Mar 07 16:34:35 2010 +0000
+++ b/Makefile.PL Sun Mar 07 17:16:14 2010 +0000
@@ -10,15 +10,20 @@
PREREQ_PM => {
# Hard dependencies:
- HTML::Entities => 0,
- HTML::Parser => 3.20,
- HTTP::Request => 0,
- HTTP::Response => 1.50,
- LWP::RobotUA => 1.19,
- LWP::UserAgent => 0,
- Time::HiRes => 0,
- URI => 1.31,
- URI::Escape => 0,
+ CSS::DOM => 0.09,
+ CSS::DOM::Constants => 0,
+ CSS::DOM::Style => 0,
+ CSS::DOM::Util => 0,
+ HTML::Entities => 0,
+ HTML::Parser => 3.20,
+ HTTP::Headers::Util => 0,
+ HTTP::Request => 0,
+ HTTP::Response => 1.50,
+ LWP::RobotUA => 1.19,
+ LWP::UserAgent => 0,
+ Time::HiRes => 0,
+ URI => 1.31,
+ URI::Escape => 0,
# Optional, but required if using a config file:
Config::General => 2.06,
diff -r dfdcf2d8782d -r bb26d957084a bin/checklink
--- a/bin/checklink Sun Mar 07 16:34:35 2010 +0000
+++ b/bin/checklink Sun Mar 07 17:16:14 2010 +0000
@@ -5,7 +5,7 @@
# (c) 1999-2010 World Wide Web Consortium
# based on Renaud Bruyeron's checklink.pl
#
-# $Id: checklink,v 4.187 2010-03-07 16:34:34 ville Exp $
+# $Id: checklink,v 4.188 2010-03-07 17:16:13 ville Exp $
#
# This program is licensed under the W3C(r) Software License:
# http://www.w3.org/Consortium/Legal/copyright-software
@@ -219,10 +219,15 @@
package W3C::LinkChecker;
use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION
- $DocType $Head $Accept $ContentTypes %Cfg);
+ $DocType $Head $Accept $ContentTypes %Cfg $CssUrl);
+use CSS::DOM 0.09 qw(); # >= 0.09 for many bugfixes
+use CSS::DOM::Constants qw(:rule);
+use CSS::DOM::Style qw();
+use CSS::DOM::Util qw();
use HTML::Entities qw();
-use HTML::Parser 3.20 qw(); # >= 3.20 for "line" argspec identifier
+use HTML::Parser 3.20 qw(); # >= 3.20 for "line" argspec identifier
+use HTTP::Headers::Util qw();
use HTTP::Request qw();
use HTTP::Response 1.50 qw(); # >= 1.50 for decoded_content()
use Time::HiRes qw();
@@ -301,7 +306,7 @@
$PROGRAM = 'W3C-checklink';
$VERSION = '4.5';
$REVISION = sprintf('version %s (c) 1999-2010 W3C', $VERSION);
- my ($cvsver) = q$Revision: 4.187 $ =~ /(\d+[\d\.]*\.\d+)/;
+ my ($cvsver) = q$Revision: 4.188 $ =~ /(\d+[\d\.]*\.\d+)/;
$AGENT = sprintf(
'%s/%s [%s] %s',
$PROGRAM, $VERSION, $cvsver,
@@ -322,8 +327,12 @@
application/vnd.wap.xhtml+xml;q=0.6
);
$Accept = join(', ', @content_types, '*/*;q=0.5');
+ push(@content_types, "text/css");
my $re = join('|', map { s/;.*//; quotemeta } @content_types);
$ContentTypes = qr{\b(?:$re)\b}io;
+
+ # Regexp for matching URL values in CSS.
+ $CssUrl = qr/(?:\s|^)url\(\s*(['"]?)(.*?)\1\s*\)(?=\s|$)/;
#
# Read configuration. If the W3C_CHECKLINK_CFG environment variable has
@@ -1145,13 +1154,16 @@
$result_anchor);
my $esc_uri =
URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9.");
+ print "<p>For reliable link checking results, check ";
+
+ if (!$response->{IsCss}) {
+ printf("<a href=\"%s\">HTML validity</a> and ",
+ &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)));
+ }
printf(
- "<p>For reliable link checking results, check
-<a href=\"%s\">HTML validity</a> first. See also
-<a href=\"%s\">CSS validity</a>.</p>
+ "<a href=\"%s\">CSS validity</a> first.</p>
<p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n",
- &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)),
- &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)),
+ &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)),
&encode($Opts{_Self_URI})
);
@@ -1536,6 +1548,18 @@
return 0; # We always have at least one base location, but none matched.
}
+#################################
+# Check for content type match. #
+#################################
+
+sub is_content_type ($$)
+{
+ my ($candidate, $type) = @_;
+ return 0 unless ($candidate && $type);
+ my @v = HTTP::Headers::Util::split_header_words($candidate);
+ return scalar(@v) ? $type eq lc($v[0]->[0]) : 0;
+}
+
##################################################
# Check whether a URI has already been processed #
##################################################
@@ -1666,6 +1690,8 @@
&hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp()))
if $verbose_progress;
+ $response->{IsCss} =
+ is_content_type($response->content_type(), "text/css");
$response->{Realm} = $realm if defined($realm);
return $response;
@@ -1767,9 +1793,13 @@
return $p;
}
- my $start;
$p = W3C::LinkChecker->new();
$p->{base} = $base_uri;
+
+ my $stype = $response->header("Content-Style-Type");
+ $p->{style_is_css} = !$stype || is_content_type($stype, "text/css");
+
+ my $start;
if (!$Opts{Summary_Only}) {
$start = &get_timestamp();
print("Parsing...\n");
@@ -1778,26 +1808,40 @@
# Content-Encoding etc already decoded in get_document().
my $docref = $response->content_ref();
- # Count lines beforehand if needed for progress indicator. In all cases,
- # the actual final number of lines processed shown is populated by our
+ # Count lines beforehand if needed (for progress indicator, or CSS while
+ # we don't get any line context out of the parser). In case of HTML, the
+ # actual final number of lines processed shown is populated by our
# end_document handler.
- $p->{Total} = ($$docref =~ tr/\n//) if $Opts{Progress};
+ $p->{Total} = ($$docref =~ tr/\n//)
+ if ($response->{IsCss} || $Opts{Progress});
# We only look for anchors if we are not interested in the links
# obviously, or if we are running a recursive checking because we
# might need this information later
$p->{only_anchors} = !($links || $rec_needs_links);
- # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
- # Processing instructions are not parsed by process, but in this case
- # it should be. It's expensive, it's horrible, but it's the easiest way
- # for right now.
- $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/
- unless $p->{only_anchors};
+ if ($response->{IsCss}) {
- $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
+ # Parse as CSS
- $p->parse($$docref)->eof();
+ $p->parse_css($$docref, LINE_UNKNOWN());
+ }
+ else {
+
+ # Parse as HTML
+
+ # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
+ # Processing instructions are not parsed by process, but in this case
+ # it should be. It's expensive, it's horrible, but it's the easiest way
+ # for right now.
+ $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/
+ unless $p->{only_anchors};
+
+ $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
+
+ $p->parse($$docref)->eof();
+ }
+
$response->content("");
if (!$Opts{Summary_Only}) {
@@ -1827,6 +1871,8 @@
# Set up handlers
$p->handler(start => 'start', 'self, tagname, attr, line');
+ $p->handler(end => 'end', 'self, tagname, line');
+ $p->handler(text => 'text', 'self, dtext, line');
$p->handler(
declaration => sub {
my $self = shift;
@@ -2006,9 +2052,66 @@
}
}
}
+
+ # Inline CSS:
+ delete $self->{csstext};
+ if ($tag eq 'style') {
+ $self->{csstext} = ''
+ if ((!$attr->{type} && $self->{style_is_css}) ||
+ is_content_type($attr->{type}, "text/css"));
+ }
+ elsif ($self->{style_is_css} && (my $style = $attr->{style})) {
+ $style = CSS::DOM::Style::parse($style);
+ $self->parse_style($style, $line);
+ }
}
$self->parse_progress($line) if $Opts{Progress};
+ return;
+}
+
+sub end
+{
+ my ($self, $tagname, $line) = @_;
+
+ $self->parse_css($self->{csstext}, $line) if ($tagname eq 'style');
+ delete $self->{csstext};
+
+ $self->parse_progress($line) if $Opts{Progress};
+ return;
+}
+
+sub parse_css
+{
+ my ($self, $css, $line) = @_;
+ return unless $css;
+
+ my $sheet = CSS::DOM::parse($css);
+ for my $rule (@{$sheet->cssRules()}) {
+ if ($rule->type() == IMPORT_RULE()) {
+ $self->add_link($rule->href(), $self->{base}, $line);
+ }
+ elsif ($rule->type == STYLE_RULE()) {
+ $self->parse_style($rule->style(), $line);
+ }
+ }
+ return;
+}
+
+sub parse_style
+{
+ my ($self, $style, $line) = @_;
+ return unless $style;
+
+ for (my $i = 0, my $len = $style->length(); $i < $len; $i++) {
+ my $prop = $style->item($i);
+ my $val = $style->getPropertyValue($prop);
+
+ while ($val =~ /$CssUrl/go) {
+ my $url = CSS::DOM::Util::unescape($2);
+ $self->add_link($url, $self->{base}, $line);
+ }
+ }
return;
}
@@ -2044,10 +2147,19 @@
return;
}
+sub text
+{
+ my ($self, $text, $line) = @_;
+ $self->{csstext} .= $text if defined($self->{csstext});
+ $self->parse_progress($line) if $Opts{Progress};
+ return;
+}
+
sub end_document
{
my ($self, $line) = @_;
$self->{Total} = $line;
+ delete $self->{csstext};
return;
}
@@ -2986,10 +3098,17 @@
if ($doc_count > 0 && !$Opts{Quiet});
if (!$doc_count) {
print <<'EOF';
- <div class="intro">
- <p>This Link Checker looks for issues in links, anchors and referenced objects in a Web page, or recursively on a whole Web site.
- For best results, it is recommended to first ensure that the documents checked use <a href="http://validator.w3.org/">Valid (X)HTML Markup</a>. The Link Checker is part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and Quality Web tools</a>.</p>
- </div>
+<div class="intro">
+ <p>
+ This Link Checker looks for issues in links, anchors and referenced objects
+ in a Web page, CSS style sheet, or recursively on a whole Web site. For
+ best results, it is recommended to first ensure that the documents checked
+ use Valid <a href="http://validator.w3.org/">(X)HTML Markup</a> and
+ <a href="http://jigsaw.w3.org/css-validator/">CSS</a>. The Link Checker is
+ part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and
+ Quality Web tools</a>.
+ </p>
+</div>
EOF
}
printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION);
diff -r dfdcf2d8782d -r bb26d957084a docs/checklink.html
--- a/docs/checklink.html Sun Mar 07 16:34:35 2010 +0000
+++ b/docs/checklink.html Sun Mar 07 17:16:14 2010 +0000
@@ -6,7 +6,7 @@
<title>W3C Link Checker Documentation</title>
<link rev="made" href="mailto:www-validator@w3.org" />
<style type="text/css" media="all">@import "linkchecker.css";</style>
- <meta name="revision" content="$Id: checklink.html,v 1.59 2010-03-07 16:34:35 ville Exp $" />
+ <meta name="revision" content="$Id: checklink.html,v 1.60 2010-03-07 17:16:14 ville Exp $" />
</head>
<body>
@@ -54,8 +54,8 @@
<h2><a name="what" id="what">What it does</a></h2>
<p>
- The link checker reads an HTML or XHTML document and extracts a list
- of anchors and links.
+ The link checker reads an HTML or XHTML document or a CSS style sheet
+ and extracts a list of anchors and links.
</p>
<p>
@@ -149,6 +149,7 @@
<li><a href="http://search.cpan.org/dist/W3C-LinkChecker/">W3C-LinkChecker</a> (the link checker itself)</li>
<li><a href="http://search.cpan.org/dist/CGI.pm/">CGI.pm</a> (required for CGI mode only)</li>
<li><a href="http://search.cpan.org/dist/Config-General/">Config-General</a> (optional, version 2.06 or newer; required only for reading the (optional) configuration file)</li>
+ <li><a href="http://search.cpan.org/dist/CSS-DOM/">CSS-DOM</a> (version 0.09 or newer)</li>
<li><a href="http://search.cpan.org/dist/HTML-Parser/">HTML-Parser</a> (version 3.20 or newer)</li>
<li><a href="http://search.cpan.org/dist/libwww-perl/">libwww-perl</a> (version 5.802 or newer)</li>
<li><a href="http://search.cpan.org/dist/Net-IP/">Net-IP</a> (optional but recommended; required for restricting access to <a href="http://www.ietf.org/rfc/rfc1918.txt">private IP addresses</a>)</li>
@@ -325,7 +326,7 @@
<address>
<a title="Send Feedback for the W3C Link Checker"
href="http://validator.w3.org/feedback.html">The W3C Validator Team</a><br />
- $Date: 2010-03-07 16:34:35 $
+ $Date: 2010-03-07 17:16:14 $
</address>
<p class="copyright">
<a rel="Copyright" href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> © 1994-2010
Received on Thursday, 5 August 2010 14:47:49 UTC