- From: Ville Skytta via cvs-syncmail <cvsmail@w3.org>
- Date: Sun, 07 Mar 2010 17:16:16 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LinkChecker/bin In directory hutz:/tmp/cvs-serv8409/bin Modified Files: checklink Log Message: Add support for checking links in CSS. Known issue: no support for line numbers yet. Index: checklink =================================================================== RCS file: /sources/public/perl/modules/W3C/LinkChecker/bin/checklink,v retrieving revision 4.187 retrieving revision 4.188 diff -u -d -r4.187 -r4.188 --- checklink 7 Mar 2010 16:34:34 -0000 4.187 +++ checklink 7 Mar 2010 17:16:13 -0000 4.188 @@ -219,10 +219,15 @@ package W3C::LinkChecker; use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION - $DocType $Head $Accept $ContentTypes %Cfg); + $DocType $Head $Accept $ContentTypes %Cfg $CssUrl); +use CSS::DOM 0.09 qw(); # >= 0.09 for many bugfixes +use CSS::DOM::Constants qw(:rule); +use CSS::DOM::Style qw(); +use CSS::DOM::Util qw(); use HTML::Entities qw(); -use HTML::Parser 3.20 qw(); # >= 3.20 for "line" argspec identifier +use HTML::Parser 3.20 qw(); # >= 3.20 for "line" argspec identifier +use HTTP::Headers::Util qw(); use HTTP::Request qw(); use HTTP::Response 1.50 qw(); # >= 1.50 for decoded_content() use Time::HiRes qw(); @@ -322,9 +327,13 @@ application/vnd.wap.xhtml+xml;q=0.6 ); $Accept = join(', ', @content_types, '*/*;q=0.5'); + push(@content_types, "text/css"); my $re = join('|', map { s/;.*//; quotemeta } @content_types); $ContentTypes = qr{\b(?:$re)\b}io; + # Regexp for matching URL values in CSS. + $CssUrl = qr/(?:\s|^)url\(\s*(['"]?)(.*?)\1\s*\)(?=\s|$)/; + # # Read configuration. If the W3C_CHECKLINK_CFG environment variable has # been set or the default contains a non-empty file, read it. Otherwise, @@ -1145,13 +1154,16 @@ $result_anchor); my $esc_uri = URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9."); + print "<p>For reliable link checking results, check "; + + if (!$response->{IsCss}) { + printf("<a href=\"%s\">HTML validity</a> and ", + &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri))); + } printf( - "<p>For reliable link checking results, check -<a href=\"%s\">HTML validity</a> first. See also -<a href=\"%s\">CSS validity</a>.</p> + "<a href=\"%s\">CSS validity</a> first.</p> <p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n", - &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)), - &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)), + &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)), &encode($Opts{_Self_URI}) ); @@ -1536,6 +1548,18 @@ return 0; # We always have at least one base location, but none matched. } +################################# +# Check for content type match. # +################################# + +sub is_content_type ($$) +{ + my ($candidate, $type) = @_; + return 0 unless ($candidate && $type); + my @v = HTTP::Headers::Util::split_header_words($candidate); + return scalar(@v) ? $type eq lc($v[0]->[0]) : 0; +} + ################################################## # Check whether a URI has already been processed # ################################################## @@ -1666,6 +1690,8 @@ &hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp())) if $verbose_progress; + $response->{IsCss} = + is_content_type($response->content_type(), "text/css"); $response->{Realm} = $realm if defined($realm); return $response; @@ -1767,9 +1793,13 @@ return $p; } - my $start; $p = W3C::LinkChecker->new(); $p->{base} = $base_uri; + + my $stype = $response->header("Content-Style-Type"); + $p->{style_is_css} = !$stype || is_content_type($stype, "text/css"); + + my $start; if (!$Opts{Summary_Only}) { $start = &get_timestamp(); print("Parsing...\n"); @@ -1778,26 +1808,40 @@ # Content-Encoding etc already decoded in get_document(). my $docref = $response->content_ref(); - # Count lines beforehand if needed for progress indicator. In all cases, - # the actual final number of lines processed shown is populated by our + # Count lines beforehand if needed (for progress indicator, or CSS while + # we don't get any line context out of the parser). In case of HTML, the + # actual final number of lines processed shown is populated by our # end_document handler. - $p->{Total} = ($$docref =~ tr/\n//) if $Opts{Progress}; + $p->{Total} = ($$docref =~ tr/\n//) + if ($response->{IsCss} || $Opts{Progress}); # We only look for anchors if we are not interested in the links # obviously, or if we are running a recursive checking because we # might need this information later $p->{only_anchors} = !($links || $rec_needs_links); - # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing - # Processing instructions are not parsed by process, but in this case - # it should be. It's expensive, it's horrible, but it's the easiest way - # for right now. - $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ - unless $p->{only_anchors}; + if ($response->{IsCss}) { - $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/); + # Parse as CSS + + $p->parse_css($$docref, LINE_UNKNOWN()); + } + else { + + # Parse as HTML + + # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing + # Processing instructions are not parsed by process, but in this case + # it should be. It's expensive, it's horrible, but it's the easiest way + # for right now. + $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ + unless $p->{only_anchors}; + + $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/); + + $p->parse($$docref)->eof(); + } - $p->parse($$docref)->eof(); $response->content(""); if (!$Opts{Summary_Only}) { @@ -1827,6 +1871,8 @@ # Set up handlers $p->handler(start => 'start', 'self, tagname, attr, line'); + $p->handler(end => 'end', 'self, tagname, line'); + $p->handler(text => 'text', 'self, dtext, line'); $p->handler( declaration => sub { my $self = shift; @@ -2006,9 +2052,66 @@ } } } + + # Inline CSS: + delete $self->{csstext}; + if ($tag eq 'style') { + $self->{csstext} = '' + if ((!$attr->{type} && $self->{style_is_css}) || + is_content_type($attr->{type}, "text/css")); + } + elsif ($self->{style_is_css} && (my $style = $attr->{style})) { + $style = CSS::DOM::Style::parse($style); + $self->parse_style($style, $line); + } } $self->parse_progress($line) if $Opts{Progress}; + return; +} + +sub end +{ + my ($self, $tagname, $line) = @_; + + $self->parse_css($self->{csstext}, $line) if ($tagname eq 'style'); + delete $self->{csstext}; + + $self->parse_progress($line) if $Opts{Progress}; + return; +} + +sub parse_css +{ + my ($self, $css, $line) = @_; + return unless $css; + + my $sheet = CSS::DOM::parse($css); + for my $rule (@{$sheet->cssRules()}) { + if ($rule->type() == IMPORT_RULE()) { + $self->add_link($rule->href(), $self->{base}, $line); + } + elsif ($rule->type == STYLE_RULE()) { + $self->parse_style($rule->style(), $line); + } + } + return; +} + +sub parse_style +{ + my ($self, $style, $line) = @_; + return unless $style; + + for (my $i = 0, my $len = $style->length(); $i < $len; $i++) { + my $prop = $style->item($i); + my $val = $style->getPropertyValue($prop); + + while ($val =~ /$CssUrl/go) { + my $url = CSS::DOM::Util::unescape($2); + $self->add_link($url, $self->{base}, $line); + } + } return; } @@ -2044,10 +2147,19 @@ return; } +sub text +{ + my ($self, $text, $line) = @_; + $self->{csstext} .= $text if defined($self->{csstext}); + $self->parse_progress($line) if $Opts{Progress}; + return; +} + sub end_document { my ($self, $line) = @_; $self->{Total} = $line; + delete $self->{csstext}; return; } @@ -2986,10 +3098,17 @@ if ($doc_count > 0 && !$Opts{Quiet}); if (!$doc_count) { print <<'EOF'; - <div class="intro"> - <p>This Link Checker looks for issues in links, anchors and referenced objects in a Web page, or recursively on a whole Web site. - For best results, it is recommended to first ensure that the documents checked use <a href="http://validator.w3.org/">Valid (X)HTML Markup</a>. The Link Checker is part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and Quality Web tools</a>.</p> - </div> +<div class="intro"> + <p> + This Link Checker looks for issues in links, anchors and referenced objects + in a Web page, CSS style sheet, or recursively on a whole Web site. For + best results, it is recommended to first ensure that the documents checked + use Valid <a href="http://validator.w3.org/">(X)HTML Markup</a> and + <a href="http://jigsaw.w3.org/css-validator/">CSS</a>. The Link Checker is + part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and + Quality Web tools</a>. + </p> +</div> EOF } printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION);
Received on Sunday, 7 March 2010 17:16:18 UTC