W3C home > Mailing lists > Public > www-validator-cvs@w3.org > August 2010

link-checker commit: Outsource line counting to HTML::Parser; version >= 3.20 is now required.

From: Mercurial notifier <nobody@w3.org>
Date: Thu, 05 Aug 2010 14:47:04 +0000
To: link-checker updates <www-validator-cvs@w3.org>
Message-Id: <E1Oh1iq-0005ly-Tu@blinky.w3.org>
changeset:   138:708d72ea9a48
user:        ville
date:        Sat May 06 18:24:10 2006 +0000
files:       Makefile.PL bin/checklink docs/checklink.html
description:
Outsource line counting to HTML::Parser; version >= 3.20 is now required.


diff -r f00b04f44da5 -r 708d72ea9a48 Makefile.PL
--- a/Makefile.PL	Sat May 06 18:19:49 2006 +0000
+++ b/Makefile.PL	Sat May 06 18:24:10 2006 +0000
@@ -23,7 +23,7 @@
   PREREQ_PM     => {
                      CGI             => 0,
                      Config::General => 2.06,
-                     HTML::Parser    => 3.00,
+                     HTML::Parser    => 3.20,
                      LWP             => 5.66,
                      Net::IP         => 0, # Optional, see the docs.
                      Term::ReadKey   => 2.00,
diff -r f00b04f44da5 -r 708d72ea9a48 bin/checklink
--- a/bin/checklink	Sat May 06 18:19:49 2006 +0000
+++ b/bin/checklink	Sat May 06 18:24:10 2006 +0000
@@ -5,7 +5,7 @@
 # (c) 1999-2005 World Wide Web Consortium
 # based on Renaud Bruyeron's checklink.pl
 #
-# $Id: checklink,v 4.28 2006-05-06 18:19:49 ville Exp $
+# $Id: checklink,v 4.29 2006-05-06 18:24:10 ville Exp $
 #
 # This program is licensed under the W3C(r) Software License:
 #       http://www.w3.org/Consortium/Legal/copyright-software
@@ -102,7 +102,7 @@
             $DocType $Head $Accept $ContentTypes %Cfg);
 
 use HTML::Entities       qw();
-use HTML::Parser    3.00 qw();
+use HTML::Parser    3.20 qw(); # >= 3.20 for "line" argspec identifier
 use HTTP::Request        qw();
 use HTTP::Response       qw();
 use Time::HiRes          qw();
@@ -123,7 +123,7 @@
   $PROGRAM     = 'W3C-checklink';
   $VERSION     = '4.2.1';
   $REVISION    = sprintf('version %s (c) 1999-2005 W3C', $VERSION);
-  my ($cvsver) = q$Revision: 4.28 $ =~ /(\d+[\d\.]*\.\d+)/;
+  my ($cvsver) = q$Revision: 4.29 $ =~ /(\d+[\d\.]*\.\d+)/;
   $AGENT       = sprintf('%s/%s [%s] %s',
                          $PROGRAM, $VERSION, $cvsver, LWP::RobotUA->_agent());
 
@@ -1196,17 +1196,15 @@
   my $p = HTML::Parser::new(@_, api_version => 3);
 
   # Start tags
-  $p->handler(start => 'start', 'self, tagname, attr, text');
+  $p->handler(start => 'start', 'self, tagname, attr, text, line');
   # Declarations
   $p->handler(declaration =>
               sub {
                 my $self = shift;
                 $self->declaration(substr($_[0], 2, -1));
-              }, 'self, text');
+              }, 'self, text, line');
   # Other stuff
-  $p->handler(default => 'text', 'self, text');
-  # Line count
-  $p->{Line} = 1;
+  $p->handler(default => 'parse_progress', 'self, line') if $Opts{Progress};
   # Check <a [..] name="...">?
   $p->{check_name} = 1;
   # Check <[..] id="..">?
@@ -1242,16 +1240,14 @@
   $self->xml_mode(1) if (m%^-//W3C//DTD XHTML %);
 }
 
-#######################################
-# Count the number of lines in a file #
-#######################################
+###################################
+# Print parse progress indication #
+###################################
 
-sub new_line
+sub parse_progress
 {
-  my ($self, $string) = @_;
-  my $count = ($string =~ tr/\n//);
-  $self->{Line} = $self->{Line} + $count;
-  printf("\r%4d%%", int($self->{Line}/$self->{Total}*100)) if $Opts{Progress};
+  my ($self, $line) = @_;
+  printf("\r%4d%%", int($line/$self->{Total}*100));
 }
 
 #############################
@@ -1280,17 +1276,17 @@
 
 sub add_link
 {
-  my ($self, $uri) = @_;
-  $self->{Links}{$uri}{$self->{Line}}++ if defined($uri);
+  my ($self, $uri, $line) = @_;
+  $self->{Links}{$uri}{$line}++ if defined($uri);
 }
 
 sub start
 {
-  my ($self, $tag, $attr, $text) = @_;
+  my ($self, $tag, $attr, $text, $line) = @_;
 
   # Anchors
   my $anchor = $self->get_anchor($tag, $attr);
-  $self->{Anchors}{$anchor}{$self->{Line}}++ if defined($anchor);
+  $self->{Anchors}{$anchor}{$line}++ if defined($anchor);
 
   # Links
   if (!$self->{only_anchors}) {
@@ -1302,31 +1298,19 @@
         $self->{base} = $attr->{href};
       }
     } else {
-      $self->add_link($attr->{href});
+      $self->add_link($attr->{href}, $line);
     }
-    $self->add_link($attr->{src});
-    $self->add_link($attr->{data}) if ($tag eq 'object');
-    $self->add_link($attr->{cite}) if ($tag eq 'blockquote');
+    $self->add_link($attr->{src}, $line);
+    $self->add_link($attr->{data}, $line) if ($tag eq 'object');
+    $self->add_link($attr->{cite}, $line) if ($tag eq 'blockquote');
   }
 
-  # Line counting
-  $self->new_line($text) if ($text =~ m/\n/);
-}
-
-sub text
-{
-  my ($self, $text) = @_;
-  if (!$Opts{Progress}) {
-    # If we are just extracting information about anchors,
-    # parsing this part is only cosmetic (progress indicator)
-    return unless !$self->{only_anchors};
-  }
-  $self->new_line($text) if ($text =~ /\n/);
+  $self->parse_progress($line) if $Opts{Progress};
 }
 
 sub declaration
 {
-  my ($self, $text) = @_;
+  my ($self, $text, $line) = @_;
   # Extract the doctype
   my @declaration = split(/\s+/, $text, 4);
   if (($#declaration >= 3) &&
@@ -1337,7 +1321,7 @@
     # Store the doctype
     $self->doctype($1) if $1;
     # If there is a link to the DTD, record it
-    $self->{Links}{$3}{$self->{Line}}++ if (!$self->{only_anchors} && $3);
+    $self->{Links}{$3}{$line}++ if (!$self->{only_anchors} && $3);
   }
   return unless !$self->{only_anchors};
   $self->text($text);
diff -r f00b04f44da5 -r 708d72ea9a48 docs/checklink.html
--- a/docs/checklink.html	Sat May 06 18:19:49 2006 +0000
+++ b/docs/checklink.html	Sat May 06 18:24:10 2006 +0000
@@ -6,7 +6,7 @@
     <title>W3C Link Checker Documentation</title>
     <link rev="made" href="mailto:www-validator@w3.org" />
     <style type="text/css" media="all">@import "linkchecker.css";</style>
-    <meta name="revision" content="$Id: checklink.html,v 1.32 2005-05-15 13:45:03 ville Exp $" />
+    <meta name="revision" content="$Id: checklink.html,v 1.33 2006-05-06 18:24:10 ville Exp $" />
   </head>
 
   <body>
@@ -162,7 +162,7 @@
           <li><a href="http://search.cpan.org/dist/W3C-LinkChecker/">W3C-LinkChecker</a> (the link checker itself)</li>
 	  <li><a href="http://search.cpan.org/dist/CGI.pm/">CGI.pm</a> (required for CGI mode only)</li>
           <li><a href="http://search.cpan.org/dist/Config-General/">Config-General</a> (optional, version 2.06 or newer; required only for reading the (optional) configuration file)</li>
-	  <li><a href="http://search.cpan.org/dist/HTML-Parser/">HTML-Parser</a> (version 3.00 or newer)</li>
+	  <li><a href="http://search.cpan.org/dist/HTML-Parser/">HTML-Parser</a> (version 3.20 or newer)</li>
 	  <li><a href="http://search.cpan.org/dist/libwww-perl/">libwww-perl</a> (version 5.66 or newer; version 5.70 or newer recommended, except for 5.76 which has a bug that may cause the link checker follow redirects to <code>file:</code> URLs)</li>
           <li><a href="http://search.cpan.org/dist/Net-IP/">Net-IP</a> (optional but recommended; required for restricting access to <a href="http://www.ietf.org/rfc/rfc1918.txt">private IP addresses</a>)</li>
           <li><a href="http://search.cpan.org/dist/TermReadKey/">TermReadKey</a> (optional but recommended; required only in command line mode for password input)</li>
@@ -297,7 +297,7 @@
         alt="Valid XHTML 1.0!" /></a>
       <a title="Send Feedback for the W3C Link Checker"
         href="http://validator.w3.org/feedback.html">The W3C Validator Team</a><br />
-      $Date: 2005-05-15 13:45:03 $
+      $Date: 2006-05-06 18:24:10 $
     </address>
     <p class="copyright">
       <a rel="Copyright" href="http://www.w3.org/Consortium/Legal/ipr-notice#Copyright">Copyright</a> &copy; 1994-2005
Received on Thursday, 5 August 2010 14:47:14 UTC

This archive was generated by hypermail 2.3.1 : Wednesday, 7 January 2015 15:17:44 UTC