- From: Mercurial notifier <nobody@w3.org>
- Date: Thu, 05 Aug 2010 14:47:25 +0000
- To: link-checker updates <www-validator-cvs@w3.org>
changeset: 349:d1b25a40ba7c
user: ville
date: Sun Feb 28 21:13:23 2010 +0000
files: Makefile.PL bin/checklink
description:
Run perltidy, update indentation settings.
diff -r f5d868ff8884 -r d1b25a40ba7c Makefile.PL
--- a/Makefile.PL Sun Feb 28 19:23:11 2010 +0000
+++ b/Makefile.PL Sun Feb 28 21:13:23 2010 +0000
@@ -2,69 +2,69 @@
use ExtUtils::MakeMaker;
WriteMakefile(
- NAME => 'W3C::LinkChecker',
- ABSTRACT => 'W3C Link Checker',
- AUTHOR => 'W3C QA-dev Team <public-qa-dev@w3.org>',
- LICENSE => 'open_source',
- VERSION_FROM => 'bin/checklink',
- PREREQ_PM => {
- # Hard dependencies:
- HTML::Entities => 0,
- HTML::Parser => 3.20,
- HTTP::Request => 0,
- HTTP::Response => 1.50,
- LWP::RobotUA => 1.19,
- LWP::UserAgent => 0,
- Time::HiRes => 0,
- URI => 1.31,
- URI::Escape => 0,
+ NAME => 'W3C::LinkChecker',
+ ABSTRACT => 'W3C Link Checker',
+ AUTHOR => 'W3C QA-dev Team <public-qa-dev@w3.org>',
+ LICENSE => 'open_source',
+ VERSION_FROM => 'bin/checklink',
+ PREREQ_PM => {
- # Optional, but required if using a config file:
- Config::General => 2.06,
+ # Hard dependencies:
+ HTML::Entities => 0,
+ HTML::Parser => 3.20,
+ HTTP::Request => 0,
+ HTTP::Response => 1.50,
+ LWP::RobotUA => 1.19,
+ LWP::UserAgent => 0,
+ Time::HiRes => 0,
+ URI => 1.31,
+ URI::Escape => 0,
- # Optional, but required if private IPs are disallowed:
- Net::hostent => 0,
- Net::IP => 0,
- Socket => 0,
+ # Optional, but required if using a config file:
+ Config::General => 2.06,
- # Optional, but required in command line mode:
- File::Spec => 0,
- Getopt::Long => 2.17,
- Text::Wrap => 0,
- URI::file => 0,
+ # Optional, but required if private IPs are disallowed:
+ Net::hostent => 0,
+ Net::IP => 0,
+ Socket => 0,
- # Optional, used for password input in command line mode:
- Term::ReadKey => 2.00,
+ # Optional, but required in command line mode:
+ File::Spec => 0,
+ Getopt::Long => 2.17,
+ Text::Wrap => 0,
+ URI::file => 0,
- # Optional, used for guessing language in cmd line mode:
- Locale::Country => 0,
- Locale::Language => 0,
+ # Optional, used for password input in command line mode:
+ Term::ReadKey => 2.00,
- # Optional, but required in CGI mode:
- CGI => 0,
- CGI::Carp => 0,
- CGI::Cookie => 0,
+ # Optional, used for guessing language in cmd line mode:
+ Locale::Country => 0,
+ Locale::Language => 0,
- # Optional, required if using cookies:
- HTTP::Cookies => 0,
+ # Optional, but required in CGI mode:
+ CGI => 0,
+ CGI::Carp => 0,
+ CGI::Cookie => 0,
- # Required for the test suite:
- File::Spec => 0,
- Test::More => 0,
- },
- PM => { 'lib/W3C/LinkChecker.pm' =>
- '$(INST_LIB)/W3C/LinkChecker.pm' },
- EXE_FILES => [ 'bin/checklink' ],
- MAN1PODS => { 'bin/checklink.pod' =>
- '$(INST_MAN1DIR)/checklink.$(MAN1EXT)',
- },
- depend => { distdir => 'ChangeLog lib/W3C/LinkChecker.pm' },
- dist => { TARFLAGS => '--owner=0 --group=0 -cvf' },
- clean => { FILES => 'ChangeLog.bak' },
+ # Optional, required if using cookies:
+ HTTP::Cookies => 0,
+
+ # Required for the test suite:
+ File::Spec => 0,
+ Test::More => 0,
+ },
+ PM => {'lib/W3C/LinkChecker.pm' => '$(INST_LIB)/W3C/LinkChecker.pm'},
+ EXE_FILES => ['bin/checklink'],
+ MAN1PODS =>
+ {'bin/checklink.pod' => '$(INST_MAN1DIR)/checklink.$(MAN1EXT)',},
+ depend => {distdir => 'ChangeLog lib/W3C/LinkChecker.pm'},
+ dist => {TARFLAGS => '--owner=0 --group=0 -cvf'},
+ clean => {FILES => 'ChangeLog.bak'},
);
-sub MY::postamble {
- return <<'MAKE_FRAG';
+sub MY::postamble
+{
+ return <<'MAKE_FRAG';
ChangeLog: bin/checklink bin/checklink.pod docs/checklink.html docs/linkchecker.css README t/00compile.t
cvs2cl --FSF --utc --prune \
--ignore ChangeLog --ignore cvsignore --ignore SIGNATURE \
diff -r f5d868ff8884 -r d1b25a40ba7c bin/checklink
--- a/bin/checklink Sun Feb 28 19:23:11 2010 +0000
+++ b/bin/checklink Sun Feb 28 21:13:23 2010 +0000
@@ -5,7 +5,7 @@
# (c) 1999-2010 World Wide Web Consortium
# based on Renaud Bruyeron's checklink.pl
#
-# $Id: checklink,v 4.184 2010-02-28 19:23:11 ville Exp $
+# $Id: checklink,v 4.185 2010-02-28 21:13:23 ville Exp $
#
# This program is licensed under the W3C(r) Software License:
# http://www.w3.org/Consortium/Legal/copyright-software
@@ -34,137 +34,144 @@
# http://www.mail-archive.com/cpan-testers-discuss%40perl.org/msg01064.html
use Config qw(%Config);
use lib map { /(.*)/ }
- defined($ENV{PERL5LIB}) ? split(/$Config{path_sep}/, $ENV{PERL5LIB}) :
- defined($ENV{PERLLIB}) ? split(/$Config{path_sep}/, $ENV{PERLLIB}) : ();
+ defined($ENV{PERL5LIB}) ? split(/$Config{path_sep}/, $ENV{PERL5LIB}) :
+ defined($ENV{PERLLIB}) ? split(/$Config{path_sep}/, $ENV{PERLLIB}) :
+ ();
# -----------------------------------------------------------------------------
package W3C::UserAgent;
-use LWP::RobotUA 1.19 qw();
-use LWP::UserAgent qw();
+use LWP::RobotUA 1.19 qw();
+use LWP::UserAgent qw();
# if 0, ignore robots exclusion (useful for testing)
use constant USE_ROBOT_UA => 1;
if (USE_ROBOT_UA) {
- @W3C::UserAgent::ISA = qw(LWP::RobotUA);
-} else {
- @W3C::UserAgent::ISA = qw(LWP::UserAgent);
+ @W3C::UserAgent::ISA = qw(LWP::RobotUA);
+}
+else {
+ @W3C::UserAgent::ISA = qw(LWP::UserAgent);
}
-BEGIN
-{
- # The 4k default line length in LWP <= 5.832 isn't enough for example to
- # accommodate 4kB cookies (RFC 2985); bump it (#6678).
- require LWP::Protocol::http;
- push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 8*1024);
+BEGIN {
+
+ # The 4k default line length in LWP <= 5.832 isn't enough for example to
+ # accommodate 4kB cookies (RFC 2985); bump it (#6678).
+ require LWP::Protocol::http;
+ push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 8 * 1024);
}
sub new
{
- my $proto = shift;
- my $class = ref($proto) || $proto;
- my ($name, $from, $rules) = @_;
+ my $proto = shift;
+ my $class = ref($proto) || $proto;
+ my ($name, $from, $rules) = @_;
- # For security/privacy reasons, if $from was not given, do not send it.
- # Cheat by defining something for the constructor, and resetting it later.
- my $from_ok = $from;
- $from ||= 'www-validator@w3.org';
+ # For security/privacy reasons, if $from was not given, do not send it.
+ # Cheat by defining something for the constructor, and resetting it later.
+ my $from_ok = $from;
+ $from ||= 'www-validator@w3.org';
- my $self;
- if (USE_ROBOT_UA) {
- $self = $class->SUPER::new($name, $from, $rules);
- } else {
- my %cnf;
- @cnf{qw(agent from)} = ($name, $from);
- $self = LWP::UserAgent->new(%cnf);
- $self = bless $self, $class;
- }
+ my $self;
+ if (USE_ROBOT_UA) {
+ $self = $class->SUPER::new($name, $from, $rules);
+ }
+ else {
+ my %cnf;
+ @cnf{qw(agent from)} = ($name, $from);
+ $self = LWP::UserAgent->new(%cnf);
+ $self = bless $self, $class;
+ }
- $self->from(undef) unless $from_ok;
+ $self->from(undef) unless $from_ok;
- $self->env_proxy();
+ $self->env_proxy();
- $self->allow_private_ips(1);
+ $self->allow_private_ips(1);
- # TODO: bug 29
- $self->protocols_forbidden([qw(mailto javascript)]);
+ # TODO: bug 29
+ $self->protocols_forbidden([qw(mailto javascript)]);
- return $self;
+ return $self;
}
sub allow_private_ips
{
- my $self = shift;
- if (@_) {
- $self->{Checklink_allow_private_ips} = shift;
- if (!$self->{Checklink_allow_private_ips}) {
- # Pull in dependencies
- require Net::IP;
- require Socket;
- require Net::hostent;
+ my $self = shift;
+ if (@_) {
+ $self->{Checklink_allow_private_ips} = shift;
+ if (!$self->{Checklink_allow_private_ips}) {
+
+ # Pull in dependencies
+ require Net::IP;
+ require Socket;
+ require Net::hostent;
+ }
}
- }
- return $self->{Checklink_allow_private_ips};
+ return $self->{Checklink_allow_private_ips};
}
sub redirect_progress_callback
{
- my $self = shift;
- $self->{Checklink_redirect_callback} = shift if @_;
- return $self->{Checklink_redirect_callback};
+ my $self = shift;
+ $self->{Checklink_redirect_callback} = shift if @_;
+ return $self->{Checklink_redirect_callback};
}
sub simple_request
{
- my $self = shift;
+ my $self = shift;
- my $response = $self->ip_disallowed($_[0]->uri());
+ my $response = $self->ip_disallowed($_[0]->uri());
- # RFC 2616, section 15.1.3
- $_[0]->remove_header("Referer")
- if ($_[0]->referer() &&
+ # RFC 2616, section 15.1.3
+ $_[0]->remove_header("Referer")
+ if ($_[0]->referer() &&
(secure_scheme($_[0]->referer()) && !secure_scheme($_[0]->uri())));
- $response ||= do {
- local $SIG{__WARN__} = sub { # Suppress some warnings, rt.cpan.org #18902
- warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/);
+ $response ||= do {
+ local $SIG{__WARN__} =
+ sub { # Suppress some warnings, rt.cpan.org #18902
+ warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/);
+ };
+
+ # @@@ Why not just $self->SUPER::simple_request?
+ $self->W3C::UserAgent::SUPER::simple_request(@_);
};
- # @@@ Why not just $self->SUPER::simple_request?
- $self->W3C::UserAgent::SUPER::simple_request(@_);
- };
- if (! defined($self->{FirstResponse})) {
- $self->{FirstResponse} = $response->code();
- $self->{FirstMessage} = $response->message() || '(no message)';
- }
+ if (!defined($self->{FirstResponse})) {
+ $self->{FirstResponse} = $response->code();
+ $self->{FirstMessage} = $response->message() || '(no message)';
+ }
- return $response;
+ return $response;
}
sub redirect_ok
{
- my ($self, $request, $response) = @_;
+ my ($self, $request, $response) = @_;
- if (my $callback = $self->redirect_progress_callback()) {
- # @@@ TODO: when an LWP internal robots.txt request gets redirected, this
- # will a bit confusingly fire for it too. Would need a robust way to
- # determine whether the request is such a LWP "internal robots.txt" one.
- &$callback($request->method(), $request->uri());
- }
+ if (my $callback = $self->redirect_progress_callback()) {
- return 0 unless $self->SUPER::redirect_ok($request, $response);
+ # @@@ TODO: when an LWP internal robots.txt request gets redirected, this
+ # will a bit confusingly fire for it too. Would need a robust way to
+ # determine whether the request is such a LWP "internal robots.txt" one.
+ &$callback($request->method(), $request->uri());
+ }
- if (my $res = $self->ip_disallowed($request->uri())) {
- $response->previous($response->clone());
- $response->request($request);
- $response->code($res->code());
- $response->message($res->message());
- return 0;
- }
+ return 0 unless $self->SUPER::redirect_ok($request, $response);
- return 1;
+ if (my $res = $self->ip_disallowed($request->uri())) {
+ $response->previous($response->clone());
+ $response->request($request);
+ $response->code($res->code());
+ $response->message($res->message());
+ return 0;
+ }
+
+ return 1;
}
#
@@ -176,33 +183,34 @@
#
sub ip_disallowed
{
- my ($self, $uri) = @_;
- return 0 if $self->allow_private_ips(); # Short-circuit
+ my ($self, $uri) = @_;
+ return 0 if $self->allow_private_ips(); # Short-circuit
- my $hostname = undef;
- eval { $hostname = $uri->host() }; # Not all URIs implement host()...
- return 0 unless $hostname;
+ my $hostname = undef;
+ eval { $hostname = $uri->host() }; # Not all URIs implement host()...
+ return 0 unless $hostname;
- my $addr = my $iptype = my $resp = undef;
- if (my $host = Net::hostent::gethostbyname($hostname)) {
- $addr = Socket::inet_ntoa($host->addr()) if $host->addr();
- if ($addr && (my $ip = Net::IP->new($addr))) {
- $iptype = $ip->iptype();
+ my $addr = my $iptype = my $resp = undef;
+ if (my $host = Net::hostent::gethostbyname($hostname)) {
+ $addr = Socket::inet_ntoa($host->addr()) if $host->addr();
+ if ($addr && (my $ip = Net::IP->new($addr))) {
+ $iptype = $ip->iptype();
+ }
}
- }
- if ($iptype && $iptype ne 'PUBLIC') {
- $resp = HTTP::Response->new(403,
- 'Checking non-public IP address disallowed by link checker configuration');
- $resp->header('Client-Warning', 'Internal response');
- }
- return $resp;
+ if ($iptype && $iptype ne 'PUBLIC') {
+ $resp = HTTP::Response->new(403,
+ 'Checking non-public IP address disallowed by link checker configuration'
+ );
+ $resp->header('Client-Warning', 'Internal response');
+ }
+ return $resp;
}
sub secure_scheme
{
- my $uri = shift or return 0;
- $uri = URI->new($uri) unless ref($uri);
- return ($uri->scheme() =~ /^(?:file|https|ldaps|sips|snews|ssh)$/i);
+ my $uri = shift or return 0;
+ $uri = URI->new($uri) unless ref($uri);
+ return ($uri->scheme() =~ /^(?:file|https|ldaps|sips|snews|ssh)$/i);
}
# -----------------------------------------------------------------------------
@@ -210,16 +218,17 @@
package W3C::LinkChecker;
use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION
- $DocType $Head $Accept $ContentTypes %Cfg);
+ $DocType $Head $Accept $ContentTypes %Cfg);
-use HTML::Entities qw();
-use HTML::Parser 3.20 qw(); # >= 3.20 for "line" argspec identifier
-use HTTP::Request qw();
-use HTTP::Response 1.50 qw(); # >= 1.50 for decoded_content()
-use Time::HiRes qw();
-use URI 1.31 qw(); # >= 1.31 for sip: abs/rel
-use URI::Escape qw();
-use URI::Heuristic qw();
+use HTML::Entities qw();
+use HTML::Parser 3.20 qw(); # >= 3.20 for "line" argspec identifier
+use HTTP::Request qw();
+use HTTP::Response 1.50 qw(); # >= 1.50 for decoded_content()
+use Time::HiRes qw();
+use URI 1.31 qw(); # >= 1.31 for sip: abs/rel
+use URI::Escape qw();
+use URI::Heuristic qw();
+
# @@@ Needs also W3C::UserAgent but can't use() it here.
use constant RC_ROBOTS_TXT => -1;
@@ -227,10 +236,10 @@
use constant RC_IP_DISALLOWED => -3;
use constant RC_PROTOCOL_DISALLOWED => -4;
-use constant LINE_UNKNOWN => -1;
+use constant LINE_UNKNOWN => -1;
use constant MP2 =>
- (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2);
+ (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2);
# Tag=>attribute mapping of things we treat as links.
# Note: base/@href and meta/@http-equiv get special treatment, see start()
@@ -242,27 +251,31 @@
blockquote => ['cite'],
body => ['background'],
del => ['cite'],
- embed => ['href', 'pluginspage', 'pluginurl', 'src'], # proprietary
+
+ # proprietary
+ embed => ['href', 'pluginspage', 'pluginurl', 'src'],
+
# form/@action not checked (side effects)
- frame => ['longdesc', 'src'],
- iframe => ['longdesc', 'src'],
- img => ['longdesc', 'src'],
- ins => ['cite'],
+ frame => ['longdesc', 'src'],
+ iframe => ['longdesc', 'src'],
+ img => ['longdesc', 'src'],
+ ins => ['cite'],
+
# input/@action not checked (side effects)
- input => ['src'],
- link => ['href'],
- object => ['data'],
- q => ['cite'],
- script => ['src'],
- source => ['src'],
- video => ['src'],
+ input => ['src'],
+ link => ['href'],
+ object => ['data'],
+ q => ['cite'],
+ script => ['src'],
+ source => ['src'],
+ video => ['src'],
};
# Tag=>attribute mapping of things we treat as space separated lists of links.
use constant LINK_LIST_ATTRS => {
- a => ['ping'],
- area => ['ping'],
- head => ['profile'],
+ a => ['ping'],
+ area => ['ping'],
+ head => ['profile'],
};
# TBD/TODO:
@@ -276,79 +289,80 @@
# - table,tr,td,th/@background?
# - xmp/@href?
-@W3C::LinkChecker::ISA = qw(HTML::Parser);
+@W3C::LinkChecker::ISA = qw(HTML::Parser);
-BEGIN
-{
- # Version info
- $PACKAGE = 'W3C Link Checker';
- $PROGRAM = 'W3C-checklink';
- $VERSION = '4.5';
- $REVISION = sprintf('version %s (c) 1999-2010 W3C', $VERSION);
- my ($cvsver) = q$Revision: 4.184 $ =~ /(\d+[\d\.]*\.\d+)/;
- $AGENT = sprintf('%s/%s [%s] %s',
- $PROGRAM, $VERSION, $cvsver,
- (W3C::UserAgent::USE_ROBOT_UA
- ? LWP::RobotUA->_agent()
- : LWP::UserAgent->_agent()));
+BEGIN {
- # Pull in mod_perl modules if applicable.
- eval {
- local $SIG{__DIE__} = undef;
- require Apache2::RequestUtil;
- } if MP2();
+ # Version info
+ $PACKAGE = 'W3C Link Checker';
+ $PROGRAM = 'W3C-checklink';
+ $VERSION = '4.5';
+ $REVISION = sprintf('version %s (c) 1999-2010 W3C', $VERSION);
+ my ($cvsver) = q$Revision: 4.185 $ =~ /(\d+[\d\.]*\.\d+)/;
+ $AGENT = sprintf(
+ '%s/%s [%s] %s',
+ $PROGRAM, $VERSION, $cvsver,
+ ( W3C::UserAgent::USE_ROBOT_UA ? LWP::RobotUA->_agent() :
+ LWP::UserAgent->_agent()
+ )
+ );
- my @content_types = qw(
- text/html
- application/xhtml+xml;q=0.9
- application/vnd.wap.xhtml+xml;q=0.6
- );
- $Accept = join(', ', @content_types, '*/*;q=0.5');
- my $re = join('|', map { s/;.*// ; quotemeta } @content_types);
- $ContentTypes = qr{\b(?:$re)\b}io;
+ # Pull in mod_perl modules if applicable.
+ eval {
+ local $SIG{__DIE__} = undef;
+ require Apache2::RequestUtil;
+ } if MP2();
- #
- # Read configuration. If the W3C_CHECKLINK_CFG environment variable has
- # been set or the default contains a non-empty file, read it. Otherwise,
- # skip silently.
- #
- my $defaultconfig = '/etc/w3c/checklink.conf';
- if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) {
+ my @content_types = qw(
+ text/html
+ application/xhtml+xml;q=0.9
+ application/vnd.wap.xhtml+xml;q=0.6
+ );
+ $Accept = join(', ', @content_types, '*/*;q=0.5');
+ my $re = join('|', map { s/;.*//; quotemeta } @content_types);
+ $ContentTypes = qr{\b(?:$re)\b}io;
- require Config::General;
- Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy
+ #
+ # Read configuration. If the W3C_CHECKLINK_CFG environment variable has
+ # been set or the default contains a non-empty file, read it. Otherwise,
+ # skip silently.
+ #
+ my $defaultconfig = '/etc/w3c/checklink.conf';
+ if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) {
- my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig;
- eval {
- my %config_opts =
- ( -ConfigFile => $conffile,
- -SplitPolicy => 'equalsign',
- -AllowMultiOptions => 'no',
- );
- %Cfg = Config::General->new(%config_opts)->getall();
- };
- if ($@) {
- die <<"EOF";
+ require Config::General;
+ Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy
+
+ my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig;
+ eval {
+ my %config_opts = (
+ -ConfigFile => $conffile,
+ -SplitPolicy => 'equalsign',
+ -AllowMultiOptions => 'no',
+ );
+ %Cfg = Config::General->new(%config_opts)->getall();
+ };
+ if ($@) {
+ die <<"EOF";
Failed to read configuration from '$conffile':
$@
EOF
+ }
}
- }
- $Cfg{Markup_Validator_URI} ||=
- 'http://validator.w3.org/check?uri=%s';
- $Cfg{CSS_Validator_URI} ||=
- 'http://jigsaw.w3.org/css-validator/validator?uri=%s';
- $Cfg{Doc_URI} ||=
- 'http://validator.w3.org/docs/checklink.html';
+ $Cfg{Markup_Validator_URI} ||= 'http://validator.w3.org/check?uri=%s';
+ $Cfg{CSS_Validator_URI} ||=
+ 'http://jigsaw.w3.org/css-validator/validator?uri=%s';
+ $Cfg{Doc_URI} ||= 'http://validator.w3.org/docs/checklink.html';
- # Untaint config params that are used as the format argument to (s)printf(),
- # Perl 5.10 does not want to see that in taint mode.
- ($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/);
- ($Cfg{CSS_Validator_URI}) = ($Cfg{CSS_Validator_URI} =~ /^(.*)$/);
+ # Untaint config params that are used as the format argument to (s)printf(),
+ # Perl 5.10 does not want to see that in taint mode.
+ ($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/);
+ ($Cfg{CSS_Validator_URI}) = ($Cfg{CSS_Validator_URI} =~ /^(.*)$/);
- $DocType = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
- my $css_url = URI->new_abs('linkchecker.css', $Cfg{Doc_URI});
- $Head = sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url);
+ $DocType =
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">';
+ my $css_url = URI->new_abs('linkchecker.css', $Cfg{Doc_URI});
+ $Head = sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url);
<meta http-equiv="Content-Script-Type" content="text/javascript" />
<meta name="generator" content="%s" />
<link rel="stylesheet" type="text/css" href="%s" />
@@ -373,97 +387,103 @@
</script>
EOF
- # Trusted environment variables that need laundering in taint mode.
- foreach (qw(NNTPSERVER NEWSHOST)) {
- ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_};
- }
+ # Trusted environment variables that need laundering in taint mode.
+ foreach (qw(NNTPSERVER NEWSHOST)) {
+ ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_};
+ }
- # Use passive FTP by default, see Net::FTP(3).
- $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
+ # Use passive FTP by default, see Net::FTP(3).
+ $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
}
# Autoflush
$| = 1;
# Different options specified by the user
-my $cmdline = ! ($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/);
-my %Opts =
- ( Command_Line => $cmdline,
- Quiet => 0,
- Summary_Only => 0,
- Verbose => 0,
- Progress => 0,
- HTML => 0,
- Timeout => 30,
- Redirects => 1,
- Dir_Redirects => 1,
- Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE},
- Cookies => undef,
- No_Referer => 0,
- Hide_Same_Realm => 0,
- Depth => 0, # < 0 means unlimited recursion.
- Sleep_Time => 1,
- Max_Documents => 150, # For the online version.
- User => undef,
- Password => undef,
- Base_Locations => [],
- Exclude => undef,
- Exclude_Docs => undef,
- Suppress_Redirect => [],
+my $cmdline = !($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/);
+my %Opts = (
+ Command_Line => $cmdline,
+ Quiet => 0,
+ Summary_Only => 0,
+ Verbose => 0,
+ Progress => 0,
+ HTML => 0,
+ Timeout => 30,
+ Redirects => 1,
+ Dir_Redirects => 1,
+ Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE},
+ Cookies => undef,
+ No_Referer => 0,
+ Hide_Same_Realm => 0,
+ Depth => 0, # < 0 means unlimited recursion.
+ Sleep_Time => 1,
+ Max_Documents => 150, # For the online version.
+ User => undef,
+ Password => undef,
+ Base_Locations => [],
+ Exclude => undef,
+ Exclude_Docs => undef,
+ Suppress_Redirect => [],
Suppress_Redirect_Prefix => [],
Suppress_Redirect_Regexp => [],
- Suppress_Temp_Redirects => 1,
- Suppress_Broken => [],
- Suppress_Fragment => [],
- Masquerade => 0,
- Masquerade_From => '',
- Masquerade_To => '',
- Trusted => $Cfg{Trusted},
+ Suppress_Temp_Redirects => 1,
+ Suppress_Broken => [],
+ Suppress_Fragment => [],
+ Masquerade => 0,
+ Masquerade_From => '',
+ Masquerade_To => '',
+ Trusted => $Cfg{Trusted},
Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ?
- $Cfg{Allow_Private_IPs} : $cmdline,
- );
+ $Cfg{Allow_Private_IPs} :
+ $cmdline,
+);
undef $cmdline;
# Global variables
# What URI's did we process? (used for recursive mode)
my %processed;
+
# Result of the HTTP query
my %results;
+
# List of redirects
my %redirects;
+
# Count of the number of documents checked
my $doc_count = 0;
# Time stamp
my $timestamp = &get_timestamp();
+
# Per-document header; undefined if already printed. See print_doc_header().
my $doc_header;
&parse_arguments() if $Opts{Command_Line};
-my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address
+my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address
+
# @@@ make number of keep-alive connections customizable
-$ua->conn_cache({ total_capacity => 1}); # 1 keep-alive connection
+$ua->conn_cache({total_capacity => 1}); # 1 keep-alive connection
if ($ua->can('delay')) {
- $ua->delay($Opts{Sleep_Time}/60);
+ $ua->delay($Opts{Sleep_Time} / 60);
}
$ua->timeout($Opts{Timeout});
+
# Set up cookie stash if requested
if (defined($Opts{Cookies})) {
- require HTTP::Cookies;
- my $cookie_file = $Opts{Cookies};
- if ($cookie_file eq 'tmp') {
- $cookie_file = undef;
- } elsif ($cookie_file =~ /^(.*)$/) {
- $cookie_file = $1; # untaint
- }
- $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1));
+ require HTTP::Cookies;
+ my $cookie_file = $Opts{Cookies};
+ if ($cookie_file eq 'tmp') {
+ $cookie_file = undef;
+ }
+ elsif ($cookie_file =~ /^(.*)$/) {
+ $cookie_file = $1; # untaint
+ }
+ $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1));
}
-eval {
- $ua->allow_private_ips($Opts{Allow_Private_IPs});
-};
+eval { $ua->allow_private_ips($Opts{Allow_Private_IPs}); };
if ($@) {
- die <<"EOF";
+ die <<"EOF";
Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and
Net::hostent modules:
$@
@@ -472,167 +492,184 @@
if ($Opts{Command_Line}) {
- require Text::Wrap;
- Text::Wrap->import('wrap');
+ require Text::Wrap;
+ Text::Wrap->import('wrap');
- require URI::file;
+ require URI::file;
- &usage(1) unless scalar(@ARGV);
+ &usage(1) unless scalar(@ARGV);
- $Opts{_Self_URI} = 'http://validator.w3.org/checklink'; # For HTML output
+ $Opts{_Self_URI} = 'http://validator.w3.org/checklink'; # For HTML output
- &ask_password() if ($Opts{User} && !$Opts{Password});
+ &ask_password() if ($Opts{User} && !$Opts{Password});
- if (!$Opts{Summary_Only}) {
- printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML};
- } else {
- $Opts{Verbose} = 0;
- $Opts{Progress} = 0;
- }
+ if (!$Opts{Summary_Only}) {
+ printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML};
+ }
+ else {
+ $Opts{Verbose} = 0;
+ $Opts{Progress} = 0;
+ }
- # Populate data for print_form()
- my %params = (
- summary => $Opts{Summary_Only},
- hide_redirects => !$Opts{Redirects},
- hide_type => $Opts{Dir_Redirects} ? 'dir' : 'all',
- no_accept_language => !(defined($Opts{Accept_Language}) &&
- $Opts{Accept_Language} eq 'auto'),
- no_referer => $Opts{No_Referer},
- recursive => ($Opts{Depth} != 0),
- depth => $Opts{Depth},
- );
+ # Populate data for print_form()
+ my %params = (
+ summary => $Opts{Summary_Only},
+ hide_redirects => !$Opts{Redirects},
+ hide_type => $Opts{Dir_Redirects} ? 'dir' : 'all',
+ no_accept_language => !(
+ defined($Opts{Accept_Language}) && $Opts{Accept_Language} eq 'auto'
+ ),
+ no_referer => $Opts{No_Referer},
+ recursive => ($Opts{Depth} != 0),
+ depth => $Opts{Depth},
+ );
- my $check_num = 1;
- my @bases = @{$Opts{Base_Locations}};
- foreach my $uri (@ARGV) {
- # Reset base locations so that previous URI's given on the command line
- # won't affect the recursion scope for this URI (see check_uri())
- @{$Opts{Base_Locations}} = @bases;
- # Transform the parameter into a URI
- $uri = &urize($uri);
- $params{uri} = $uri;
- &check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1);
- $check_num++;
- }
- undef $check_num;
+ my $check_num = 1;
+ my @bases = @{$Opts{Base_Locations}};
+ foreach my $uri (@ARGV) {
- if ($Opts{HTML}) {
+ # Reset base locations so that previous URI's given on the command line
+ # won't affect the recursion scope for this URI (see check_uri())
+ @{$Opts{Base_Locations}} = @bases;
+
+ # Transform the parameter into a URI
+ $uri = &urize($uri);
+ $params{uri} = $uri;
+ &check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1);
+ $check_num++;
+ }
+ undef $check_num;
+
+ if ($Opts{HTML}) {
+ &html_footer();
+ }
+ elsif (($doc_count > 0) && !$Opts{Summary_Only}) {
+ printf("\n%s\n", &global_stats());
+ }
+
+}
+else {
+
+ require CGI;
+ require CGI::Carp;
+ CGI::Carp->import(qw(fatalsToBrowser));
+ require CGI::Cookie;
+
+ # file: URIs are not allowed in CGI mode
+ my $forbidden = $ua->protocols_forbidden() || [];
+ push(@$forbidden, 'file');
+ $ua->protocols_forbidden($forbidden);
+
+ my $query = CGI->new();
+
+ # Set a few parameters in CGI mode
+ $Opts{Verbose} = 0;
+ $Opts{Progress} = 0;
+ $Opts{HTML} = 1;
+ $Opts{_Self_URI} = $query->url(-relative => 1);
+
+ # Backwards compatibility
+ my $uri = undef;
+ if ($uri = $query->param('url')) {
+ $query->param('uri', $uri) unless $query->param('uri');
+ $query->delete('url');
+ }
+ $uri = $query->param('uri');
+
+ if (!$uri) {
+ &html_header('', 1); # Set cookie only from results page.
+ my %cookies = CGI::Cookie->fetch();
+ &print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1);
+ &html_footer();
+ exit;
+ }
+
+ # Backwards compatibility
+ if ($query->param('hide_dir_redirects')) {
+ $query->param('hide_redirects', 'on');
+ $query->param('hide_type', 'dir');
+ $query->delete('hide_dir_redirects');
+ }
+
+ $Opts{Summary_Only} = 1 if $query->param('summary');
+
+ if ($query->param('hide_redirects')) {
+ $Opts{Dir_Redirects} = 0;
+ if (my $type = $query->param('hide_type')) {
+ $Opts{Redirects} = 0 if ($type ne 'dir');
+ }
+ else {
+ $Opts{Redirects} = 0;
+ }
+ }
+
+ $Opts{Accept_Language} = undef if $query->param('no_accept_language');
+ $Opts{No_Referer} = $query->param('no_referer');
+
+ $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0);
+ if (my $depth = $query->param('depth')) {
+
+ # @@@ Ignore invalid depth silently for now.
+ $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/);
+ }
+
+ # Save, clear or leave cookie as is.
+ my $cookie = undef;
+ if (my $action = $query->param('cookie')) {
+ if ($action eq 'clear') {
+
+ # Clear the cookie.
+ $cookie = CGI::Cookie->new(-name => $PROGRAM);
+ $cookie->value({clear => 1});
+ $cookie->expires('-1M');
+ }
+ elsif ($action eq 'set') {
+
+ # Set the options.
+ $cookie = CGI::Cookie->new(-name => $PROGRAM);
+ my %options = $query->Vars();
+ delete($options{$_})
+ for qw(url uri check cookie); # Non-persistent.
+ $cookie->value(\%options);
+ }
+ }
+ if (!$cookie) {
+ my %cookies = CGI::Cookie->fetch();
+ $cookie = $cookies{$PROGRAM};
+ }
+
+ # Always refresh cookie expiration time.
+ $cookie->expires('+1M') if ($cookie && !$cookie->expires());
+
+ # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
+ # If we're under mod_perl, there is a way around it...
+ eval {
+ local $SIG{__DIE__} = undef;
+ my $auth =
+ Apache2::RequestUtil->request()->headers_in()->{Authorization};
+ $ENV{HTTP_AUTHORIZATION} = $auth if $auth;
+ } if (MP2() && !$ENV{HTTP_AUTHORIZATION});
+
+ $uri =~ s/^\s+//g;
+ if ($uri !~ m/:/) {
+ if ($uri =~ m|^//|) {
+ $uri = 'http:' . $uri;
+ }
+ else {
+ local $ENV{URL_GUESS_PATTERN} = '';
+ my $guess = URI::Heuristic::uf_uri($uri);
+ if ($guess->scheme() && $ua->is_protocol_supported($guess)) {
+ $uri = $guess->as_string();
+ }
+ else {
+ $uri = 'http://' . $uri;
+ }
+ }
+ }
+
+ &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie);
+ undef $query; # Not needed any more.
&html_footer();
- } elsif (($doc_count > 0) && !$Opts{Summary_Only}) {
- printf("\n%s\n", &global_stats());
- }
-
-} else {
-
- require CGI;
- require CGI::Carp;
- CGI::Carp->import(qw(fatalsToBrowser));
- require CGI::Cookie;
-
- # file: URIs are not allowed in CGI mode
- my $forbidden = $ua->protocols_forbidden() || [];
- push(@$forbidden, 'file');
- $ua->protocols_forbidden($forbidden);
-
- my $query = CGI->new();
- # Set a few parameters in CGI mode
- $Opts{Verbose} = 0;
- $Opts{Progress} = 0;
- $Opts{HTML} = 1;
- $Opts{_Self_URI} = $query->url(-relative => 1);
-
- # Backwards compatibility
- my $uri = undef;
- if ($uri = $query->param('url')) {
- $query->param('uri', $uri) unless $query->param('uri');
- $query->delete('url');
- }
- $uri = $query->param('uri');
-
- if (! $uri) {
- &html_header('', 1); # Set cookie only from results page.
- my %cookies = CGI::Cookie->fetch();
- &print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1);
- &html_footer();
- exit;
- }
-
- # Backwards compatibility
- if ($query->param('hide_dir_redirects')) {
- $query->param('hide_redirects', 'on');
- $query->param('hide_type', 'dir');
- $query->delete('hide_dir_redirects');
- }
-
- $Opts{Summary_Only} = 1 if $query->param('summary');
-
- if ($query->param('hide_redirects')) {
- $Opts{Dir_Redirects} = 0;
- if (my $type = $query->param('hide_type')) {
- $Opts{Redirects} = 0 if ($type ne 'dir');
- } else {
- $Opts{Redirects} = 0;
- }
- }
-
- $Opts{Accept_Language} = undef if $query->param('no_accept_language');
- $Opts{No_Referer} = $query->param('no_referer');
-
- $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0);
- if (my $depth = $query->param('depth')) {
- # @@@ Ignore invalid depth silently for now.
- $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/);
- }
-
- # Save, clear or leave cookie as is.
- my $cookie = undef;
- if (my $action = $query->param('cookie')) {
- if ($action eq 'clear') {
- # Clear the cookie.
- $cookie = CGI::Cookie->new(-name => $PROGRAM);
- $cookie->value({ clear => 1 });
- $cookie->expires('-1M');
- } elsif ($action eq 'set') {
- # Set the options.
- $cookie = CGI::Cookie->new(-name => $PROGRAM);
- my %options = $query->Vars();
- delete($options{$_}) for qw(url uri check cookie); # Non-persistent.
- $cookie->value(\%options);
- }
- }
- if (!$cookie) {
- my %cookies = CGI::Cookie->fetch();
- $cookie = $cookies{$PROGRAM};
- }
- # Always refresh cookie expiration time.
- $cookie->expires('+1M') if ($cookie && !$cookie->expires());
-
- # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
- # If we're under mod_perl, there is a way around it...
- eval {
- local $SIG{__DIE__} = undef;
- my $auth = Apache2::RequestUtil->request()->headers_in()->{Authorization};
- $ENV{HTTP_AUTHORIZATION} = $auth if $auth;
- } if (MP2() && !$ENV{HTTP_AUTHORIZATION});
-
- $uri =~ s/^\s+//g;
- if ($uri !~ m/:/) {
- if ($uri =~ m|^//|) {
- $uri = 'http:'.$uri;
- } else {
- local $ENV{URL_GUESS_PATTERN} = '';
- my $guess = URI::Heuristic::uf_uri($uri);
- if ($guess->scheme() && $ua->is_protocol_supported($guess)) {
- $uri = $guess->as_string();
- } else {
- $uri = 'http://'.$uri;
- }
- }
- }
-
- &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie);
- undef $query; # Not needed any more.
- &html_footer();
}
###############################################################################
@@ -644,144 +681,169 @@
sub parse_arguments ()
{
- require Getopt::Long;
- Getopt::Long->require_version(2.17);
- Getopt::Long->import('GetOptions');
- Getopt::Long::Configure('bundling', 'no_ignore_case');
- my $masq = '';
- my @locs = ();
+ require Getopt::Long;
+ Getopt::Long->require_version(2.17);
+ Getopt::Long->import('GetOptions');
+ Getopt::Long::Configure('bundling', 'no_ignore_case');
+ my $masq = '';
+ my @locs = ();
- GetOptions('help|h|?' => sub { usage(0) },
- 'q|quiet' => sub { $Opts{Quiet} = 1;
- $Opts{Summary_Only} = 1;
- },
- 's|summary' => \$Opts{Summary_Only},
- 'b|broken' => sub { $Opts{Redirects} = 0;
- $Opts{Dir_Redirects} = 0;
- },
- 'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; },
- 'v|verbose' => \$Opts{Verbose},
- 'i|indicator' => \$Opts{Progress},
- 'H|html' => \$Opts{HTML},
- 'r|recursive' => sub { $Opts{Depth} = -1
- if $Opts{Depth} == 0; },
- 'l|location=s' => \@locs,
- 'X|exclude=s' => \$Opts{Exclude},
- 'exclude-docs=s@' => \@{$Opts{Exclude_Docs}},
- 'suppress-redirect=s@' => \@{$Opts{Suppress_Redirect}},
- 'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}},
- 'suppress-temp-redirects' => \$Opts{Suppress_Temp_Redirects},
- 'suppress-broken=s@' => \@{$Opts{Suppress_Broken}},
- 'suppress-fragment=s@' => \@{$Opts{Suppress_Fragment}},
- 'u|user=s' => \$Opts{User},
- 'p|password=s' => \$Opts{Password},
- 't|timeout=i' => \$Opts{Timeout},
- 'S|sleep=i' => \$Opts{Sleep_Time},
- 'L|languages=s' => \$Opts{Accept_Language},
- 'c|cookies=s' => \$Opts{Cookies},
- 'R|no-referer' => \$Opts{No_Referer},
- 'D|depth=i' => sub { $Opts{Depth} = $_[1]
- unless $_[1] == 0; },
- 'd|domain=s' => \$Opts{Trusted},
- 'masquerade=s' => \$masq,
- 'hide-same-realm' => \$Opts{Hide_Same_Realm},
- 'V|version' => \&version,
- )
- || usage(1);
+ GetOptions(
+ 'help|h|?' => sub { usage(0) },
+ 'q|quiet' => sub {
+ $Opts{Quiet} = 1;
+ $Opts{Summary_Only} = 1;
+ },
+ 's|summary' => \$Opts{Summary_Only},
+ 'b|broken' => sub {
+ $Opts{Redirects} = 0;
+ $Opts{Dir_Redirects} = 0;
+ },
+ 'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; },
+ 'v|verbose' => \$Opts{Verbose},
+ 'i|indicator' => \$Opts{Progress},
+ 'H|html' => \$Opts{HTML},
+ 'r|recursive' => sub {
+ $Opts{Depth} = -1
+ if $Opts{Depth} == 0;
+ },
+ 'l|location=s' => \@locs,
+ 'X|exclude=s' => \$Opts{Exclude},
+ 'exclude-docs=s@' => \@{$Opts{Exclude_Docs}},
+ 'suppress-redirect=s@' => \@{$Opts{Suppress_Redirect}},
+ 'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}},
+ 'suppress-temp-redirects' => \$Opts{Suppress_Temp_Redirects},
+ 'suppress-broken=s@' => \@{$Opts{Suppress_Broken}},
+ 'suppress-fragment=s@' => \@{$Opts{Suppress_Fragment}},
+ 'u|user=s' => \$Opts{User},
+ 'p|password=s' => \$Opts{Password},
+ 't|timeout=i' => \$Opts{Timeout},
+ 'S|sleep=i' => \$Opts{Sleep_Time},
+ 'L|languages=s' => \$Opts{Accept_Language},
+ 'c|cookies=s' => \$Opts{Cookies},
+ 'R|no-referer' => \$Opts{No_Referer},
+ 'D|depth=i' => sub {
+ $Opts{Depth} = $_[1]
+ unless $_[1] == 0;
+ },
+ 'd|domain=s' => \$Opts{Trusted},
+ 'masquerade=s' => \$masq,
+ 'hide-same-realm' => \$Opts{Hide_Same_Realm},
+ 'V|version' => \&version,
+ ) ||
+ usage(1);
- if ($masq) {
- $Opts{Masquerade} = 1;
- my @masq = split(/\s+/, $masq);
- if (scalar(@masq) != 2 ||
- !defined($masq[0]) || $masq[0] !~ /\S/ ||
- !defined($masq[1]) || $masq[1] !~ /\S/) {
- usage(1, "Error: --masquerade takes two whitespace separated URIs.");
- } else {
- require URI::file;
- $Opts{Masquerade_From} = $masq[0];
- my $u = URI->new($masq[1]);
- $Opts{Masquerade_To} = $u->scheme() ? $u : URI::file->new_abs($masq[1]);
+ if ($masq) {
+ $Opts{Masquerade} = 1;
+ my @masq = split(/\s+/, $masq);
+ if (scalar(@masq) != 2 ||
+ !defined($masq[0]) ||
+ $masq[0] !~ /\S/ ||
+ !defined($masq[1]) ||
+ $masq[1] !~ /\S/)
+ {
+ usage(1,
+ "Error: --masquerade takes two whitespace separated URIs.");
+ }
+ else {
+ require URI::file;
+ $Opts{Masquerade_From} = $masq[0];
+ my $u = URI->new($masq[1]);
+ $Opts{Masquerade_To} =
+ $u->scheme() ? $u : URI::file->new_abs($masq[1]);
+ }
}
- }
- if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') {
- $Opts{Accept_Language} = &guess_language();
- }
+ if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') {
+ $Opts{Accept_Language} = &guess_language();
+ }
- if (($Opts{Sleep_Time} || 0) < 1) {
- warn("*** Warning: minimum allowed sleep time is 1 second, resetting.\n");
- $Opts{Sleep_Time} = 1;
- }
+ if (($Opts{Sleep_Time} || 0) < 1) {
+ warn(
+ "*** Warning: minimum allowed sleep time is 1 second, resetting.\n"
+ );
+ $Opts{Sleep_Time} = 1;
+ }
- push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs);
+ push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs);
- $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs);
+ $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs);
- # Precompile/error-check regular expressions.
- if (defined($Opts{Exclude})) {
- eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; };
- &usage(1, "Error in exclude regexp: $@") if $@;
- }
- for my $i (0 .. $#{$Opts{Exclude_Docs}}) {
- eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; };
- &usage(1, "Error in exclude-docs regexp: $@") if $@;
- }
- if (defined($Opts{Trusted})) {
- eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; };
- &usage(1, "Error in trusted domains regexp: $@") if $@;
- }
+ # Precompile/error-check regular expressions.
+ if (defined($Opts{Exclude})) {
+ eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; };
+ &usage(1, "Error in exclude regexp: $@") if $@;
+ }
+ for my $i (0 .. $#{$Opts{Exclude_Docs}}) {
+ eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; };
+ &usage(1, "Error in exclude-docs regexp: $@") if $@;
+ }
+ if (defined($Opts{Trusted})) {
+ eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; };
+ &usage(1, "Error in trusted domains regexp: $@") if $@;
+ }
- # Sanity-check error-suppression arguments
- for my $i (0..$#{$Opts{Suppress_Redirect}}) {
- ${$Opts{Suppress_Redirect}}[$i] =~ s/ /->/;
- my $sr_arg = ${$Opts{Suppress_Redirect}}[$i];
- if ($sr_arg !~ /.->./) {
- &usage(1, "Bad suppress-redirect argument, should contain \"->\": $sr_arg");
+ # Sanity-check error-suppression arguments
+ for my $i (0 .. $#{$Opts{Suppress_Redirect}}) {
+ ${$Opts{Suppress_Redirect}}[$i] =~ s/ /->/;
+ my $sr_arg = ${$Opts{Suppress_Redirect}}[$i];
+ if ($sr_arg !~ /.->./) {
+ &usage(1,
+ "Bad suppress-redirect argument, should contain \"->\": $sr_arg"
+ );
+ }
}
- }
- for my $i (0..$#{$Opts{Suppress_Redirect_Prefix}}) {
- my $srp_arg = ${$Opts{Suppress_Redirect_Prefix}}[$i];
- $srp_arg =~ s/ /->/;
- if ($srp_arg !~ /^(.*)->(.*)$/) {
- &usage(1, "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg");
+ for my $i (0 .. $#{$Opts{Suppress_Redirect_Prefix}}) {
+ my $srp_arg = ${$Opts{Suppress_Redirect_Prefix}}[$i];
+ $srp_arg =~ s/ /->/;
+ if ($srp_arg !~ /^(.*)->(.*)$/) {
+ &usage(1,
+ "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg"
+ );
+ }
+
+ # Turn prefixes into a regexp.
+ ${$Opts{Suppress_Redirect_Prefix}}[$i] = qr/^\Q$1\E(.*)->\Q$2\E\1$/ism;
}
- # Turn prefixes into a regexp.
- ${$Opts{Suppress_Redirect_Prefix}}[$i] = qr/^\Q$1\E(.*)->\Q$2\E\1$/ism;
- }
- for my $i (0..$#{$Opts{Suppress_Broken}}) {
- ${$Opts{Suppress_Broken}}[$i] =~ s/ /:/;
- my $sb_arg = ${$Opts{Suppress_Broken}}[$i];
- if ($sb_arg !~ /^(-1|[0-9]+):./) {
- &usage(1, "Bad suppress-broken argument, should be prefixed by a numeric response code: $sb_arg");
+ for my $i (0 .. $#{$Opts{Suppress_Broken}}) {
+ ${$Opts{Suppress_Broken}}[$i] =~ s/ /:/;
+ my $sb_arg = ${$Opts{Suppress_Broken}}[$i];
+ if ($sb_arg !~ /^(-1|[0-9]+):./) {
+ &usage(1,
+ "Bad suppress-broken argument, should be prefixed by a numeric response code: $sb_arg"
+ );
+ }
}
- }
- for my $sf_arg (@{$Opts{Suppress_Fragment}}) {
- if ($sf_arg !~ /.#./) {
- &usage(1, "Bad suppress-fragment argument, should contain \"#\": $sf_arg");
+ for my $sf_arg (@{$Opts{Suppress_Fragment}}) {
+ if ($sf_arg !~ /.#./) {
+ &usage(1,
+ "Bad suppress-fragment argument, should contain \"#\": $sf_arg"
+ );
+ }
}
- }
- return;
+ return;
}
sub version ()
{
- print "$PACKAGE $REVISION\n";
- exit 0;
+ print "$PACKAGE $REVISION\n";
+ exit 0;
}
sub usage ()
{
- my ($exitval, $msg) = @_;
- $exitval = 0 unless defined($exitval);
- $msg ||= ''; $msg =~ s/[\r\n]*$/\n\n/ if $msg;
+ my ($exitval, $msg) = @_;
+ $exitval = 0 unless defined($exitval);
+ $msg ||= '';
+ $msg =~ s/[\r\n]*$/\n\n/ if $msg;
- die($msg) unless $Opts{Command_Line};
+ die($msg) unless $Opts{Command_Line};
- my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only';
+ my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only';
- select(STDERR) if $exitval;
- print "$msg$PACKAGE $REVISION
+ select(STDERR) if $exitval;
+ print "$msg$PACKAGE $REVISION
Usage: checklink <options> <uris>
Options:
@@ -854,28 +916,28 @@
www-validator\@w3.org (with 'checklink' in the subject)
Archives are at: http://lists.w3.org/Archives/Public/www-validator/
";
- exit $exitval;
+ exit $exitval;
}
sub ask_password ()
{
- eval {
- local $SIG{__DIE__} = undef;
- require Term::ReadKey;
- Term::ReadKey->require_version(2.00);
- Term::ReadKey->import(qw(ReadMode));
- };
- if ($@) {
- warn('Warning: Term::ReadKey 2.00 or newer not available, ' .
- "password input disabled.\n");
+ eval {
+ local $SIG{__DIE__} = undef;
+ require Term::ReadKey;
+ Term::ReadKey->require_version(2.00);
+ Term::ReadKey->import(qw(ReadMode));
+ };
+ if ($@) {
+ warn('Warning: Term::ReadKey 2.00 or newer not available, ' .
+ "password input disabled.\n");
+ return;
+ }
+ printf(STDERR 'Enter the password for user %s: ', $Opts{User});
+ ReadMode('noecho', *STDIN);
+ chomp($Opts{Password} = <STDIN>);
+ ReadMode('restore', *STDIN);
+ print(STDERR "ok.\n");
return;
- }
- printf(STDERR 'Enter the password for user %s: ', $Opts{User});
- ReadMode('noecho', *STDIN);
- chomp($Opts{Password} = <STDIN>);
- ReadMode('restore', *STDIN);
- print(STDERR "ok.\n");
- return;
}
###############################################################################
@@ -886,29 +948,29 @@
sub guess_language ()
{
- my $lang = $ENV{LANG} or return;
+ my $lang = $ENV{LANG} or return;
- $lang =~ s/[\.@].*$//; # en_US.UTF-8, fi_FI@euro...
+ $lang =~ s/[\.@].*$//; # en_US.UTF-8, fi_FI@euro...
- return 'en' if ($lang eq 'C' || $lang eq 'POSIX');
+ return 'en' if ($lang eq 'C' || $lang eq 'POSIX');
- my $res = undef;
- eval {
- require Locale::Language;
- if (my $tmp = Locale::Language::language2code($lang)) {
- $lang = $tmp;
- }
- if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) {
- if (Locale::Language::code2language($l)) {
- $res = $l;
- if ($c) {
- require Locale::Country;
- $res .= "-$c" if Locale::Country::code2country($c);
+ my $res = undef;
+ eval {
+ require Locale::Language;
+ if (my $tmp = Locale::Language::language2code($lang)) {
+ $lang = $tmp;
}
- }
- }
- };
- return $res;
+ if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) {
+ if (Locale::Language::code2language($l)) {
+ $res = $l;
+ if ($c) {
+ require Locale::Country;
+ $res .= "-$c" if Locale::Country::code2country($c);
+ }
+ }
+ }
+ };
+ return $res;
}
############################
@@ -917,34 +979,40 @@
sub urize ($)
{
- my $arg = shift;
- my $uarg = URI::Escape::uri_unescape($arg);
- my $uri;
- if (-d $uarg) {
- # look for an "index" file in dir, return it if found
- require File::Spec;
- for my $index (map { File::Spec->catfile($uarg, $_) }
- qw(index.html index.xhtml index.htm index.xhtm)) {
- if (-e $index) {
- $uri = URI::file->new_abs($index);
- last;
- }
+ my $arg = shift;
+ my $uarg = URI::Escape::uri_unescape($arg);
+ my $uri;
+ if (-d $uarg) {
+
+ # look for an "index" file in dir, return it if found
+ require File::Spec;
+ for my $index (map { File::Spec->catfile($uarg, $_) }
+ qw(index.html index.xhtml index.htm index.xhtm))
+ {
+ if (-e $index) {
+ $uri = URI::file->new_abs($index);
+ last;
+ }
+ }
+
+ # return dir itself if an index file was not found
+ $uri ||= URI::file->new_abs($uarg);
}
- # return dir itself if an index file was not found
- $uri ||= URI::file->new_abs($uarg);
- } elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) {
- $uri = URI::file->new_abs($uarg);
- } else {
- my $newuri = URI->new($arg);
- if ($newuri->scheme()) {
- $uri = $newuri;
- } else {
- local $ENV{URL_GUESS_PATTERN} = '';
- $uri = URI::Heuristic::uf_uri($arg);
- $uri = URI::file->new_abs($uri) unless $uri->scheme();
+ elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) {
+ $uri = URI::file->new_abs($uarg);
}
- }
- return $uri->as_string();
+ else {
+ my $newuri = URI->new($arg);
+ if ($newuri->scheme()) {
+ $uri = $newuri;
+ }
+ else {
+ local $ENV{URL_GUESS_PATTERN} = '';
+ $uri = URI::Heuristic::uf_uri($arg);
+ $uri = URI::file->new_abs($uri) unless $uri->scheme();
+ }
+ }
+ return $uri->as_string();
}
########################################
@@ -953,83 +1021,90 @@
sub check_uri (\%$$$$;$$)
{
- my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_;
- $is_start ||= ($check_num == 1);
+ my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_;
+ $is_start ||= ($check_num == 1);
- my $start = $Opts{Summary_Only} ? 0 : &get_timestamp();
+ my $start = $Opts{Summary_Only} ? 0 : &get_timestamp();
- # Get and parse the document
- my $response = &get_document('GET', $uri, $doc_count, \%redirects, $referer,
- $cookie, $params, $check_num, $is_start);
+ # Get and parse the document
+ my $response = &get_document(
+ 'GET', $uri, $doc_count, \%redirects, $referer,
+ $cookie, $params, $check_num, $is_start
+ );
- # Can we check the resource? If not, we exit here...
- return if defined($response->{Stop});
+ # Can we check the resource? If not, we exit here...
+ return if defined($response->{Stop});
- if ($Opts{HTML}) {
- &html_header($uri, 0, $cookie) if ($check_num == 1);
- &print_form($params, $cookie, $check_num) if $is_start;
- }
+ if ($Opts{HTML}) {
+ &html_header($uri, 0, $cookie) if ($check_num == 1);
+ &print_form($params, $cookie, $check_num) if $is_start;
+ }
- if ($is_start) { # Starting point of a new check, eg. from the command line
- # Use the first URI as the recursion base unless specified otherwise.
- push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical())
- unless @{$Opts{Base_Locations}};
- } else {
- # Before fetching the document, we don't know if we'll be within the
- # recursion scope or not (think redirects).
- if (!&in_recursion_scope($response->{absolute_uri})) {
- hprintf("Not in recursion scope: %s\n")
- if ($Opts{Verbose});
- $response->content("");
- return;
+ if ($is_start) { # Starting point of a new check, eg. from the command line
+ # Use the first URI as the recursion base unless specified otherwise.
+ push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical())
+ unless @{$Opts{Base_Locations}};
}
- }
+ else {
- # Define the document header, and perhaps print it.
- # (It might still be defined if the previous document had no errors;
- # just redefine it in that case.)
+ # Before fetching the document, we don't know if we'll be within the
+ # recursion scope or not (think redirects).
+ if (!&in_recursion_scope($response->{absolute_uri})) {
+ hprintf("Not in recursion scope: %s\n")
+ if ($Opts{Verbose});
+ $response->content("");
+ return;
+ }
+ }
- if ($check_num != 1) {
+ # Define the document header, and perhaps print it.
+ # (It might still be defined if the previous document had no errors;
+ # just redefine it in that case.)
+
+ if ($check_num != 1) {
+ if ($Opts{HTML}) {
+ $doc_header = "\n<hr />\n";
+ }
+ else {
+ $doc_header = "\n" . ('-' x 40) . "\n";
+ }
+ }
+
+ my $absolute_uri = $response->{absolute_uri}->as_string();
+
if ($Opts{HTML}) {
- $doc_header = "\n<hr />\n";
- } else {
- $doc_header = "\n" . ('-' x 40) . "\n";
+ $doc_header .=
+ ("<h2>\nProcessing\t" . &show_url($absolute_uri) . "\n</h2>\n\n");
}
- }
+ else {
+ $doc_header .= "\nProcessing\t$absolute_uri\n\n";
+ }
- my $absolute_uri = $response->{absolute_uri}->as_string();
+ if (!$Opts{Quiet}) {
+ print_doc_header();
+ }
- if ($Opts{HTML}) {
- $doc_header .= ("<h2>\nProcessing\t"
- . &show_url($absolute_uri)
- . "\n</h2>\n\n");
- } else {
- $doc_header .= "\nProcessing\t$absolute_uri\n\n";
- }
+ # We are checking a new document
+ $doc_count++;
- if (! $Opts{Quiet}) {
- print_doc_header();
- }
+ my $result_anchor = 'results' . $doc_count;
- # We are checking a new document
- $doc_count++;
-
- my $result_anchor = 'results'.$doc_count;
-
- if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) {
- my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
- my $acclang = $Opts{Accept_Language} || '(not sent)';
- my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
- my $cookies = 'not used';
- if (defined($Opts{Cookies})) {
- $cookies = 'used, ';
- if ($Opts{Cookies} eq 'tmp') {
- $cookies .= 'non-persistent';
- } else {
- $cookies .= "file $Opts{Cookies}";
- }
- }
- printf(<<'EOF', $Accept, $acclang, $send_referer, $cookies, $Opts{Sleep_Time}, $s);
+ if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) {
+ my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
+ my $acclang = $Opts{Accept_Language} || '(not sent)';
+ my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
+ my $cookies = 'not used';
+ if (defined($Opts{Cookies})) {
+ $cookies = 'used, ';
+ if ($Opts{Cookies} eq 'tmp') {
+ $cookies .= 'non-persistent';
+ }
+ else {
+ $cookies .= "file $Opts{Cookies}";
+ }
+ }
+ printf(
+ <<'EOF', $Accept, $acclang, $send_referer, $cookies, $Opts{Sleep_Time}, $s);
Settings used:
- Accept: %s
@@ -1038,19 +1113,21 @@
- Cookies: %s
- Sleeping %d second%s between requests to each server
EOF
- printf("- Excluding links matching %s\n", $Opts{Exclude})
- if defined($Opts{Exclude});
- printf("- Excluding links in documents whose URIs match %s\n",
- join(', ', @{$Opts{Exclude_Docs}})) if @{$Opts{Exclude_Docs}};
- }
+ printf("- Excluding links matching %s\n", $Opts{Exclude})
+ if defined($Opts{Exclude});
+ printf("- Excluding links in documents whose URIs match %s\n",
+ join(', ', @{$Opts{Exclude_Docs}}))
+ if @{$Opts{Exclude_Docs}};
+ }
- if ($Opts{HTML}) {
- if (! $Opts{Summary_Only}) {
- my $accept = &encode($Accept);
- my $acclang = &encode($Opts{Accept_Language} || '(not sent)');
- my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
- my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
- printf(<<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s);
+ if ($Opts{HTML}) {
+ if (!$Opts{Summary_Only}) {
+ my $accept = &encode($Accept);
+ my $acclang = &encode($Opts{Accept_Language} || '(not sent)');
+ my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending';
+ my $s = $Opts{Sleep_Time} == 1 ? '' : 's';
+ printf(
+ <<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s);
<div class="settings">
Settings used:
<ul>
@@ -1061,212 +1138,245 @@
</ul>
</div>
EOF
- printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n",
- $result_anchor);
- my $esc_uri = URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9.");
- printf("<p>For reliable link checking results, check
+ printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n",
+ $result_anchor);
+ my $esc_uri =
+ URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9.");
+ printf(
+ "<p>For reliable link checking results, check
<a href=\"%s\">HTML validity</a> first. See also
<a href=\"%s\">CSS validity</a>.</p>
<p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n",
- &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)),
- &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)),
- &encode($Opts{_Self_URI}));
+ &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)),
+ &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)),
+ &encode($Opts{_Self_URI})
+ );
- printf(<<'EOF', $result_anchor);
+ printf(<<'EOF', $result_anchor);
<div class="progress" id="progress%s">
<h3>Status: <span></span></h3>
<div class="progressbar"><div></div></div>
<pre>
EOF
- }
- }
-
- if ($Opts{Summary_Only} && !$Opts{Quiet}) {
- print '<p>' if $Opts{HTML};
- print 'This may take some time';
- print "... (<a href=\"$Cfg{Doc_URI}#wait\">why?</a>)</p>" if $Opts{HTML};
- print " if the document has many links to check.\n" unless $Opts{HTML};
- }
- # Record that we have processed this resource
- $processed{$absolute_uri} = 1;
- # Parse the document
- my $p = &parse_document($uri, $absolute_uri, $response, 1, ($depth != 0));
- my $base = URI->new($p->{base});
-
- # Check anchors
- ###############
-
- print "Checking anchors...\n" unless $Opts{Summary_Only};
-
- my %errors;
- while (my ($anchor, $lines) = each(%{$p->{Anchors}})) {
- if (!length($anchor)) {
- # Empty IDREF's are not allowed
- $errors{$anchor} = 1;
- } else {
- my $times = 0;
- $times += $_ for values(%$lines);
- # They should appear only once
- $errors{$anchor} = 1 if ($times > 1);
- }
- }
- print " done.\n" unless $Opts{Summary_Only};
-
- # Check links
- #############
-
- &hprintf("Recording all the links found: %d\n", scalar (keys %{$p->{Links}}))
- if ($Opts{Verbose});
- my %links;
- # Record all the links found
- while (my ($link, $lines) = each(%{$p->{Links}})) {
- my $link_uri = URI->new($link);
- my $abs_link_uri = URI->new_abs($link_uri, $base);
-
- if ($Opts{Masquerade}) {
- if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) {
- print_doc_header();
- printf("processing %s in base %s\n",
- $abs_link_uri, $Opts{Masquerade_To});
- my $nlink = $abs_link_uri;
- $nlink =~
- s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|;
- $abs_link_uri = URI->new($nlink);
- }
+ }
}
- my $canon_uri = URI->new($abs_link_uri->canonical());
- my $fragment = $canon_uri->fragment(undef);
- if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) {
- foreach my $line_num (keys(%$lines)) {
- if (!defined($fragment) || !length($fragment)) {
- # Document without fragment
- $links{$canon_uri}{location}{$line_num} = 1;
- } else {
- # Resource with a fragment
- $links{$canon_uri}{fragments}{$fragment}{$line_num} = 1;
+ if ($Opts{Summary_Only} && !$Opts{Quiet}) {
+ print '<p>' if $Opts{HTML};
+ print 'This may take some time';
+ print "... (<a href=\"$Cfg{Doc_URI}#wait\">why?</a>)</p>"
+ if $Opts{HTML};
+ print " if the document has many links to check.\n" unless $Opts{HTML};
+ }
+
+ # Record that we have processed this resource
+ $processed{$absolute_uri} = 1;
+
+ # Parse the document
+ my $p = &parse_document($uri, $absolute_uri, $response, 1, ($depth != 0));
+ my $base = URI->new($p->{base});
+
+ # Check anchors
+ ###############
+
+ print "Checking anchors...\n" unless $Opts{Summary_Only};
+
+ my %errors;
+ while (my ($anchor, $lines) = each(%{$p->{Anchors}})) {
+ if (!length($anchor)) {
+
+ # Empty IDREF's are not allowed
+ $errors{$anchor} = 1;
}
- }
+ else {
+ my $times = 0;
+ $times += $_ for values(%$lines);
+
+ # They should appear only once
+ $errors{$anchor} = 1 if ($times > 1);
+ }
}
- }
+ print " done.\n" unless $Opts{Summary_Only};
- # Build the list of broken URI's
+ # Check links
+ #############
- my $nlinks = scalar(keys(%links));
+ &hprintf("Recording all the links found: %d\n",
+ scalar(keys %{$p->{Links}}))
+ if ($Opts{Verbose});
+ my %links;
- &hprintf("Checking %d links to build list of broken URI's\n", $nlinks)
- if ($Opts{Verbose});
+ # Record all the links found
+ while (my ($link, $lines) = each(%{$p->{Links}})) {
+ my $link_uri = URI->new($link);
+ my $abs_link_uri = URI->new_abs($link_uri, $base);
- my %broken;
- my $link_num = 0;
- while (my ($u, $ulinks) = each(%links)) {
+ if ($Opts{Masquerade}) {
+ if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) {
+ print_doc_header();
+ printf("processing %s in base %s\n",
+ $abs_link_uri, $Opts{Masquerade_To});
+ my $nlink = $abs_link_uri;
+ $nlink =~ s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|;
+ $abs_link_uri = URI->new($nlink);
+ }
+ }
- if ($Opts{Summary_Only}) {
- # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896
- print ' ' if ($Opts{HTML} && !$Opts{Command_Line});
- } else {
- &hprintf("\nChecking link %s\n", $u);
- my $progress = ($link_num/$nlinks) * 100;
- printf('<script type="text/javascript">show_progress("%s", "Checking link %s", "%.1f%%");</script>', $result_anchor, &encode($u), $progress)
- if (!$Opts{Command_Line} && $Opts{HTML} && !$Opts{Summary_Only});
+ my $canon_uri = URI->new($abs_link_uri->canonical());
+ my $fragment = $canon_uri->fragment(undef);
+ if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) {
+ foreach my $line_num (keys(%$lines)) {
+ if (!defined($fragment) || !length($fragment)) {
+
+ # Document without fragment
+ $links{$canon_uri}{location}{$line_num} = 1;
+ }
+ else {
+
+ # Resource with a fragment
+ $links{$canon_uri}{fragments}{$fragment}{$line_num} = 1;
+ }
+ }
+ }
}
- $link_num++;
- # Check that a link is valid
- &check_validity($uri, $u,
- ($depth != 0 && &in_recursion_scope($u)),
- \%links, \%redirects);
- &hprintf("\tReturn code: %s\n", $results{$u}{location}{code})
- if ($Opts{Verbose});
- if ($results{$u}{location}{success}) {
+ # Build the list of broken URI's
- # Even though it was not broken, we might want to display it
- # on the results page (e.g. because it required authentication)
- $broken{$u}{location} = 1 if ($results{$u}{location}{display} >= 400);
+ my $nlinks = scalar(keys(%links));
- # List the broken fragments
- while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) {
- if ($Opts{Verbose}) {
- my @line_nums = sort { $a <=> $b } keys(%$lines);
- &hprintf("\t\t%s %s - Line%s: %s\n",
- $fragment,
- ($results{$u}{fragments}{$fragment}) ? 'OK' : 'Not found',
- (scalar(@line_nums) > 1) ? 's' : '',
- join(', ', @line_nums)
- );
+ &hprintf("Checking %d links to build list of broken URI's\n", $nlinks)
+ if ($Opts{Verbose});
+
+ my %broken;
+ my $link_num = 0;
+ while (my ($u, $ulinks) = each(%links)) {
+
+ if ($Opts{Summary_Only}) {
+
+ # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896
+ print ' ' if ($Opts{HTML} && !$Opts{Command_Line});
}
- # A broken fragment?
- if ($results{$u}{fragments}{$fragment} == 0) {
- $broken{$u}{fragments}{$fragment} += 2;
+ else {
+ &hprintf("\nChecking link %s\n", $u);
+ my $progress = ($link_num / $nlinks) * 100;
+ printf(
+ '<script type="text/javascript">show_progress("%s", "Checking link %s", "%.1f%%");</script>',
+ $result_anchor, &encode($u), $progress)
+ if (!$Opts{Command_Line} &&
+ $Opts{HTML} &&
+ !$Opts{Summary_Only});
}
- }
- } elsif (!($Opts{Quiet} && &informational($results{$u}{location}{code}))) {
- # Couldn't find the document
- $broken{$u}{location} = 1;
- # All the fragments associated are hence broken
- foreach my $fragment (keys %{$ulinks->{fragments}}) {
- $broken{$u}{fragments}{$fragment}++;
- }
+ $link_num++;
+
+ # Check that a link is valid
+ &check_validity($uri, $u, ($depth != 0 && &in_recursion_scope($u)),
+ \%links, \%redirects);
+ &hprintf("\tReturn code: %s\n", $results{$u}{location}{code})
+ if ($Opts{Verbose});
+ if ($results{$u}{location}{success}) {
+
+ # Even though it was not broken, we might want to display it
+ # on the results page (e.g. because it required authentication)
+ $broken{$u}{location} = 1
+ if ($results{$u}{location}{display} >= 400);
+
+ # List the broken fragments
+ while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) {
+ if ($Opts{Verbose}) {
+ my @line_nums = sort { $a <=> $b } keys(%$lines);
+ &hprintf(
+ "\t\t%s %s - Line%s: %s\n",
+ $fragment,
+ ($results{$u}{fragments}{$fragment}) ? 'OK' :
+ 'Not found',
+ (scalar(@line_nums) > 1) ? 's' : '',
+ join(', ', @line_nums)
+ );
+ }
+
+ # A broken fragment?
+ if ($results{$u}{fragments}{$fragment} == 0) {
+ $broken{$u}{fragments}{$fragment} += 2;
+ }
+ }
+ }
+ elsif (!($Opts{Quiet} && &informational($results{$u}{location}{code})))
+ {
+
+ # Couldn't find the document
+ $broken{$u}{location} = 1;
+
+ # All the fragments associated are hence broken
+ foreach my $fragment (keys %{$ulinks->{fragments}}) {
+ $broken{$u}{fragments}{$fragment}++;
+ }
+ }
}
- }
- &hprintf("\nProcessed in %s seconds.\n", &time_diff($start, &get_timestamp()))
- unless $Opts{Summary_Only};
- printf('<script type="text/javascript">show_progress("%s", "Done. Document processed in %s seconds.", "100%%");</script>', $result_anchor, &time_diff($start, &get_timestamp()))
- if ($Opts{HTML} && !$Opts{Summary_Only});
+ &hprintf(
+ "\nProcessed in %s seconds.\n",
+ &time_diff($start, &get_timestamp())
+ ) unless $Opts{Summary_Only};
+ printf(
+ '<script type="text/javascript">show_progress("%s", "Done. Document processed in %s seconds.", "100%%");</script>',
+ $result_anchor, &time_diff($start, &get_timestamp()))
+ if ($Opts{HTML} && !$Opts{Summary_Only});
- # Display results
- if ($Opts{HTML} && !$Opts{Summary_Only}) {
- print("</pre>\n</div>\n");
- printf("<h2><a name=\"%s\">Results</a></h2>\n", $result_anchor);
- }
- print "\n" unless $Opts{Quiet};
+ # Display results
+ if ($Opts{HTML} && !$Opts{Summary_Only}) {
+ print("</pre>\n</div>\n");
+ printf("<h2><a name=\"%s\">Results</a></h2>\n", $result_anchor);
+ }
+ print "\n" unless $Opts{Quiet};
- &links_summary(\%links, \%results, \%broken, \%redirects);
- &anchors_summary($p->{Anchors}, \%errors);
+ &links_summary(\%links, \%results, \%broken, \%redirects);
+ &anchors_summary($p->{Anchors}, \%errors);
- # Do we want to process other documents?
- if ($depth != 0) {
+ # Do we want to process other documents?
+ if ($depth != 0) {
- foreach my $u (keys %links) {
+ foreach my $u (keys %links) {
- next unless $results{$u}{location}{success}; # Broken link?
+ next unless $results{$u}{location}{success}; # Broken link?
- next unless &in_recursion_scope($u);
+ next unless &in_recursion_scope($u);
- # Do we understand its content type?
- next unless ($results{$u}{location}{type} =~ $ContentTypes);
+ # Do we understand its content type?
+ next unless ($results{$u}{location}{type} =~ $ContentTypes);
- # Have we already processed this URI?
- next if &already_processed($u, $uri);
+ # Have we already processed this URI?
+ next if &already_processed($u, $uri);
- # Do the job
- print "\n" unless $Opts{Quiet};
- if ($Opts{HTML}) {
- if (!$Opts{Command_Line}) {
- if ($doc_count == $Opts{Max_Documents}) {
- print("<hr />\n<p><strong>Maximum number of documents ($Opts{Max_Documents}) reached!</strong></p>\n");
- }
- if ($doc_count >= $Opts{Max_Documents}) {
- $doc_count++;
- print("<p>Not checking <strong>$u</strong></p>\n");
- $processed{$u} = 1;
- next;
- }
+ # Do the job
+ print "\n" unless $Opts{Quiet};
+ if ($Opts{HTML}) {
+ if (!$Opts{Command_Line}) {
+ if ($doc_count == $Opts{Max_Documents}) {
+ print(
+ "<hr />\n<p><strong>Maximum number of documents ($Opts{Max_Documents}) reached!</strong></p>\n"
+ );
+ }
+ if ($doc_count >= $Opts{Max_Documents}) {
+ $doc_count++;
+ print("<p>Not checking <strong>$u</strong></p>\n");
+ $processed{$u} = 1;
+ next;
+ }
+ }
+ }
+
+ # This is an inherently recursive algorithm, so Perl's warning is not
+ # helpful. You may wish to comment this out when debugging, though.
+ no warnings 'recursion';
+
+ if ($depth < 0) {
+ &check_uri($params, $u, 0, -1, $cookie, $uri);
+ }
+ else {
+ &check_uri($params, $u, 0, $depth - 1, $cookie, $uri);
+ }
}
- }
-
- # This is an inherently recursive algorithm, so Perl's warning is not
- # helpful. You may wish to comment this out when debugging, though.
- no warnings 'recursion';
-
- if ($depth < 0) {
- &check_uri($params, $u, 0, -1, $cookie, $uri);
- } else {
- &check_uri($params, $u, 0, $depth-1, $cookie, $uri);
- }
}
- }
- return;
+ return;
}
##########################################
@@ -1275,29 +1385,34 @@
sub decode_content ($)
{
- my $response = shift;
- my $error = undef;
+ my $response = shift;
+ my $error = undef;
- my $docref = $response->decoded_content(ref => 1);
- if (defined($docref)) {
- utf8::encode($$docref);
- $response->content_ref($docref);
- # Remove Content-Encoding so it won't be decoded again later.
- $response->remove_header('Content-Encoding')
- } else {
- my $ce = $response->header('Content-Encoding');
- $ce = defined($ce) ? "'$ce'" : 'undefined';
- my $ct = $response->header('Content-Type');
- $ct = defined($ct) ? "'$ct'" : 'undefined';
- my $request_uri = $response->request->url;
- # content_charset() is available in LWP >= 5.827
- my $cs = $response->can('content_charset') ?
- $response->content_charset() : undef;
- $cs = defined($cs) ? "'$cs'" : 'unknown';
- $error = "Error decoding document at <$request_uri>, Content-Type $ct, " .
- "Content-Encoding $ce, content charset $cs: '$@'";
- }
- return $error;
+ my $docref = $response->decoded_content(ref => 1);
+ if (defined($docref)) {
+ utf8::encode($$docref);
+ $response->content_ref($docref);
+
+ # Remove Content-Encoding so it won't be decoded again later.
+ $response->remove_header('Content-Encoding');
+ }
+ else {
+ my $ce = $response->header('Content-Encoding');
+ $ce = defined($ce) ? "'$ce'" : 'undefined';
+ my $ct = $response->header('Content-Type');
+ $ct = defined($ct) ? "'$ct'" : 'undefined';
+ my $request_uri = $response->request->url;
+
+ # content_charset() is available in LWP >= 5.827
+ my $cs =
+ $response->can('content_charset') ? $response->content_charset() :
+ undef;
+ $cs = defined($cs) ? "'$cs'" : 'unknown';
+ $error =
+ "Error decoding document at <$request_uri>, Content-Type $ct, " .
+ "Content-Encoding $ce, content charset $cs: '$@'";
+ }
+ return $error;
}
#######################################
@@ -1306,80 +1421,89 @@
sub get_document ($$$;\%$$$$$)
{
- my ($method, $uri, $in_recursion, $redirects, $referer, $cookie, $params,
- $check_num, $is_start) = @_;
- # $method contains the HTTP method the use (GET or HEAD)
- # $uri contains the identifier of the resource
- # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least
- # the second resource checked)
- # $redirects is a pointer to the hash containing the map of the redirects
- # $referer is the URI of the referring document
- # $cookie, $params, $check_num, and $is_start are for printing HTTP headers
- # and the form if $in_recursion == 0 and not authenticating
+ my ($method, $uri, $in_recursion, $redirects, $referer,
+ $cookie, $params, $check_num, $is_start
+ ) = @_;
- # Get the resource
- my $response;
- if (defined($results{$uri}{response})
- && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
- $response = $results{$uri}{response};
- } else {
- $response = &get_uri($method, $uri, $referer);
- &record_results($uri, $method, $response);
- &record_redirects($redirects, $response);
- }
- if (! $response->is_success()) {
- if (! $in_recursion) {
- # Is it too late to request authentication?
- if ($response->code() == 401) {
- &authentication($response, $cookie, $params, $check_num, $is_start);
- } else {
- if ($Opts{HTML}) {
- &html_header($uri, 0, $cookie) if ($check_num == 1);
- &print_form($params, $cookie, $check_num) if $is_start;
- print "<p>", &status_icon($response->code());
+ # $method contains the HTTP method the use (GET or HEAD)
+ # $uri contains the identifier of the resource
+ # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least
+ # the second resource checked)
+ # $redirects is a pointer to the hash containing the map of the redirects
+ # $referer is the URI of the referring document
+ # $cookie, $params, $check_num, and $is_start are for printing HTTP headers
+ # and the form if $in_recursion == 0 and not authenticating
+
+ # Get the resource
+ my $response;
+ if (defined($results{$uri}{response}) &&
+ !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD')))
+ {
+ $response = $results{$uri}{response};
+ }
+ else {
+ $response = &get_uri($method, $uri, $referer);
+ &record_results($uri, $method, $response);
+ &record_redirects($redirects, $response);
+ }
+ if (!$response->is_success()) {
+ if (!$in_recursion) {
+
+ # Is it too late to request authentication?
+ if ($response->code() == 401) {
+ &authentication($response, $cookie, $params, $check_num,
+ $is_start);
+ }
+ else {
+ if ($Opts{HTML}) {
+ &html_header($uri, 0, $cookie) if ($check_num == 1);
+ &print_form($params, $cookie, $check_num) if $is_start;
+ print "<p>", &status_icon($response->code());
+ }
+ &hprintf("\nError: %d %s\n",
+ $response->code(), $response->message() || '(no message)');
+ print "</p>\n" if $Opts{HTML};
+ }
}
- &hprintf("\nError: %d %s\n",
- $response->code(), $response->message() || '(no message)');
- print "</p>\n" if $Opts{HTML};
- }
+ $response->{Stop} = 1;
+ $response->content("");
+ return ($response);
}
- $response->{Stop} = 1;
- $response->content("");
- return($response);
- }
- # What is the URI of the resource that we are processing by the way?
- my $base_uri = $response->base();
- my $request_uri = URI->new($response->request->url);
- $response->{absolute_uri} = $request_uri->abs($base_uri);
+ # What is the URI of the resource that we are processing by the way?
+ my $base_uri = $response->base();
+ my $request_uri = URI->new($response->request->url);
+ $response->{absolute_uri} = $request_uri->abs($base_uri);
- # Can we parse the document?
- my $failed_reason;
- my $ct = $response->header('Content-Type');
- if (!$ct || $ct !~ $ContentTypes) {
- $failed_reason = "Content-Type for <$request_uri> is " .
- (defined($ct) ? "'$ct'" : 'undefined');
- } else {
- $failed_reason = decode_content($response);
- }
- if ($failed_reason) {
- # No, there is a problem...
- if (! $in_recursion) {
- if ($Opts{HTML}) {
- &html_header($uri, 0, $cookie) if ($check_num == 1);
- &print_form($params, $cookie, $check_num) if $is_start;
- print "<p>", &status_icon(406);
+ # Can we parse the document?
+ my $failed_reason;
+ my $ct = $response->header('Content-Type');
+ if (!$ct || $ct !~ $ContentTypes) {
+ $failed_reason = "Content-Type for <$request_uri> is " .
+ (defined($ct) ? "'$ct'" : 'undefined');
+ }
+ else {
+ $failed_reason = decode_content($response);
+ }
+ if ($failed_reason) {
- }
- &hprintf("Can't check links: %s.\n", $failed_reason);
- print "</p>\n" if $Opts{HTML};
+ # No, there is a problem...
+ if (!$in_recursion) {
+ if ($Opts{HTML}) {
+ &html_header($uri, 0, $cookie) if ($check_num == 1);
+ &print_form($params, $cookie, $check_num) if $is_start;
+ print "<p>", &status_icon(406);
+
+ }
+ &hprintf("Can't check links: %s.\n", $failed_reason);
+ print "</p>\n" if $Opts{HTML};
+ }
+ $response->{Stop} = 1;
+ $response->content("");
}
- $response->{Stop} = 1;
- $response->content("");
- }
- # Ok, return the information
- return($response);
+ # Ok, return the information
+ return ($response);
}
#########################################################
@@ -1388,25 +1512,25 @@
sub in_recursion_scope ($)
{
- my ($uri) = @_;
- return 0 unless $uri;
+ my ($uri) = @_;
+ return 0 unless $uri;
- my $candidate = URI->new($uri)->canonical();
+ my $candidate = URI->new($uri)->canonical();
- return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude});
+ return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude});
- for my $excluded_doc (@{$Opts{Exclude_Docs}}) {
- return 0 if ($candidate =~ $excluded_doc);
- }
+ for my $excluded_doc (@{$Opts{Exclude_Docs}}) {
+ return 0 if ($candidate =~ $excluded_doc);
+ }
- foreach my $base (@{$Opts{Base_Locations}}) {
- my $rel = $candidate->rel($base);
- next if ($candidate eq $rel); # Relative path not possible?
- next if ($rel =~ m|^(\.\.)?/|); # Relative path upwards?
- return 1;
- }
+ foreach my $base (@{$Opts{Base_Locations}}) {
+ my $rel = $candidate->rel($base);
+ next if ($candidate eq $rel); # Relative path not possible?
+ next if ($rel =~ m|^(\.\.)?/|); # Relative path upwards?
+ return 1;
+ }
- return 0; # We always have at least one base location, but none matched.
+ return 0; # We always have at least one base location, but none matched.
}
##################################################
@@ -1415,20 +1539,26 @@
sub already_processed ($$)
{
- my ($uri, $referer) = @_;
- # Don't be verbose for that part...
- my $summary_value = $Opts{Summary_Only};
- $Opts{Summary_Only} = 1;
- # Do a GET: if it fails, we stop, if not, the results are cached
- my $response = &get_document('GET', $uri, 1, undef, $referer);
- # ... but just for that part
- $Opts{Summary_Only} = $summary_value;
- # Can we process the resource?
- return -1 if defined($response->{Stop});
- # Have we already processed it?
- return 1 if defined($processed{$response->{absolute_uri}->as_string()});
- # It's not processed yet and it is processable: return 0
- return 0;
+ my ($uri, $referer) = @_;
+
+ # Don't be verbose for that part...
+ my $summary_value = $Opts{Summary_Only};
+ $Opts{Summary_Only} = 1;
+
+ # Do a GET: if it fails, we stop, if not, the results are cached
+ my $response = &get_document('GET', $uri, 1, undef, $referer);
+
+ # ... but just for that part
+ $Opts{Summary_Only} = $summary_value;
+
+ # Can we process the resource?
+ return -1 if defined($response->{Stop});
+
+ # Have we already processed it?
+ return 1 if defined($processed{$response->{absolute_uri}->as_string()});
+
+ # It's not processed yet and it is processable: return 0
+ return 0;
}
############################
@@ -1437,96 +1567,105 @@
sub get_uri ($$;$$\%$$$$)
{
- # Here we have a lot of extra parameters in order not to lose information
- # if the function is called several times (401's)
- my ($method, $uri, $referer, $start, $redirects, $code, $realm, $message,
- $auth) = @_;
- # $method contains the method used
- # $uri contains the target of the request
- # $referer is the URI of the referring document
- # $start is a timestamp (not defined the first time the function is called)
- # $redirects is a map of redirects
- # $code is the first HTTP return code
- # $realm is the realm of the request
- # $message is the HTTP message received
- # $auth equals 1 if we want to send out authentication information
+ # Here we have a lot of extra parameters in order not to lose information
+ # if the function is called several times (401's)
+ my ($method, $uri, $referer, $start, $redirects,
+ $code, $realm, $message, $auth
+ ) = @_;
- # For timing purposes
- $start = &get_timestamp() unless defined($start);
+ # $method contains the method used
+ # $uri contains the target of the request
+ # $referer is the URI of the referring document
+ # $start is a timestamp (not defined the first time the function is called)
+ # $redirects is a map of redirects
+ # $code is the first HTTP return code
+ # $realm is the realm of the request
+ # $message is the HTTP message received
+ # $auth equals 1 if we want to send out authentication information
- # Prepare the query
+ # For timing purposes
+ $start = &get_timestamp() unless defined($start);
- # Do we want printouts of progress?
- my $verbose_progress =
- ! ($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}));
+ # Prepare the query
- &hprintf("%s %s ", $method, $uri) if $verbose_progress;
+ # Do we want printouts of progress?
+ my $verbose_progress =
+ !($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}));
- my $request = HTTP::Request->new($method, $uri);
+ &hprintf("%s %s ", $method, $uri) if $verbose_progress;
- $request->header('Accept-Language' => $Opts{Accept_Language})
- if $Opts{Accept_Language};
- $request->header('Accept', $Accept);
- # accept_decodable() was added in LWP 5.814
- $request->accept_decodable() if $request->can('accept_decodable');
+ my $request = HTTP::Request->new($method, $uri);
- # Are we providing authentication info?
- if ($auth && $request->url()->host() =~ $Opts{Trusted}) {
- if (defined($ENV{HTTP_AUTHORIZATION})) {
- $request->header(Authorization => $ENV{HTTP_AUTHORIZATION});
- } elsif (defined($Opts{User}) && defined($Opts{Password})) {
- $request->authorization_basic($Opts{User}, $Opts{Password});
- }
- }
+ $request->header('Accept-Language' => $Opts{Accept_Language})
+ if $Opts{Accept_Language};
+ $request->header('Accept', $Accept);
- # Tell the user agent if we want progress reports for redirects or not.
- $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); })
- if $verbose_progress;
+ # accept_decodable() was added in LWP 5.814
+ $request->accept_decodable() if $request->can('accept_decodable');
- # Set referer
- $request->referer($referer) if (!$Opts{No_Referer} && $referer);
-
- # Telling caches in the middle we want a fresh copy (Bug 4998)
- $request->header(Cache_Control => "max-age=0");
-
- # Do the query
- my $response = $ua->request($request);
-
- # Get the results
- # Record the very first response
- if (! defined($code)) {
- ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)});
- }
- # Authentication requested?
- if ($response->code() == 401 &&
- !defined($auth) &&
- (defined($ENV{HTTP_AUTHORIZATION})
- || (defined($Opts{User}) && defined($Opts{Password})))) {
-
- # Set host as trusted domain unless we already have one.
- if (!$Opts{Trusted}) {
- my $re = sprintf('^%s$', quotemeta($response->base()->host()));
- $Opts{Trusted} = qr/$re/io;
+ # Are we providing authentication info?
+ if ($auth && $request->url()->host() =~ $Opts{Trusted}) {
+ if (defined($ENV{HTTP_AUTHORIZATION})) {
+ $request->header(Authorization => $ENV{HTTP_AUTHORIZATION});
+ }
+ elsif (defined($Opts{User}) && defined($Opts{Password})) {
+ $request->authorization_basic($Opts{User}, $Opts{Password});
+ }
}
- # Deal with authentication and avoid loops
- if (!defined($realm) &&
- $response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) {
- $realm = $1;
+ # Tell the user agent if we want progress reports for redirects or not.
+ $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); })
+ if $verbose_progress;
+
+ # Set referer
+ $request->referer($referer) if (!$Opts{No_Referer} && $referer);
+
+ # Telling caches in the middle we want a fresh copy (Bug 4998)
+ $request->header(Cache_Control => "max-age=0");
+
+ # Do the query
+ my $response = $ua->request($request);
+
+ # Get the results
+ # Record the very first response
+ if (!defined($code)) {
+ ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)});
}
- print "\n" if $verbose_progress;
- return &get_uri($method, $response->request()->url(), $referer,
- $start, $redirects, $code, $realm, $message, 1);
- }
- # @@@ subtract robot delay from the "fetched in" time?
- &hprintf(" fetched in %s seconds\n",
- &time_diff($start, &get_timestamp())) if $verbose_progress;
+ # Authentication requested?
+ if ($response->code() == 401 &&
+ !defined($auth) &&
+ (defined($ENV{HTTP_AUTHORIZATION}) ||
+ (defined($Opts{User}) && defined($Opts{Password})))
+ )
+ {
- $response->{Realm} = $realm if defined($realm);
+ # Set host as trusted domain unless we already have one.
+ if (!$Opts{Trusted}) {
+ my $re = sprintf('^%s$', quotemeta($response->base()->host()));
+ $Opts{Trusted} = qr/$re/io;
+ }
- return $response;
+ # Deal with authentication and avoid loops
+ if (!defined($realm) &&
+ $response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/)
+ {
+ $realm = $1;
+ }
+
+ print "\n" if $verbose_progress;
+ return &get_uri($method, $response->request()->url(),
+ $referer, $start, $redirects, $code, $realm, $message, 1);
+ }
+
+ # @@@ subtract robot delay from the "fetched in" time?
+ &hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp()))
+ if $verbose_progress;
+
+ $response->{Realm} = $realm if defined($realm);
+
+ return $response;
}
#########################################
@@ -1535,65 +1674,72 @@
sub record_results ($$$)
{
- my ($uri, $method, $response) = @_;
- $results{$uri}{response} = $response;
- $results{$uri}{method} = $method;
- $results{$uri}{location}{code} = $response->code();
- $results{$uri}{location}{code} = RC_ROBOTS_TXT()
- if ($results{$uri}{location}{code} == 403 &&
+ my ($uri, $method, $response) = @_;
+ $results{$uri}{response} = $response;
+ $results{$uri}{method} = $method;
+ $results{$uri}{location}{code} = $response->code();
+ $results{$uri}{location}{code} = RC_ROBOTS_TXT()
+ if ($results{$uri}{location}{code} == 403 &&
$response->message() =~ /Forbidden by robots\.txt/);
- $results{$uri}{location}{code} = RC_IP_DISALLOWED()
- if ($results{$uri}{location}{code} == 403 &&
+ $results{$uri}{location}{code} = RC_IP_DISALLOWED()
+ if ($results{$uri}{location}{code} == 403 &&
$response->message() =~ /non-public IP/);
- $results{$uri}{location}{code} = RC_DNS_ERROR()
- if ($results{$uri}{location}{code} == 500 &&
+ $results{$uri}{location}{code} = RC_DNS_ERROR()
+ if ($results{$uri}{location}{code} == 500 &&
$response->message() =~ /Bad hostname '[^\']*'/);
- $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED()
- if ($results{$uri}{location}{code} == 500 &&
+ $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED()
+ if ($results{$uri}{location}{code} == 500 &&
$response->message() =~ /Access to '[^\']*' URIs has been disabled/);
- $results{$uri}{location}{type} = $response->header('Content-type');
- $results{$uri}{location}{display} = $results{$uri}{location}{code};
- # Rewind, check for the original code and message.
- for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) {
- $results{$uri}{location}{orig} = $tmp->code();
- $results{$uri}{location}{orig_message} = $tmp->message() || '(no message)';
- }
- $results{$uri}{location}{success} = $response->is_success();
+ $results{$uri}{location}{type} = $response->header('Content-type');
+ $results{$uri}{location}{display} = $results{$uri}{location}{code};
- # If a suppressed broken link, fill the data structure like a typical success.
- # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n";
- if (! $results{$uri}{location}{success}) {
- my $code = $results{$uri}{location}{code};
- my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}};
- if ($match) {
- $results{$uri}{location}{success} = 1;
- $results{$uri}{location}{code} = 100;
- $results{$uri}{location}{display} = 100;
+ # Rewind, check for the original code and message.
+ for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) {
+ $results{$uri}{location}{orig} = $tmp->code();
+ $results{$uri}{location}{orig_message} = $tmp->message() ||
+ '(no message)';
}
- }
+ $results{$uri}{location}{success} = $response->is_success();
- # Stores the authentication information
- if (defined($response->{Realm})) {
- $results{$uri}{location}{realm} = $response->{Realm};
- $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm};
- }
- # What type of broken link is it? (stored in {record} - the {display}
- # information is just for visual use only)
- if (($results{$uri}{location}{display} == 401)
- && ($results{$uri}{location}{code} == 404)) {
- $results{$uri}{location}{record} = 404;
- } else {
- $results{$uri}{location}{record} = $results{$uri}{location}{display};
- }
- # Did it fail?
- $results{$uri}{location}{message} = $response->message() || '(no message)';
- if (! $results{$uri}{location}{success}) {
- &hprintf("Error: %d %s\n",
- $results{$uri}{location}{code},
- $results{$uri}{location}{message})
- if ($Opts{Verbose});
- }
- return;
+ # If a suppressed broken link, fill the data structure like a typical success.
+ # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n";
+ if (!$results{$uri}{location}{success}) {
+ my $code = $results{$uri}{location}{code};
+ my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}};
+ if ($match) {
+ $results{$uri}{location}{success} = 1;
+ $results{$uri}{location}{code} = 100;
+ $results{$uri}{location}{display} = 100;
+ }
+ }
+
+ # Stores the authentication information
+ if (defined($response->{Realm})) {
+ $results{$uri}{location}{realm} = $response->{Realm};
+ $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm};
+ }
+
+ # What type of broken link is it? (stored in {record} - the {display}
+ # information is just for visual use only)
+ if (($results{$uri}{location}{display} == 401) &&
+ ($results{$uri}{location}{code} == 404))
+ {
+ $results{$uri}{location}{record} = 404;
+ }
+ else {
+ $results{$uri}{location}{record} = $results{$uri}{location}{display};
+ }
+
+ # Did it fail?
+ $results{$uri}{location}{message} = $response->message() || '(no message)';
+ if (!$results{$uri}{location}{success}) {
+ &hprintf(
+ "Error: %d %s\n",
+ $results{$uri}{location}{code},
+ $results{$uri}{location}{message}
+ ) if ($Opts{Verbose});
+ }
+ return;
}
####################
@@ -1602,66 +1748,68 @@
sub parse_document ($$$$$)
{
- my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_;
+ my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_;
- print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n")
- if $Opts{Verbose};
+ print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n")
+ if $Opts{Verbose};
- my $p;
+ my $p;
- if (defined($results{$uri}{parsing})) {
- # We have already done the job. Woohoo!
- $p->{base} = $results{$uri}{parsing}{base};
- $p->{Anchors} = $results{$uri}{parsing}{Anchors};
- $p->{Links} = $results{$uri}{parsing}{Links};
+ if (defined($results{$uri}{parsing})) {
+
+ # We have already done the job. Woohoo!
+ $p->{base} = $results{$uri}{parsing}{base};
+ $p->{Anchors} = $results{$uri}{parsing}{Anchors};
+ $p->{Links} = $results{$uri}{parsing}{Links};
+ return $p;
+ }
+
+ my $start;
+ $p = W3C::LinkChecker->new();
+ $p->{base} = $base_uri;
+ if (!$Opts{Summary_Only}) {
+ $start = &get_timestamp();
+ print("Parsing...\n");
+ }
+
+ # Content-Encoding etc already decoded in get_document().
+ my $docref = $response->content_ref();
+
+ # Count lines beforehand if needed for progress indicator. In all cases,
+ # the actual final number of lines processed shown is populated by our
+ # end_document handler.
+ $p->{Total} = ($$docref =~ tr/\n//) if $Opts{Progress};
+
+ # We only look for anchors if we are not interested in the links
+ # obviously, or if we are running a recursive checking because we
+ # might need this information later
+ $p->{only_anchors} = !($links || $rec_needs_links);
+
+ # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
+ # Processing instructions are not parsed by process, but in this case
+ # it should be. It's expensive, it's horrible, but it's the easiest way
+ # for right now.
+ $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/
+ unless $p->{only_anchors};
+
+ $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
+
+ $p->parse($$docref)->eof();
+ $response->content("");
+
+ if (!$Opts{Summary_Only}) {
+ my $stop = &get_timestamp();
+ print "\r" if $Opts{Progress};
+ &hprintf(" done (%d lines in %s seconds).\n",
+ $p->{Total}, &time_diff($start, $stop));
+ }
+
+ # Save the results before exiting
+ $results{$uri}{parsing}{base} = $p->{base};
+ $results{$uri}{parsing}{Anchors} = $p->{Anchors};
+ $results{$uri}{parsing}{Links} = $p->{Links};
+
return $p;
- }
-
- my $start;
- $p = W3C::LinkChecker->new();
- $p->{base} = $base_uri;
- if (! $Opts{Summary_Only}) {
- $start = &get_timestamp();
- print("Parsing...\n");
- }
-
- # Content-Encoding etc already decoded in get_document().
- my $docref = $response->content_ref();
-
- # Count lines beforehand if needed for progress indicator. In all cases,
- # the actual final number of lines processed shown is populated by our
- # end_document handler.
- $p->{Total} = ($$docref =~ tr/\n//) if $Opts{Progress};
-
- # We only look for anchors if we are not interested in the links
- # obviously, or if we are running a recursive checking because we
- # might need this information later
- $p->{only_anchors} = !($links || $rec_needs_links);
-
- # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
- # Processing instructions are not parsed by process, but in this case
- # it should be. It's expensive, it's horrible, but it's the easiest way
- # for right now.
- $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors};
-
- $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/);
-
- $p->parse($$docref)->eof();
- $response->content("");
-
- if (! $Opts{Summary_Only}) {
- my $stop = &get_timestamp();
- print "\r" if $Opts{Progress};
- &hprintf(" done (%d lines in %s seconds).\n",
- $p->{Total}, &time_diff($start, $stop));
- }
-
- # Save the results before exiting
- $results{$uri}{parsing}{base} = $p->{base};
- $results{$uri}{parsing}{Anchors} = $p->{Anchors};
- $results{$uri}{parsing}{Links} = $p->{Links};
-
- return $p;
}
####################################
@@ -1670,31 +1818,35 @@
sub new
{
- my $p = HTML::Parser::new(@_, api_version => 3);
- $p->utf8_mode(1);
+ my $p = HTML::Parser::new(@_, api_version => 3);
+ $p->utf8_mode(1);
- # Set up handlers
+ # Set up handlers
- $p->handler(start => 'start', 'self, tagname, attr, line');
- $p->handler(declaration =>
- sub {
- my $self = shift;
- $self->declaration(substr($_[0], 2, -1));
- }, 'self, text, line');
- $p->handler(end_document => 'end_document', 'self, line');
- if ($Opts{Progress}) {
- $p->handler(default => 'parse_progress', 'self, line');
- $p->{last_percentage} = 0;
- }
+ $p->handler(start => 'start', 'self, tagname, attr, line');
+ $p->handler(
+ declaration => sub {
+ my $self = shift;
+ $self->declaration(substr($_[0], 2, -1));
+ },
+ 'self, text, line'
+ );
+ $p->handler(end_document => 'end_document', 'self, line');
+ if ($Opts{Progress}) {
+ $p->handler(default => 'parse_progress', 'self, line');
+ $p->{last_percentage} = 0;
+ }
- # Check <a [..] name="...">?
- $p->{check_name} = 1;
- # Check <[..] id="..">?
- $p->{check_id} = 1;
- # Don't interpret comment loosely
- $p->strict_comment(1);
+ # Check <a [..] name="...">?
+ $p->{check_name} = 1;
- return $p;
+ # Check <[..] id="..">?
+ $p->{check_id} = 1;
+
+ # Don't interpret comment loosely
+ $p->strict_comment(1);
+
+ return $p;
}
#################################################
@@ -1703,24 +1855,24 @@
sub doctype
{
- my ($self, $dc) = @_;
- return $self->{doctype} unless $dc;
- $_ = $self->{doctype} = $dc;
+ my ($self, $dc) = @_;
+ return $self->{doctype} unless $dc;
+ $_ = $self->{doctype} = $dc;
- # What to look for depending on the doctype
+ # What to look for depending on the doctype
- # Check for <a name="...">?
- $self->{check_name} = 0
- if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %;
+ # Check for <a name="...">?
+ $self->{check_name} = 0
+ if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %;
- # Check for <* id="...">?
- $self->{check_id} = 0
- if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%);
+ # Check for <* id="...">?
+ $self->{check_id} = 0
+ if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%);
- # Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...)
- $self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%);
+ # Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...)
+ $self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%);
- return;
+ return;
}
###################################
@@ -1729,16 +1881,16 @@
sub parse_progress
{
- my ($self, $line) = @_;
- return unless defined($line) && $line > 0 && $self->{Total} > 0;
+ my ($self, $line) = @_;
+ return unless defined($line) && $line > 0 && $self->{Total} > 0;
- my $percentage = int($line/$self->{Total}*100);
- if ($percentage != $self->{last_percentage}) {
- printf("\r%4d%%", $percentage);
- $self->{last_percentage} = $percentage;
- }
+ my $percentage = int($line / $self->{Total} * 100);
+ if ($percentage != $self->{last_percentage}) {
+ printf("\r%4d%%", $percentage);
+ $self->{last_percentage} = $percentage;
+ }
- return;
+ return;
}
#############################
@@ -1747,18 +1899,19 @@
sub get_anchor
{
- my ($self, $tag, $attr) = @_;
+ my ($self, $tag, $attr) = @_;
- my $anchor = $self->{check_id} ? $attr->{id} : undef;
- if ($self->{check_name} && ($tag eq 'a')) {
- # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
- # Force an error if it's not the case (or if id's and name's values
- # are different)
- # If id is defined, name if defined must have the same value
- $anchor ||= $attr->{name};
- }
+ my $anchor = $self->{check_id} ? $attr->{id} : undef;
+ if ($self->{check_name} && ($tag eq 'a')) {
- return $anchor;
+ # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
+ # Force an error if it's not the case (or if id's and name's values
+ # are different)
+ # If id is defined, name if defined must have the same value
+ $anchor ||= $attr->{name};
+ }
+
+ return $anchor;
}
#############################
@@ -1767,116 +1920,132 @@
sub add_link
{
- my ($self, $uri, $base, $line) = @_;
- if (defined($uri)) {
- # Remove repeated slashes after the . or .. in relative links, to avoid
- # duplicated checking or infinite recursion.
- $uri =~ s|^(\.\.?/)/+|$1|o;
- $uri = URI->new_abs($uri, $base) if defined($base);
- $self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}++;
- }
- return;
+ my ($self, $uri, $base, $line) = @_;
+ if (defined($uri)) {
+
+ # Remove repeated slashes after the . or .. in relative links, to avoid
+ # duplicated checking or infinite recursion.
+ $uri =~ s|^(\.\.?/)/+|$1|o;
+ $uri = URI->new_abs($uri, $base) if defined($base);
+ $self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}++;
+ }
+ return;
}
sub start
{
- my ($self, $tag, $attr, $line) = @_;
- $line = LINE_UNKNOWN() unless defined($line);
+ my ($self, $tag, $attr, $line) = @_;
+ $line = LINE_UNKNOWN() unless defined($line);
- # Anchors
- my $anchor = $self->get_anchor($tag, $attr);
- $self->{Anchors}{$anchor}{$line}++ if defined($anchor);
+ # Anchors
+ my $anchor = $self->get_anchor($tag, $attr);
+ $self->{Anchors}{$anchor}{$line}++ if defined($anchor);
- # Links
- if (!$self->{only_anchors}) {
+ # Links
+ if (!$self->{only_anchors}) {
- my $tag_local_base = undef;
+ my $tag_local_base = undef;
- # Special case: base/@href
- # TODO: This should go away as soon as LWP::Protocol::collect() invokes
- # HTML::HeadParser (thus taking care of it in $response->base()
- # transparently) for application/xhtml+xml and
- # application/vnd.wap.xhtml+xml documents
- # --> it does in LWP >= 5.810
- if ($tag eq 'base') {
- # Treat <base> (without href) or <base href=""> as if it didn't exist.
- if (defined($attr->{href}) && length($attr->{href})) {
- $self->{base} = $attr->{href};
- }
- # Note: base/@href intentionally not treated as a dereferenceable link:
- # http://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi
- }
- # Special case: meta[@http-equiv=Refresh]/@content
- elsif ($tag eq 'meta') {
- if ($attr->{'http-equiv'} && lc($attr->{'http-equiv'}) eq 'refresh') {
- my $content = $attr->{content};
- if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) {
- $self->add_link($1, undef, $line);
+ # Special case: base/@href
+ # TODO: This should go away as soon as LWP::Protocol::collect() invokes
+ # HTML::HeadParser (thus taking care of it in $response->base()
+ # transparently) for application/xhtml+xml and
+ # application/vnd.wap.xhtml+xml documents
+ # --> it does in LWP >= 5.810
+ if ($tag eq 'base') {
+
+ # Treat <base> (without href) or <base href=""> as if it didn't exist.
+ if (defined($attr->{href}) && length($attr->{href})) {
+ $self->{base} = $attr->{href};
+ }
+
+ # Note: base/@href intentionally not treated as a dereferenceable link:
+ # http://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi
}
- }
- }
- # Special case: tags that have "local base"
- elsif ($tag eq 'applet' || $tag eq 'object') {
- if (my $codebase = $attr->{codebase}) {
- # TODO: HTML 4 spec says applet/@codebase may only point to subdirs of
- # the directory containing the current document. Should we do
- # something about that?
- $tag_local_base = URI->new_abs($codebase, $self->{base});
- }
+
+ # Special case: meta[@http-equiv=Refresh]/@content
+ elsif ($tag eq 'meta') {
+ if ($attr->{'http-equiv'} &&
+ lc($attr->{'http-equiv'}) eq 'refresh')
+ {
+ my $content = $attr->{content};
+ if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) {
+ $self->add_link($1, undef, $line);
+ }
+ }
+ }
+
+ # Special case: tags that have "local base"
+ elsif ($tag eq 'applet' || $tag eq 'object') {
+ if (my $codebase = $attr->{codebase}) {
+
+ # TODO: HTML 4 spec says applet/@codebase may only point to subdirs of
+ # the directory containing the current document. Should we do
+ # something about that?
+ $tag_local_base = URI->new_abs($codebase, $self->{base});
+ }
+ }
+
+ # Link attributes:
+ if (my $link_attrs = LINK_ATTRS()->{$tag}) {
+ for my $la (@$link_attrs) {
+ $self->add_link($attr->{$la}, $tag_local_base, $line);
+ }
+ }
+
+ # List of links attributes:
+ if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) {
+ for my $la (@$link_attrs) {
+ if (defined(my $value = $attr->{$la})) {
+ for my $link (split(/\s+/, $value)) {
+ $self->add_link($link, $tag_local_base, $line);
+ }
+ }
+ }
+ }
}
- # Link attributes:
- if (my $link_attrs = LINK_ATTRS()->{$tag}) {
- for my $la (@$link_attrs) {
- $self->add_link($attr->{$la}, $tag_local_base, $line);
- }
- }
+ $self->parse_progress($line) if $Opts{Progress};
- # List of links attributes:
- if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) {
- for my $la (@$link_attrs) {
- if (defined(my $value = $attr->{$la})) {
- for my $link (split(/\s+/, $value)) {
- $self->add_link($link, $tag_local_base, $line);
- }
- }
- }
- }
- }
-
- $self->parse_progress($line) if $Opts{Progress};
-
- return;
+ return;
}
sub declaration
{
- my ($self, $text, $line) = @_;
+ my ($self, $text, $line) = @_;
- # Extract the doctype
- my @declaration = split(/\s+/, $text, 4);
- if (($#declaration >= 3) &&
- ($declaration[0] eq 'DOCTYPE') &&
- (lc($declaration[1]) eq 'html')) {
- # Parse the doctype declaration
- if ($text =~ m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i) {
- # Store the doctype
- $self->doctype($1) if $1;
- # If there is a link to the DTD, record it
- $self->add_link($3, undef, $line) if (!$self->{only_anchors} && $3);
+ # Extract the doctype
+ my @declaration = split(/\s+/, $text, 4);
+ if (($#declaration >= 3) &&
+ ($declaration[0] eq 'DOCTYPE') &&
+ (lc($declaration[1]) eq 'html'))
+ {
+
+ # Parse the doctype declaration
+ if ($text =~
+ m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i
+ )
+ {
+
+ # Store the doctype
+ $self->doctype($1) if $1;
+
+ # If there is a link to the DTD, record it
+ $self->add_link($3, undef, $line)
+ if (!$self->{only_anchors} && $3);
+ }
}
- }
- $self->text($text) unless $self->{only_anchors};
+ $self->text($text) unless $self->{only_anchors};
- return;
+ return;
}
sub end_document
{
- my ($self, $line) = @_;
- $self->{Total} = $line;
- return;
+ my ($self, $line) = @_;
+ $self->{Total} = $line;
+ return;
}
################################
@@ -1885,77 +2054,91 @@
sub check_validity ($$$\%\%)
{
- my ($referer, $uri, $want_links, $links, $redirects) = @_;
- # $referer is the URI of the document checked
- # $uri is the URI of the target that we are verifying
- # $want_links is true if we're interested in links in the target doc
- # $links is a hash of the links in the documents checked
- # $redirects is a map of the redirects encountered
+ my ($referer, $uri, $want_links, $links, $redirects) = @_;
- # Get the document with the appropriate method
- # Only use GET if there are fragments. HEAD is enough if it's not the
- # case.
- my @fragments = keys %{$links->{$uri}{fragments}};
- my $method = scalar(@fragments) ? 'GET' : 'HEAD';
+ # $referer is the URI of the document checked
+ # $uri is the URI of the target that we are verifying
+ # $want_links is true if we're interested in links in the target doc
+ # $links is a hash of the links in the documents checked
+ # $redirects is a map of the redirects encountered
- my $response;
- my $being_processed = 0;
- if ((! defined($results{$uri}))
- || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
- $being_processed = 1;
- $response = &get_uri($method, $uri, $referer);
- # Get the information back from get_uri()
- &record_results($uri, $method, $response);
- # Record the redirects
- &record_redirects($redirects, $response);
- }
+ # Get the document with the appropriate method
+ # Only use GET if there are fragments. HEAD is enough if it's not the
+ # case.
+ my @fragments = keys %{$links->{$uri}{fragments}};
+ my $method = scalar(@fragments) ? 'GET' : 'HEAD';
- # We got the response of the HTTP request. Stop here if it was a HEAD.
- return if ($method eq 'HEAD');
+ my $response;
+ my $being_processed = 0;
+ if ((!defined($results{$uri})) ||
+ (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD')))
+ {
+ $being_processed = 1;
+ $response = &get_uri($method, $uri, $referer);
- # There are fragments. Parse the document.
- my $p;
- if ($being_processed) {
- # Can we really parse the document?
- if (!defined($results{$uri}{location}{type}) ||
- $results{$uri}{location}{type} !~ $ContentTypes)
- {
- &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n",
- $uri, $results{$uri}{location}{type})
- if ($Opts{Verbose});
- $response->content("");
- return;
+ # Get the information back from get_uri()
+ &record_results($uri, $method, $response);
+
+ # Record the redirects
+ &record_redirects($redirects, $response);
}
- # Do it then
- if (my $error = decode_content($response)) {
- &hprintf("%s\n.", $error);
+
+ # We got the response of the HTTP request. Stop here if it was a HEAD.
+ return if ($method eq 'HEAD');
+
+ # There are fragments. Parse the document.
+ my $p;
+ if ($being_processed) {
+
+ # Can we really parse the document?
+ if (!defined($results{$uri}{location}{type}) ||
+ $results{$uri}{location}{type} !~ $ContentTypes)
+ {
+ &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n",
+ $uri, $results{$uri}{location}{type})
+ if ($Opts{Verbose});
+ $response->content("");
+ return;
+ }
+
+ # Do it then
+ if (my $error = decode_content($response)) {
+ &hprintf("%s\n.", $error);
+ }
+
+ # @@@TODO: this isn't the best thing to do if a decode error occurred
+ $p =
+ &parse_document($uri, $response->base(), $response, 0,
+ $want_links);
}
- # @@@TODO: this isn't the best thing to do if a decode error occurred
- $p = &parse_document($uri, $response->base(), $response, 0, $want_links);
- } else {
- # We already had the information
- $p->{Anchors} = $results{$uri}{parsing}{Anchors};
- }
- # Check that the fragments exist
- foreach my $fragment (keys %{$links->{$uri}{fragments}}) {
- if (defined($p->{Anchors}{$fragment})
- || &escape_match($fragment, $p->{Anchors})
- || grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}}) {
- $results{$uri}{fragments}{$fragment} = 1;
- } else {
- $results{$uri}{fragments}{$fragment} = 0;
+ else {
+
+ # We already had the information
+ $p->{Anchors} = $results{$uri}{parsing}{Anchors};
}
- }
- return;
+
+ # Check that the fragments exist
+ foreach my $fragment (keys %{$links->{$uri}{fragments}}) {
+ if (defined($p->{Anchors}{$fragment}) ||
+ &escape_match($fragment, $p->{Anchors}) ||
+ grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}})
+ {
+ $results{$uri}{fragments}{$fragment} = 1;
+ }
+ else {
+ $results{$uri}{fragments}{$fragment} = 0;
+ }
+ }
+ return;
}
sub escape_match ($\%)
{
- my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
- foreach my $b (keys %$hash) {
- return 1 if ($a eq URI::Escape::uri_unescape($b));
- }
- return 0;
+ my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
+ foreach my $b (keys %$hash) {
+ return 1 if ($a eq URI::Escape::uri_unescape($b));
+ }
+ return 0;
}
##########################
@@ -1964,55 +2147,61 @@
sub authentication ($;$$$$)
{
- my ($response, $cookie, $params, $check_num, $is_start) = @_;
+ my ($response, $cookie, $params, $check_num, $is_start) = @_;
- my $realm = '';
- if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) {
- $realm = $1;
- }
+ my $realm = '';
+ if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) {
+ $realm = $1;
+ }
- if ($Opts{Command_Line}) {
- printf STDERR <<'EOF', $response->request()->url(), $realm;
+ if ($Opts{Command_Line}) {
+ printf STDERR <<'EOF', $response->request()->url(), $realm;
Authentication is required for %s.
The realm is "%s".
Use the -u and -p options to specify a username and password and the -d option
to specify trusted domains.
EOF
- } else {
+ }
+ else {
- printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\n%sConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n",
- $response->www_authenticate(),
- $cookie ? "Set-Cookie: $cookie\n" : "",
- );
+ printf(
+ "Status: 401 Authorization Required\nWWW-Authenticate: %s\n%sConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n",
+ $response->www_authenticate(),
+ $cookie ? "Set-Cookie: $cookie\n" : "",
+ );
- printf("%s
+ printf(
+ "%s
<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
<head>
<title>W3C Link Checker: 401 Authorization Required</title>
%s</head>
-<body>", $DocType, $Head);
- &banner(': 401 Authorization Required');
- &print_form($params, $cookie, $check_num) if $is_start;
- printf('<p>
+<body>", $DocType, $Head
+ );
+ &banner(': 401 Authorization Required');
+ &print_form($params, $cookie, $check_num) if $is_start;
+ printf(
+ '<p>
%s
You need "%s" access to <a href="%s">%s</a> to perform link checking.<br />
',
- &status_icon(401),
- &encode($realm), (&encode($response->request()->url())) x 2);
+ &status_icon(401),
+ &encode($realm), (&encode($response->request()->url())) x 2
+ );
- my $host = $response->request()->url()->host();
- if ($Opts{Trusted} && $host !~ $Opts{Trusted}) {
- printf <<'EOF', &encode($Opts{Trusted}), &encode($host);
+ my $host = $response->request()->url()->host();
+ if ($Opts{Trusted} && $host !~ $Opts{Trusted}) {
+ printf <<'EOF', &encode($Opts{Trusted}), &encode($host);
This service has been configured to send authentication only to hostnames
matching the regular expression <code>%s</code>, but the hostname
<code>%s</code> does not match it.
EOF
+ }
+
+ print "</p>\n";
}
-
- print "</p>\n";
- }
- return;
+ return;
}
##################
@@ -2021,17 +2210,17 @@
sub get_timestamp ()
{
- return pack('LL', Time::HiRes::gettimeofday());
+ return pack('LL', Time::HiRes::gettimeofday());
}
sub time_diff ($$)
{
- my @start = unpack('LL', $_[0]);
- my @stop = unpack('LL', $_[1]);
- for ($start[1], $stop[1]) {
- $_ /= 1_000_000;
- }
- return(sprintf("%.2f", ($stop[0]+$stop[1])-($start[0]+$start[1])));
+ my @start = unpack('LL', $_[0]);
+ my @stop = unpack('LL', $_[1]);
+ for ($start[1], $stop[1]) {
+ $_ /= 1_000_000;
+ }
+ return (sprintf("%.2f", ($stop[0] + $stop[1]) - ($start[0] + $start[1])));
}
########################
@@ -2041,56 +2230,59 @@
# Record the redirects in a hash
sub record_redirects (\%$)
{
- my ($redirects, $response) = @_;
- for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) {
+ my ($redirects, $response) = @_;
+ for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) {
- # Check for redirect match.
- my $from = $prev->request()->url();
- my $to = $response->request()->url(); # same on every loop iteration
- my $from_to = $from . '->' . $to;
- my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}};
- # print STDERR "Result $match of redirect checking $from_to\n";
- if ($match) { next; }
+ # Check for redirect match.
+ my $from = $prev->request()->url();
+ my $to = $response->request()->url(); # same on every loop iteration
+ my $from_to = $from . '->' . $to;
+ my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}};
- $match = grep { $from_to =~ /$_/ } @{$Opts{Suppress_Redirect_Prefix}};
- # print STDERR "Result $match of regexp checking $from_to\n";
- if ($match) { next; }
+ # print STDERR "Result $match of redirect checking $from_to\n";
+ if ($match) { next; }
- my $c = $prev->code();
- if ($Opts{Suppress_Temp_Redirects} && ($c == 307 || $c == 302)) {
- next;
+ $match = grep { $from_to =~ /$_/ } @{$Opts{Suppress_Redirect_Prefix}};
+
+ # print STDERR "Result $match of regexp checking $from_to\n";
+ if ($match) { next; }
+
+ my $c = $prev->code();
+ if ($Opts{Suppress_Temp_Redirects} && ($c == 307 || $c == 302)) {
+ next;
+ }
+
+ $redirects->{$prev->request()->url()} = $response->request()->url();
}
-
- $redirects->{$prev->request()->url()} = $response->request()->url();
- }
- return;
+ return;
}
# Determine if a request is redirected
sub is_redirected ($%)
{
- my ($uri, %redirects) = @_;
- return(defined($redirects{$uri}));
+ my ($uri, %redirects) = @_;
+ return (defined($redirects{$uri}));
}
# Get a list of redirects for a URI
sub get_redirects ($%)
{
- my ($uri, %redirects) = @_;
- my @history = ($uri);
- my %seen = ($uri => 1); # for tracking redirect loops
- my $loop = 0;
- while ($redirects{$uri}) {
- $uri = $redirects{$uri};
- push(@history, $uri);
- if ($seen{$uri}) {
- $loop = 1;
- last;
- } else {
- $seen{$uri}++;
+ my ($uri, %redirects) = @_;
+ my @history = ($uri);
+ my %seen = ($uri => 1); # for tracking redirect loops
+ my $loop = 0;
+ while ($redirects{$uri}) {
+ $uri = $redirects{$uri};
+ push(@history, $uri);
+ if ($seen{$uri}) {
+ $loop = 1;
+ last;
+ }
+ else {
+ $seen{$uri}++;
+ }
}
- }
- return ($loop, @history);
+ return ($loop, @history);
}
####################################################
@@ -2099,9 +2291,9 @@
sub sort_unique (@)
{
- my %saw;
- @saw{@_} = ();
- return (sort { $a <=> $b } keys %saw);
+ my %saw;
+ @saw{@_} = ();
+ return (sort { $a <=> $b } keys %saw);
}
#####################
@@ -2110,53 +2302,57 @@
sub line_number ($)
{
- my $line = shift;
- return $line if ($line >= 0);
- return "(N/A)";
+ my $line = shift;
+ return $line if ($line >= 0);
+ return "(N/A)";
}
sub http_rc ($)
{
- my $rc = shift;
- return $rc if ($rc >= 0);
- return "(N/A)";
+ my $rc = shift;
+ return $rc if ($rc >= 0);
+ return "(N/A)";
}
# returns true if the given code is informational
sub informational ($)
{
- my $rc = shift;
- return $rc == RC_ROBOTS_TXT() || $rc == RC_IP_DISALLOWED() ||
- $rc == RC_PROTOCOL_DISALLOWED();
+ my $rc = shift;
+ return $rc == RC_ROBOTS_TXT() ||
+ $rc == RC_IP_DISALLOWED() ||
+ $rc == RC_PROTOCOL_DISALLOWED();
}
sub anchors_summary (\%\%)
{
- my ($anchors, $errors) = @_;
+ my ($anchors, $errors) = @_;
- # Number of anchors found.
- my $n = scalar(keys(%$anchors));
- if (! $Opts{Quiet}) {
- if ($Opts{HTML}) {
- print("<h3>Anchors</h3>\n<p>");
- } else {
- print("Anchors\n\n");
+ # Number of anchors found.
+ my $n = scalar(keys(%$anchors));
+ if (!$Opts{Quiet}) {
+ if ($Opts{HTML}) {
+ print("<h3>Anchors</h3>\n<p>");
+ }
+ else {
+ print("Anchors\n\n");
+ }
+ &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's');
+ print("</p>\n") if $Opts{HTML};
}
- &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's');
- print("</p>\n") if $Opts{HTML};
- }
- # List of the duplicates, if any.
- my @errors = keys %{$errors};
- if (! scalar(@errors)) {
- print("<p>Valid anchors!</p>\n") if (! $Opts{Quiet} && $Opts{HTML} && $n);
- return;
- }
- undef $n;
- print_doc_header();
- print('<p>') if $Opts{HTML};
- print('List of duplicate and empty anchors');
- print <<'EOF' if $Opts{HTML};
+ # List of the duplicates, if any.
+ my @errors = keys %{$errors};
+ if (!scalar(@errors)) {
+ print("<p>Valid anchors!</p>\n")
+ if (!$Opts{Quiet} && $Opts{HTML} && $n);
+ return;
+ }
+ undef $n;
+
+ print_doc_header();
+ print('<p>') if $Opts{HTML};
+ print('List of duplicate and empty anchors');
+ print <<'EOF' if $Opts{HTML};
</p>
<table class="report" border="1" summary="List of duplicate and empty anchors.">
<thead>
@@ -2167,320 +2363,406 @@
</thead>
<tbody>
EOF
- print("\n");
+ print("\n");
- foreach my $anchor (@errors) {
- my $format;
- my @unique = &sort_unique(map { line_number($_) }
- keys %{$anchors->{$anchor}});
- if ($Opts{HTML}) {
- $format = "<tr><td class=\"broken\">%s</td><td>%s</td></tr>\n";
- } else {
- my $s = (scalar(@unique) > 1) ? 's' : '';
- $format = "\t%s\tLine$s: %s\n";
+ foreach my $anchor (@errors) {
+ my $format;
+ my @unique = &sort_unique(
+ map { line_number($_) }
+ keys %{$anchors->{$anchor}}
+ );
+ if ($Opts{HTML}) {
+ $format = "<tr><td class=\"broken\">%s</td><td>%s</td></tr>\n";
+ }
+ else {
+ my $s = (scalar(@unique) > 1) ? 's' : '';
+ $format = "\t%s\tLine$s: %s\n";
+ }
+ printf($format,
+ &encode(length($anchor) ? $anchor : 'Empty anchor'),
+ join(', ', @unique));
}
- printf($format,
- &encode(length($anchor) ? $anchor : 'Empty anchor'),
- join(', ', @unique));
- }
- print("</tbody>\n</table>\n") if $Opts{HTML};
+ print("</tbody>\n</table>\n") if $Opts{HTML};
- return;
+ return;
}
sub show_link_report (\%\%\%\%\@;$\%)
{
- my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;
+ my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;
- print("\n<dl class=\"report\">") if $Opts{HTML};
- print("\n") if (! $Opts{Quiet});
+ print("\n<dl class=\"report\">") if $Opts{HTML};
+ print("\n") if (!$Opts{Quiet});
- # Process each URL
- my ($c, $previous_c);
- foreach my $u (@$urls) {
- my @fragments = keys %{$broken->{$u}{fragments}};
- # Did we get a redirect?
- my $redirected = &is_redirected($u, %$redirects);
- # List of lines
- my @total_lines;
- push(@total_lines, keys(%{$links->{$u}{location}}));
- foreach my $f (@fragments) {
- push(@total_lines, keys(%{$links->{$u}{fragments}{$f}}))
- unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()}));
- }
+ # Process each URL
+ my ($c, $previous_c);
+ foreach my $u (@$urls) {
+ my @fragments = keys %{$broken->{$u}{fragments}};
- my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects);
- my $currloc = $results->{$u}{location};
+ # Did we get a redirect?
+ my $redirected = &is_redirected($u, %$redirects);
- # Error type
- $c = &code_shown($u, $results);
- # What to do
- my $whattodo;
- my $redirect_too;
- if ($todo) {
- if ($u =~ m/^javascript:/) {
- if ($Opts{HTML}) {
- $whattodo =
-'You must change this link: people using a browser without JavaScript support
+ # List of lines
+ my @total_lines;
+ push(@total_lines, keys(%{$links->{$u}{location}}));
+ foreach my $f (@fragments) {
+ push(@total_lines, keys(%{$links->{$u}{fragments}{$f}}))
+ unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()}));
+ }
+
+ my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects);
+ my $currloc = $results->{$u}{location};
+
+ # Error type
+ $c = &code_shown($u, $results);
+
+ # What to do
+ my $whattodo;
+ my $redirect_too;
+ if ($todo) {
+ if ($u =~ m/^javascript:/) {
+ if ($Opts{HTML}) {
+ $whattodo =
+ 'You must change this link: people using a browser without JavaScript support
will <em>not</em> be able to follow this link. See the
<a href="http://www.w3.org/TR/WAI-WEBCONTENT/#tech-scripts">Web Content
Accessibility Guidelines on the use of scripting on the Web</a> and the
<a href="http://www.w3.org/TR/WCAG10-HTML-TECHS/#directly-accessible-scripts">techniques
on how to solve this</a>.';
- } else {
- $whattodo = 'Change this link: people using a browser without JavaScript support will not be able to follow this link.';
+ }
+ else {
+ $whattodo =
+ 'Change this link: people using a browser without JavaScript support will not be able to follow this link.';
+ }
+ }
+ elsif ($c == RC_ROBOTS_TXT()) {
+ $whattodo =
+ 'The link was not checked due to robots exclusion ' .
+ 'rules. Check the link manually.';
+ }
+ elsif ($redirect_loop) {
+ $whattodo =
+ 'Retrieving the URI results in a redirect loop, that should be '
+ . 'fixed. Examine the redirect sequence to see where the loop '
+ . 'occurs.';
+ }
+ else {
+ $whattodo = $todo->{$c};
+ }
}
- } elsif ($c == RC_ROBOTS_TXT()) {
- $whattodo = 'The link was not checked due to robots exclusion ' .
- 'rules. Check the link manually.';
- } elsif ($redirect_loop) {
- $whattodo =
- 'Retrieving the URI results in a redirect loop, that should be ' .
- 'fixed. Examine the redirect sequence to see where the loop ' .
- 'occurs.';
- } else {
- $whattodo = $todo->{$c};
- }
- } elsif (defined($redirects{$u})) {
- # Redirects
- if (($u.'/') eq $redirects{$u}) {
- $whattodo = 'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.';
- }
- elsif ($c == 307 || $c == 302) {
- $whattodo = 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.';
- }
- elsif ($c == 301) {
- $whattodo = 'This is a permanent redirect. The link should be updated.';
- }
- }
+ elsif (defined($redirects{$u})) {
- my @unique = &sort_unique(map { line_number($_) } @total_lines);
- my $lines_list = join(', ', @unique);
- my $s = (scalar(@unique) > 1) ? 's' : '';
- undef @unique;
+ # Redirects
+ if (($u . '/') eq $redirects{$u}) {
+ $whattodo =
+ 'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.';
+ }
+ elsif ($c == 307 || $c == 302) {
+ $whattodo =
+ 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.';
+ }
+ elsif ($c == 301) {
+ $whattodo =
+ 'This is a permanent redirect. The link should be updated.';
+ }
+ }
- my @http_codes = ($currloc->{code});
- unshift(@http_codes, $currloc->{orig}) if $currloc->{orig};
- @http_codes = map { http_rc($_) } @http_codes;
+ my @unique = &sort_unique(map { line_number($_) } @total_lines);
+ my $lines_list = join(', ', @unique);
+ my $s = (scalar(@unique) > 1) ? 's' : '';
+ undef @unique;
- if ($Opts{HTML}) {
- # Style stuff
- my $idref = '';
- if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
- $idref = ' id="d'.$doc_count.'code_'.$c.'"';
- $previous_c = $c;
- }
- # Main info
- for (@redirects_urls) {
- $_ = &show_url($_);
- }
- # HTTP message
- my $http_message;
- if ($currloc->{message}) {
- $http_message = &encode($currloc->{message});
- if ($c == 404 || $c == 500) {
- $http_message = '<span class="broken">'.
- $http_message.'</span>';
- }
- }
- my $redirmsg =
- $redirect_loop ? ' <em>redirect loop detected</em>' : '';
- printf("
+ my @http_codes = ($currloc->{code});
+ unshift(@http_codes, $currloc->{orig}) if $currloc->{orig};
+ @http_codes = map { http_rc($_) } @http_codes;
+
+ if ($Opts{HTML}) {
+
+ # Style stuff
+ my $idref = '';
+ if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
+ $idref = ' id="d' . $doc_count . 'code_' . $c . '"';
+ $previous_c = $c;
+ }
+
+ # Main info
+ for (@redirects_urls) {
+ $_ = &show_url($_);
+ }
+
+ # HTTP message
+ my $http_message;
+ if ($currloc->{message}) {
+ $http_message = &encode($currloc->{message});
+ if ($c == 404 || $c == 500) {
+ $http_message =
+ '<span class="broken">' . $http_message . '</span>';
+ }
+ }
+ my $redirmsg =
+ $redirect_loop ? ' <em>redirect loop detected</em>' : '';
+ printf("
<dt%s>%s <span class='msg_loc'>Line%s: %s</span> %s</dt>
<dd class='responsecode'><strong>Status</strong>: %s %s %s</dd>
<dd class='message_explanation'><p>%s %s</p></dd>\n",
- # Anchor for return codes
- $idref,
- # Color
- &status_icon($c),
- $s,
- # List of lines
- $lines_list,
- # List of redirects
- $redirected ? join(' redirected to ', @redirects_urls) . $redirmsg : &show_url($u),
- # Realm
- defined($currloc->{realm})
- ? sprintf('Realm: %s<br />', &encode($currloc->{realm})) : '',
- # HTTP original message
- # defined($currloc->{orig_message})
- # ? &encode($currloc->{orig_message}).
- # ' <span title="redirected to">-></span> '
- # : '',
- # Response code chain
- join(' <span class="redirected_to" title="redirected to">-></span> ',
- map { &encode($_) } @http_codes),
- # HTTP final message
- $http_message,
- # What to do
- $whattodo,
- # Redirect too?
- $redirect_too ?
- sprintf(' <span %s>%s</span>', &bgcolor(301), $redirect_too) : '',
- );
- if ($#fragments >= 0) {
- printf("<dd>Broken fragments: <ul>\n");
- }
- } else {
- my $redirmsg = $redirect_loop ? ' redirect loop detected' : '';
- printf("\n%s\t%s\n Code: %s %s\n%s\n",
- # List of redirects
- $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u,
- # List of lines
- $lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '',
- # Response code chain
- join(' -> ', @http_codes),
- # HTTP message
- $currloc->{message} || '',
- # What to do
- wrap(' To do: ', ' ', $whattodo));
- if ($#fragments >= 0) {
- if ($currloc->{code} == 200) {
- print("The following fragments need to be fixed:\n");
- } else {
- print("Fragments:\n");
+ # Anchor for return codes
+ $idref,
+
+ # Color
+ &status_icon($c),
+ $s,
+
+ # List of lines
+ $lines_list,
+
+ # List of redirects
+ $redirected ?
+ join(' redirected to ', @redirects_urls) . $redirmsg :
+ &show_url($u),
+
+ # Realm
+ defined($currloc->{realm}) ?
+ sprintf('Realm: %s<br />', &encode($currloc->{realm})) :
+ '',
+
+ # HTTP original message
+ # defined($currloc->{orig_message})
+ # ? &encode($currloc->{orig_message}).
+ # ' <span title="redirected to">-></span> '
+ # : '',
+
+ # Response code chain
+ join(
+ ' <span class="redirected_to" title="redirected to">-></span> ',
+ map { &encode($_) } @http_codes),
+
+ # HTTP final message
+ $http_message,
+
+ # What to do
+ $whattodo,
+
+ # Redirect too?
+ $redirect_too ?
+ sprintf(' <span %s>%s</span>',
+ &bgcolor(301), $redirect_too) :
+ '',
+ );
+ if ($#fragments >= 0) {
+ printf("<dd>Broken fragments: <ul>\n");
+ }
}
- }
- }
- # Fragments
- foreach my $f (@fragments) {
- my @unique_lines = &sort_unique(keys %{$links->{$u}{fragments}{$f}});
- my $plural = (scalar(@unique_lines) > 1) ? 's' : '';
- my $unique_lines = join(', ', @unique_lines);
- if ($Opts{HTML}) {
- printf("<li>%s<em>#%s</em> (line%s %s)</li>\n",
- &encode($u), &encode($f), $plural, $unique_lines);
- } else {
- printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines);
- }
+ else {
+ my $redirmsg = $redirect_loop ? ' redirect loop detected' : '';
+ printf(
+ "\n%s\t%s\n Code: %s %s\n%s\n",
+
+ # List of redirects
+ $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u,
+
+ # List of lines
+ $lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '',
+
+ # Response code chain
+ join(' -> ', @http_codes),
+
+ # HTTP message
+ $currloc->{message} || '',
+
+ # What to do
+ wrap(' To do: ', ' ', $whattodo)
+ );
+ if ($#fragments >= 0) {
+ if ($currloc->{code} == 200) {
+ print("The following fragments need to be fixed:\n");
+ }
+ else {
+ print("Fragments:\n");
+ }
+ }
+ }
+
+ # Fragments
+ foreach my $f (@fragments) {
+ my @unique_lines =
+ &sort_unique(keys %{$links->{$u}{fragments}{$f}});
+ my $plural = (scalar(@unique_lines) > 1) ? 's' : '';
+ my $unique_lines = join(', ', @unique_lines);
+ if ($Opts{HTML}) {
+ printf("<li>%s<em>#%s</em> (line%s %s)</li>\n",
+ &encode($u), &encode($f), $plural, $unique_lines);
+ }
+ else {
+ printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines);
+ }
+ }
+
+ print("</ul></dd>\n") if ($Opts{HTML} && scalar(@fragments));
}
- print("</ul></dd>\n") if ($Opts{HTML} && scalar(@fragments));
- }
+ # End of the table
+ print("</dl>\n") if $Opts{HTML};
- # End of the table
- print("</dl>\n") if $Opts{HTML};
-
- return;
+ return;
}
sub code_shown ($$)
{
- my ($u, $results) = @_;
+ my ($u, $results) = @_;
- if ($results->{$u}{location}{record} == 200) {
- return $results->{$u}{location}{orig} || $results->{$u}{location}{record};
- } else {
- return $results->{$u}{location}{record};
- }
+ if ($results->{$u}{location}{record} == 200) {
+ return $results->{$u}{location}{orig} ||
+ $results->{$u}{location}{record};
+ }
+ else {
+ return $results->{$u}{location}{record};
+ }
}
sub links_summary (\%\%\%\%)
{
- # Advices to fix the problems
- my %todo = ( 200 => 'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).',
- 300 => 'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.',
- 301 => 'This is a permanent redirect. The link should be updated to point to the more recent URI.',
- 302 => 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
- 303 => 'This rare status code points to a "See Other" resource. There is generally nothing to be done.',
- 307 => 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
- 400 => 'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.',
- 401 => "The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.",
- 403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
- 404 => 'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.',
- 405 => 'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically. Check the link manually.',
- 406 => "The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.",
- 407 => 'The link is a proxy, but requires Authentication.',
- 408 => 'The request timed out.',
- 410 => 'The resource is gone. You should remove this link.',
- 415 => 'The media type is not supported.',
- 500 => 'This is a server side problem. Check the URI.',
- 501 => 'Could not check this link: method not implemented or scheme not supported.',
- 503 => 'The server cannot service the request, for some unknown reason.',
- # Non-HTTP codes:
- RC_ROBOTS_TXT() => sprintf('The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.',
- $Opts{HTML} ? ('<a href="http://www.robotstxt.org/wc/exclusion.html#robotstxt">', '</a>', "<a href=\"$Cfg{Doc_URI}#bot\">", '</a>') : ('') x 4),
- RC_DNS_ERROR() => 'The hostname could not be resolved. Check the link for typos.',
- RC_IP_DISALLOWED() => sprintf('The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.',
- $Opts{HTML} ? ('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') : ('') x 2),
- RC_PROTOCOL_DISALLOWED() => 'Accessing links with this URI scheme has been disabled in link checker.',
- );
- my %priority = ( 410 => 1,
- 404 => 2,
- 403 => 5,
- 200 => 10,
- 300 => 15,
- 401 => 20
- );
+ # Advices to fix the problems
- my ($links, $results, $broken, $redirects) = @_;
+ my %todo = (
+ 200 =>
+ 'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).',
+ 300 =>
+ 'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.',
+ 301 =>
+ 'This is a permanent redirect. The link should be updated to point to the more recent URI.',
+ 302 =>
+ 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
+ 303 =>
+ 'This rare status code points to a "See Other" resource. There is generally nothing to be done.',
+ 307 =>
+ 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.',
+ 400 =>
+ 'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.',
+ 401 =>
+ "The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.",
+ 403 =>
+ 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
+ 404 =>
+ 'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.',
+ 405 =>
+ 'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically. Check the link manually.',
+ 406 =>
+ "The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.",
+ 407 => 'The link is a proxy, but requires Authentication.',
+ 408 => 'The request timed out.',
+ 410 => 'The resource is gone. You should remove this link.',
+ 415 => 'The media type is not supported.',
+ 500 => 'This is a server side problem. Check the URI.',
+ 501 =>
+ 'Could not check this link: method not implemented or scheme not supported.',
+ 503 =>
+ 'The server cannot service the request, for some unknown reason.',
- # List of the broken links
- my @urls = keys %{$broken};
- my @dir_redirect_urls = ();
- if ($Opts{Redirects}) {
- # Add the redirected URI's to the report
- for my $l (keys %$redirects) {
- next unless (defined($results->{$l})
- && defined($links->{$l})
- && !defined($broken->{$l}));
- # Check whether we have a "directory redirect"
- # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/
- my ($redirect_loop, @redirects) = get_redirects($l, %$redirects);
- if ($#redirects == 1) {
- push(@dir_redirect_urls, $l);
- next;
- }
- push(@urls, $l);
+ # Non-HTTP codes:
+ RC_ROBOTS_TXT() => sprintf(
+ 'The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.',
+ $Opts{HTML} ? (
+ '<a href="http://www.robotstxt.org/wc/exclusion.html#robotstxt">',
+ '</a>',
+ "<a href=\"$Cfg{Doc_URI}#bot\">",
+ '</a>'
+ ) : ('') x 4
+ ),
+ RC_DNS_ERROR() =>
+ 'The hostname could not be resolved. Check the link for typos.',
+ RC_IP_DISALLOWED() =>
+ sprintf(
+ 'The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.',
+ $Opts{HTML} ?
+ ('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') :
+ ('') x 2),
+ RC_PROTOCOL_DISALLOWED() =>
+ 'Accessing links with this URI scheme has been disabled in link checker.',
+ );
+ my %priority = (
+ 410 => 1,
+ 404 => 2,
+ 403 => 5,
+ 200 => 10,
+ 300 => 15,
+ 401 => 20
+ );
+
+ my ($links, $results, $broken, $redirects) = @_;
+
+ # List of the broken links
+ my @urls = keys %{$broken};
+ my @dir_redirect_urls = ();
+ if ($Opts{Redirects}) {
+
+ # Add the redirected URI's to the report
+ for my $l (keys %$redirects) {
+ next
+ unless (defined($results->{$l}) &&
+ defined($links->{$l}) &&
+ !defined($broken->{$l}));
+
+ # Check whether we have a "directory redirect"
+ # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/
+ my ($redirect_loop, @redirects) = get_redirects($l, %$redirects);
+ if ($#redirects == 1) {
+ push(@dir_redirect_urls, $l);
+ next;
+ }
+ push(@urls, $l);
+ }
}
- }
- # Broken links and redirects
- if ($#urls < 0) {
- if (! $Opts{Quiet}) {
- print_doc_header();
- if ($Opts{HTML}) {
- print "<h3>Links</h3>\n<p>Valid links!</p>\n";
- } else {
- print "\nValid links.\n";
- }
+ # Broken links and redirects
+ if ($#urls < 0) {
+ if (!$Opts{Quiet}) {
+ print_doc_header();
+ if ($Opts{HTML}) {
+ print "<h3>Links</h3>\n<p>Valid links!</p>\n";
+ }
+ else {
+ print "\nValid links.\n";
+ }
+ }
}
- } else {
- print_doc_header();
- print('<h3>') if $Opts{HTML};
- print("\nList of broken links and other issues");
- #print(' and redirects') if $Opts{Redirects};
+ else {
+ print_doc_header();
+ print('<h3>') if $Opts{HTML};
+ print("\nList of broken links and other issues");
- # Sort the URI's by HTTP Code
- my %code_summary;
- my @idx;
- foreach my $u (@urls) {
- if (defined($results->{$u}{location}{record})) {
- my $c = &code_shown($u, $results);
- $code_summary{$c}++;
- push(@idx, $c);
- }
- }
- my @sorted = @urls[
- sort {
- defined($priority{$idx[$a]}) ?
- defined($priority{$idx[$b]}) ?
- $priority{$idx[$a]}
- <=> $priority{$idx[$b]} :
- -1 :
- defined($priority{$idx[$b]}) ?
- 1 :
- $idx[$a] <=> $idx[$b]
- } 0 .. $#idx
- ];
- @urls = @sorted;
- undef(@sorted); undef(@idx);
+ #print(' and redirects') if $Opts{Redirects};
- if ($Opts{HTML}) {
- # Print a summary
- print <<'EOF';
+ # Sort the URI's by HTTP Code
+ my %code_summary;
+ my @idx;
+ foreach my $u (@urls) {
+ if (defined($results->{$u}{location}{record})) {
+ my $c = &code_shown($u, $results);
+ $code_summary{$c}++;
+ push(@idx, $c);
+ }
+ }
+ my @sorted = @urls[
+ sort {
+ defined($priority{$idx[$a]}) ?
+ defined($priority{$idx[$b]}) ?
+ $priority{$idx[$a]} <=> $priority{$idx[$b]} :
+ -1 :
+ defined($priority{$idx[$b]}) ? 1 :
+ $idx[$a] <=> $idx[$b]
+ } 0 .. $#idx
+ ];
+ @urls = @sorted;
+ undef(@sorted);
+ undef(@idx);
+
+ if ($Opts{HTML}) {
+
+ # Print a summary
+ print <<'EOF';
</h3>
<p><em>There are issues with the URLs listed below. The table summarizes the
issues and suggested actions by HTTP response status code.</em></p>
@@ -2494,33 +2776,36 @@
</thead>
<tbody>
EOF
- foreach my $code (sort(keys(%code_summary))) {
- printf('<tr%s>', &bgcolor($code));
- printf('<td><a href="#d%scode_%s">%s</a></td>',
- $doc_count, $code, http_rc($code));
- printf('<td>%s</td>', $code_summary{$code});
- printf('<td>%s</td>', $todo{$code});
- print "</tr>\n";
- }
- print "</tbody>\n</table>\n";
- } else {
- print(':');
+ foreach my $code (sort(keys(%code_summary))) {
+ printf('<tr%s>', &bgcolor($code));
+ printf('<td><a href="#d%scode_%s">%s</a></td>',
+ $doc_count, $code, http_rc($code));
+ printf('<td>%s</td>', $code_summary{$code});
+ printf('<td>%s</td>', $todo{$code});
+ print "</tr>\n";
+ }
+ print "</tbody>\n</table>\n";
+ }
+ else {
+ print(':');
+ }
+ &show_link_report($links, $results, $broken, $redirects, \@urls, 1,
+ \%todo);
}
- &show_link_report($links, $results, $broken, $redirects,
- \@urls, 1, \%todo);
- }
- # Show directory redirects
- if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) {
- print_doc_header();
- print('<h3>') if $Opts{HTML};
- print("\nList of redirects");
- print("</h3>\n<p>The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.</p>") if $Opts{HTML};
- &show_link_report($links, $results, $broken, $redirects,
- \@dir_redirect_urls);
- }
+ # Show directory redirects
+ if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) {
+ print_doc_header();
+ print('<h3>') if $Opts{HTML};
+ print("\nList of redirects");
+ print(
+ "</h3>\n<p>The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.</p>"
+ ) if $Opts{HTML};
+ &show_link_report($links, $results, $broken, $redirects,
+ \@dir_redirect_urls);
+ }
- return;
+ return;
}
###############################################################################
@@ -2531,13 +2816,16 @@
sub global_stats ()
{
- my $stop = &get_timestamp();
- my $n_docs =
- ($doc_count <= $Opts{Max_Documents}) ? $doc_count : $Opts{Max_Documents};
- return sprintf('Checked %d document%s in %s seconds.',
- $n_docs,
- ($n_docs == 1) ? '' : 's',
- &time_diff($timestamp, $stop));
+ my $stop = &get_timestamp();
+ my $n_docs =
+ ($doc_count <= $Opts{Max_Documents}) ? $doc_count :
+ $Opts{Max_Documents};
+ return sprintf(
+ 'Checked %d document%s in %s seconds.',
+ $n_docs,
+ ($n_docs == 1) ? '' : 's',
+ &time_diff($timestamp, $stop)
+ );
}
##################
@@ -2546,25 +2834,25 @@
sub html_header ($;$$)
{
- my ($uri, $doform, $cookie) = @_;
+ my ($uri, $doform, $cookie) = @_;
- my $title = defined($uri) ? $uri : '';
- $title = ': ' . $title if ($title =~ /\S/);
+ my $title = defined($uri) ? $uri : '';
+ $title = ': ' . $title if ($title =~ /\S/);
- my $headers = '';
- if (! $Opts{Command_Line}) {
- $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $doform;
- $headers .= "Content-Type: text/html; charset=utf-8\n";
- $headers .= "Set-Cookie: $cookie\n" if $cookie;
+ my $headers = '';
+ if (!$Opts{Command_Line}) {
+ $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $doform;
+ $headers .= "Content-Type: text/html; charset=utf-8\n";
+ $headers .= "Set-Cookie: $cookie\n" if $cookie;
- # mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same
- # print() statement as the last header
- $headers .= "Content-Language: en\n\n";
- }
+ # mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same
+ # print() statement as the last header
+ $headers .= "Content-Language: en\n\n";
+ }
- my $script = my $onload = '';
- if ($doform) {
- $script = <<'EOF';
+ my $script = my $onload = '';
+ if ($doform) {
+ $script = <<'EOF';
<script type="text/javascript">
function uriOk(num)
{
@@ -2587,106 +2875,121 @@
}
</script>
EOF
- $onload = ' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"';
- }
+ $onload =
+ ' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"';
+ }
- print $headers, $DocType, "
+ print $headers, $DocType, "
<html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\">
<head>
<title>W3C Link Checker", &encode($title), "</title>
", $Head, $script, "</head>
<body", $onload, '>';
- &banner($title);
- return;
+ &banner($title);
+ return;
}
sub banner ($)
{
- my $tagline ="Check links and anchors in Web pages or full Web sites";
+ my $tagline = "Check links and anchors in Web pages or full Web sites";
- printf(<<'EOF', URI->new_abs("../images/no_w3c.png", $Cfg{Doc_URI}), $tagline);
+ printf(
+ <<'EOF', URI->new_abs("../images/no_w3c.png", $Cfg{Doc_URI}), $tagline);
<div id="banner"><h1 id="title"><a href="http://www.w3.org/" title="W3C"><img alt="W3C" id="logo" src="%s" /></a>
<a href="checklink"><span>Link Checker</span></a></h1>
<p id="tagline">%s</p></div>
<div id="main">
EOF
- return;
+ return;
}
sub status_icon($)
{
- my ($code) = @_;
- my $icon_type;
- my $r = HTTP::Response->new($code);
- if ($r->is_success()) {
- $icon_type = 'error'; # if is success but reported, it's because of broken frags => error
- } elsif (&informational($code)) {
- $icon_type = 'info';
- } elsif ($code == 300) {
- $icon_type = 'info';
- } elsif ($code == 401) {
- $icon_type = 'error';
- } elsif ($r->is_redirect()) {
- $icon_type = 'warning';
- } elsif ($r->is_error()) {
- $icon_type = 'error';
- } else {
- $icon_type = 'error';
- }
- return sprintf('<span class="err_type"><img src="%s" alt="%s" /></span>',
- URI->new_abs("../images/info_icons/$icon_type.png",
- $Cfg{Doc_URI}),
- $icon_type);
+ my ($code) = @_;
+ my $icon_type;
+ my $r = HTTP::Response->new($code);
+ if ($r->is_success()) {
+ $icon_type = 'error'
+ ; # if is success but reported, it's because of broken frags => error
+ }
+ elsif (&informational($code)) {
+ $icon_type = 'info';
+ }
+ elsif ($code == 300) {
+ $icon_type = 'info';
+ }
+ elsif ($code == 401) {
+ $icon_type = 'error';
+ }
+ elsif ($r->is_redirect()) {
+ $icon_type = 'warning';
+ }
+ elsif ($r->is_error()) {
+ $icon_type = 'error';
+ }
+ else {
+ $icon_type = 'error';
+ }
+ return sprintf('<span class="err_type"><img src="%s" alt="%s" /></span>',
+ URI->new_abs("../images/info_icons/$icon_type.png", $Cfg{Doc_URI}),
+ $icon_type);
}
sub bgcolor ($)
{
- my ($code) = @_;
- my $class;
- my $r = HTTP::Response->new($code);
- if ($r->is_success()) {
- return '';
- } elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) {
- $class = 'dubious';
- } elsif ($code == 300) {
- $class = 'multiple';
- } elsif ($code == 401) {
- $class = 'unauthorized';
- } elsif ($r->is_redirect()) {
- $class = 'redirect';
- } elsif ($r->is_error()) {
- $class = 'broken';
- } else {
- $class = 'broken';
- }
- return(' class="'.$class.'"');
+ my ($code) = @_;
+ my $class;
+ my $r = HTTP::Response->new($code);
+ if ($r->is_success()) {
+ return '';
+ }
+ elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) {
+ $class = 'dubious';
+ }
+ elsif ($code == 300) {
+ $class = 'multiple';
+ }
+ elsif ($code == 401) {
+ $class = 'unauthorized';
+ }
+ elsif ($r->is_redirect()) {
+ $class = 'redirect';
+ }
+ elsif ($r->is_error()) {
+ $class = 'broken';
+ }
+ else {
+ $class = 'broken';
+ }
+ return (' class="' . $class . '"');
}
sub show_url ($;$)
{
- my ($url, $fragment) = @_;
- if (defined($fragment)) {
- my $u = URI->new($url);
- $u->fragment($fragment);
- $url = $u->as_string();
- }
- $url = &encode($url);
- return sprintf('<a href="%s">%s</a>',
- $url, defined($fragment) ? &encode($fragment) : $url);
+ my ($url, $fragment) = @_;
+ if (defined($fragment)) {
+ my $u = URI->new($url);
+ $u->fragment($fragment);
+ $url = $u->as_string();
+ }
+ $url = &encode($url);
+ return sprintf('<a href="%s">%s</a>',
+ $url, defined($fragment) ? &encode($fragment) : $url);
}
sub html_footer ()
{
- printf("<p>%s</p>\n", &global_stats()) if ($doc_count > 0 && !$Opts{Quiet});
- if (! $doc_count) {
- print <<'EOF';
+ printf("<p>%s</p>\n", &global_stats())
+ if ($doc_count > 0 && !$Opts{Quiet});
+ if (!$doc_count) {
+ print <<'EOF';
<div class="intro">
<p>This Link Checker looks for issues in links, anchors and referenced objects in a Web page, or recursively on a whole Web site.
For best results, it is recommended to first ensure that the documents checked use <a href="http://validator.w3.org/">Valid (X)HTML Markup</a>. The Link Checker is part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and Quality Web tools</a>.</p>
</div>
EOF
- }
- printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION);
+ }
+ printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION);
</div><!-- main -->
<ul class="navbar" id="menu">
<li><a href="%s" accesskey="3" title="Documentation for this Link Checker Service">Docs</a></li>
@@ -2702,77 +3005,94 @@
</body>
</html>
EOF
- return;
+ return;
}
sub print_form (\%$$)
{
- my ($params, $cookie, $check_num) = @_;
+ my ($params, $cookie, $check_num) = @_;
- # Split params on \0, see CGI's docs on Vars()
- while (my ($key, $value) = each(%$params)) {
- if ($value) {
- my @vals = split(/\0/, $value, 2);
- $params->{$key} = $vals[0];
+ # Split params on \0, see CGI's docs on Vars()
+ while (my ($key, $value) = each(%$params)) {
+ if ($value) {
+ my @vals = split(/\0/, $value, 2);
+ $params->{$key} = $vals[0];
+ }
}
- }
- # Override undefined values from the cookie, if we got one.
- my $valid_cookie = 0;
- if ($cookie) {
- my %cookie_values = $cookie->value();
- if (!$cookie_values{clear}) { # XXX no easy way to check if cookie expired?
- $valid_cookie = 1;
- while (my ($key, $value) = each(%cookie_values)) {
- $params->{$key} = $value unless defined($params->{$key});
- }
+ # Override undefined values from the cookie, if we got one.
+ my $valid_cookie = 0;
+ if ($cookie) {
+ my %cookie_values = $cookie->value();
+ if (!$cookie_values{clear})
+ { # XXX no easy way to check if cookie expired?
+ $valid_cookie = 1;
+ while (my ($key, $value) = each(%cookie_values)) {
+ $params->{$key} = $value unless defined($params->{$key});
+ }
+ }
}
- }
- my $chk = ' checked="checked"';
- $params->{hide_type} = 'all' unless $params->{hide_type};
+ my $chk = ' checked="checked"';
+ $params->{hide_type} = 'all' unless $params->{hide_type};
- my $requested_uri = &encode($params->{uri} || '');
- my $sum = $params->{summary} ? $chk : '';
- my $red = $params->{hide_redirects} ? $chk : '';
- my $all = ($params->{hide_type} ne 'dir') ? $chk : '';
- my $dir = $all ? '' : $chk;
- my $acc = $params->{no_accept_language} ? $chk : '';
- my $ref = $params->{no_referer} ? $chk : '';
- my $rec = $params->{recursive} ? $chk : '';
- my $dep = &encode($params->{depth} || '');
+ my $requested_uri = &encode($params->{uri} || '');
+ my $sum = $params->{summary} ? $chk : '';
+ my $red = $params->{hide_redirects} ? $chk : '';
+ my $all = ($params->{hide_type} ne 'dir') ? $chk : '';
+ my $dir = $all ? '' : $chk;
+ my $acc = $params->{no_accept_language} ? $chk : '';
+ my $ref = $params->{no_referer} ? $chk : '';
+ my $rec = $params->{recursive} ? $chk : '';
+ my $dep = &encode($params->{depth} || '');
- my $cookie_options = '';
- if ($valid_cookie) {
- $cookie_options = "
+ my $cookie_options = '';
+ if ($valid_cookie) {
+ $cookie_options = "
<label for=\"cookie1_$check_num\"><input type=\"radio\" id=\"cookie1_$check_num\" name=\"cookie\" value=\"nochanges\" checked=\"checked\" /> Don't modify saved options</label>
<label for=\"cookie2_$check_num\"><input type=\"radio\" id=\"cookie2_$check_num\" name=\"cookie\" value=\"set\" /> Save these options</label>
<label for=\"cookie3_$check_num\"><input type=\"radio\" id=\"cookie3_$check_num\" name=\"cookie\" value=\"clear\" /> Clear saved options</label>";
- } else {
- $cookie_options = "
+ }
+ else {
+ $cookie_options = "
<label for=\"cookie_$check_num\"><input type=\"checkbox\" id=\"cookie_$check_num\" name=\"cookie\" value=\"set\" /> Save options in a <a href=\"http://www.w3.org/Protocols/rfc2109/rfc2109\">cookie</a></label>";
- }
+ }
- print "<form action=\"", $Opts{_Self_URI}, "\" method=\"get\" onsubmit=\"return uriOk($check_num)\">
+ print "<form action=\"", $Opts{_Self_URI},
+ "\" method=\"get\" onsubmit=\"return uriOk($check_num)\">
<p><label for=\"uri_$check_num\">Enter the address (<a href=\"http://www.w3.org/Addressing/\">URL</a>)
of a document that you would like to check:</label></p>
-<p><input type=\"text\" size=\"50\" id=\"uri_$check_num\" name=\"uri\" value=\"",$requested_uri,"\" /></p>
+<p><input type=\"text\" size=\"50\" id=\"uri_$check_num\" name=\"uri\" value=\"",
+ $requested_uri, "\" /></p>
<fieldset id=\"extra_opt_uri_$check_num\" class=\"moreoptions\">
<legend class=\"toggletext\">More Options</legend>
<div class=\"options\">
<p>
- <label for=\"summary_$check_num\"><input type=\"checkbox\" id=\"summary_$check_num\" name=\"summary\" value=\"on\"", $sum, " /> Summary only</label>
+ <label for=\"summary_$check_num\"><input type=\"checkbox\" id=\"summary_$check_num\" name=\"summary\" value=\"on\"",
+ $sum, " /> Summary only</label>
<br />
- <label for=\"hide_redirects_$check_num\"><input type=\"checkbox\" id=\"hide_redirects_$check_num\" name=\"hide_redirects\" value=\"on\"", $red, " /> Hide <a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3\">redirects</a>:</label>
- <label for=\"hide_type_all_$check_num\"><input type=\"radio\" id=\"hide_type_all_$check_num\" name=\"hide_type\" value=\"all\"", $all, " /> all</label>
- <label for=\"hide_type_dir_$check_num\"><input type=\"radio\" id=\"hide_type_dir_$check_num\" name=\"hide_type\" value=\"dir\"", $dir, " /> for directories only</label>
+ <label for=\"hide_redirects_$check_num\"><input type=\"checkbox\" id=\"hide_redirects_$check_num\" name=\"hide_redirects\" value=\"on\"",
+ $red,
+ " /> Hide <a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3\">redirects</a>:</label>
+ <label for=\"hide_type_all_$check_num\"><input type=\"radio\" id=\"hide_type_all_$check_num\" name=\"hide_type\" value=\"all\"",
+ $all, " /> all</label>
+ <label for=\"hide_type_dir_$check_num\"><input type=\"radio\" id=\"hide_type_dir_$check_num\" name=\"hide_type\" value=\"dir\"",
+ $dir, " /> for directories only</label>
<br />
- <label for=\"no_accept_language_$check_num\"><input type=\"checkbox\" id=\"no_accept_language_$check_num\" name=\"no_accept_language\" value=\"on\"", $acc, " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> header</label>
+ <label for=\"no_accept_language_$check_num\"><input type=\"checkbox\" id=\"no_accept_language_$check_num\" name=\"no_accept_language\" value=\"on\"",
+ $acc,
+ " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> header</label>
<br />
- <label for=\"no_referer_$check_num\"><input type=\"checkbox\" id=\"no_referer_$check_num\" name=\"no_referer\" value=\"on\"", $ref, " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36\">Referer</a></tt> header</label>
+ <label for=\"no_referer_$check_num\"><input type=\"checkbox\" id=\"no_referer_$check_num\" name=\"no_referer\" value=\"on\"",
+ $ref,
+ " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36\">Referer</a></tt> header</label>
<br />
- <label title=\"Check linked documents recursively (maximum: ", $Opts{Max_Documents}, " documents)\" for=\"recursive_$check_num\"><input type=\"checkbox\" id=\"recursive_$check_num\" name=\"recursive\" value=\"on\"", $rec, " /> Check linked documents recursively</label>,
- <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth_$check_num\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth_$check_num\" name=\"depth\" value=\"", $dep, "\" /></label>
+ <label title=\"Check linked documents recursively (maximum: ",
+ $Opts{Max_Documents},
+ " documents)\" for=\"recursive_$check_num\"><input type=\"checkbox\" id=\"recursive_$check_num\" name=\"recursive\" value=\"on\"",
+ $rec, " /> Check linked documents recursively</label>,
+ <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth_$check_num\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth_$check_num\" name=\"depth\" value=\"",
+ $dep, "\" /></label>
<br /><br />", $cookie_options, "
</p>
</div>
@@ -2782,23 +3102,24 @@
<div class=\"intro\" id=\"don_program\"></div>
<script type=\"text/javascript\" src=\"http://www.w3.org/QA/Tools/don_prog.js\"></script>
";
- return;
+ return;
}
sub encode (@)
{
- return $Opts{HTML} ? HTML::Entities::encode(@_) : @_;
+ return $Opts{HTML} ? HTML::Entities::encode(@_) : @_;
}
sub hprintf (@)
{
- print_doc_header();
- if (! $Opts{HTML}) {
- printf(@_);
- } else {
- print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1]));
- }
- return;
+ print_doc_header();
+ if (!$Opts{HTML}) {
+ printf(@_);
+ }
+ else {
+ print HTML::Entities::encode(sprintf($_[0], @_[1 .. @_ - 1]));
+ }
+ return;
}
# Print the document header, if it hasn't been printed already.
@@ -2807,17 +3128,18 @@
# "Processing..." messages when nothing else will be reported.
sub print_doc_header ()
{
- if (defined($doc_header)) {
- print $doc_header;
- undef($doc_header);
- }
+ if (defined($doc_header)) {
+ print $doc_header;
+ undef($doc_header);
+ }
}
-
# Local Variables:
# mode: perl
# indent-tabs-mode: nil
-# tab-width: 2
-# perl-indent-level: 2
+# cperl-indent-level: 4
+# cperl-continued-statement-offset: 4
+# cperl-brace-offset: -4
+# perl-indent-level: 4
# End:
-# ex: ts=2 sw=2 et
+# ex: ts=4 sw=4 et
Received on Thursday, 5 August 2010 14:47:41 UTC