- From: Mercurial notifier <nobody@w3.org>
- Date: Thu, 05 Aug 2010 14:47:25 +0000
- To: link-checker updates <www-validator-cvs@w3.org>
changeset: 349:d1b25a40ba7c user: ville date: Sun Feb 28 21:13:23 2010 +0000 files: Makefile.PL bin/checklink description: Run perltidy, update indentation settings. diff -r f5d868ff8884 -r d1b25a40ba7c Makefile.PL --- a/Makefile.PL Sun Feb 28 19:23:11 2010 +0000 +++ b/Makefile.PL Sun Feb 28 21:13:23 2010 +0000 @@ -2,69 +2,69 @@ use ExtUtils::MakeMaker; WriteMakefile( - NAME => 'W3C::LinkChecker', - ABSTRACT => 'W3C Link Checker', - AUTHOR => 'W3C QA-dev Team <public-qa-dev@w3.org>', - LICENSE => 'open_source', - VERSION_FROM => 'bin/checklink', - PREREQ_PM => { - # Hard dependencies: - HTML::Entities => 0, - HTML::Parser => 3.20, - HTTP::Request => 0, - HTTP::Response => 1.50, - LWP::RobotUA => 1.19, - LWP::UserAgent => 0, - Time::HiRes => 0, - URI => 1.31, - URI::Escape => 0, + NAME => 'W3C::LinkChecker', + ABSTRACT => 'W3C Link Checker', + AUTHOR => 'W3C QA-dev Team <public-qa-dev@w3.org>', + LICENSE => 'open_source', + VERSION_FROM => 'bin/checklink', + PREREQ_PM => { - # Optional, but required if using a config file: - Config::General => 2.06, + # Hard dependencies: + HTML::Entities => 0, + HTML::Parser => 3.20, + HTTP::Request => 0, + HTTP::Response => 1.50, + LWP::RobotUA => 1.19, + LWP::UserAgent => 0, + Time::HiRes => 0, + URI => 1.31, + URI::Escape => 0, - # Optional, but required if private IPs are disallowed: - Net::hostent => 0, - Net::IP => 0, - Socket => 0, + # Optional, but required if using a config file: + Config::General => 2.06, - # Optional, but required in command line mode: - File::Spec => 0, - Getopt::Long => 2.17, - Text::Wrap => 0, - URI::file => 0, + # Optional, but required if private IPs are disallowed: + Net::hostent => 0, + Net::IP => 0, + Socket => 0, - # Optional, used for password input in command line mode: - Term::ReadKey => 2.00, + # Optional, but required in command line mode: + File::Spec => 0, + Getopt::Long => 2.17, + Text::Wrap => 0, + URI::file => 0, - # Optional, used for guessing language in cmd line mode: - Locale::Country => 0, - Locale::Language => 0, + # Optional, used for password input in command line mode: + Term::ReadKey => 2.00, - # Optional, but required in CGI mode: - CGI => 0, - CGI::Carp => 0, - CGI::Cookie => 0, + # Optional, used for guessing language in cmd line mode: + Locale::Country => 0, + Locale::Language => 0, - # Optional, required if using cookies: - HTTP::Cookies => 0, + # Optional, but required in CGI mode: + CGI => 0, + CGI::Carp => 0, + CGI::Cookie => 0, - # Required for the test suite: - File::Spec => 0, - Test::More => 0, - }, - PM => { 'lib/W3C/LinkChecker.pm' => - '$(INST_LIB)/W3C/LinkChecker.pm' }, - EXE_FILES => [ 'bin/checklink' ], - MAN1PODS => { 'bin/checklink.pod' => - '$(INST_MAN1DIR)/checklink.$(MAN1EXT)', - }, - depend => { distdir => 'ChangeLog lib/W3C/LinkChecker.pm' }, - dist => { TARFLAGS => '--owner=0 --group=0 -cvf' }, - clean => { FILES => 'ChangeLog.bak' }, + # Optional, required if using cookies: + HTTP::Cookies => 0, + + # Required for the test suite: + File::Spec => 0, + Test::More => 0, + }, + PM => {'lib/W3C/LinkChecker.pm' => '$(INST_LIB)/W3C/LinkChecker.pm'}, + EXE_FILES => ['bin/checklink'], + MAN1PODS => + {'bin/checklink.pod' => '$(INST_MAN1DIR)/checklink.$(MAN1EXT)',}, + depend => {distdir => 'ChangeLog lib/W3C/LinkChecker.pm'}, + dist => {TARFLAGS => '--owner=0 --group=0 -cvf'}, + clean => {FILES => 'ChangeLog.bak'}, ); -sub MY::postamble { - return <<'MAKE_FRAG'; +sub MY::postamble +{ + return <<'MAKE_FRAG'; ChangeLog: bin/checklink bin/checklink.pod docs/checklink.html docs/linkchecker.css README t/00compile.t cvs2cl --FSF --utc --prune \ --ignore ChangeLog --ignore cvsignore --ignore SIGNATURE \ diff -r f5d868ff8884 -r d1b25a40ba7c bin/checklink --- a/bin/checklink Sun Feb 28 19:23:11 2010 +0000 +++ b/bin/checklink Sun Feb 28 21:13:23 2010 +0000 @@ -5,7 +5,7 @@ # (c) 1999-2010 World Wide Web Consortium # based on Renaud Bruyeron's checklink.pl # -# $Id: checklink,v 4.184 2010-02-28 19:23:11 ville Exp $ +# $Id: checklink,v 4.185 2010-02-28 21:13:23 ville Exp $ # # This program is licensed under the W3C(r) Software License: # http://www.w3.org/Consortium/Legal/copyright-software @@ -34,137 +34,144 @@ # http://www.mail-archive.com/cpan-testers-discuss%40perl.org/msg01064.html use Config qw(%Config); use lib map { /(.*)/ } - defined($ENV{PERL5LIB}) ? split(/$Config{path_sep}/, $ENV{PERL5LIB}) : - defined($ENV{PERLLIB}) ? split(/$Config{path_sep}/, $ENV{PERLLIB}) : (); + defined($ENV{PERL5LIB}) ? split(/$Config{path_sep}/, $ENV{PERL5LIB}) : + defined($ENV{PERLLIB}) ? split(/$Config{path_sep}/, $ENV{PERLLIB}) : + (); # ----------------------------------------------------------------------------- package W3C::UserAgent; -use LWP::RobotUA 1.19 qw(); -use LWP::UserAgent qw(); +use LWP::RobotUA 1.19 qw(); +use LWP::UserAgent qw(); # if 0, ignore robots exclusion (useful for testing) use constant USE_ROBOT_UA => 1; if (USE_ROBOT_UA) { - @W3C::UserAgent::ISA = qw(LWP::RobotUA); -} else { - @W3C::UserAgent::ISA = qw(LWP::UserAgent); + @W3C::UserAgent::ISA = qw(LWP::RobotUA); +} +else { + @W3C::UserAgent::ISA = qw(LWP::UserAgent); } -BEGIN -{ - # The 4k default line length in LWP <= 5.832 isn't enough for example to - # accommodate 4kB cookies (RFC 2985); bump it (#6678). - require LWP::Protocol::http; - push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 8*1024); +BEGIN { + + # The 4k default line length in LWP <= 5.832 isn't enough for example to + # accommodate 4kB cookies (RFC 2985); bump it (#6678). + require LWP::Protocol::http; + push(@LWP::Protocol::http::EXTRA_SOCK_OPTS, MaxLineLength => 8 * 1024); } sub new { - my $proto = shift; - my $class = ref($proto) || $proto; - my ($name, $from, $rules) = @_; + my $proto = shift; + my $class = ref($proto) || $proto; + my ($name, $from, $rules) = @_; - # For security/privacy reasons, if $from was not given, do not send it. - # Cheat by defining something for the constructor, and resetting it later. - my $from_ok = $from; - $from ||= 'www-validator@w3.org'; + # For security/privacy reasons, if $from was not given, do not send it. + # Cheat by defining something for the constructor, and resetting it later. + my $from_ok = $from; + $from ||= 'www-validator@w3.org'; - my $self; - if (USE_ROBOT_UA) { - $self = $class->SUPER::new($name, $from, $rules); - } else { - my %cnf; - @cnf{qw(agent from)} = ($name, $from); - $self = LWP::UserAgent->new(%cnf); - $self = bless $self, $class; - } + my $self; + if (USE_ROBOT_UA) { + $self = $class->SUPER::new($name, $from, $rules); + } + else { + my %cnf; + @cnf{qw(agent from)} = ($name, $from); + $self = LWP::UserAgent->new(%cnf); + $self = bless $self, $class; + } - $self->from(undef) unless $from_ok; + $self->from(undef) unless $from_ok; - $self->env_proxy(); + $self->env_proxy(); - $self->allow_private_ips(1); + $self->allow_private_ips(1); - # TODO: bug 29 - $self->protocols_forbidden([qw(mailto javascript)]); + # TODO: bug 29 + $self->protocols_forbidden([qw(mailto javascript)]); - return $self; + return $self; } sub allow_private_ips { - my $self = shift; - if (@_) { - $self->{Checklink_allow_private_ips} = shift; - if (!$self->{Checklink_allow_private_ips}) { - # Pull in dependencies - require Net::IP; - require Socket; - require Net::hostent; + my $self = shift; + if (@_) { + $self->{Checklink_allow_private_ips} = shift; + if (!$self->{Checklink_allow_private_ips}) { + + # Pull in dependencies + require Net::IP; + require Socket; + require Net::hostent; + } } - } - return $self->{Checklink_allow_private_ips}; + return $self->{Checklink_allow_private_ips}; } sub redirect_progress_callback { - my $self = shift; - $self->{Checklink_redirect_callback} = shift if @_; - return $self->{Checklink_redirect_callback}; + my $self = shift; + $self->{Checklink_redirect_callback} = shift if @_; + return $self->{Checklink_redirect_callback}; } sub simple_request { - my $self = shift; + my $self = shift; - my $response = $self->ip_disallowed($_[0]->uri()); + my $response = $self->ip_disallowed($_[0]->uri()); - # RFC 2616, section 15.1.3 - $_[0]->remove_header("Referer") - if ($_[0]->referer() && + # RFC 2616, section 15.1.3 + $_[0]->remove_header("Referer") + if ($_[0]->referer() && (secure_scheme($_[0]->referer()) && !secure_scheme($_[0]->uri()))); - $response ||= do { - local $SIG{__WARN__} = sub { # Suppress some warnings, rt.cpan.org #18902 - warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/); + $response ||= do { + local $SIG{__WARN__} = + sub { # Suppress some warnings, rt.cpan.org #18902 + warn($_[0]) if ($_[0] && $_[0] !~ /^RobotRules/); + }; + + # @@@ Why not just $self->SUPER::simple_request? + $self->W3C::UserAgent::SUPER::simple_request(@_); }; - # @@@ Why not just $self->SUPER::simple_request? - $self->W3C::UserAgent::SUPER::simple_request(@_); - }; - if (! defined($self->{FirstResponse})) { - $self->{FirstResponse} = $response->code(); - $self->{FirstMessage} = $response->message() || '(no message)'; - } + if (!defined($self->{FirstResponse})) { + $self->{FirstResponse} = $response->code(); + $self->{FirstMessage} = $response->message() || '(no message)'; + } - return $response; + return $response; } sub redirect_ok { - my ($self, $request, $response) = @_; + my ($self, $request, $response) = @_; - if (my $callback = $self->redirect_progress_callback()) { - # @@@ TODO: when an LWP internal robots.txt request gets redirected, this - # will a bit confusingly fire for it too. Would need a robust way to - # determine whether the request is such a LWP "internal robots.txt" one. - &$callback($request->method(), $request->uri()); - } + if (my $callback = $self->redirect_progress_callback()) { - return 0 unless $self->SUPER::redirect_ok($request, $response); + # @@@ TODO: when an LWP internal robots.txt request gets redirected, this + # will a bit confusingly fire for it too. Would need a robust way to + # determine whether the request is such a LWP "internal robots.txt" one. + &$callback($request->method(), $request->uri()); + } - if (my $res = $self->ip_disallowed($request->uri())) { - $response->previous($response->clone()); - $response->request($request); - $response->code($res->code()); - $response->message($res->message()); - return 0; - } + return 0 unless $self->SUPER::redirect_ok($request, $response); - return 1; + if (my $res = $self->ip_disallowed($request->uri())) { + $response->previous($response->clone()); + $response->request($request); + $response->code($res->code()); + $response->message($res->message()); + return 0; + } + + return 1; } # @@ -176,33 +183,34 @@ # sub ip_disallowed { - my ($self, $uri) = @_; - return 0 if $self->allow_private_ips(); # Short-circuit + my ($self, $uri) = @_; + return 0 if $self->allow_private_ips(); # Short-circuit - my $hostname = undef; - eval { $hostname = $uri->host() }; # Not all URIs implement host()... - return 0 unless $hostname; + my $hostname = undef; + eval { $hostname = $uri->host() }; # Not all URIs implement host()... + return 0 unless $hostname; - my $addr = my $iptype = my $resp = undef; - if (my $host = Net::hostent::gethostbyname($hostname)) { - $addr = Socket::inet_ntoa($host->addr()) if $host->addr(); - if ($addr && (my $ip = Net::IP->new($addr))) { - $iptype = $ip->iptype(); + my $addr = my $iptype = my $resp = undef; + if (my $host = Net::hostent::gethostbyname($hostname)) { + $addr = Socket::inet_ntoa($host->addr()) if $host->addr(); + if ($addr && (my $ip = Net::IP->new($addr))) { + $iptype = $ip->iptype(); + } } - } - if ($iptype && $iptype ne 'PUBLIC') { - $resp = HTTP::Response->new(403, - 'Checking non-public IP address disallowed by link checker configuration'); - $resp->header('Client-Warning', 'Internal response'); - } - return $resp; + if ($iptype && $iptype ne 'PUBLIC') { + $resp = HTTP::Response->new(403, + 'Checking non-public IP address disallowed by link checker configuration' + ); + $resp->header('Client-Warning', 'Internal response'); + } + return $resp; } sub secure_scheme { - my $uri = shift or return 0; - $uri = URI->new($uri) unless ref($uri); - return ($uri->scheme() =~ /^(?:file|https|ldaps|sips|snews|ssh)$/i); + my $uri = shift or return 0; + $uri = URI->new($uri) unless ref($uri); + return ($uri->scheme() =~ /^(?:file|https|ldaps|sips|snews|ssh)$/i); } # ----------------------------------------------------------------------------- @@ -210,16 +218,17 @@ package W3C::LinkChecker; use vars qw($AGENT $PACKAGE $PROGRAM $VERSION $REVISION - $DocType $Head $Accept $ContentTypes %Cfg); + $DocType $Head $Accept $ContentTypes %Cfg); -use HTML::Entities qw(); -use HTML::Parser 3.20 qw(); # >= 3.20 for "line" argspec identifier -use HTTP::Request qw(); -use HTTP::Response 1.50 qw(); # >= 1.50 for decoded_content() -use Time::HiRes qw(); -use URI 1.31 qw(); # >= 1.31 for sip: abs/rel -use URI::Escape qw(); -use URI::Heuristic qw(); +use HTML::Entities qw(); +use HTML::Parser 3.20 qw(); # >= 3.20 for "line" argspec identifier +use HTTP::Request qw(); +use HTTP::Response 1.50 qw(); # >= 1.50 for decoded_content() +use Time::HiRes qw(); +use URI 1.31 qw(); # >= 1.31 for sip: abs/rel +use URI::Escape qw(); +use URI::Heuristic qw(); + # @@@ Needs also W3C::UserAgent but can't use() it here. use constant RC_ROBOTS_TXT => -1; @@ -227,10 +236,10 @@ use constant RC_IP_DISALLOWED => -3; use constant RC_PROTOCOL_DISALLOWED => -4; -use constant LINE_UNKNOWN => -1; +use constant LINE_UNKNOWN => -1; use constant MP2 => - (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2); + (exists($ENV{MOD_PERL_API_VERSION}) && $ENV{MOD_PERL_API_VERSION} >= 2); # Tag=>attribute mapping of things we treat as links. # Note: base/@href and meta/@http-equiv get special treatment, see start() @@ -242,27 +251,31 @@ blockquote => ['cite'], body => ['background'], del => ['cite'], - embed => ['href', 'pluginspage', 'pluginurl', 'src'], # proprietary + + # proprietary + embed => ['href', 'pluginspage', 'pluginurl', 'src'], + # form/@action not checked (side effects) - frame => ['longdesc', 'src'], - iframe => ['longdesc', 'src'], - img => ['longdesc', 'src'], - ins => ['cite'], + frame => ['longdesc', 'src'], + iframe => ['longdesc', 'src'], + img => ['longdesc', 'src'], + ins => ['cite'], + # input/@action not checked (side effects) - input => ['src'], - link => ['href'], - object => ['data'], - q => ['cite'], - script => ['src'], - source => ['src'], - video => ['src'], + input => ['src'], + link => ['href'], + object => ['data'], + q => ['cite'], + script => ['src'], + source => ['src'], + video => ['src'], }; # Tag=>attribute mapping of things we treat as space separated lists of links. use constant LINK_LIST_ATTRS => { - a => ['ping'], - area => ['ping'], - head => ['profile'], + a => ['ping'], + area => ['ping'], + head => ['profile'], }; # TBD/TODO: @@ -276,79 +289,80 @@ # - table,tr,td,th/@background? # - xmp/@href? -@W3C::LinkChecker::ISA = qw(HTML::Parser); +@W3C::LinkChecker::ISA = qw(HTML::Parser); -BEGIN -{ - # Version info - $PACKAGE = 'W3C Link Checker'; - $PROGRAM = 'W3C-checklink'; - $VERSION = '4.5'; - $REVISION = sprintf('version %s (c) 1999-2010 W3C', $VERSION); - my ($cvsver) = q$Revision: 4.184 $ =~ /(\d+[\d\.]*\.\d+)/; - $AGENT = sprintf('%s/%s [%s] %s', - $PROGRAM, $VERSION, $cvsver, - (W3C::UserAgent::USE_ROBOT_UA - ? LWP::RobotUA->_agent() - : LWP::UserAgent->_agent())); +BEGIN { - # Pull in mod_perl modules if applicable. - eval { - local $SIG{__DIE__} = undef; - require Apache2::RequestUtil; - } if MP2(); + # Version info + $PACKAGE = 'W3C Link Checker'; + $PROGRAM = 'W3C-checklink'; + $VERSION = '4.5'; + $REVISION = sprintf('version %s (c) 1999-2010 W3C', $VERSION); + my ($cvsver) = q$Revision: 4.185 $ =~ /(\d+[\d\.]*\.\d+)/; + $AGENT = sprintf( + '%s/%s [%s] %s', + $PROGRAM, $VERSION, $cvsver, + ( W3C::UserAgent::USE_ROBOT_UA ? LWP::RobotUA->_agent() : + LWP::UserAgent->_agent() + ) + ); - my @content_types = qw( - text/html - application/xhtml+xml;q=0.9 - application/vnd.wap.xhtml+xml;q=0.6 - ); - $Accept = join(', ', @content_types, '*/*;q=0.5'); - my $re = join('|', map { s/;.*// ; quotemeta } @content_types); - $ContentTypes = qr{\b(?:$re)\b}io; + # Pull in mod_perl modules if applicable. + eval { + local $SIG{__DIE__} = undef; + require Apache2::RequestUtil; + } if MP2(); - # - # Read configuration. If the W3C_CHECKLINK_CFG environment variable has - # been set or the default contains a non-empty file, read it. Otherwise, - # skip silently. - # - my $defaultconfig = '/etc/w3c/checklink.conf'; - if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) { + my @content_types = qw( + text/html + application/xhtml+xml;q=0.9 + application/vnd.wap.xhtml+xml;q=0.6 + ); + $Accept = join(', ', @content_types, '*/*;q=0.5'); + my $re = join('|', map { s/;.*//; quotemeta } @content_types); + $ContentTypes = qr{\b(?:$re)\b}io; - require Config::General; - Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy + # + # Read configuration. If the W3C_CHECKLINK_CFG environment variable has + # been set or the default contains a non-empty file, read it. Otherwise, + # skip silently. + # + my $defaultconfig = '/etc/w3c/checklink.conf'; + if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) { - my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig; - eval { - my %config_opts = - ( -ConfigFile => $conffile, - -SplitPolicy => 'equalsign', - -AllowMultiOptions => 'no', - ); - %Cfg = Config::General->new(%config_opts)->getall(); - }; - if ($@) { - die <<"EOF"; + require Config::General; + Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy + + my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig; + eval { + my %config_opts = ( + -ConfigFile => $conffile, + -SplitPolicy => 'equalsign', + -AllowMultiOptions => 'no', + ); + %Cfg = Config::General->new(%config_opts)->getall(); + }; + if ($@) { + die <<"EOF"; Failed to read configuration from '$conffile': $@ EOF + } } - } - $Cfg{Markup_Validator_URI} ||= - 'http://validator.w3.org/check?uri=%s'; - $Cfg{CSS_Validator_URI} ||= - 'http://jigsaw.w3.org/css-validator/validator?uri=%s'; - $Cfg{Doc_URI} ||= - 'http://validator.w3.org/docs/checklink.html'; + $Cfg{Markup_Validator_URI} ||= 'http://validator.w3.org/check?uri=%s'; + $Cfg{CSS_Validator_URI} ||= + 'http://jigsaw.w3.org/css-validator/validator?uri=%s'; + $Cfg{Doc_URI} ||= 'http://validator.w3.org/docs/checklink.html'; - # Untaint config params that are used as the format argument to (s)printf(), - # Perl 5.10 does not want to see that in taint mode. - ($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/); - ($Cfg{CSS_Validator_URI}) = ($Cfg{CSS_Validator_URI} =~ /^(.*)$/); + # Untaint config params that are used as the format argument to (s)printf(), + # Perl 5.10 does not want to see that in taint mode. + ($Cfg{Markup_Validator_URI}) = ($Cfg{Markup_Validator_URI} =~ /^(.*)$/); + ($Cfg{CSS_Validator_URI}) = ($Cfg{CSS_Validator_URI} =~ /^(.*)$/); - $DocType = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'; - my $css_url = URI->new_abs('linkchecker.css', $Cfg{Doc_URI}); - $Head = sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url); + $DocType = + '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">'; + my $css_url = URI->new_abs('linkchecker.css', $Cfg{Doc_URI}); + $Head = sprintf(<<'EOF', HTML::Entities::encode($AGENT), $css_url); <meta http-equiv="Content-Script-Type" content="text/javascript" /> <meta name="generator" content="%s" /> <link rel="stylesheet" type="text/css" href="%s" /> @@ -373,97 +387,103 @@ </script> EOF - # Trusted environment variables that need laundering in taint mode. - foreach (qw(NNTPSERVER NEWSHOST)) { - ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_}; - } + # Trusted environment variables that need laundering in taint mode. + foreach (qw(NNTPSERVER NEWSHOST)) { + ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_}; + } - # Use passive FTP by default, see Net::FTP(3). - $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); + # Use passive FTP by default, see Net::FTP(3). + $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE}); } # Autoflush $| = 1; # Different options specified by the user -my $cmdline = ! ($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/); -my %Opts = - ( Command_Line => $cmdline, - Quiet => 0, - Summary_Only => 0, - Verbose => 0, - Progress => 0, - HTML => 0, - Timeout => 30, - Redirects => 1, - Dir_Redirects => 1, - Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE}, - Cookies => undef, - No_Referer => 0, - Hide_Same_Realm => 0, - Depth => 0, # < 0 means unlimited recursion. - Sleep_Time => 1, - Max_Documents => 150, # For the online version. - User => undef, - Password => undef, - Base_Locations => [], - Exclude => undef, - Exclude_Docs => undef, - Suppress_Redirect => [], +my $cmdline = !($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/); +my %Opts = ( + Command_Line => $cmdline, + Quiet => 0, + Summary_Only => 0, + Verbose => 0, + Progress => 0, + HTML => 0, + Timeout => 30, + Redirects => 1, + Dir_Redirects => 1, + Accept_Language => $cmdline ? undef : $ENV{HTTP_ACCEPT_LANGUAGE}, + Cookies => undef, + No_Referer => 0, + Hide_Same_Realm => 0, + Depth => 0, # < 0 means unlimited recursion. + Sleep_Time => 1, + Max_Documents => 150, # For the online version. + User => undef, + Password => undef, + Base_Locations => [], + Exclude => undef, + Exclude_Docs => undef, + Suppress_Redirect => [], Suppress_Redirect_Prefix => [], Suppress_Redirect_Regexp => [], - Suppress_Temp_Redirects => 1, - Suppress_Broken => [], - Suppress_Fragment => [], - Masquerade => 0, - Masquerade_From => '', - Masquerade_To => '', - Trusted => $Cfg{Trusted}, + Suppress_Temp_Redirects => 1, + Suppress_Broken => [], + Suppress_Fragment => [], + Masquerade => 0, + Masquerade_From => '', + Masquerade_To => '', + Trusted => $Cfg{Trusted}, Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ? - $Cfg{Allow_Private_IPs} : $cmdline, - ); + $Cfg{Allow_Private_IPs} : + $cmdline, +); undef $cmdline; # Global variables # What URI's did we process? (used for recursive mode) my %processed; + # Result of the HTTP query my %results; + # List of redirects my %redirects; + # Count of the number of documents checked my $doc_count = 0; # Time stamp my $timestamp = &get_timestamp(); + # Per-document header; undefined if already printed. See print_doc_header(). my $doc_header; &parse_arguments() if $Opts{Command_Line}; -my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address +my $ua = W3C::UserAgent->new($AGENT); # @@@ TODO: admin address + # @@@ make number of keep-alive connections customizable -$ua->conn_cache({ total_capacity => 1}); # 1 keep-alive connection +$ua->conn_cache({total_capacity => 1}); # 1 keep-alive connection if ($ua->can('delay')) { - $ua->delay($Opts{Sleep_Time}/60); + $ua->delay($Opts{Sleep_Time} / 60); } $ua->timeout($Opts{Timeout}); + # Set up cookie stash if requested if (defined($Opts{Cookies})) { - require HTTP::Cookies; - my $cookie_file = $Opts{Cookies}; - if ($cookie_file eq 'tmp') { - $cookie_file = undef; - } elsif ($cookie_file =~ /^(.*)$/) { - $cookie_file = $1; # untaint - } - $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1)); + require HTTP::Cookies; + my $cookie_file = $Opts{Cookies}; + if ($cookie_file eq 'tmp') { + $cookie_file = undef; + } + elsif ($cookie_file =~ /^(.*)$/) { + $cookie_file = $1; # untaint + } + $ua->cookie_jar(HTTP::Cookies->new(file => $cookie_file, autosave => 1)); } -eval { - $ua->allow_private_ips($Opts{Allow_Private_IPs}); -}; +eval { $ua->allow_private_ips($Opts{Allow_Private_IPs}); }; if ($@) { - die <<"EOF"; + die <<"EOF"; Allow_Private_IPs is false; this feature requires the Net::IP, Socket, and Net::hostent modules: $@ @@ -472,167 +492,184 @@ if ($Opts{Command_Line}) { - require Text::Wrap; - Text::Wrap->import('wrap'); + require Text::Wrap; + Text::Wrap->import('wrap'); - require URI::file; + require URI::file; - &usage(1) unless scalar(@ARGV); + &usage(1) unless scalar(@ARGV); - $Opts{_Self_URI} = 'http://validator.w3.org/checklink'; # For HTML output + $Opts{_Self_URI} = 'http://validator.w3.org/checklink'; # For HTML output - &ask_password() if ($Opts{User} && !$Opts{Password}); + &ask_password() if ($Opts{User} && !$Opts{Password}); - if (!$Opts{Summary_Only}) { - printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML}; - } else { - $Opts{Verbose} = 0; - $Opts{Progress} = 0; - } + if (!$Opts{Summary_Only}) { + printf("%s %s\n", $PACKAGE, $REVISION) unless $Opts{HTML}; + } + else { + $Opts{Verbose} = 0; + $Opts{Progress} = 0; + } - # Populate data for print_form() - my %params = ( - summary => $Opts{Summary_Only}, - hide_redirects => !$Opts{Redirects}, - hide_type => $Opts{Dir_Redirects} ? 'dir' : 'all', - no_accept_language => !(defined($Opts{Accept_Language}) && - $Opts{Accept_Language} eq 'auto'), - no_referer => $Opts{No_Referer}, - recursive => ($Opts{Depth} != 0), - depth => $Opts{Depth}, - ); + # Populate data for print_form() + my %params = ( + summary => $Opts{Summary_Only}, + hide_redirects => !$Opts{Redirects}, + hide_type => $Opts{Dir_Redirects} ? 'dir' : 'all', + no_accept_language => !( + defined($Opts{Accept_Language}) && $Opts{Accept_Language} eq 'auto' + ), + no_referer => $Opts{No_Referer}, + recursive => ($Opts{Depth} != 0), + depth => $Opts{Depth}, + ); - my $check_num = 1; - my @bases = @{$Opts{Base_Locations}}; - foreach my $uri (@ARGV) { - # Reset base locations so that previous URI's given on the command line - # won't affect the recursion scope for this URI (see check_uri()) - @{$Opts{Base_Locations}} = @bases; - # Transform the parameter into a URI - $uri = &urize($uri); - $params{uri} = $uri; - &check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1); - $check_num++; - } - undef $check_num; + my $check_num = 1; + my @bases = @{$Opts{Base_Locations}}; + foreach my $uri (@ARGV) { - if ($Opts{HTML}) { + # Reset base locations so that previous URI's given on the command line + # won't affect the recursion scope for this URI (see check_uri()) + @{$Opts{Base_Locations}} = @bases; + + # Transform the parameter into a URI + $uri = &urize($uri); + $params{uri} = $uri; + &check_uri(\%params, $uri, $check_num, $Opts{Depth}, undef, undef, 1); + $check_num++; + } + undef $check_num; + + if ($Opts{HTML}) { + &html_footer(); + } + elsif (($doc_count > 0) && !$Opts{Summary_Only}) { + printf("\n%s\n", &global_stats()); + } + +} +else { + + require CGI; + require CGI::Carp; + CGI::Carp->import(qw(fatalsToBrowser)); + require CGI::Cookie; + + # file: URIs are not allowed in CGI mode + my $forbidden = $ua->protocols_forbidden() || []; + push(@$forbidden, 'file'); + $ua->protocols_forbidden($forbidden); + + my $query = CGI->new(); + + # Set a few parameters in CGI mode + $Opts{Verbose} = 0; + $Opts{Progress} = 0; + $Opts{HTML} = 1; + $Opts{_Self_URI} = $query->url(-relative => 1); + + # Backwards compatibility + my $uri = undef; + if ($uri = $query->param('url')) { + $query->param('uri', $uri) unless $query->param('uri'); + $query->delete('url'); + } + $uri = $query->param('uri'); + + if (!$uri) { + &html_header('', 1); # Set cookie only from results page. + my %cookies = CGI::Cookie->fetch(); + &print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1); + &html_footer(); + exit; + } + + # Backwards compatibility + if ($query->param('hide_dir_redirects')) { + $query->param('hide_redirects', 'on'); + $query->param('hide_type', 'dir'); + $query->delete('hide_dir_redirects'); + } + + $Opts{Summary_Only} = 1 if $query->param('summary'); + + if ($query->param('hide_redirects')) { + $Opts{Dir_Redirects} = 0; + if (my $type = $query->param('hide_type')) { + $Opts{Redirects} = 0 if ($type ne 'dir'); + } + else { + $Opts{Redirects} = 0; + } + } + + $Opts{Accept_Language} = undef if $query->param('no_accept_language'); + $Opts{No_Referer} = $query->param('no_referer'); + + $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0); + if (my $depth = $query->param('depth')) { + + # @@@ Ignore invalid depth silently for now. + $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/); + } + + # Save, clear or leave cookie as is. + my $cookie = undef; + if (my $action = $query->param('cookie')) { + if ($action eq 'clear') { + + # Clear the cookie. + $cookie = CGI::Cookie->new(-name => $PROGRAM); + $cookie->value({clear => 1}); + $cookie->expires('-1M'); + } + elsif ($action eq 'set') { + + # Set the options. + $cookie = CGI::Cookie->new(-name => $PROGRAM); + my %options = $query->Vars(); + delete($options{$_}) + for qw(url uri check cookie); # Non-persistent. + $cookie->value(\%options); + } + } + if (!$cookie) { + my %cookies = CGI::Cookie->fetch(); + $cookie = $cookies{$PROGRAM}; + } + + # Always refresh cookie expiration time. + $cookie->expires('+1M') if ($cookie && !$cookie->expires()); + + # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts. + # If we're under mod_perl, there is a way around it... + eval { + local $SIG{__DIE__} = undef; + my $auth = + Apache2::RequestUtil->request()->headers_in()->{Authorization}; + $ENV{HTTP_AUTHORIZATION} = $auth if $auth; + } if (MP2() && !$ENV{HTTP_AUTHORIZATION}); + + $uri =~ s/^\s+//g; + if ($uri !~ m/:/) { + if ($uri =~ m|^//|) { + $uri = 'http:' . $uri; + } + else { + local $ENV{URL_GUESS_PATTERN} = ''; + my $guess = URI::Heuristic::uf_uri($uri); + if ($guess->scheme() && $ua->is_protocol_supported($guess)) { + $uri = $guess->as_string(); + } + else { + $uri = 'http://' . $uri; + } + } + } + + &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie); + undef $query; # Not needed any more. &html_footer(); - } elsif (($doc_count > 0) && !$Opts{Summary_Only}) { - printf("\n%s\n", &global_stats()); - } - -} else { - - require CGI; - require CGI::Carp; - CGI::Carp->import(qw(fatalsToBrowser)); - require CGI::Cookie; - - # file: URIs are not allowed in CGI mode - my $forbidden = $ua->protocols_forbidden() || []; - push(@$forbidden, 'file'); - $ua->protocols_forbidden($forbidden); - - my $query = CGI->new(); - # Set a few parameters in CGI mode - $Opts{Verbose} = 0; - $Opts{Progress} = 0; - $Opts{HTML} = 1; - $Opts{_Self_URI} = $query->url(-relative => 1); - - # Backwards compatibility - my $uri = undef; - if ($uri = $query->param('url')) { - $query->param('uri', $uri) unless $query->param('uri'); - $query->delete('url'); - } - $uri = $query->param('uri'); - - if (! $uri) { - &html_header('', 1); # Set cookie only from results page. - my %cookies = CGI::Cookie->fetch(); - &print_form(scalar($query->Vars()), $cookies{$PROGRAM}, 1); - &html_footer(); - exit; - } - - # Backwards compatibility - if ($query->param('hide_dir_redirects')) { - $query->param('hide_redirects', 'on'); - $query->param('hide_type', 'dir'); - $query->delete('hide_dir_redirects'); - } - - $Opts{Summary_Only} = 1 if $query->param('summary'); - - if ($query->param('hide_redirects')) { - $Opts{Dir_Redirects} = 0; - if (my $type = $query->param('hide_type')) { - $Opts{Redirects} = 0 if ($type ne 'dir'); - } else { - $Opts{Redirects} = 0; - } - } - - $Opts{Accept_Language} = undef if $query->param('no_accept_language'); - $Opts{No_Referer} = $query->param('no_referer'); - - $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0); - if (my $depth = $query->param('depth')) { - # @@@ Ignore invalid depth silently for now. - $Opts{Depth} = $1 if ($depth =~ /(-?\d+)/); - } - - # Save, clear or leave cookie as is. - my $cookie = undef; - if (my $action = $query->param('cookie')) { - if ($action eq 'clear') { - # Clear the cookie. - $cookie = CGI::Cookie->new(-name => $PROGRAM); - $cookie->value({ clear => 1 }); - $cookie->expires('-1M'); - } elsif ($action eq 'set') { - # Set the options. - $cookie = CGI::Cookie->new(-name => $PROGRAM); - my %options = $query->Vars(); - delete($options{$_}) for qw(url uri check cookie); # Non-persistent. - $cookie->value(\%options); - } - } - if (!$cookie) { - my %cookies = CGI::Cookie->fetch(); - $cookie = $cookies{$PROGRAM}; - } - # Always refresh cookie expiration time. - $cookie->expires('+1M') if ($cookie && !$cookie->expires()); - - # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts. - # If we're under mod_perl, there is a way around it... - eval { - local $SIG{__DIE__} = undef; - my $auth = Apache2::RequestUtil->request()->headers_in()->{Authorization}; - $ENV{HTTP_AUTHORIZATION} = $auth if $auth; - } if (MP2() && !$ENV{HTTP_AUTHORIZATION}); - - $uri =~ s/^\s+//g; - if ($uri !~ m/:/) { - if ($uri =~ m|^//|) { - $uri = 'http:'.$uri; - } else { - local $ENV{URL_GUESS_PATTERN} = ''; - my $guess = URI::Heuristic::uf_uri($uri); - if ($guess->scheme() && $ua->is_protocol_supported($guess)) { - $uri = $guess->as_string(); - } else { - $uri = 'http://'.$uri; - } - } - } - - &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie); - undef $query; # Not needed any more. - &html_footer(); } ############################################################################### @@ -644,144 +681,169 @@ sub parse_arguments () { - require Getopt::Long; - Getopt::Long->require_version(2.17); - Getopt::Long->import('GetOptions'); - Getopt::Long::Configure('bundling', 'no_ignore_case'); - my $masq = ''; - my @locs = (); + require Getopt::Long; + Getopt::Long->require_version(2.17); + Getopt::Long->import('GetOptions'); + Getopt::Long::Configure('bundling', 'no_ignore_case'); + my $masq = ''; + my @locs = (); - GetOptions('help|h|?' => sub { usage(0) }, - 'q|quiet' => sub { $Opts{Quiet} = 1; - $Opts{Summary_Only} = 1; - }, - 's|summary' => \$Opts{Summary_Only}, - 'b|broken' => sub { $Opts{Redirects} = 0; - $Opts{Dir_Redirects} = 0; - }, - 'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; }, - 'v|verbose' => \$Opts{Verbose}, - 'i|indicator' => \$Opts{Progress}, - 'H|html' => \$Opts{HTML}, - 'r|recursive' => sub { $Opts{Depth} = -1 - if $Opts{Depth} == 0; }, - 'l|location=s' => \@locs, - 'X|exclude=s' => \$Opts{Exclude}, - 'exclude-docs=s@' => \@{$Opts{Exclude_Docs}}, - 'suppress-redirect=s@' => \@{$Opts{Suppress_Redirect}}, - 'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}}, - 'suppress-temp-redirects' => \$Opts{Suppress_Temp_Redirects}, - 'suppress-broken=s@' => \@{$Opts{Suppress_Broken}}, - 'suppress-fragment=s@' => \@{$Opts{Suppress_Fragment}}, - 'u|user=s' => \$Opts{User}, - 'p|password=s' => \$Opts{Password}, - 't|timeout=i' => \$Opts{Timeout}, - 'S|sleep=i' => \$Opts{Sleep_Time}, - 'L|languages=s' => \$Opts{Accept_Language}, - 'c|cookies=s' => \$Opts{Cookies}, - 'R|no-referer' => \$Opts{No_Referer}, - 'D|depth=i' => sub { $Opts{Depth} = $_[1] - unless $_[1] == 0; }, - 'd|domain=s' => \$Opts{Trusted}, - 'masquerade=s' => \$masq, - 'hide-same-realm' => \$Opts{Hide_Same_Realm}, - 'V|version' => \&version, - ) - || usage(1); + GetOptions( + 'help|h|?' => sub { usage(0) }, + 'q|quiet' => sub { + $Opts{Quiet} = 1; + $Opts{Summary_Only} = 1; + }, + 's|summary' => \$Opts{Summary_Only}, + 'b|broken' => sub { + $Opts{Redirects} = 0; + $Opts{Dir_Redirects} = 0; + }, + 'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; }, + 'v|verbose' => \$Opts{Verbose}, + 'i|indicator' => \$Opts{Progress}, + 'H|html' => \$Opts{HTML}, + 'r|recursive' => sub { + $Opts{Depth} = -1 + if $Opts{Depth} == 0; + }, + 'l|location=s' => \@locs, + 'X|exclude=s' => \$Opts{Exclude}, + 'exclude-docs=s@' => \@{$Opts{Exclude_Docs}}, + 'suppress-redirect=s@' => \@{$Opts{Suppress_Redirect}}, + 'suppress-redirect-prefix=s@' => \@{$Opts{Suppress_Redirect_Prefix}}, + 'suppress-temp-redirects' => \$Opts{Suppress_Temp_Redirects}, + 'suppress-broken=s@' => \@{$Opts{Suppress_Broken}}, + 'suppress-fragment=s@' => \@{$Opts{Suppress_Fragment}}, + 'u|user=s' => \$Opts{User}, + 'p|password=s' => \$Opts{Password}, + 't|timeout=i' => \$Opts{Timeout}, + 'S|sleep=i' => \$Opts{Sleep_Time}, + 'L|languages=s' => \$Opts{Accept_Language}, + 'c|cookies=s' => \$Opts{Cookies}, + 'R|no-referer' => \$Opts{No_Referer}, + 'D|depth=i' => sub { + $Opts{Depth} = $_[1] + unless $_[1] == 0; + }, + 'd|domain=s' => \$Opts{Trusted}, + 'masquerade=s' => \$masq, + 'hide-same-realm' => \$Opts{Hide_Same_Realm}, + 'V|version' => \&version, + ) || + usage(1); - if ($masq) { - $Opts{Masquerade} = 1; - my @masq = split(/\s+/, $masq); - if (scalar(@masq) != 2 || - !defined($masq[0]) || $masq[0] !~ /\S/ || - !defined($masq[1]) || $masq[1] !~ /\S/) { - usage(1, "Error: --masquerade takes two whitespace separated URIs."); - } else { - require URI::file; - $Opts{Masquerade_From} = $masq[0]; - my $u = URI->new($masq[1]); - $Opts{Masquerade_To} = $u->scheme() ? $u : URI::file->new_abs($masq[1]); + if ($masq) { + $Opts{Masquerade} = 1; + my @masq = split(/\s+/, $masq); + if (scalar(@masq) != 2 || + !defined($masq[0]) || + $masq[0] !~ /\S/ || + !defined($masq[1]) || + $masq[1] !~ /\S/) + { + usage(1, + "Error: --masquerade takes two whitespace separated URIs."); + } + else { + require URI::file; + $Opts{Masquerade_From} = $masq[0]; + my $u = URI->new($masq[1]); + $Opts{Masquerade_To} = + $u->scheme() ? $u : URI::file->new_abs($masq[1]); + } } - } - if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') { - $Opts{Accept_Language} = &guess_language(); - } + if ($Opts{Accept_Language} && $Opts{Accept_Language} eq 'auto') { + $Opts{Accept_Language} = &guess_language(); + } - if (($Opts{Sleep_Time} || 0) < 1) { - warn("*** Warning: minimum allowed sleep time is 1 second, resetting.\n"); - $Opts{Sleep_Time} = 1; - } + if (($Opts{Sleep_Time} || 0) < 1) { + warn( + "*** Warning: minimum allowed sleep time is 1 second, resetting.\n" + ); + $Opts{Sleep_Time} = 1; + } - push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs); + push(@{$Opts{Base_Locations}}, map { URI->new($_)->canonical() } @locs); - $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs); + $Opts{Depth} = -1 if ($Opts{Depth} == 0 && @locs); - # Precompile/error-check regular expressions. - if (defined($Opts{Exclude})) { - eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; }; - &usage(1, "Error in exclude regexp: $@") if $@; - } - for my $i (0 .. $#{$Opts{Exclude_Docs}}) { - eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; }; - &usage(1, "Error in exclude-docs regexp: $@") if $@; - } - if (defined($Opts{Trusted})) { - eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; }; - &usage(1, "Error in trusted domains regexp: $@") if $@; - } + # Precompile/error-check regular expressions. + if (defined($Opts{Exclude})) { + eval { $Opts{Exclude} = qr/$Opts{Exclude}/o; }; + &usage(1, "Error in exclude regexp: $@") if $@; + } + for my $i (0 .. $#{$Opts{Exclude_Docs}}) { + eval { $Opts{Exclude_Docs}->[$i] = qr/$Opts{Exclude_Docs}->[$i]/; }; + &usage(1, "Error in exclude-docs regexp: $@") if $@; + } + if (defined($Opts{Trusted})) { + eval { $Opts{Trusted} = qr/$Opts{Trusted}/io; }; + &usage(1, "Error in trusted domains regexp: $@") if $@; + } - # Sanity-check error-suppression arguments - for my $i (0..$#{$Opts{Suppress_Redirect}}) { - ${$Opts{Suppress_Redirect}}[$i] =~ s/ /->/; - my $sr_arg = ${$Opts{Suppress_Redirect}}[$i]; - if ($sr_arg !~ /.->./) { - &usage(1, "Bad suppress-redirect argument, should contain \"->\": $sr_arg"); + # Sanity-check error-suppression arguments + for my $i (0 .. $#{$Opts{Suppress_Redirect}}) { + ${$Opts{Suppress_Redirect}}[$i] =~ s/ /->/; + my $sr_arg = ${$Opts{Suppress_Redirect}}[$i]; + if ($sr_arg !~ /.->./) { + &usage(1, + "Bad suppress-redirect argument, should contain \"->\": $sr_arg" + ); + } } - } - for my $i (0..$#{$Opts{Suppress_Redirect_Prefix}}) { - my $srp_arg = ${$Opts{Suppress_Redirect_Prefix}}[$i]; - $srp_arg =~ s/ /->/; - if ($srp_arg !~ /^(.*)->(.*)$/) { - &usage(1, "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg"); + for my $i (0 .. $#{$Opts{Suppress_Redirect_Prefix}}) { + my $srp_arg = ${$Opts{Suppress_Redirect_Prefix}}[$i]; + $srp_arg =~ s/ /->/; + if ($srp_arg !~ /^(.*)->(.*)$/) { + &usage(1, + "Bad suppress-redirect-prefix argument, should contain \"->\": $srp_arg" + ); + } + + # Turn prefixes into a regexp. + ${$Opts{Suppress_Redirect_Prefix}}[$i] = qr/^\Q$1\E(.*)->\Q$2\E\1$/ism; } - # Turn prefixes into a regexp. - ${$Opts{Suppress_Redirect_Prefix}}[$i] = qr/^\Q$1\E(.*)->\Q$2\E\1$/ism; - } - for my $i (0..$#{$Opts{Suppress_Broken}}) { - ${$Opts{Suppress_Broken}}[$i] =~ s/ /:/; - my $sb_arg = ${$Opts{Suppress_Broken}}[$i]; - if ($sb_arg !~ /^(-1|[0-9]+):./) { - &usage(1, "Bad suppress-broken argument, should be prefixed by a numeric response code: $sb_arg"); + for my $i (0 .. $#{$Opts{Suppress_Broken}}) { + ${$Opts{Suppress_Broken}}[$i] =~ s/ /:/; + my $sb_arg = ${$Opts{Suppress_Broken}}[$i]; + if ($sb_arg !~ /^(-1|[0-9]+):./) { + &usage(1, + "Bad suppress-broken argument, should be prefixed by a numeric response code: $sb_arg" + ); + } } - } - for my $sf_arg (@{$Opts{Suppress_Fragment}}) { - if ($sf_arg !~ /.#./) { - &usage(1, "Bad suppress-fragment argument, should contain \"#\": $sf_arg"); + for my $sf_arg (@{$Opts{Suppress_Fragment}}) { + if ($sf_arg !~ /.#./) { + &usage(1, + "Bad suppress-fragment argument, should contain \"#\": $sf_arg" + ); + } } - } - return; + return; } sub version () { - print "$PACKAGE $REVISION\n"; - exit 0; + print "$PACKAGE $REVISION\n"; + exit 0; } sub usage () { - my ($exitval, $msg) = @_; - $exitval = 0 unless defined($exitval); - $msg ||= ''; $msg =~ s/[\r\n]*$/\n\n/ if $msg; + my ($exitval, $msg) = @_; + $exitval = 0 unless defined($exitval); + $msg ||= ''; + $msg =~ s/[\r\n]*$/\n\n/ if $msg; - die($msg) unless $Opts{Command_Line}; + die($msg) unless $Opts{Command_Line}; - my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only'; + my $trust = defined($Cfg{Trusted}) ? $Cfg{Trusted} : 'same host only'; - select(STDERR) if $exitval; - print "$msg$PACKAGE $REVISION + select(STDERR) if $exitval; + print "$msg$PACKAGE $REVISION Usage: checklink <options> <uris> Options: @@ -854,28 +916,28 @@ www-validator\@w3.org (with 'checklink' in the subject) Archives are at: http://lists.w3.org/Archives/Public/www-validator/ "; - exit $exitval; + exit $exitval; } sub ask_password () { - eval { - local $SIG{__DIE__} = undef; - require Term::ReadKey; - Term::ReadKey->require_version(2.00); - Term::ReadKey->import(qw(ReadMode)); - }; - if ($@) { - warn('Warning: Term::ReadKey 2.00 or newer not available, ' . - "password input disabled.\n"); + eval { + local $SIG{__DIE__} = undef; + require Term::ReadKey; + Term::ReadKey->require_version(2.00); + Term::ReadKey->import(qw(ReadMode)); + }; + if ($@) { + warn('Warning: Term::ReadKey 2.00 or newer not available, ' . + "password input disabled.\n"); + return; + } + printf(STDERR 'Enter the password for user %s: ', $Opts{User}); + ReadMode('noecho', *STDIN); + chomp($Opts{Password} = <STDIN>); + ReadMode('restore', *STDIN); + print(STDERR "ok.\n"); return; - } - printf(STDERR 'Enter the password for user %s: ', $Opts{User}); - ReadMode('noecho', *STDIN); - chomp($Opts{Password} = <STDIN>); - ReadMode('restore', *STDIN); - print(STDERR "ok.\n"); - return; } ############################################################################### @@ -886,29 +948,29 @@ sub guess_language () { - my $lang = $ENV{LANG} or return; + my $lang = $ENV{LANG} or return; - $lang =~ s/[\.@].*$//; # en_US.UTF-8, fi_FI@euro... + $lang =~ s/[\.@].*$//; # en_US.UTF-8, fi_FI@euro... - return 'en' if ($lang eq 'C' || $lang eq 'POSIX'); + return 'en' if ($lang eq 'C' || $lang eq 'POSIX'); - my $res = undef; - eval { - require Locale::Language; - if (my $tmp = Locale::Language::language2code($lang)) { - $lang = $tmp; - } - if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) { - if (Locale::Language::code2language($l)) { - $res = $l; - if ($c) { - require Locale::Country; - $res .= "-$c" if Locale::Country::code2country($c); + my $res = undef; + eval { + require Locale::Language; + if (my $tmp = Locale::Language::language2code($lang)) { + $lang = $tmp; } - } - } - }; - return $res; + if (my ($l, $c) = (lc($lang) =~ /^([a-z]+)(?:[-_]([a-z]+))?/)) { + if (Locale::Language::code2language($l)) { + $res = $l; + if ($c) { + require Locale::Country; + $res .= "-$c" if Locale::Country::code2country($c); + } + } + } + }; + return $res; } ############################ @@ -917,34 +979,40 @@ sub urize ($) { - my $arg = shift; - my $uarg = URI::Escape::uri_unescape($arg); - my $uri; - if (-d $uarg) { - # look for an "index" file in dir, return it if found - require File::Spec; - for my $index (map { File::Spec->catfile($uarg, $_) } - qw(index.html index.xhtml index.htm index.xhtm)) { - if (-e $index) { - $uri = URI::file->new_abs($index); - last; - } + my $arg = shift; + my $uarg = URI::Escape::uri_unescape($arg); + my $uri; + if (-d $uarg) { + + # look for an "index" file in dir, return it if found + require File::Spec; + for my $index (map { File::Spec->catfile($uarg, $_) } + qw(index.html index.xhtml index.htm index.xhtm)) + { + if (-e $index) { + $uri = URI::file->new_abs($index); + last; + } + } + + # return dir itself if an index file was not found + $uri ||= URI::file->new_abs($uarg); } - # return dir itself if an index file was not found - $uri ||= URI::file->new_abs($uarg); - } elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) { - $uri = URI::file->new_abs($uarg); - } else { - my $newuri = URI->new($arg); - if ($newuri->scheme()) { - $uri = $newuri; - } else { - local $ENV{URL_GUESS_PATTERN} = ''; - $uri = URI::Heuristic::uf_uri($arg); - $uri = URI::file->new_abs($uri) unless $uri->scheme(); + elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) { + $uri = URI::file->new_abs($uarg); } - } - return $uri->as_string(); + else { + my $newuri = URI->new($arg); + if ($newuri->scheme()) { + $uri = $newuri; + } + else { + local $ENV{URL_GUESS_PATTERN} = ''; + $uri = URI::Heuristic::uf_uri($arg); + $uri = URI::file->new_abs($uri) unless $uri->scheme(); + } + } + return $uri->as_string(); } ######################################## @@ -953,83 +1021,90 @@ sub check_uri (\%$$$$;$$) { - my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_; - $is_start ||= ($check_num == 1); + my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_; + $is_start ||= ($check_num == 1); - my $start = $Opts{Summary_Only} ? 0 : &get_timestamp(); + my $start = $Opts{Summary_Only} ? 0 : &get_timestamp(); - # Get and parse the document - my $response = &get_document('GET', $uri, $doc_count, \%redirects, $referer, - $cookie, $params, $check_num, $is_start); + # Get and parse the document + my $response = &get_document( + 'GET', $uri, $doc_count, \%redirects, $referer, + $cookie, $params, $check_num, $is_start + ); - # Can we check the resource? If not, we exit here... - return if defined($response->{Stop}); + # Can we check the resource? If not, we exit here... + return if defined($response->{Stop}); - if ($Opts{HTML}) { - &html_header($uri, 0, $cookie) if ($check_num == 1); - &print_form($params, $cookie, $check_num) if $is_start; - } + if ($Opts{HTML}) { + &html_header($uri, 0, $cookie) if ($check_num == 1); + &print_form($params, $cookie, $check_num) if $is_start; + } - if ($is_start) { # Starting point of a new check, eg. from the command line - # Use the first URI as the recursion base unless specified otherwise. - push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical()) - unless @{$Opts{Base_Locations}}; - } else { - # Before fetching the document, we don't know if we'll be within the - # recursion scope or not (think redirects). - if (!&in_recursion_scope($response->{absolute_uri})) { - hprintf("Not in recursion scope: %s\n") - if ($Opts{Verbose}); - $response->content(""); - return; + if ($is_start) { # Starting point of a new check, eg. from the command line + # Use the first URI as the recursion base unless specified otherwise. + push(@{$Opts{Base_Locations}}, $response->{absolute_uri}->canonical()) + unless @{$Opts{Base_Locations}}; } - } + else { - # Define the document header, and perhaps print it. - # (It might still be defined if the previous document had no errors; - # just redefine it in that case.) + # Before fetching the document, we don't know if we'll be within the + # recursion scope or not (think redirects). + if (!&in_recursion_scope($response->{absolute_uri})) { + hprintf("Not in recursion scope: %s\n") + if ($Opts{Verbose}); + $response->content(""); + return; + } + } - if ($check_num != 1) { + # Define the document header, and perhaps print it. + # (It might still be defined if the previous document had no errors; + # just redefine it in that case.) + + if ($check_num != 1) { + if ($Opts{HTML}) { + $doc_header = "\n<hr />\n"; + } + else { + $doc_header = "\n" . ('-' x 40) . "\n"; + } + } + + my $absolute_uri = $response->{absolute_uri}->as_string(); + if ($Opts{HTML}) { - $doc_header = "\n<hr />\n"; - } else { - $doc_header = "\n" . ('-' x 40) . "\n"; + $doc_header .= + ("<h2>\nProcessing\t" . &show_url($absolute_uri) . "\n</h2>\n\n"); } - } + else { + $doc_header .= "\nProcessing\t$absolute_uri\n\n"; + } - my $absolute_uri = $response->{absolute_uri}->as_string(); + if (!$Opts{Quiet}) { + print_doc_header(); + } - if ($Opts{HTML}) { - $doc_header .= ("<h2>\nProcessing\t" - . &show_url($absolute_uri) - . "\n</h2>\n\n"); - } else { - $doc_header .= "\nProcessing\t$absolute_uri\n\n"; - } + # We are checking a new document + $doc_count++; - if (! $Opts{Quiet}) { - print_doc_header(); - } + my $result_anchor = 'results' . $doc_count; - # We are checking a new document - $doc_count++; - - my $result_anchor = 'results'.$doc_count; - - if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) { - my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; - my $acclang = $Opts{Accept_Language} || '(not sent)'; - my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; - my $cookies = 'not used'; - if (defined($Opts{Cookies})) { - $cookies = 'used, '; - if ($Opts{Cookies} eq 'tmp') { - $cookies .= 'non-persistent'; - } else { - $cookies .= "file $Opts{Cookies}"; - } - } - printf(<<'EOF', $Accept, $acclang, $send_referer, $cookies, $Opts{Sleep_Time}, $s); + if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) { + my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; + my $acclang = $Opts{Accept_Language} || '(not sent)'; + my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; + my $cookies = 'not used'; + if (defined($Opts{Cookies})) { + $cookies = 'used, '; + if ($Opts{Cookies} eq 'tmp') { + $cookies .= 'non-persistent'; + } + else { + $cookies .= "file $Opts{Cookies}"; + } + } + printf( + <<'EOF', $Accept, $acclang, $send_referer, $cookies, $Opts{Sleep_Time}, $s); Settings used: - Accept: %s @@ -1038,19 +1113,21 @@ - Cookies: %s - Sleeping %d second%s between requests to each server EOF - printf("- Excluding links matching %s\n", $Opts{Exclude}) - if defined($Opts{Exclude}); - printf("- Excluding links in documents whose URIs match %s\n", - join(', ', @{$Opts{Exclude_Docs}})) if @{$Opts{Exclude_Docs}}; - } + printf("- Excluding links matching %s\n", $Opts{Exclude}) + if defined($Opts{Exclude}); + printf("- Excluding links in documents whose URIs match %s\n", + join(', ', @{$Opts{Exclude_Docs}})) + if @{$Opts{Exclude_Docs}}; + } - if ($Opts{HTML}) { - if (! $Opts{Summary_Only}) { - my $accept = &encode($Accept); - my $acclang = &encode($Opts{Accept_Language} || '(not sent)'); - my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; - my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; - printf(<<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s); + if ($Opts{HTML}) { + if (!$Opts{Summary_Only}) { + my $accept = &encode($Accept); + my $acclang = &encode($Opts{Accept_Language} || '(not sent)'); + my $send_referer = $Opts{No_Referer} ? 'not sent' : 'sending'; + my $s = $Opts{Sleep_Time} == 1 ? '' : 's'; + printf( + <<'EOF', $accept, $acclang, $send_referer, $Opts{Sleep_Time}, $s); <div class="settings"> Settings used: <ul> @@ -1061,212 +1138,245 @@ </ul> </div> EOF - printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n", - $result_anchor); - my $esc_uri = URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9."); - printf("<p>For reliable link checking results, check + printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n", + $result_anchor); + my $esc_uri = + URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9."); + printf( + "<p>For reliable link checking results, check <a href=\"%s\">HTML validity</a> first. See also <a href=\"%s\">CSS validity</a>.</p> <p>Back to the <a accesskey=\"1\" href=\"%s\">link checker</a>.</p>\n", - &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)), - &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)), - &encode($Opts{_Self_URI})); + &encode(sprintf($Cfg{Markup_Validator_URI}, $esc_uri)), + &encode(sprintf($Cfg{CSS_Validator_URI}, $esc_uri)), + &encode($Opts{_Self_URI}) + ); - printf(<<'EOF', $result_anchor); + printf(<<'EOF', $result_anchor); <div class="progress" id="progress%s"> <h3>Status: <span></span></h3> <div class="progressbar"><div></div></div> <pre> EOF - } - } - - if ($Opts{Summary_Only} && !$Opts{Quiet}) { - print '<p>' if $Opts{HTML}; - print 'This may take some time'; - print "... (<a href=\"$Cfg{Doc_URI}#wait\">why?</a>)</p>" if $Opts{HTML}; - print " if the document has many links to check.\n" unless $Opts{HTML}; - } - # Record that we have processed this resource - $processed{$absolute_uri} = 1; - # Parse the document - my $p = &parse_document($uri, $absolute_uri, $response, 1, ($depth != 0)); - my $base = URI->new($p->{base}); - - # Check anchors - ############### - - print "Checking anchors...\n" unless $Opts{Summary_Only}; - - my %errors; - while (my ($anchor, $lines) = each(%{$p->{Anchors}})) { - if (!length($anchor)) { - # Empty IDREF's are not allowed - $errors{$anchor} = 1; - } else { - my $times = 0; - $times += $_ for values(%$lines); - # They should appear only once - $errors{$anchor} = 1 if ($times > 1); - } - } - print " done.\n" unless $Opts{Summary_Only}; - - # Check links - ############# - - &hprintf("Recording all the links found: %d\n", scalar (keys %{$p->{Links}})) - if ($Opts{Verbose}); - my %links; - # Record all the links found - while (my ($link, $lines) = each(%{$p->{Links}})) { - my $link_uri = URI->new($link); - my $abs_link_uri = URI->new_abs($link_uri, $base); - - if ($Opts{Masquerade}) { - if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) { - print_doc_header(); - printf("processing %s in base %s\n", - $abs_link_uri, $Opts{Masquerade_To}); - my $nlink = $abs_link_uri; - $nlink =~ - s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|; - $abs_link_uri = URI->new($nlink); - } + } } - my $canon_uri = URI->new($abs_link_uri->canonical()); - my $fragment = $canon_uri->fragment(undef); - if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) { - foreach my $line_num (keys(%$lines)) { - if (!defined($fragment) || !length($fragment)) { - # Document without fragment - $links{$canon_uri}{location}{$line_num} = 1; - } else { - # Resource with a fragment - $links{$canon_uri}{fragments}{$fragment}{$line_num} = 1; + if ($Opts{Summary_Only} && !$Opts{Quiet}) { + print '<p>' if $Opts{HTML}; + print 'This may take some time'; + print "... (<a href=\"$Cfg{Doc_URI}#wait\">why?</a>)</p>" + if $Opts{HTML}; + print " if the document has many links to check.\n" unless $Opts{HTML}; + } + + # Record that we have processed this resource + $processed{$absolute_uri} = 1; + + # Parse the document + my $p = &parse_document($uri, $absolute_uri, $response, 1, ($depth != 0)); + my $base = URI->new($p->{base}); + + # Check anchors + ############### + + print "Checking anchors...\n" unless $Opts{Summary_Only}; + + my %errors; + while (my ($anchor, $lines) = each(%{$p->{Anchors}})) { + if (!length($anchor)) { + + # Empty IDREF's are not allowed + $errors{$anchor} = 1; } - } + else { + my $times = 0; + $times += $_ for values(%$lines); + + # They should appear only once + $errors{$anchor} = 1 if ($times > 1); + } } - } + print " done.\n" unless $Opts{Summary_Only}; - # Build the list of broken URI's + # Check links + ############# - my $nlinks = scalar(keys(%links)); + &hprintf("Recording all the links found: %d\n", + scalar(keys %{$p->{Links}})) + if ($Opts{Verbose}); + my %links; - &hprintf("Checking %d links to build list of broken URI's\n", $nlinks) - if ($Opts{Verbose}); + # Record all the links found + while (my ($link, $lines) = each(%{$p->{Links}})) { + my $link_uri = URI->new($link); + my $abs_link_uri = URI->new_abs($link_uri, $base); - my %broken; - my $link_num = 0; - while (my ($u, $ulinks) = each(%links)) { + if ($Opts{Masquerade}) { + if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) { + print_doc_header(); + printf("processing %s in base %s\n", + $abs_link_uri, $Opts{Masquerade_To}); + my $nlink = $abs_link_uri; + $nlink =~ s|^\Q$Opts{Masquerade_From}\E|$Opts{Masquerade_To}|; + $abs_link_uri = URI->new($nlink); + } + } - if ($Opts{Summary_Only}) { - # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896 - print ' ' if ($Opts{HTML} && !$Opts{Command_Line}); - } else { - &hprintf("\nChecking link %s\n", $u); - my $progress = ($link_num/$nlinks) * 100; - printf('<script type="text/javascript">show_progress("%s", "Checking link %s", "%.1f%%");</script>', $result_anchor, &encode($u), $progress) - if (!$Opts{Command_Line} && $Opts{HTML} && !$Opts{Summary_Only}); + my $canon_uri = URI->new($abs_link_uri->canonical()); + my $fragment = $canon_uri->fragment(undef); + if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) { + foreach my $line_num (keys(%$lines)) { + if (!defined($fragment) || !length($fragment)) { + + # Document without fragment + $links{$canon_uri}{location}{$line_num} = 1; + } + else { + + # Resource with a fragment + $links{$canon_uri}{fragments}{$fragment}{$line_num} = 1; + } + } + } } - $link_num++; - # Check that a link is valid - &check_validity($uri, $u, - ($depth != 0 && &in_recursion_scope($u)), - \%links, \%redirects); - &hprintf("\tReturn code: %s\n", $results{$u}{location}{code}) - if ($Opts{Verbose}); - if ($results{$u}{location}{success}) { + # Build the list of broken URI's - # Even though it was not broken, we might want to display it - # on the results page (e.g. because it required authentication) - $broken{$u}{location} = 1 if ($results{$u}{location}{display} >= 400); + my $nlinks = scalar(keys(%links)); - # List the broken fragments - while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) { - if ($Opts{Verbose}) { - my @line_nums = sort { $a <=> $b } keys(%$lines); - &hprintf("\t\t%s %s - Line%s: %s\n", - $fragment, - ($results{$u}{fragments}{$fragment}) ? 'OK' : 'Not found', - (scalar(@line_nums) > 1) ? 's' : '', - join(', ', @line_nums) - ); + &hprintf("Checking %d links to build list of broken URI's\n", $nlinks) + if ($Opts{Verbose}); + + my %broken; + my $link_num = 0; + while (my ($u, $ulinks) = each(%links)) { + + if ($Opts{Summary_Only}) { + + # Hack: avoid browser/server timeouts in summary only CGI mode, bug 896 + print ' ' if ($Opts{HTML} && !$Opts{Command_Line}); } - # A broken fragment? - if ($results{$u}{fragments}{$fragment} == 0) { - $broken{$u}{fragments}{$fragment} += 2; + else { + &hprintf("\nChecking link %s\n", $u); + my $progress = ($link_num / $nlinks) * 100; + printf( + '<script type="text/javascript">show_progress("%s", "Checking link %s", "%.1f%%");</script>', + $result_anchor, &encode($u), $progress) + if (!$Opts{Command_Line} && + $Opts{HTML} && + !$Opts{Summary_Only}); } - } - } elsif (!($Opts{Quiet} && &informational($results{$u}{location}{code}))) { - # Couldn't find the document - $broken{$u}{location} = 1; - # All the fragments associated are hence broken - foreach my $fragment (keys %{$ulinks->{fragments}}) { - $broken{$u}{fragments}{$fragment}++; - } + $link_num++; + + # Check that a link is valid + &check_validity($uri, $u, ($depth != 0 && &in_recursion_scope($u)), + \%links, \%redirects); + &hprintf("\tReturn code: %s\n", $results{$u}{location}{code}) + if ($Opts{Verbose}); + if ($results{$u}{location}{success}) { + + # Even though it was not broken, we might want to display it + # on the results page (e.g. because it required authentication) + $broken{$u}{location} = 1 + if ($results{$u}{location}{display} >= 400); + + # List the broken fragments + while (my ($fragment, $lines) = each(%{$ulinks->{fragments}})) { + if ($Opts{Verbose}) { + my @line_nums = sort { $a <=> $b } keys(%$lines); + &hprintf( + "\t\t%s %s - Line%s: %s\n", + $fragment, + ($results{$u}{fragments}{$fragment}) ? 'OK' : + 'Not found', + (scalar(@line_nums) > 1) ? 's' : '', + join(', ', @line_nums) + ); + } + + # A broken fragment? + if ($results{$u}{fragments}{$fragment} == 0) { + $broken{$u}{fragments}{$fragment} += 2; + } + } + } + elsif (!($Opts{Quiet} && &informational($results{$u}{location}{code}))) + { + + # Couldn't find the document + $broken{$u}{location} = 1; + + # All the fragments associated are hence broken + foreach my $fragment (keys %{$ulinks->{fragments}}) { + $broken{$u}{fragments}{$fragment}++; + } + } } - } - &hprintf("\nProcessed in %s seconds.\n", &time_diff($start, &get_timestamp())) - unless $Opts{Summary_Only}; - printf('<script type="text/javascript">show_progress("%s", "Done. Document processed in %s seconds.", "100%%");</script>', $result_anchor, &time_diff($start, &get_timestamp())) - if ($Opts{HTML} && !$Opts{Summary_Only}); + &hprintf( + "\nProcessed in %s seconds.\n", + &time_diff($start, &get_timestamp()) + ) unless $Opts{Summary_Only}; + printf( + '<script type="text/javascript">show_progress("%s", "Done. Document processed in %s seconds.", "100%%");</script>', + $result_anchor, &time_diff($start, &get_timestamp())) + if ($Opts{HTML} && !$Opts{Summary_Only}); - # Display results - if ($Opts{HTML} && !$Opts{Summary_Only}) { - print("</pre>\n</div>\n"); - printf("<h2><a name=\"%s\">Results</a></h2>\n", $result_anchor); - } - print "\n" unless $Opts{Quiet}; + # Display results + if ($Opts{HTML} && !$Opts{Summary_Only}) { + print("</pre>\n</div>\n"); + printf("<h2><a name=\"%s\">Results</a></h2>\n", $result_anchor); + } + print "\n" unless $Opts{Quiet}; - &links_summary(\%links, \%results, \%broken, \%redirects); - &anchors_summary($p->{Anchors}, \%errors); + &links_summary(\%links, \%results, \%broken, \%redirects); + &anchors_summary($p->{Anchors}, \%errors); - # Do we want to process other documents? - if ($depth != 0) { + # Do we want to process other documents? + if ($depth != 0) { - foreach my $u (keys %links) { + foreach my $u (keys %links) { - next unless $results{$u}{location}{success}; # Broken link? + next unless $results{$u}{location}{success}; # Broken link? - next unless &in_recursion_scope($u); + next unless &in_recursion_scope($u); - # Do we understand its content type? - next unless ($results{$u}{location}{type} =~ $ContentTypes); + # Do we understand its content type? + next unless ($results{$u}{location}{type} =~ $ContentTypes); - # Have we already processed this URI? - next if &already_processed($u, $uri); + # Have we already processed this URI? + next if &already_processed($u, $uri); - # Do the job - print "\n" unless $Opts{Quiet}; - if ($Opts{HTML}) { - if (!$Opts{Command_Line}) { - if ($doc_count == $Opts{Max_Documents}) { - print("<hr />\n<p><strong>Maximum number of documents ($Opts{Max_Documents}) reached!</strong></p>\n"); - } - if ($doc_count >= $Opts{Max_Documents}) { - $doc_count++; - print("<p>Not checking <strong>$u</strong></p>\n"); - $processed{$u} = 1; - next; - } + # Do the job + print "\n" unless $Opts{Quiet}; + if ($Opts{HTML}) { + if (!$Opts{Command_Line}) { + if ($doc_count == $Opts{Max_Documents}) { + print( + "<hr />\n<p><strong>Maximum number of documents ($Opts{Max_Documents}) reached!</strong></p>\n" + ); + } + if ($doc_count >= $Opts{Max_Documents}) { + $doc_count++; + print("<p>Not checking <strong>$u</strong></p>\n"); + $processed{$u} = 1; + next; + } + } + } + + # This is an inherently recursive algorithm, so Perl's warning is not + # helpful. You may wish to comment this out when debugging, though. + no warnings 'recursion'; + + if ($depth < 0) { + &check_uri($params, $u, 0, -1, $cookie, $uri); + } + else { + &check_uri($params, $u, 0, $depth - 1, $cookie, $uri); + } } - } - - # This is an inherently recursive algorithm, so Perl's warning is not - # helpful. You may wish to comment this out when debugging, though. - no warnings 'recursion'; - - if ($depth < 0) { - &check_uri($params, $u, 0, -1, $cookie, $uri); - } else { - &check_uri($params, $u, 0, $depth-1, $cookie, $uri); - } } - } - return; + return; } ########################################## @@ -1275,29 +1385,34 @@ sub decode_content ($) { - my $response = shift; - my $error = undef; + my $response = shift; + my $error = undef; - my $docref = $response->decoded_content(ref => 1); - if (defined($docref)) { - utf8::encode($$docref); - $response->content_ref($docref); - # Remove Content-Encoding so it won't be decoded again later. - $response->remove_header('Content-Encoding') - } else { - my $ce = $response->header('Content-Encoding'); - $ce = defined($ce) ? "'$ce'" : 'undefined'; - my $ct = $response->header('Content-Type'); - $ct = defined($ct) ? "'$ct'" : 'undefined'; - my $request_uri = $response->request->url; - # content_charset() is available in LWP >= 5.827 - my $cs = $response->can('content_charset') ? - $response->content_charset() : undef; - $cs = defined($cs) ? "'$cs'" : 'unknown'; - $error = "Error decoding document at <$request_uri>, Content-Type $ct, " . - "Content-Encoding $ce, content charset $cs: '$@'"; - } - return $error; + my $docref = $response->decoded_content(ref => 1); + if (defined($docref)) { + utf8::encode($$docref); + $response->content_ref($docref); + + # Remove Content-Encoding so it won't be decoded again later. + $response->remove_header('Content-Encoding'); + } + else { + my $ce = $response->header('Content-Encoding'); + $ce = defined($ce) ? "'$ce'" : 'undefined'; + my $ct = $response->header('Content-Type'); + $ct = defined($ct) ? "'$ct'" : 'undefined'; + my $request_uri = $response->request->url; + + # content_charset() is available in LWP >= 5.827 + my $cs = + $response->can('content_charset') ? $response->content_charset() : + undef; + $cs = defined($cs) ? "'$cs'" : 'unknown'; + $error = + "Error decoding document at <$request_uri>, Content-Type $ct, " . + "Content-Encoding $ce, content charset $cs: '$@'"; + } + return $error; } ####################################### @@ -1306,80 +1421,89 @@ sub get_document ($$$;\%$$$$$) { - my ($method, $uri, $in_recursion, $redirects, $referer, $cookie, $params, - $check_num, $is_start) = @_; - # $method contains the HTTP method the use (GET or HEAD) - # $uri contains the identifier of the resource - # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least - # the second resource checked) - # $redirects is a pointer to the hash containing the map of the redirects - # $referer is the URI of the referring document - # $cookie, $params, $check_num, and $is_start are for printing HTTP headers - # and the form if $in_recursion == 0 and not authenticating + my ($method, $uri, $in_recursion, $redirects, $referer, + $cookie, $params, $check_num, $is_start + ) = @_; - # Get the resource - my $response; - if (defined($results{$uri}{response}) - && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { - $response = $results{$uri}{response}; - } else { - $response = &get_uri($method, $uri, $referer); - &record_results($uri, $method, $response); - &record_redirects($redirects, $response); - } - if (! $response->is_success()) { - if (! $in_recursion) { - # Is it too late to request authentication? - if ($response->code() == 401) { - &authentication($response, $cookie, $params, $check_num, $is_start); - } else { - if ($Opts{HTML}) { - &html_header($uri, 0, $cookie) if ($check_num == 1); - &print_form($params, $cookie, $check_num) if $is_start; - print "<p>", &status_icon($response->code()); + # $method contains the HTTP method the use (GET or HEAD) + # $uri contains the identifier of the resource + # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least + # the second resource checked) + # $redirects is a pointer to the hash containing the map of the redirects + # $referer is the URI of the referring document + # $cookie, $params, $check_num, and $is_start are for printing HTTP headers + # and the form if $in_recursion == 0 and not authenticating + + # Get the resource + my $response; + if (defined($results{$uri}{response}) && + !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) + { + $response = $results{$uri}{response}; + } + else { + $response = &get_uri($method, $uri, $referer); + &record_results($uri, $method, $response); + &record_redirects($redirects, $response); + } + if (!$response->is_success()) { + if (!$in_recursion) { + + # Is it too late to request authentication? + if ($response->code() == 401) { + &authentication($response, $cookie, $params, $check_num, + $is_start); + } + else { + if ($Opts{HTML}) { + &html_header($uri, 0, $cookie) if ($check_num == 1); + &print_form($params, $cookie, $check_num) if $is_start; + print "<p>", &status_icon($response->code()); + } + &hprintf("\nError: %d %s\n", + $response->code(), $response->message() || '(no message)'); + print "</p>\n" if $Opts{HTML}; + } } - &hprintf("\nError: %d %s\n", - $response->code(), $response->message() || '(no message)'); - print "</p>\n" if $Opts{HTML}; - } + $response->{Stop} = 1; + $response->content(""); + return ($response); } - $response->{Stop} = 1; - $response->content(""); - return($response); - } - # What is the URI of the resource that we are processing by the way? - my $base_uri = $response->base(); - my $request_uri = URI->new($response->request->url); - $response->{absolute_uri} = $request_uri->abs($base_uri); + # What is the URI of the resource that we are processing by the way? + my $base_uri = $response->base(); + my $request_uri = URI->new($response->request->url); + $response->{absolute_uri} = $request_uri->abs($base_uri); - # Can we parse the document? - my $failed_reason; - my $ct = $response->header('Content-Type'); - if (!$ct || $ct !~ $ContentTypes) { - $failed_reason = "Content-Type for <$request_uri> is " . - (defined($ct) ? "'$ct'" : 'undefined'); - } else { - $failed_reason = decode_content($response); - } - if ($failed_reason) { - # No, there is a problem... - if (! $in_recursion) { - if ($Opts{HTML}) { - &html_header($uri, 0, $cookie) if ($check_num == 1); - &print_form($params, $cookie, $check_num) if $is_start; - print "<p>", &status_icon(406); + # Can we parse the document? + my $failed_reason; + my $ct = $response->header('Content-Type'); + if (!$ct || $ct !~ $ContentTypes) { + $failed_reason = "Content-Type for <$request_uri> is " . + (defined($ct) ? "'$ct'" : 'undefined'); + } + else { + $failed_reason = decode_content($response); + } + if ($failed_reason) { - } - &hprintf("Can't check links: %s.\n", $failed_reason); - print "</p>\n" if $Opts{HTML}; + # No, there is a problem... + if (!$in_recursion) { + if ($Opts{HTML}) { + &html_header($uri, 0, $cookie) if ($check_num == 1); + &print_form($params, $cookie, $check_num) if $is_start; + print "<p>", &status_icon(406); + + } + &hprintf("Can't check links: %s.\n", $failed_reason); + print "</p>\n" if $Opts{HTML}; + } + $response->{Stop} = 1; + $response->content(""); } - $response->{Stop} = 1; - $response->content(""); - } - # Ok, return the information - return($response); + # Ok, return the information + return ($response); } ######################################################### @@ -1388,25 +1512,25 @@ sub in_recursion_scope ($) { - my ($uri) = @_; - return 0 unless $uri; + my ($uri) = @_; + return 0 unless $uri; - my $candidate = URI->new($uri)->canonical(); + my $candidate = URI->new($uri)->canonical(); - return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude}); + return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude}); - for my $excluded_doc (@{$Opts{Exclude_Docs}}) { - return 0 if ($candidate =~ $excluded_doc); - } + for my $excluded_doc (@{$Opts{Exclude_Docs}}) { + return 0 if ($candidate =~ $excluded_doc); + } - foreach my $base (@{$Opts{Base_Locations}}) { - my $rel = $candidate->rel($base); - next if ($candidate eq $rel); # Relative path not possible? - next if ($rel =~ m|^(\.\.)?/|); # Relative path upwards? - return 1; - } + foreach my $base (@{$Opts{Base_Locations}}) { + my $rel = $candidate->rel($base); + next if ($candidate eq $rel); # Relative path not possible? + next if ($rel =~ m|^(\.\.)?/|); # Relative path upwards? + return 1; + } - return 0; # We always have at least one base location, but none matched. + return 0; # We always have at least one base location, but none matched. } ################################################## @@ -1415,20 +1539,26 @@ sub already_processed ($$) { - my ($uri, $referer) = @_; - # Don't be verbose for that part... - my $summary_value = $Opts{Summary_Only}; - $Opts{Summary_Only} = 1; - # Do a GET: if it fails, we stop, if not, the results are cached - my $response = &get_document('GET', $uri, 1, undef, $referer); - # ... but just for that part - $Opts{Summary_Only} = $summary_value; - # Can we process the resource? - return -1 if defined($response->{Stop}); - # Have we already processed it? - return 1 if defined($processed{$response->{absolute_uri}->as_string()}); - # It's not processed yet and it is processable: return 0 - return 0; + my ($uri, $referer) = @_; + + # Don't be verbose for that part... + my $summary_value = $Opts{Summary_Only}; + $Opts{Summary_Only} = 1; + + # Do a GET: if it fails, we stop, if not, the results are cached + my $response = &get_document('GET', $uri, 1, undef, $referer); + + # ... but just for that part + $Opts{Summary_Only} = $summary_value; + + # Can we process the resource? + return -1 if defined($response->{Stop}); + + # Have we already processed it? + return 1 if defined($processed{$response->{absolute_uri}->as_string()}); + + # It's not processed yet and it is processable: return 0 + return 0; } ############################ @@ -1437,96 +1567,105 @@ sub get_uri ($$;$$\%$$$$) { - # Here we have a lot of extra parameters in order not to lose information - # if the function is called several times (401's) - my ($method, $uri, $referer, $start, $redirects, $code, $realm, $message, - $auth) = @_; - # $method contains the method used - # $uri contains the target of the request - # $referer is the URI of the referring document - # $start is a timestamp (not defined the first time the function is called) - # $redirects is a map of redirects - # $code is the first HTTP return code - # $realm is the realm of the request - # $message is the HTTP message received - # $auth equals 1 if we want to send out authentication information + # Here we have a lot of extra parameters in order not to lose information + # if the function is called several times (401's) + my ($method, $uri, $referer, $start, $redirects, + $code, $realm, $message, $auth + ) = @_; - # For timing purposes - $start = &get_timestamp() unless defined($start); + # $method contains the method used + # $uri contains the target of the request + # $referer is the URI of the referring document + # $start is a timestamp (not defined the first time the function is called) + # $redirects is a map of redirects + # $code is the first HTTP return code + # $realm is the realm of the request + # $message is the HTTP message received + # $auth equals 1 if we want to send out authentication information - # Prepare the query + # For timing purposes + $start = &get_timestamp() unless defined($start); - # Do we want printouts of progress? - my $verbose_progress = - ! ($Opts{Summary_Only} || (!$doc_count && $Opts{HTML})); + # Prepare the query - &hprintf("%s %s ", $method, $uri) if $verbose_progress; + # Do we want printouts of progress? + my $verbose_progress = + !($Opts{Summary_Only} || (!$doc_count && $Opts{HTML})); - my $request = HTTP::Request->new($method, $uri); + &hprintf("%s %s ", $method, $uri) if $verbose_progress; - $request->header('Accept-Language' => $Opts{Accept_Language}) - if $Opts{Accept_Language}; - $request->header('Accept', $Accept); - # accept_decodable() was added in LWP 5.814 - $request->accept_decodable() if $request->can('accept_decodable'); + my $request = HTTP::Request->new($method, $uri); - # Are we providing authentication info? - if ($auth && $request->url()->host() =~ $Opts{Trusted}) { - if (defined($ENV{HTTP_AUTHORIZATION})) { - $request->header(Authorization => $ENV{HTTP_AUTHORIZATION}); - } elsif (defined($Opts{User}) && defined($Opts{Password})) { - $request->authorization_basic($Opts{User}, $Opts{Password}); - } - } + $request->header('Accept-Language' => $Opts{Accept_Language}) + if $Opts{Accept_Language}; + $request->header('Accept', $Accept); - # Tell the user agent if we want progress reports for redirects or not. - $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); }) - if $verbose_progress; + # accept_decodable() was added in LWP 5.814 + $request->accept_decodable() if $request->can('accept_decodable'); - # Set referer - $request->referer($referer) if (!$Opts{No_Referer} && $referer); - - # Telling caches in the middle we want a fresh copy (Bug 4998) - $request->header(Cache_Control => "max-age=0"); - - # Do the query - my $response = $ua->request($request); - - # Get the results - # Record the very first response - if (! defined($code)) { - ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)}); - } - # Authentication requested? - if ($response->code() == 401 && - !defined($auth) && - (defined($ENV{HTTP_AUTHORIZATION}) - || (defined($Opts{User}) && defined($Opts{Password})))) { - - # Set host as trusted domain unless we already have one. - if (!$Opts{Trusted}) { - my $re = sprintf('^%s$', quotemeta($response->base()->host())); - $Opts{Trusted} = qr/$re/io; + # Are we providing authentication info? + if ($auth && $request->url()->host() =~ $Opts{Trusted}) { + if (defined($ENV{HTTP_AUTHORIZATION})) { + $request->header(Authorization => $ENV{HTTP_AUTHORIZATION}); + } + elsif (defined($Opts{User}) && defined($Opts{Password})) { + $request->authorization_basic($Opts{User}, $Opts{Password}); + } } - # Deal with authentication and avoid loops - if (!defined($realm) && - $response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) { - $realm = $1; + # Tell the user agent if we want progress reports for redirects or not. + $ua->redirect_progress_callback(sub { &hprintf("\n-> %s %s ", @_); }) + if $verbose_progress; + + # Set referer + $request->referer($referer) if (!$Opts{No_Referer} && $referer); + + # Telling caches in the middle we want a fresh copy (Bug 4998) + $request->header(Cache_Control => "max-age=0"); + + # Do the query + my $response = $ua->request($request); + + # Get the results + # Record the very first response + if (!defined($code)) { + ($code, $message) = delete(@$ua{qw(FirstResponse FirstMessage)}); } - print "\n" if $verbose_progress; - return &get_uri($method, $response->request()->url(), $referer, - $start, $redirects, $code, $realm, $message, 1); - } - # @@@ subtract robot delay from the "fetched in" time? - &hprintf(" fetched in %s seconds\n", - &time_diff($start, &get_timestamp())) if $verbose_progress; + # Authentication requested? + if ($response->code() == 401 && + !defined($auth) && + (defined($ENV{HTTP_AUTHORIZATION}) || + (defined($Opts{User}) && defined($Opts{Password}))) + ) + { - $response->{Realm} = $realm if defined($realm); + # Set host as trusted domain unless we already have one. + if (!$Opts{Trusted}) { + my $re = sprintf('^%s$', quotemeta($response->base()->host())); + $Opts{Trusted} = qr/$re/io; + } - return $response; + # Deal with authentication and avoid loops + if (!defined($realm) && + $response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) + { + $realm = $1; + } + + print "\n" if $verbose_progress; + return &get_uri($method, $response->request()->url(), + $referer, $start, $redirects, $code, $realm, $message, 1); + } + + # @@@ subtract robot delay from the "fetched in" time? + &hprintf(" fetched in %s seconds\n", &time_diff($start, &get_timestamp())) + if $verbose_progress; + + $response->{Realm} = $realm if defined($realm); + + return $response; } ######################################### @@ -1535,65 +1674,72 @@ sub record_results ($$$) { - my ($uri, $method, $response) = @_; - $results{$uri}{response} = $response; - $results{$uri}{method} = $method; - $results{$uri}{location}{code} = $response->code(); - $results{$uri}{location}{code} = RC_ROBOTS_TXT() - if ($results{$uri}{location}{code} == 403 && + my ($uri, $method, $response) = @_; + $results{$uri}{response} = $response; + $results{$uri}{method} = $method; + $results{$uri}{location}{code} = $response->code(); + $results{$uri}{location}{code} = RC_ROBOTS_TXT() + if ($results{$uri}{location}{code} == 403 && $response->message() =~ /Forbidden by robots\.txt/); - $results{$uri}{location}{code} = RC_IP_DISALLOWED() - if ($results{$uri}{location}{code} == 403 && + $results{$uri}{location}{code} = RC_IP_DISALLOWED() + if ($results{$uri}{location}{code} == 403 && $response->message() =~ /non-public IP/); - $results{$uri}{location}{code} = RC_DNS_ERROR() - if ($results{$uri}{location}{code} == 500 && + $results{$uri}{location}{code} = RC_DNS_ERROR() + if ($results{$uri}{location}{code} == 500 && $response->message() =~ /Bad hostname '[^\']*'/); - $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED() - if ($results{$uri}{location}{code} == 500 && + $results{$uri}{location}{code} = RC_PROTOCOL_DISALLOWED() + if ($results{$uri}{location}{code} == 500 && $response->message() =~ /Access to '[^\']*' URIs has been disabled/); - $results{$uri}{location}{type} = $response->header('Content-type'); - $results{$uri}{location}{display} = $results{$uri}{location}{code}; - # Rewind, check for the original code and message. - for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) { - $results{$uri}{location}{orig} = $tmp->code(); - $results{$uri}{location}{orig_message} = $tmp->message() || '(no message)'; - } - $results{$uri}{location}{success} = $response->is_success(); + $results{$uri}{location}{type} = $response->header('Content-type'); + $results{$uri}{location}{display} = $results{$uri}{location}{code}; - # If a suppressed broken link, fill the data structure like a typical success. - # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n"; - if (! $results{$uri}{location}{success}) { - my $code = $results{$uri}{location}{code}; - my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}}; - if ($match) { - $results{$uri}{location}{success} = 1; - $results{$uri}{location}{code} = 100; - $results{$uri}{location}{display} = 100; + # Rewind, check for the original code and message. + for (my $tmp = $response->previous(); $tmp; $tmp = $tmp->previous()) { + $results{$uri}{location}{orig} = $tmp->code(); + $results{$uri}{location}{orig_message} = $tmp->message() || + '(no message)'; } - } + $results{$uri}{location}{success} = $response->is_success(); - # Stores the authentication information - if (defined($response->{Realm})) { - $results{$uri}{location}{realm} = $response->{Realm}; - $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm}; - } - # What type of broken link is it? (stored in {record} - the {display} - # information is just for visual use only) - if (($results{$uri}{location}{display} == 401) - && ($results{$uri}{location}{code} == 404)) { - $results{$uri}{location}{record} = 404; - } else { - $results{$uri}{location}{record} = $results{$uri}{location}{display}; - } - # Did it fail? - $results{$uri}{location}{message} = $response->message() || '(no message)'; - if (! $results{$uri}{location}{success}) { - &hprintf("Error: %d %s\n", - $results{$uri}{location}{code}, - $results{$uri}{location}{message}) - if ($Opts{Verbose}); - } - return; + # If a suppressed broken link, fill the data structure like a typical success. + # print STDERR "success? " . $results{$uri}{location}{success} . ": $uri\n"; + if (!$results{$uri}{location}{success}) { + my $code = $results{$uri}{location}{code}; + my $match = grep { $_ eq "$code:$uri" } @{$Opts{Suppress_Broken}}; + if ($match) { + $results{$uri}{location}{success} = 1; + $results{$uri}{location}{code} = 100; + $results{$uri}{location}{display} = 100; + } + } + + # Stores the authentication information + if (defined($response->{Realm})) { + $results{$uri}{location}{realm} = $response->{Realm}; + $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm}; + } + + # What type of broken link is it? (stored in {record} - the {display} + # information is just for visual use only) + if (($results{$uri}{location}{display} == 401) && + ($results{$uri}{location}{code} == 404)) + { + $results{$uri}{location}{record} = 404; + } + else { + $results{$uri}{location}{record} = $results{$uri}{location}{display}; + } + + # Did it fail? + $results{$uri}{location}{message} = $response->message() || '(no message)'; + if (!$results{$uri}{location}{success}) { + &hprintf( + "Error: %d %s\n", + $results{$uri}{location}{code}, + $results{$uri}{location}{message} + ) if ($Opts{Verbose}); + } + return; } #################### @@ -1602,66 +1748,68 @@ sub parse_document ($$$$$) { - my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_; + my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_; - print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n") - if $Opts{Verbose}; + print("parse_document($uri, $base_uri, ..., $links, $rec_needs_links)\n") + if $Opts{Verbose}; - my $p; + my $p; - if (defined($results{$uri}{parsing})) { - # We have already done the job. Woohoo! - $p->{base} = $results{$uri}{parsing}{base}; - $p->{Anchors} = $results{$uri}{parsing}{Anchors}; - $p->{Links} = $results{$uri}{parsing}{Links}; + if (defined($results{$uri}{parsing})) { + + # We have already done the job. Woohoo! + $p->{base} = $results{$uri}{parsing}{base}; + $p->{Anchors} = $results{$uri}{parsing}{Anchors}; + $p->{Links} = $results{$uri}{parsing}{Links}; + return $p; + } + + my $start; + $p = W3C::LinkChecker->new(); + $p->{base} = $base_uri; + if (!$Opts{Summary_Only}) { + $start = &get_timestamp(); + print("Parsing...\n"); + } + + # Content-Encoding etc already decoded in get_document(). + my $docref = $response->content_ref(); + + # Count lines beforehand if needed for progress indicator. In all cases, + # the actual final number of lines processed shown is populated by our + # end_document handler. + $p->{Total} = ($$docref =~ tr/\n//) if $Opts{Progress}; + + # We only look for anchors if we are not interested in the links + # obviously, or if we are running a recursive checking because we + # might need this information later + $p->{only_anchors} = !($links || $rec_needs_links); + + # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing + # Processing instructions are not parsed by process, but in this case + # it should be. It's expensive, it's horrible, but it's the easiest way + # for right now. + $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ + unless $p->{only_anchors}; + + $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/); + + $p->parse($$docref)->eof(); + $response->content(""); + + if (!$Opts{Summary_Only}) { + my $stop = &get_timestamp(); + print "\r" if $Opts{Progress}; + &hprintf(" done (%d lines in %s seconds).\n", + $p->{Total}, &time_diff($start, $stop)); + } + + # Save the results before exiting + $results{$uri}{parsing}{base} = $p->{base}; + $results{$uri}{parsing}{Anchors} = $p->{Anchors}; + $results{$uri}{parsing}{Links} = $p->{Links}; + return $p; - } - - my $start; - $p = W3C::LinkChecker->new(); - $p->{base} = $base_uri; - if (! $Opts{Summary_Only}) { - $start = &get_timestamp(); - print("Parsing...\n"); - } - - # Content-Encoding etc already decoded in get_document(). - my $docref = $response->content_ref(); - - # Count lines beforehand if needed for progress indicator. In all cases, - # the actual final number of lines processed shown is populated by our - # end_document handler. - $p->{Total} = ($$docref =~ tr/\n//) if $Opts{Progress}; - - # We only look for anchors if we are not interested in the links - # obviously, or if we are running a recursive checking because we - # might need this information later - $p->{only_anchors} = !($links || $rec_needs_links); - - # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing - # Processing instructions are not parsed by process, but in this case - # it should be. It's expensive, it's horrible, but it's the easiest way - # for right now. - $$docref =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors}; - - $p->xml_mode(1) if ($response->content_type() =~ /\+xml$/); - - $p->parse($$docref)->eof(); - $response->content(""); - - if (! $Opts{Summary_Only}) { - my $stop = &get_timestamp(); - print "\r" if $Opts{Progress}; - &hprintf(" done (%d lines in %s seconds).\n", - $p->{Total}, &time_diff($start, $stop)); - } - - # Save the results before exiting - $results{$uri}{parsing}{base} = $p->{base}; - $results{$uri}{parsing}{Anchors} = $p->{Anchors}; - $results{$uri}{parsing}{Links} = $p->{Links}; - - return $p; } #################################### @@ -1670,31 +1818,35 @@ sub new { - my $p = HTML::Parser::new(@_, api_version => 3); - $p->utf8_mode(1); + my $p = HTML::Parser::new(@_, api_version => 3); + $p->utf8_mode(1); - # Set up handlers + # Set up handlers - $p->handler(start => 'start', 'self, tagname, attr, line'); - $p->handler(declaration => - sub { - my $self = shift; - $self->declaration(substr($_[0], 2, -1)); - }, 'self, text, line'); - $p->handler(end_document => 'end_document', 'self, line'); - if ($Opts{Progress}) { - $p->handler(default => 'parse_progress', 'self, line'); - $p->{last_percentage} = 0; - } + $p->handler(start => 'start', 'self, tagname, attr, line'); + $p->handler( + declaration => sub { + my $self = shift; + $self->declaration(substr($_[0], 2, -1)); + }, + 'self, text, line' + ); + $p->handler(end_document => 'end_document', 'self, line'); + if ($Opts{Progress}) { + $p->handler(default => 'parse_progress', 'self, line'); + $p->{last_percentage} = 0; + } - # Check <a [..] name="...">? - $p->{check_name} = 1; - # Check <[..] id="..">? - $p->{check_id} = 1; - # Don't interpret comment loosely - $p->strict_comment(1); + # Check <a [..] name="...">? + $p->{check_name} = 1; - return $p; + # Check <[..] id="..">? + $p->{check_id} = 1; + + # Don't interpret comment loosely + $p->strict_comment(1); + + return $p; } ################################################# @@ -1703,24 +1855,24 @@ sub doctype { - my ($self, $dc) = @_; - return $self->{doctype} unless $dc; - $_ = $self->{doctype} = $dc; + my ($self, $dc) = @_; + return $self->{doctype} unless $dc; + $_ = $self->{doctype} = $dc; - # What to look for depending on the doctype + # What to look for depending on the doctype - # Check for <a name="...">? - $self->{check_name} = 0 - if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %; + # Check for <a name="...">? + $self->{check_name} = 0 + if m%^-//(W3C|WAPFORUM)//DTD XHTML (Basic|Mobile) %; - # Check for <* id="...">? - $self->{check_id} = 0 - if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%); + # Check for <* id="...">? + $self->{check_id} = 0 + if (m%^-//IETF//DTD HTML [23]\.0//% || m%^-//W3C//DTD HTML 3\.2//%); - # Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...) - $self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%); + # Enable XML mode (XHTML, XHTML Mobile, XHTML-Print, XHTML+RDFa, ...) + $self->xml_mode(1) if (m%^-//(W3C|WAPFORUM)//DTD XHTML[ \-\+]%); - return; + return; } ################################### @@ -1729,16 +1881,16 @@ sub parse_progress { - my ($self, $line) = @_; - return unless defined($line) && $line > 0 && $self->{Total} > 0; + my ($self, $line) = @_; + return unless defined($line) && $line > 0 && $self->{Total} > 0; - my $percentage = int($line/$self->{Total}*100); - if ($percentage != $self->{last_percentage}) { - printf("\r%4d%%", $percentage); - $self->{last_percentage} = $percentage; - } + my $percentage = int($line / $self->{Total} * 100); + if ($percentage != $self->{last_percentage}) { + printf("\r%4d%%", $percentage); + $self->{last_percentage} = $percentage; + } - return; + return; } ############################# @@ -1747,18 +1899,19 @@ sub get_anchor { - my ($self, $tag, $attr) = @_; + my ($self, $tag, $attr) = @_; - my $anchor = $self->{check_id} ? $attr->{id} : undef; - if ($self->{check_name} && ($tag eq 'a')) { - # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory - # Force an error if it's not the case (or if id's and name's values - # are different) - # If id is defined, name if defined must have the same value - $anchor ||= $attr->{name}; - } + my $anchor = $self->{check_id} ? $attr->{id} : undef; + if ($self->{check_name} && ($tag eq 'a')) { - return $anchor; + # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory + # Force an error if it's not the case (or if id's and name's values + # are different) + # If id is defined, name if defined must have the same value + $anchor ||= $attr->{name}; + } + + return $anchor; } ############################# @@ -1767,116 +1920,132 @@ sub add_link { - my ($self, $uri, $base, $line) = @_; - if (defined($uri)) { - # Remove repeated slashes after the . or .. in relative links, to avoid - # duplicated checking or infinite recursion. - $uri =~ s|^(\.\.?/)/+|$1|o; - $uri = URI->new_abs($uri, $base) if defined($base); - $self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}++; - } - return; + my ($self, $uri, $base, $line) = @_; + if (defined($uri)) { + + # Remove repeated slashes after the . or .. in relative links, to avoid + # duplicated checking or infinite recursion. + $uri =~ s|^(\.\.?/)/+|$1|o; + $uri = URI->new_abs($uri, $base) if defined($base); + $self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}++; + } + return; } sub start { - my ($self, $tag, $attr, $line) = @_; - $line = LINE_UNKNOWN() unless defined($line); + my ($self, $tag, $attr, $line) = @_; + $line = LINE_UNKNOWN() unless defined($line); - # Anchors - my $anchor = $self->get_anchor($tag, $attr); - $self->{Anchors}{$anchor}{$line}++ if defined($anchor); + # Anchors + my $anchor = $self->get_anchor($tag, $attr); + $self->{Anchors}{$anchor}{$line}++ if defined($anchor); - # Links - if (!$self->{only_anchors}) { + # Links + if (!$self->{only_anchors}) { - my $tag_local_base = undef; + my $tag_local_base = undef; - # Special case: base/@href - # TODO: This should go away as soon as LWP::Protocol::collect() invokes - # HTML::HeadParser (thus taking care of it in $response->base() - # transparently) for application/xhtml+xml and - # application/vnd.wap.xhtml+xml documents - # --> it does in LWP >= 5.810 - if ($tag eq 'base') { - # Treat <base> (without href) or <base href=""> as if it didn't exist. - if (defined($attr->{href}) && length($attr->{href})) { - $self->{base} = $attr->{href}; - } - # Note: base/@href intentionally not treated as a dereferenceable link: - # http://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi - } - # Special case: meta[@http-equiv=Refresh]/@content - elsif ($tag eq 'meta') { - if ($attr->{'http-equiv'} && lc($attr->{'http-equiv'}) eq 'refresh') { - my $content = $attr->{content}; - if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) { - $self->add_link($1, undef, $line); + # Special case: base/@href + # TODO: This should go away as soon as LWP::Protocol::collect() invokes + # HTML::HeadParser (thus taking care of it in $response->base() + # transparently) for application/xhtml+xml and + # application/vnd.wap.xhtml+xml documents + # --> it does in LWP >= 5.810 + if ($tag eq 'base') { + + # Treat <base> (without href) or <base href=""> as if it didn't exist. + if (defined($attr->{href}) && length($attr->{href})) { + $self->{base} = $attr->{href}; + } + + # Note: base/@href intentionally not treated as a dereferenceable link: + # http://www.w3.org/mid/200802091439.27764.ville.skytta%40iki.fi } - } - } - # Special case: tags that have "local base" - elsif ($tag eq 'applet' || $tag eq 'object') { - if (my $codebase = $attr->{codebase}) { - # TODO: HTML 4 spec says applet/@codebase may only point to subdirs of - # the directory containing the current document. Should we do - # something about that? - $tag_local_base = URI->new_abs($codebase, $self->{base}); - } + + # Special case: meta[@http-equiv=Refresh]/@content + elsif ($tag eq 'meta') { + if ($attr->{'http-equiv'} && + lc($attr->{'http-equiv'}) eq 'refresh') + { + my $content = $attr->{content}; + if ($content && $content =~ /.*?;\s*(?:url=)?(.+)/i) { + $self->add_link($1, undef, $line); + } + } + } + + # Special case: tags that have "local base" + elsif ($tag eq 'applet' || $tag eq 'object') { + if (my $codebase = $attr->{codebase}) { + + # TODO: HTML 4 spec says applet/@codebase may only point to subdirs of + # the directory containing the current document. Should we do + # something about that? + $tag_local_base = URI->new_abs($codebase, $self->{base}); + } + } + + # Link attributes: + if (my $link_attrs = LINK_ATTRS()->{$tag}) { + for my $la (@$link_attrs) { + $self->add_link($attr->{$la}, $tag_local_base, $line); + } + } + + # List of links attributes: + if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) { + for my $la (@$link_attrs) { + if (defined(my $value = $attr->{$la})) { + for my $link (split(/\s+/, $value)) { + $self->add_link($link, $tag_local_base, $line); + } + } + } + } } - # Link attributes: - if (my $link_attrs = LINK_ATTRS()->{$tag}) { - for my $la (@$link_attrs) { - $self->add_link($attr->{$la}, $tag_local_base, $line); - } - } + $self->parse_progress($line) if $Opts{Progress}; - # List of links attributes: - if (my $link_attrs = LINK_LIST_ATTRS()->{$tag}) { - for my $la (@$link_attrs) { - if (defined(my $value = $attr->{$la})) { - for my $link (split(/\s+/, $value)) { - $self->add_link($link, $tag_local_base, $line); - } - } - } - } - } - - $self->parse_progress($line) if $Opts{Progress}; - - return; + return; } sub declaration { - my ($self, $text, $line) = @_; + my ($self, $text, $line) = @_; - # Extract the doctype - my @declaration = split(/\s+/, $text, 4); - if (($#declaration >= 3) && - ($declaration[0] eq 'DOCTYPE') && - (lc($declaration[1]) eq 'html')) { - # Parse the doctype declaration - if ($text =~ m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i) { - # Store the doctype - $self->doctype($1) if $1; - # If there is a link to the DTD, record it - $self->add_link($3, undef, $line) if (!$self->{only_anchors} && $3); + # Extract the doctype + my @declaration = split(/\s+/, $text, 4); + if (($#declaration >= 3) && + ($declaration[0] eq 'DOCTYPE') && + (lc($declaration[1]) eq 'html')) + { + + # Parse the doctype declaration + if ($text =~ + m/^DOCTYPE\s+html\s+(?:PUBLIC\s+"([^"]+)"|SYSTEM)(\s+"([^"]+)")?\s*$/i + ) + { + + # Store the doctype + $self->doctype($1) if $1; + + # If there is a link to the DTD, record it + $self->add_link($3, undef, $line) + if (!$self->{only_anchors} && $3); + } } - } - $self->text($text) unless $self->{only_anchors}; + $self->text($text) unless $self->{only_anchors}; - return; + return; } sub end_document { - my ($self, $line) = @_; - $self->{Total} = $line; - return; + my ($self, $line) = @_; + $self->{Total} = $line; + return; } ################################ @@ -1885,77 +2054,91 @@ sub check_validity ($$$\%\%) { - my ($referer, $uri, $want_links, $links, $redirects) = @_; - # $referer is the URI of the document checked - # $uri is the URI of the target that we are verifying - # $want_links is true if we're interested in links in the target doc - # $links is a hash of the links in the documents checked - # $redirects is a map of the redirects encountered + my ($referer, $uri, $want_links, $links, $redirects) = @_; - # Get the document with the appropriate method - # Only use GET if there are fragments. HEAD is enough if it's not the - # case. - my @fragments = keys %{$links->{$uri}{fragments}}; - my $method = scalar(@fragments) ? 'GET' : 'HEAD'; + # $referer is the URI of the document checked + # $uri is the URI of the target that we are verifying + # $want_links is true if we're interested in links in the target doc + # $links is a hash of the links in the documents checked + # $redirects is a map of the redirects encountered - my $response; - my $being_processed = 0; - if ((! defined($results{$uri})) - || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) { - $being_processed = 1; - $response = &get_uri($method, $uri, $referer); - # Get the information back from get_uri() - &record_results($uri, $method, $response); - # Record the redirects - &record_redirects($redirects, $response); - } + # Get the document with the appropriate method + # Only use GET if there are fragments. HEAD is enough if it's not the + # case. + my @fragments = keys %{$links->{$uri}{fragments}}; + my $method = scalar(@fragments) ? 'GET' : 'HEAD'; - # We got the response of the HTTP request. Stop here if it was a HEAD. - return if ($method eq 'HEAD'); + my $response; + my $being_processed = 0; + if ((!defined($results{$uri})) || + (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) + { + $being_processed = 1; + $response = &get_uri($method, $uri, $referer); - # There are fragments. Parse the document. - my $p; - if ($being_processed) { - # Can we really parse the document? - if (!defined($results{$uri}{location}{type}) || - $results{$uri}{location}{type} !~ $ContentTypes) - { - &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n", - $uri, $results{$uri}{location}{type}) - if ($Opts{Verbose}); - $response->content(""); - return; + # Get the information back from get_uri() + &record_results($uri, $method, $response); + + # Record the redirects + &record_redirects($redirects, $response); } - # Do it then - if (my $error = decode_content($response)) { - &hprintf("%s\n.", $error); + + # We got the response of the HTTP request. Stop here if it was a HEAD. + return if ($method eq 'HEAD'); + + # There are fragments. Parse the document. + my $p; + if ($being_processed) { + + # Can we really parse the document? + if (!defined($results{$uri}{location}{type}) || + $results{$uri}{location}{type} !~ $ContentTypes) + { + &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n", + $uri, $results{$uri}{location}{type}) + if ($Opts{Verbose}); + $response->content(""); + return; + } + + # Do it then + if (my $error = decode_content($response)) { + &hprintf("%s\n.", $error); + } + + # @@@TODO: this isn't the best thing to do if a decode error occurred + $p = + &parse_document($uri, $response->base(), $response, 0, + $want_links); } - # @@@TODO: this isn't the best thing to do if a decode error occurred - $p = &parse_document($uri, $response->base(), $response, 0, $want_links); - } else { - # We already had the information - $p->{Anchors} = $results{$uri}{parsing}{Anchors}; - } - # Check that the fragments exist - foreach my $fragment (keys %{$links->{$uri}{fragments}}) { - if (defined($p->{Anchors}{$fragment}) - || &escape_match($fragment, $p->{Anchors}) - || grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}}) { - $results{$uri}{fragments}{$fragment} = 1; - } else { - $results{$uri}{fragments}{$fragment} = 0; + else { + + # We already had the information + $p->{Anchors} = $results{$uri}{parsing}{Anchors}; } - } - return; + + # Check that the fragments exist + foreach my $fragment (keys %{$links->{$uri}{fragments}}) { + if (defined($p->{Anchors}{$fragment}) || + &escape_match($fragment, $p->{Anchors}) || + grep { $_ eq "$uri#$fragment" } @{$Opts{Suppress_Fragment}}) + { + $results{$uri}{fragments}{$fragment} = 1; + } + else { + $results{$uri}{fragments}{$fragment} = 0; + } + } + return; } sub escape_match ($\%) { - my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]); - foreach my $b (keys %$hash) { - return 1 if ($a eq URI::Escape::uri_unescape($b)); - } - return 0; + my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]); + foreach my $b (keys %$hash) { + return 1 if ($a eq URI::Escape::uri_unescape($b)); + } + return 0; } ########################## @@ -1964,55 +2147,61 @@ sub authentication ($;$$$$) { - my ($response, $cookie, $params, $check_num, $is_start) = @_; + my ($response, $cookie, $params, $check_num, $is_start) = @_; - my $realm = ''; - if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) { - $realm = $1; - } + my $realm = ''; + if ($response->www_authenticate() =~ /Basic realm=\"([^\"]+)\"/) { + $realm = $1; + } - if ($Opts{Command_Line}) { - printf STDERR <<'EOF', $response->request()->url(), $realm; + if ($Opts{Command_Line}) { + printf STDERR <<'EOF', $response->request()->url(), $realm; Authentication is required for %s. The realm is "%s". Use the -u and -p options to specify a username and password and the -d option to specify trusted domains. EOF - } else { + } + else { - printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\n%sConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n", - $response->www_authenticate(), - $cookie ? "Set-Cookie: $cookie\n" : "", - ); + printf( + "Status: 401 Authorization Required\nWWW-Authenticate: %s\n%sConnection: close\nContent-Language: en\nContent-Type: text/html; charset=utf-8\n\n", + $response->www_authenticate(), + $cookie ? "Set-Cookie: $cookie\n" : "", + ); - printf("%s + printf( + "%s <html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\"> <head> <title>W3C Link Checker: 401 Authorization Required</title> %s</head> -<body>", $DocType, $Head); - &banner(': 401 Authorization Required'); - &print_form($params, $cookie, $check_num) if $is_start; - printf('<p> +<body>", $DocType, $Head + ); + &banner(': 401 Authorization Required'); + &print_form($params, $cookie, $check_num) if $is_start; + printf( + '<p> %s You need "%s" access to <a href="%s">%s</a> to perform link checking.<br /> ', - &status_icon(401), - &encode($realm), (&encode($response->request()->url())) x 2); + &status_icon(401), + &encode($realm), (&encode($response->request()->url())) x 2 + ); - my $host = $response->request()->url()->host(); - if ($Opts{Trusted} && $host !~ $Opts{Trusted}) { - printf <<'EOF', &encode($Opts{Trusted}), &encode($host); + my $host = $response->request()->url()->host(); + if ($Opts{Trusted} && $host !~ $Opts{Trusted}) { + printf <<'EOF', &encode($Opts{Trusted}), &encode($host); This service has been configured to send authentication only to hostnames matching the regular expression <code>%s</code>, but the hostname <code>%s</code> does not match it. EOF + } + + print "</p>\n"; } - - print "</p>\n"; - } - return; + return; } ################## @@ -2021,17 +2210,17 @@ sub get_timestamp () { - return pack('LL', Time::HiRes::gettimeofday()); + return pack('LL', Time::HiRes::gettimeofday()); } sub time_diff ($$) { - my @start = unpack('LL', $_[0]); - my @stop = unpack('LL', $_[1]); - for ($start[1], $stop[1]) { - $_ /= 1_000_000; - } - return(sprintf("%.2f", ($stop[0]+$stop[1])-($start[0]+$start[1]))); + my @start = unpack('LL', $_[0]); + my @stop = unpack('LL', $_[1]); + for ($start[1], $stop[1]) { + $_ /= 1_000_000; + } + return (sprintf("%.2f", ($stop[0] + $stop[1]) - ($start[0] + $start[1]))); } ######################## @@ -2041,56 +2230,59 @@ # Record the redirects in a hash sub record_redirects (\%$) { - my ($redirects, $response) = @_; - for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) { + my ($redirects, $response) = @_; + for (my $prev = $response->previous(); $prev; $prev = $prev->previous()) { - # Check for redirect match. - my $from = $prev->request()->url(); - my $to = $response->request()->url(); # same on every loop iteration - my $from_to = $from . '->' . $to; - my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}}; - # print STDERR "Result $match of redirect checking $from_to\n"; - if ($match) { next; } + # Check for redirect match. + my $from = $prev->request()->url(); + my $to = $response->request()->url(); # same on every loop iteration + my $from_to = $from . '->' . $to; + my $match = grep { $_ eq $from_to } @{$Opts{Suppress_Redirect}}; - $match = grep { $from_to =~ /$_/ } @{$Opts{Suppress_Redirect_Prefix}}; - # print STDERR "Result $match of regexp checking $from_to\n"; - if ($match) { next; } + # print STDERR "Result $match of redirect checking $from_to\n"; + if ($match) { next; } - my $c = $prev->code(); - if ($Opts{Suppress_Temp_Redirects} && ($c == 307 || $c == 302)) { - next; + $match = grep { $from_to =~ /$_/ } @{$Opts{Suppress_Redirect_Prefix}}; + + # print STDERR "Result $match of regexp checking $from_to\n"; + if ($match) { next; } + + my $c = $prev->code(); + if ($Opts{Suppress_Temp_Redirects} && ($c == 307 || $c == 302)) { + next; + } + + $redirects->{$prev->request()->url()} = $response->request()->url(); } - - $redirects->{$prev->request()->url()} = $response->request()->url(); - } - return; + return; } # Determine if a request is redirected sub is_redirected ($%) { - my ($uri, %redirects) = @_; - return(defined($redirects{$uri})); + my ($uri, %redirects) = @_; + return (defined($redirects{$uri})); } # Get a list of redirects for a URI sub get_redirects ($%) { - my ($uri, %redirects) = @_; - my @history = ($uri); - my %seen = ($uri => 1); # for tracking redirect loops - my $loop = 0; - while ($redirects{$uri}) { - $uri = $redirects{$uri}; - push(@history, $uri); - if ($seen{$uri}) { - $loop = 1; - last; - } else { - $seen{$uri}++; + my ($uri, %redirects) = @_; + my @history = ($uri); + my %seen = ($uri => 1); # for tracking redirect loops + my $loop = 0; + while ($redirects{$uri}) { + $uri = $redirects{$uri}; + push(@history, $uri); + if ($seen{$uri}) { + $loop = 1; + last; + } + else { + $seen{$uri}++; + } } - } - return ($loop, @history); + return ($loop, @history); } #################################################### @@ -2099,9 +2291,9 @@ sub sort_unique (@) { - my %saw; - @saw{@_} = (); - return (sort { $a <=> $b } keys %saw); + my %saw; + @saw{@_} = (); + return (sort { $a <=> $b } keys %saw); } ##################### @@ -2110,53 +2302,57 @@ sub line_number ($) { - my $line = shift; - return $line if ($line >= 0); - return "(N/A)"; + my $line = shift; + return $line if ($line >= 0); + return "(N/A)"; } sub http_rc ($) { - my $rc = shift; - return $rc if ($rc >= 0); - return "(N/A)"; + my $rc = shift; + return $rc if ($rc >= 0); + return "(N/A)"; } # returns true if the given code is informational sub informational ($) { - my $rc = shift; - return $rc == RC_ROBOTS_TXT() || $rc == RC_IP_DISALLOWED() || - $rc == RC_PROTOCOL_DISALLOWED(); + my $rc = shift; + return $rc == RC_ROBOTS_TXT() || + $rc == RC_IP_DISALLOWED() || + $rc == RC_PROTOCOL_DISALLOWED(); } sub anchors_summary (\%\%) { - my ($anchors, $errors) = @_; + my ($anchors, $errors) = @_; - # Number of anchors found. - my $n = scalar(keys(%$anchors)); - if (! $Opts{Quiet}) { - if ($Opts{HTML}) { - print("<h3>Anchors</h3>\n<p>"); - } else { - print("Anchors\n\n"); + # Number of anchors found. + my $n = scalar(keys(%$anchors)); + if (!$Opts{Quiet}) { + if ($Opts{HTML}) { + print("<h3>Anchors</h3>\n<p>"); + } + else { + print("Anchors\n\n"); + } + &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's'); + print("</p>\n") if $Opts{HTML}; } - &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's'); - print("</p>\n") if $Opts{HTML}; - } - # List of the duplicates, if any. - my @errors = keys %{$errors}; - if (! scalar(@errors)) { - print("<p>Valid anchors!</p>\n") if (! $Opts{Quiet} && $Opts{HTML} && $n); - return; - } - undef $n; - print_doc_header(); - print('<p>') if $Opts{HTML}; - print('List of duplicate and empty anchors'); - print <<'EOF' if $Opts{HTML}; + # List of the duplicates, if any. + my @errors = keys %{$errors}; + if (!scalar(@errors)) { + print("<p>Valid anchors!</p>\n") + if (!$Opts{Quiet} && $Opts{HTML} && $n); + return; + } + undef $n; + + print_doc_header(); + print('<p>') if $Opts{HTML}; + print('List of duplicate and empty anchors'); + print <<'EOF' if $Opts{HTML}; </p> <table class="report" border="1" summary="List of duplicate and empty anchors."> <thead> @@ -2167,320 +2363,406 @@ </thead> <tbody> EOF - print("\n"); + print("\n"); - foreach my $anchor (@errors) { - my $format; - my @unique = &sort_unique(map { line_number($_) } - keys %{$anchors->{$anchor}}); - if ($Opts{HTML}) { - $format = "<tr><td class=\"broken\">%s</td><td>%s</td></tr>\n"; - } else { - my $s = (scalar(@unique) > 1) ? 's' : ''; - $format = "\t%s\tLine$s: %s\n"; + foreach my $anchor (@errors) { + my $format; + my @unique = &sort_unique( + map { line_number($_) } + keys %{$anchors->{$anchor}} + ); + if ($Opts{HTML}) { + $format = "<tr><td class=\"broken\">%s</td><td>%s</td></tr>\n"; + } + else { + my $s = (scalar(@unique) > 1) ? 's' : ''; + $format = "\t%s\tLine$s: %s\n"; + } + printf($format, + &encode(length($anchor) ? $anchor : 'Empty anchor'), + join(', ', @unique)); } - printf($format, - &encode(length($anchor) ? $anchor : 'Empty anchor'), - join(', ', @unique)); - } - print("</tbody>\n</table>\n") if $Opts{HTML}; + print("</tbody>\n</table>\n") if $Opts{HTML}; - return; + return; } sub show_link_report (\%\%\%\%\@;$\%) { - my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_; + my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_; - print("\n<dl class=\"report\">") if $Opts{HTML}; - print("\n") if (! $Opts{Quiet}); + print("\n<dl class=\"report\">") if $Opts{HTML}; + print("\n") if (!$Opts{Quiet}); - # Process each URL - my ($c, $previous_c); - foreach my $u (@$urls) { - my @fragments = keys %{$broken->{$u}{fragments}}; - # Did we get a redirect? - my $redirected = &is_redirected($u, %$redirects); - # List of lines - my @total_lines; - push(@total_lines, keys(%{$links->{$u}{location}})); - foreach my $f (@fragments) { - push(@total_lines, keys(%{$links->{$u}{fragments}{$f}})) - unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()})); - } + # Process each URL + my ($c, $previous_c); + foreach my $u (@$urls) { + my @fragments = keys %{$broken->{$u}{fragments}}; - my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects); - my $currloc = $results->{$u}{location}; + # Did we get a redirect? + my $redirected = &is_redirected($u, %$redirects); - # Error type - $c = &code_shown($u, $results); - # What to do - my $whattodo; - my $redirect_too; - if ($todo) { - if ($u =~ m/^javascript:/) { - if ($Opts{HTML}) { - $whattodo = -'You must change this link: people using a browser without JavaScript support + # List of lines + my @total_lines; + push(@total_lines, keys(%{$links->{$u}{location}})); + foreach my $f (@fragments) { + push(@total_lines, keys(%{$links->{$u}{fragments}{$f}})) + unless ($f eq $u && defined($links->{$u}{$u}{LINE_UNKNOWN()})); + } + + my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects); + my $currloc = $results->{$u}{location}; + + # Error type + $c = &code_shown($u, $results); + + # What to do + my $whattodo; + my $redirect_too; + if ($todo) { + if ($u =~ m/^javascript:/) { + if ($Opts{HTML}) { + $whattodo = + 'You must change this link: people using a browser without JavaScript support will <em>not</em> be able to follow this link. See the <a href="http://www.w3.org/TR/WAI-WEBCONTENT/#tech-scripts">Web Content Accessibility Guidelines on the use of scripting on the Web</a> and the <a href="http://www.w3.org/TR/WCAG10-HTML-TECHS/#directly-accessible-scripts">techniques on how to solve this</a>.'; - } else { - $whattodo = 'Change this link: people using a browser without JavaScript support will not be able to follow this link.'; + } + else { + $whattodo = + 'Change this link: people using a browser without JavaScript support will not be able to follow this link.'; + } + } + elsif ($c == RC_ROBOTS_TXT()) { + $whattodo = + 'The link was not checked due to robots exclusion ' . + 'rules. Check the link manually.'; + } + elsif ($redirect_loop) { + $whattodo = + 'Retrieving the URI results in a redirect loop, that should be ' + . 'fixed. Examine the redirect sequence to see where the loop ' + . 'occurs.'; + } + else { + $whattodo = $todo->{$c}; + } } - } elsif ($c == RC_ROBOTS_TXT()) { - $whattodo = 'The link was not checked due to robots exclusion ' . - 'rules. Check the link manually.'; - } elsif ($redirect_loop) { - $whattodo = - 'Retrieving the URI results in a redirect loop, that should be ' . - 'fixed. Examine the redirect sequence to see where the loop ' . - 'occurs.'; - } else { - $whattodo = $todo->{$c}; - } - } elsif (defined($redirects{$u})) { - # Redirects - if (($u.'/') eq $redirects{$u}) { - $whattodo = 'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.'; - } - elsif ($c == 307 || $c == 302) { - $whattodo = 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.'; - } - elsif ($c == 301) { - $whattodo = 'This is a permanent redirect. The link should be updated.'; - } - } + elsif (defined($redirects{$u})) { - my @unique = &sort_unique(map { line_number($_) } @total_lines); - my $lines_list = join(', ', @unique); - my $s = (scalar(@unique) > 1) ? 's' : ''; - undef @unique; + # Redirects + if (($u . '/') eq $redirects{$u}) { + $whattodo = + 'The link is missing a trailing slash, and caused a redirect. Adding the trailing slash would speed up browsing.'; + } + elsif ($c == 307 || $c == 302) { + $whattodo = + 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.'; + } + elsif ($c == 301) { + $whattodo = + 'This is a permanent redirect. The link should be updated.'; + } + } - my @http_codes = ($currloc->{code}); - unshift(@http_codes, $currloc->{orig}) if $currloc->{orig}; - @http_codes = map { http_rc($_) } @http_codes; + my @unique = &sort_unique(map { line_number($_) } @total_lines); + my $lines_list = join(', ', @unique); + my $s = (scalar(@unique) > 1) ? 's' : ''; + undef @unique; - if ($Opts{HTML}) { - # Style stuff - my $idref = ''; - if ($codes && (!defined($previous_c) || ($c != $previous_c))) { - $idref = ' id="d'.$doc_count.'code_'.$c.'"'; - $previous_c = $c; - } - # Main info - for (@redirects_urls) { - $_ = &show_url($_); - } - # HTTP message - my $http_message; - if ($currloc->{message}) { - $http_message = &encode($currloc->{message}); - if ($c == 404 || $c == 500) { - $http_message = '<span class="broken">'. - $http_message.'</span>'; - } - } - my $redirmsg = - $redirect_loop ? ' <em>redirect loop detected</em>' : ''; - printf(" + my @http_codes = ($currloc->{code}); + unshift(@http_codes, $currloc->{orig}) if $currloc->{orig}; + @http_codes = map { http_rc($_) } @http_codes; + + if ($Opts{HTML}) { + + # Style stuff + my $idref = ''; + if ($codes && (!defined($previous_c) || ($c != $previous_c))) { + $idref = ' id="d' . $doc_count . 'code_' . $c . '"'; + $previous_c = $c; + } + + # Main info + for (@redirects_urls) { + $_ = &show_url($_); + } + + # HTTP message + my $http_message; + if ($currloc->{message}) { + $http_message = &encode($currloc->{message}); + if ($c == 404 || $c == 500) { + $http_message = + '<span class="broken">' . $http_message . '</span>'; + } + } + my $redirmsg = + $redirect_loop ? ' <em>redirect loop detected</em>' : ''; + printf(" <dt%s>%s <span class='msg_loc'>Line%s: %s</span> %s</dt> <dd class='responsecode'><strong>Status</strong>: %s %s %s</dd> <dd class='message_explanation'><p>%s %s</p></dd>\n", - # Anchor for return codes - $idref, - # Color - &status_icon($c), - $s, - # List of lines - $lines_list, - # List of redirects - $redirected ? join(' redirected to ', @redirects_urls) . $redirmsg : &show_url($u), - # Realm - defined($currloc->{realm}) - ? sprintf('Realm: %s<br />', &encode($currloc->{realm})) : '', - # HTTP original message - # defined($currloc->{orig_message}) - # ? &encode($currloc->{orig_message}). - # ' <span title="redirected to">-></span> ' - # : '', - # Response code chain - join(' <span class="redirected_to" title="redirected to">-></span> ', - map { &encode($_) } @http_codes), - # HTTP final message - $http_message, - # What to do - $whattodo, - # Redirect too? - $redirect_too ? - sprintf(' <span %s>%s</span>', &bgcolor(301), $redirect_too) : '', - ); - if ($#fragments >= 0) { - printf("<dd>Broken fragments: <ul>\n"); - } - } else { - my $redirmsg = $redirect_loop ? ' redirect loop detected' : ''; - printf("\n%s\t%s\n Code: %s %s\n%s\n", - # List of redirects - $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u, - # List of lines - $lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '', - # Response code chain - join(' -> ', @http_codes), - # HTTP message - $currloc->{message} || '', - # What to do - wrap(' To do: ', ' ', $whattodo)); - if ($#fragments >= 0) { - if ($currloc->{code} == 200) { - print("The following fragments need to be fixed:\n"); - } else { - print("Fragments:\n"); + # Anchor for return codes + $idref, + + # Color + &status_icon($c), + $s, + + # List of lines + $lines_list, + + # List of redirects + $redirected ? + join(' redirected to ', @redirects_urls) . $redirmsg : + &show_url($u), + + # Realm + defined($currloc->{realm}) ? + sprintf('Realm: %s<br />', &encode($currloc->{realm})) : + '', + + # HTTP original message + # defined($currloc->{orig_message}) + # ? &encode($currloc->{orig_message}). + # ' <span title="redirected to">-></span> ' + # : '', + + # Response code chain + join( + ' <span class="redirected_to" title="redirected to">-></span> ', + map { &encode($_) } @http_codes), + + # HTTP final message + $http_message, + + # What to do + $whattodo, + + # Redirect too? + $redirect_too ? + sprintf(' <span %s>%s</span>', + &bgcolor(301), $redirect_too) : + '', + ); + if ($#fragments >= 0) { + printf("<dd>Broken fragments: <ul>\n"); + } } - } - } - # Fragments - foreach my $f (@fragments) { - my @unique_lines = &sort_unique(keys %{$links->{$u}{fragments}{$f}}); - my $plural = (scalar(@unique_lines) > 1) ? 's' : ''; - my $unique_lines = join(', ', @unique_lines); - if ($Opts{HTML}) { - printf("<li>%s<em>#%s</em> (line%s %s)</li>\n", - &encode($u), &encode($f), $plural, $unique_lines); - } else { - printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines); - } + else { + my $redirmsg = $redirect_loop ? ' redirect loop detected' : ''; + printf( + "\n%s\t%s\n Code: %s %s\n%s\n", + + # List of redirects + $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u, + + # List of lines + $lines_list ? sprintf("\n%6s: %s", "Line$s", $lines_list) : '', + + # Response code chain + join(' -> ', @http_codes), + + # HTTP message + $currloc->{message} || '', + + # What to do + wrap(' To do: ', ' ', $whattodo) + ); + if ($#fragments >= 0) { + if ($currloc->{code} == 200) { + print("The following fragments need to be fixed:\n"); + } + else { + print("Fragments:\n"); + } + } + } + + # Fragments + foreach my $f (@fragments) { + my @unique_lines = + &sort_unique(keys %{$links->{$u}{fragments}{$f}}); + my $plural = (scalar(@unique_lines) > 1) ? 's' : ''; + my $unique_lines = join(', ', @unique_lines); + if ($Opts{HTML}) { + printf("<li>%s<em>#%s</em> (line%s %s)</li>\n", + &encode($u), &encode($f), $plural, $unique_lines); + } + else { + printf("\t%-30s\tLine%s: %s\n", $f, $plural, $unique_lines); + } + } + + print("</ul></dd>\n") if ($Opts{HTML} && scalar(@fragments)); } - print("</ul></dd>\n") if ($Opts{HTML} && scalar(@fragments)); - } + # End of the table + print("</dl>\n") if $Opts{HTML}; - # End of the table - print("</dl>\n") if $Opts{HTML}; - - return; + return; } sub code_shown ($$) { - my ($u, $results) = @_; + my ($u, $results) = @_; - if ($results->{$u}{location}{record} == 200) { - return $results->{$u}{location}{orig} || $results->{$u}{location}{record}; - } else { - return $results->{$u}{location}{record}; - } + if ($results->{$u}{location}{record} == 200) { + return $results->{$u}{location}{orig} || + $results->{$u}{location}{record}; + } + else { + return $results->{$u}{location}{record}; + } } sub links_summary (\%\%\%\%) { - # Advices to fix the problems - my %todo = ( 200 => 'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).', - 300 => 'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.', - 301 => 'This is a permanent redirect. The link should be updated to point to the more recent URI.', - 302 => 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.', - 303 => 'This rare status code points to a "See Other" resource. There is generally nothing to be done.', - 307 => 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.', - 400 => 'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.', - 401 => "The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.", - 403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.', - 404 => 'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.', - 405 => 'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically. Check the link manually.', - 406 => "The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.", - 407 => 'The link is a proxy, but requires Authentication.', - 408 => 'The request timed out.', - 410 => 'The resource is gone. You should remove this link.', - 415 => 'The media type is not supported.', - 500 => 'This is a server side problem. Check the URI.', - 501 => 'Could not check this link: method not implemented or scheme not supported.', - 503 => 'The server cannot service the request, for some unknown reason.', - # Non-HTTP codes: - RC_ROBOTS_TXT() => sprintf('The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.', - $Opts{HTML} ? ('<a href="http://www.robotstxt.org/wc/exclusion.html#robotstxt">', '</a>', "<a href=\"$Cfg{Doc_URI}#bot\">", '</a>') : ('') x 4), - RC_DNS_ERROR() => 'The hostname could not be resolved. Check the link for typos.', - RC_IP_DISALLOWED() => sprintf('The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.', - $Opts{HTML} ? ('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') : ('') x 2), - RC_PROTOCOL_DISALLOWED() => 'Accessing links with this URI scheme has been disabled in link checker.', - ); - my %priority = ( 410 => 1, - 404 => 2, - 403 => 5, - 200 => 10, - 300 => 15, - 401 => 20 - ); + # Advices to fix the problems - my ($links, $results, $broken, $redirects) = @_; + my %todo = ( + 200 => + 'Some of the links to this resource point to broken URI fragments (such as index.html#fragment).', + 300 => + 'This often happens when a typo in the link gets corrected automatically by the server. For the sake of performance, the link should be fixed.', + 301 => + 'This is a permanent redirect. The link should be updated to point to the more recent URI.', + 302 => + 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.', + 303 => + 'This rare status code points to a "See Other" resource. There is generally nothing to be done.', + 307 => + 'This is a temporary redirect. Update the link if you believe it makes sense, or leave it as is.', + 400 => + 'This is usually the sign of a malformed URL that cannot be parsed by the server. Check the syntax of the link.', + 401 => + "The link is not public and the actual resource is only available behind authentication. If not already done, you could specify it.", + 403 => + 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.', + 404 => + 'The link is broken. Double-check that you have not made any typo, or mistake in copy-pasting. If the link points to a resource that no longer exists, you may want to remove or fix the link.', + 405 => + 'The server does not allow HTTP HEAD requests, which prevents the Link Checker to check the link automatically. Check the link manually.', + 406 => + "The server isn't capable of responding according to the Accept* headers sent. This is likely to be a server-side issue with negotiation.", + 407 => 'The link is a proxy, but requires Authentication.', + 408 => 'The request timed out.', + 410 => 'The resource is gone. You should remove this link.', + 415 => 'The media type is not supported.', + 500 => 'This is a server side problem. Check the URI.', + 501 => + 'Could not check this link: method not implemented or scheme not supported.', + 503 => + 'The server cannot service the request, for some unknown reason.', - # List of the broken links - my @urls = keys %{$broken}; - my @dir_redirect_urls = (); - if ($Opts{Redirects}) { - # Add the redirected URI's to the report - for my $l (keys %$redirects) { - next unless (defined($results->{$l}) - && defined($links->{$l}) - && !defined($broken->{$l})); - # Check whether we have a "directory redirect" - # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ - my ($redirect_loop, @redirects) = get_redirects($l, %$redirects); - if ($#redirects == 1) { - push(@dir_redirect_urls, $l); - next; - } - push(@urls, $l); + # Non-HTTP codes: + RC_ROBOTS_TXT() => sprintf( + 'The link was not checked due to %srobots exclusion rules%s. Check the link manually, and see also the link checker %sdocumentation on robots exclusion%s.', + $Opts{HTML} ? ( + '<a href="http://www.robotstxt.org/wc/exclusion.html#robotstxt">', + '</a>', + "<a href=\"$Cfg{Doc_URI}#bot\">", + '</a>' + ) : ('') x 4 + ), + RC_DNS_ERROR() => + 'The hostname could not be resolved. Check the link for typos.', + RC_IP_DISALLOWED() => + sprintf( + 'The link resolved to a %snon-public IP address%s, and this link checker instance has been configured to not access such addresses. This may be a real error or just a quirk of the name resolver configuration on the server where the link checker runs. Check the link manually, in particular its hostname/IP address.', + $Opts{HTML} ? + ('<a href="http://www.ietf.org/rfc/rfc1918.txt">', '</a>') : + ('') x 2), + RC_PROTOCOL_DISALLOWED() => + 'Accessing links with this URI scheme has been disabled in link checker.', + ); + my %priority = ( + 410 => 1, + 404 => 2, + 403 => 5, + 200 => 10, + 300 => 15, + 401 => 20 + ); + + my ($links, $results, $broken, $redirects) = @_; + + # List of the broken links + my @urls = keys %{$broken}; + my @dir_redirect_urls = (); + if ($Opts{Redirects}) { + + # Add the redirected URI's to the report + for my $l (keys %$redirects) { + next + unless (defined($results->{$l}) && + defined($links->{$l}) && + !defined($broken->{$l})); + + # Check whether we have a "directory redirect" + # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ + my ($redirect_loop, @redirects) = get_redirects($l, %$redirects); + if ($#redirects == 1) { + push(@dir_redirect_urls, $l); + next; + } + push(@urls, $l); + } } - } - # Broken links and redirects - if ($#urls < 0) { - if (! $Opts{Quiet}) { - print_doc_header(); - if ($Opts{HTML}) { - print "<h3>Links</h3>\n<p>Valid links!</p>\n"; - } else { - print "\nValid links.\n"; - } + # Broken links and redirects + if ($#urls < 0) { + if (!$Opts{Quiet}) { + print_doc_header(); + if ($Opts{HTML}) { + print "<h3>Links</h3>\n<p>Valid links!</p>\n"; + } + else { + print "\nValid links.\n"; + } + } } - } else { - print_doc_header(); - print('<h3>') if $Opts{HTML}; - print("\nList of broken links and other issues"); - #print(' and redirects') if $Opts{Redirects}; + else { + print_doc_header(); + print('<h3>') if $Opts{HTML}; + print("\nList of broken links and other issues"); - # Sort the URI's by HTTP Code - my %code_summary; - my @idx; - foreach my $u (@urls) { - if (defined($results->{$u}{location}{record})) { - my $c = &code_shown($u, $results); - $code_summary{$c}++; - push(@idx, $c); - } - } - my @sorted = @urls[ - sort { - defined($priority{$idx[$a]}) ? - defined($priority{$idx[$b]}) ? - $priority{$idx[$a]} - <=> $priority{$idx[$b]} : - -1 : - defined($priority{$idx[$b]}) ? - 1 : - $idx[$a] <=> $idx[$b] - } 0 .. $#idx - ]; - @urls = @sorted; - undef(@sorted); undef(@idx); + #print(' and redirects') if $Opts{Redirects}; - if ($Opts{HTML}) { - # Print a summary - print <<'EOF'; + # Sort the URI's by HTTP Code + my %code_summary; + my @idx; + foreach my $u (@urls) { + if (defined($results->{$u}{location}{record})) { + my $c = &code_shown($u, $results); + $code_summary{$c}++; + push(@idx, $c); + } + } + my @sorted = @urls[ + sort { + defined($priority{$idx[$a]}) ? + defined($priority{$idx[$b]}) ? + $priority{$idx[$a]} <=> $priority{$idx[$b]} : + -1 : + defined($priority{$idx[$b]}) ? 1 : + $idx[$a] <=> $idx[$b] + } 0 .. $#idx + ]; + @urls = @sorted; + undef(@sorted); + undef(@idx); + + if ($Opts{HTML}) { + + # Print a summary + print <<'EOF'; </h3> <p><em>There are issues with the URLs listed below. The table summarizes the issues and suggested actions by HTTP response status code.</em></p> @@ -2494,33 +2776,36 @@ </thead> <tbody> EOF - foreach my $code (sort(keys(%code_summary))) { - printf('<tr%s>', &bgcolor($code)); - printf('<td><a href="#d%scode_%s">%s</a></td>', - $doc_count, $code, http_rc($code)); - printf('<td>%s</td>', $code_summary{$code}); - printf('<td>%s</td>', $todo{$code}); - print "</tr>\n"; - } - print "</tbody>\n</table>\n"; - } else { - print(':'); + foreach my $code (sort(keys(%code_summary))) { + printf('<tr%s>', &bgcolor($code)); + printf('<td><a href="#d%scode_%s">%s</a></td>', + $doc_count, $code, http_rc($code)); + printf('<td>%s</td>', $code_summary{$code}); + printf('<td>%s</td>', $todo{$code}); + print "</tr>\n"; + } + print "</tbody>\n</table>\n"; + } + else { + print(':'); + } + &show_link_report($links, $results, $broken, $redirects, \@urls, 1, + \%todo); } - &show_link_report($links, $results, $broken, $redirects, - \@urls, 1, \%todo); - } - # Show directory redirects - if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) { - print_doc_header(); - print('<h3>') if $Opts{HTML}; - print("\nList of redirects"); - print("</h3>\n<p>The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.</p>") if $Opts{HTML}; - &show_link_report($links, $results, $broken, $redirects, - \@dir_redirect_urls); - } + # Show directory redirects + if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) { + print_doc_header(); + print('<h3>') if $Opts{HTML}; + print("\nList of redirects"); + print( + "</h3>\n<p>The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.</p>" + ) if $Opts{HTML}; + &show_link_report($links, $results, $broken, $redirects, + \@dir_redirect_urls); + } - return; + return; } ############################################################################### @@ -2531,13 +2816,16 @@ sub global_stats () { - my $stop = &get_timestamp(); - my $n_docs = - ($doc_count <= $Opts{Max_Documents}) ? $doc_count : $Opts{Max_Documents}; - return sprintf('Checked %d document%s in %s seconds.', - $n_docs, - ($n_docs == 1) ? '' : 's', - &time_diff($timestamp, $stop)); + my $stop = &get_timestamp(); + my $n_docs = + ($doc_count <= $Opts{Max_Documents}) ? $doc_count : + $Opts{Max_Documents}; + return sprintf( + 'Checked %d document%s in %s seconds.', + $n_docs, + ($n_docs == 1) ? '' : 's', + &time_diff($timestamp, $stop) + ); } ################## @@ -2546,25 +2834,25 @@ sub html_header ($;$$) { - my ($uri, $doform, $cookie) = @_; + my ($uri, $doform, $cookie) = @_; - my $title = defined($uri) ? $uri : ''; - $title = ': ' . $title if ($title =~ /\S/); + my $title = defined($uri) ? $uri : ''; + $title = ': ' . $title if ($title =~ /\S/); - my $headers = ''; - if (! $Opts{Command_Line}) { - $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $doform; - $headers .= "Content-Type: text/html; charset=utf-8\n"; - $headers .= "Set-Cookie: $cookie\n" if $cookie; + my $headers = ''; + if (!$Opts{Command_Line}) { + $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $doform; + $headers .= "Content-Type: text/html; charset=utf-8\n"; + $headers .= "Set-Cookie: $cookie\n" if $cookie; - # mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same - # print() statement as the last header - $headers .= "Content-Language: en\n\n"; - } + # mod_perl 1.99_05 doesn't seem to like it if the "\n\n" isn't in the same + # print() statement as the last header + $headers .= "Content-Language: en\n\n"; + } - my $script = my $onload = ''; - if ($doform) { - $script = <<'EOF'; + my $script = my $onload = ''; + if ($doform) { + $script = <<'EOF'; <script type="text/javascript"> function uriOk(num) { @@ -2587,106 +2875,121 @@ } </script> EOF - $onload = ' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"'; - } + $onload = + ' onload="if(document.getElementById){document.getElementById(\'uri_1\').focus()}"'; + } - print $headers, $DocType, " + print $headers, $DocType, " <html lang=\"en\" xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\"> <head> <title>W3C Link Checker", &encode($title), "</title> ", $Head, $script, "</head> <body", $onload, '>'; - &banner($title); - return; + &banner($title); + return; } sub banner ($) { - my $tagline ="Check links and anchors in Web pages or full Web sites"; + my $tagline = "Check links and anchors in Web pages or full Web sites"; - printf(<<'EOF', URI->new_abs("../images/no_w3c.png", $Cfg{Doc_URI}), $tagline); + printf( + <<'EOF', URI->new_abs("../images/no_w3c.png", $Cfg{Doc_URI}), $tagline); <div id="banner"><h1 id="title"><a href="http://www.w3.org/" title="W3C"><img alt="W3C" id="logo" src="%s" /></a> <a href="checklink"><span>Link Checker</span></a></h1> <p id="tagline">%s</p></div> <div id="main"> EOF - return; + return; } sub status_icon($) { - my ($code) = @_; - my $icon_type; - my $r = HTTP::Response->new($code); - if ($r->is_success()) { - $icon_type = 'error'; # if is success but reported, it's because of broken frags => error - } elsif (&informational($code)) { - $icon_type = 'info'; - } elsif ($code == 300) { - $icon_type = 'info'; - } elsif ($code == 401) { - $icon_type = 'error'; - } elsif ($r->is_redirect()) { - $icon_type = 'warning'; - } elsif ($r->is_error()) { - $icon_type = 'error'; - } else { - $icon_type = 'error'; - } - return sprintf('<span class="err_type"><img src="%s" alt="%s" /></span>', - URI->new_abs("../images/info_icons/$icon_type.png", - $Cfg{Doc_URI}), - $icon_type); + my ($code) = @_; + my $icon_type; + my $r = HTTP::Response->new($code); + if ($r->is_success()) { + $icon_type = 'error' + ; # if is success but reported, it's because of broken frags => error + } + elsif (&informational($code)) { + $icon_type = 'info'; + } + elsif ($code == 300) { + $icon_type = 'info'; + } + elsif ($code == 401) { + $icon_type = 'error'; + } + elsif ($r->is_redirect()) { + $icon_type = 'warning'; + } + elsif ($r->is_error()) { + $icon_type = 'error'; + } + else { + $icon_type = 'error'; + } + return sprintf('<span class="err_type"><img src="%s" alt="%s" /></span>', + URI->new_abs("../images/info_icons/$icon_type.png", $Cfg{Doc_URI}), + $icon_type); } sub bgcolor ($) { - my ($code) = @_; - my $class; - my $r = HTTP::Response->new($code); - if ($r->is_success()) { - return ''; - } elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) { - $class = 'dubious'; - } elsif ($code == 300) { - $class = 'multiple'; - } elsif ($code == 401) { - $class = 'unauthorized'; - } elsif ($r->is_redirect()) { - $class = 'redirect'; - } elsif ($r->is_error()) { - $class = 'broken'; - } else { - $class = 'broken'; - } - return(' class="'.$class.'"'); + my ($code) = @_; + my $class; + my $r = HTTP::Response->new($code); + if ($r->is_success()) { + return ''; + } + elsif ($code == RC_ROBOTS_TXT() || $code == RC_IP_DISALLOWED()) { + $class = 'dubious'; + } + elsif ($code == 300) { + $class = 'multiple'; + } + elsif ($code == 401) { + $class = 'unauthorized'; + } + elsif ($r->is_redirect()) { + $class = 'redirect'; + } + elsif ($r->is_error()) { + $class = 'broken'; + } + else { + $class = 'broken'; + } + return (' class="' . $class . '"'); } sub show_url ($;$) { - my ($url, $fragment) = @_; - if (defined($fragment)) { - my $u = URI->new($url); - $u->fragment($fragment); - $url = $u->as_string(); - } - $url = &encode($url); - return sprintf('<a href="%s">%s</a>', - $url, defined($fragment) ? &encode($fragment) : $url); + my ($url, $fragment) = @_; + if (defined($fragment)) { + my $u = URI->new($url); + $u->fragment($fragment); + $url = $u->as_string(); + } + $url = &encode($url); + return sprintf('<a href="%s">%s</a>', + $url, defined($fragment) ? &encode($fragment) : $url); } sub html_footer () { - printf("<p>%s</p>\n", &global_stats()) if ($doc_count > 0 && !$Opts{Quiet}); - if (! $doc_count) { - print <<'EOF'; + printf("<p>%s</p>\n", &global_stats()) + if ($doc_count > 0 && !$Opts{Quiet}); + if (!$doc_count) { + print <<'EOF'; <div class="intro"> <p>This Link Checker looks for issues in links, anchors and referenced objects in a Web page, or recursively on a whole Web site. For best results, it is recommended to first ensure that the documents checked use <a href="http://validator.w3.org/">Valid (X)HTML Markup</a>. The Link Checker is part of the W3C's <a href="http://www.w3.org/QA/Tools/">validators and Quality Web tools</a>.</p> </div> EOF - } - printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION); + } + printf(<<'EOF', $Cfg{Doc_URI}, $Cfg{Doc_URI}, $PACKAGE, $REVISION); </div><!-- main --> <ul class="navbar" id="menu"> <li><a href="%s" accesskey="3" title="Documentation for this Link Checker Service">Docs</a></li> @@ -2702,77 +3005,94 @@ </body> </html> EOF - return; + return; } sub print_form (\%$$) { - my ($params, $cookie, $check_num) = @_; + my ($params, $cookie, $check_num) = @_; - # Split params on \0, see CGI's docs on Vars() - while (my ($key, $value) = each(%$params)) { - if ($value) { - my @vals = split(/\0/, $value, 2); - $params->{$key} = $vals[0]; + # Split params on \0, see CGI's docs on Vars() + while (my ($key, $value) = each(%$params)) { + if ($value) { + my @vals = split(/\0/, $value, 2); + $params->{$key} = $vals[0]; + } } - } - # Override undefined values from the cookie, if we got one. - my $valid_cookie = 0; - if ($cookie) { - my %cookie_values = $cookie->value(); - if (!$cookie_values{clear}) { # XXX no easy way to check if cookie expired? - $valid_cookie = 1; - while (my ($key, $value) = each(%cookie_values)) { - $params->{$key} = $value unless defined($params->{$key}); - } + # Override undefined values from the cookie, if we got one. + my $valid_cookie = 0; + if ($cookie) { + my %cookie_values = $cookie->value(); + if (!$cookie_values{clear}) + { # XXX no easy way to check if cookie expired? + $valid_cookie = 1; + while (my ($key, $value) = each(%cookie_values)) { + $params->{$key} = $value unless defined($params->{$key}); + } + } } - } - my $chk = ' checked="checked"'; - $params->{hide_type} = 'all' unless $params->{hide_type}; + my $chk = ' checked="checked"'; + $params->{hide_type} = 'all' unless $params->{hide_type}; - my $requested_uri = &encode($params->{uri} || ''); - my $sum = $params->{summary} ? $chk : ''; - my $red = $params->{hide_redirects} ? $chk : ''; - my $all = ($params->{hide_type} ne 'dir') ? $chk : ''; - my $dir = $all ? '' : $chk; - my $acc = $params->{no_accept_language} ? $chk : ''; - my $ref = $params->{no_referer} ? $chk : ''; - my $rec = $params->{recursive} ? $chk : ''; - my $dep = &encode($params->{depth} || ''); + my $requested_uri = &encode($params->{uri} || ''); + my $sum = $params->{summary} ? $chk : ''; + my $red = $params->{hide_redirects} ? $chk : ''; + my $all = ($params->{hide_type} ne 'dir') ? $chk : ''; + my $dir = $all ? '' : $chk; + my $acc = $params->{no_accept_language} ? $chk : ''; + my $ref = $params->{no_referer} ? $chk : ''; + my $rec = $params->{recursive} ? $chk : ''; + my $dep = &encode($params->{depth} || ''); - my $cookie_options = ''; - if ($valid_cookie) { - $cookie_options = " + my $cookie_options = ''; + if ($valid_cookie) { + $cookie_options = " <label for=\"cookie1_$check_num\"><input type=\"radio\" id=\"cookie1_$check_num\" name=\"cookie\" value=\"nochanges\" checked=\"checked\" /> Don't modify saved options</label> <label for=\"cookie2_$check_num\"><input type=\"radio\" id=\"cookie2_$check_num\" name=\"cookie\" value=\"set\" /> Save these options</label> <label for=\"cookie3_$check_num\"><input type=\"radio\" id=\"cookie3_$check_num\" name=\"cookie\" value=\"clear\" /> Clear saved options</label>"; - } else { - $cookie_options = " + } + else { + $cookie_options = " <label for=\"cookie_$check_num\"><input type=\"checkbox\" id=\"cookie_$check_num\" name=\"cookie\" value=\"set\" /> Save options in a <a href=\"http://www.w3.org/Protocols/rfc2109/rfc2109\">cookie</a></label>"; - } + } - print "<form action=\"", $Opts{_Self_URI}, "\" method=\"get\" onsubmit=\"return uriOk($check_num)\"> + print "<form action=\"", $Opts{_Self_URI}, + "\" method=\"get\" onsubmit=\"return uriOk($check_num)\"> <p><label for=\"uri_$check_num\">Enter the address (<a href=\"http://www.w3.org/Addressing/\">URL</a>) of a document that you would like to check:</label></p> -<p><input type=\"text\" size=\"50\" id=\"uri_$check_num\" name=\"uri\" value=\"",$requested_uri,"\" /></p> +<p><input type=\"text\" size=\"50\" id=\"uri_$check_num\" name=\"uri\" value=\"", + $requested_uri, "\" /></p> <fieldset id=\"extra_opt_uri_$check_num\" class=\"moreoptions\"> <legend class=\"toggletext\">More Options</legend> <div class=\"options\"> <p> - <label for=\"summary_$check_num\"><input type=\"checkbox\" id=\"summary_$check_num\" name=\"summary\" value=\"on\"", $sum, " /> Summary only</label> + <label for=\"summary_$check_num\"><input type=\"checkbox\" id=\"summary_$check_num\" name=\"summary\" value=\"on\"", + $sum, " /> Summary only</label> <br /> - <label for=\"hide_redirects_$check_num\"><input type=\"checkbox\" id=\"hide_redirects_$check_num\" name=\"hide_redirects\" value=\"on\"", $red, " /> Hide <a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3\">redirects</a>:</label> - <label for=\"hide_type_all_$check_num\"><input type=\"radio\" id=\"hide_type_all_$check_num\" name=\"hide_type\" value=\"all\"", $all, " /> all</label> - <label for=\"hide_type_dir_$check_num\"><input type=\"radio\" id=\"hide_type_dir_$check_num\" name=\"hide_type\" value=\"dir\"", $dir, " /> for directories only</label> + <label for=\"hide_redirects_$check_num\"><input type=\"checkbox\" id=\"hide_redirects_$check_num\" name=\"hide_redirects\" value=\"on\"", + $red, + " /> Hide <a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3\">redirects</a>:</label> + <label for=\"hide_type_all_$check_num\"><input type=\"radio\" id=\"hide_type_all_$check_num\" name=\"hide_type\" value=\"all\"", + $all, " /> all</label> + <label for=\"hide_type_dir_$check_num\"><input type=\"radio\" id=\"hide_type_dir_$check_num\" name=\"hide_type\" value=\"dir\"", + $dir, " /> for directories only</label> <br /> - <label for=\"no_accept_language_$check_num\"><input type=\"checkbox\" id=\"no_accept_language_$check_num\" name=\"no_accept_language\" value=\"on\"", $acc, " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> header</label> + <label for=\"no_accept_language_$check_num\"><input type=\"checkbox\" id=\"no_accept_language_$check_num\" name=\"no_accept_language\" value=\"on\"", + $acc, + " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> header</label> <br /> - <label for=\"no_referer_$check_num\"><input type=\"checkbox\" id=\"no_referer_$check_num\" name=\"no_referer\" value=\"on\"", $ref, " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36\">Referer</a></tt> header</label> + <label for=\"no_referer_$check_num\"><input type=\"checkbox\" id=\"no_referer_$check_num\" name=\"no_referer\" value=\"on\"", + $ref, + " /> Don't send the <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.36\">Referer</a></tt> header</label> <br /> - <label title=\"Check linked documents recursively (maximum: ", $Opts{Max_Documents}, " documents)\" for=\"recursive_$check_num\"><input type=\"checkbox\" id=\"recursive_$check_num\" name=\"recursive\" value=\"on\"", $rec, " /> Check linked documents recursively</label>, - <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth_$check_num\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth_$check_num\" name=\"depth\" value=\"", $dep, "\" /></label> + <label title=\"Check linked documents recursively (maximum: ", + $Opts{Max_Documents}, + " documents)\" for=\"recursive_$check_num\"><input type=\"checkbox\" id=\"recursive_$check_num\" name=\"recursive\" value=\"on\"", + $rec, " /> Check linked documents recursively</label>, + <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth_$check_num\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth_$check_num\" name=\"depth\" value=\"", + $dep, "\" /></label> <br /><br />", $cookie_options, " </p> </div> @@ -2782,23 +3102,24 @@ <div class=\"intro\" id=\"don_program\"></div> <script type=\"text/javascript\" src=\"http://www.w3.org/QA/Tools/don_prog.js\"></script> "; - return; + return; } sub encode (@) { - return $Opts{HTML} ? HTML::Entities::encode(@_) : @_; + return $Opts{HTML} ? HTML::Entities::encode(@_) : @_; } sub hprintf (@) { - print_doc_header(); - if (! $Opts{HTML}) { - printf(@_); - } else { - print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1])); - } - return; + print_doc_header(); + if (!$Opts{HTML}) { + printf(@_); + } + else { + print HTML::Entities::encode(sprintf($_[0], @_[1 .. @_ - 1])); + } + return; } # Print the document header, if it hasn't been printed already. @@ -2807,17 +3128,18 @@ # "Processing..." messages when nothing else will be reported. sub print_doc_header () { - if (defined($doc_header)) { - print $doc_header; - undef($doc_header); - } + if (defined($doc_header)) { + print $doc_header; + undef($doc_header); + } } - # Local Variables: # mode: perl # indent-tabs-mode: nil -# tab-width: 2 -# perl-indent-level: 2 +# cperl-indent-level: 4 +# cperl-continued-statement-offset: 4 +# cperl-brace-offset: -4 +# perl-indent-level: 4 # End: -# ex: ts=2 sw=2 et +# ex: ts=4 sw=4 et
Received on Thursday, 5 August 2010 14:47:41 UTC