- From: Mercurial notifier <nobody@w3.org>
- Date: Thu, 05 Aug 2010 14:46:51 +0000
- To: link-checker updates <www-validator-cvs@w3.org>
changeset: 1:55210bc792ab
user: ville
date: Thu Mar 18 21:39:55 2004 +0000
files: bin/checklink
description:
Copy here from the validator dir.
diff -r 3b1a5c7ab987 -r 55210bc792ab bin/checklink
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/checklink Thu Mar 18 21:39:55 2004 +0000
@@ -0,0 +1,2081 @@
+#!/usr/bin/perl -wT
+#
+# W3C Link Checker
+# by Hugo Haas <hugo@w3.org>
+# (c) 1999-2004 World Wide Web Consortium
+# based on Renaud Bruyeron's checklink.pl
+#
+# $Id: checklink,v 3.7 2004-03-18 21:39:55 ville Exp $
+#
+# This program is licensed under the W3C(r) Software License:
+# http://www.w3.org/Consortium/Legal/copyright-software
+#
+# The documentation is at:
+# http://www.w3.org/2000/07/checklink
+#
+# See the CVSweb interface at:
+# http://dev.w3.org/cvsweb/validator/httpd/cgi-bin/checklink.pl
+#
+# An online version is available at:
+# http://validator.w3.org/checklink
+#
+# Comments and suggestions should be sent to the www-validator mailing list:
+# www-validator@w3.org (with 'checklink' in the subject)
+# http://lists.w3.org/Archives/Public/www-validator/ (archives)
+
+use strict;
+
+# -----------------------------------------------------------------------------
+
+package W3C::UserAgent;
+
+use LWP::UserAgent qw();
+# @@@ Needs also W3C::CheckLink but can't use() it here.
+
+@W3C::UserAgent::ISA = qw(LWP::UserAgent);
+
+sub simple_request
+{
+ my $self = shift;
+ my $response = $self->W3C::UserAgent::SUPER::simple_request(@_);
+ if (! defined($self->{FirstResponse})) {
+ $self->{FirstResponse} = $response->code();
+ $self->{FirstMessage} = $response->message();
+ }
+ return $response;
+}
+
+sub redirect_ok
+{
+ my ($self, $request) = @_;
+
+ if ($self->{Checklink_verbose_progress}) {
+ &W3C::CheckLink::hprintf("\n%s %s ", $request->method(), $request->uri());
+ }
+
+ # Build a map of redirects
+ $self->{Redirects}{$self->{fetching}} = $request->uri();
+ $self->{fetching} = $request->uri();
+
+ return ($request->method() eq 'POST') ? 0 : 1;
+}
+
+# -----------------------------------------------------------------------------
+
+package W3C::CheckLink;
+
+use vars qw($PROGRAM $AGENT $VERSION $CVS_VERSION $REVISION
+ $DocType $Accept $ContentTypes %Cfg);
+
+use HTML::Entities qw();
+use HTML::Parser 3.00 qw();
+use HTTP::Request qw();
+use HTTP::Response qw();
+use Net::hostent qw(gethostbyname);
+use Net::IP qw();
+use Socket qw(inet_ntoa);
+use Time::HiRes qw();
+use URI qw();
+use URI::Escape qw();
+use URI::file qw();
+# @@@ Needs also W3C::UserAgent but can't use() it here.
+
+@W3C::CheckLink::ISA = qw(HTML::Parser);
+
+BEGIN
+{
+ # Version info
+ $PROGRAM = 'W3C checklink';
+ ($AGENT = $PROGRAM) =~ s/\s+/-/g;
+ $VERSION = '3.9.1';
+ ($CVS_VERSION) = q$Revision: 3.7 $ =~ /(\d+[\d\.]*\.\d+)/;
+ $REVISION = sprintf('version %s [%s] (c) 1999-2004 W3C',
+ $VERSION, $CVS_VERSION);
+
+ # Pull in mod_perl modules if applicable.
+ if ($ENV{MOD_PERL}) {
+ eval "require Apache::compat"; # For mod_perl 2
+ require Apache;
+ }
+
+ $DocType = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">';
+
+ my @content_types = qw(application/xhtml+xml text/html);
+ $Accept = join(', ', @content_types) . ', */*;q=0.5';
+ my $re = join('|', map { s/\+/\\+/g; $_ } @content_types);
+ $ContentTypes = qr{\b(?:$re)\b}io;
+
+ #
+ # Read configuration. If the W3C_CHECKLINK_CFG environment variable has
+ # been set or the default contains a non-empty file, read it. Otherwise,
+ # skip silently.
+ #
+ my $defaultconfig = '/etc/w3c/checklink.conf';
+ if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) {
+
+ require Config::General;
+ Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy
+
+ my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig;
+ eval {
+ my %config_opts =
+ ( -ConfigFile => $conffile,
+ -SplitPolicy => 'equalsign',
+ );
+ %Cfg = Config::General->new(%config_opts)->getall();
+ };
+ if ($@) {
+ die <<".EOF.";
+Failed to read configuration from '$conffile':
+$@
+.EOF.
+ }
+ }
+
+ # Trusted environment variables that need laundering in taint mode.
+ foreach (qw(NNTPSERVER NEWSHOST)) {
+ ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_};
+ }
+
+ # Use passive FTP by default, see Net::FTP(3).
+ $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
+}
+
+# Autoflush
+$| = 1;
+
+# Different options specified by the user
+my $cmdline = ! ($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/);
+my %Opts =
+ ( Command_Line => $cmdline,
+ Quiet => 0,
+ Summary_Only => 0,
+ Verbose => 0,
+ Progress => 0,
+ HTML => 0,
+ Timeout => 60,
+ Redirects => 1,
+ Dir_Redirects => 1,
+ Accept_Language => 1,
+ Languages => $ENV{HTTP_ACCEPT_LANGUAGE} || '*',
+ HTTP_Proxy => undef,
+ Hide_Same_Realm => 0,
+ Depth => 0, # -1 means unlimited recursion.
+ Sleep_Time => 3, # For the online version.
+ Max_Documents => 150, # Ditto.
+ User => undef,
+ Password => undef,
+ Base_Location => '.',
+ Masquerade => 0,
+ Masquerade_From => '',
+ Masquerade_To => '',
+ Trusted => $Cfg{Trusted},
+ Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ?
+ $Cfg{Allow_Private_IPs} : $cmdline,
+ );
+undef $cmdline;
+
+# Global variables
+# What is our query?
+my $query;
+# What URI's did we process? (used for recursive mode)
+my %processed;
+# Result of the HTTP query
+my %results;
+# List of redirects
+my %redirects;
+# Count of the number of documents checked
+my $doc_count = 0;
+# Time stamp
+my $timestamp = &get_timestamp();
+
+if ($Opts{Command_Line}) {
+
+ require Text::Wrap;
+ Text::Wrap->import('wrap');
+
+ # Parse command line
+ &parse_arguments();
+ &ask_password() if ($Opts{User} && !$Opts{Password});
+
+ my $first = 1;
+ foreach my $uri (@ARGV) {
+ if (!$Opts{Summary_Only}) {
+ printf("%s %s\n", $PROGRAM, $REVISION) unless $Opts{HTML};
+ } else {
+ $Opts{Verbose} = 0;
+ $Opts{Progress} = 0;
+ }
+ # Transform the parameter into a URI
+ $uri = &urize($uri);
+ &check_uri($uri, $first, $Opts{Depth});
+ $first &&= 0;
+ }
+ undef $first;
+
+ if ($Opts{HTML}) {
+ &html_footer();
+ } elsif (($doc_count > 0) && !$Opts{Summary_Only}) {
+ printf("\n%s\n", &global_stats());
+ }
+
+} else {
+
+ require CGI;
+ require CGI::Carp;
+ CGI::Carp->import(qw(fatalsToBrowser));
+ $query = new CGI;
+ # Set a few parameters in CGI mode
+ $Opts{Verbose} = 0;
+ $Opts{Progress} = 0;
+ $Opts{HTML} = 1;
+
+ # Backwards compatibility
+ if ($query->param('hide_dir_redirects')) {
+ $query->param('hide_redirects', 'on');
+ $query->param('hide_type', 'dir');
+ $query->delete('hide_dir_redirects');
+ }
+ if (my $uri = $query->param('url')) {
+ $query->param('uri', $uri) unless $query->param('uri');
+ $query->delete('url');
+ }
+
+ # Override undefined values from the cookie, if we got one.
+ if (my %cookie = $query->cookie($AGENT)) {
+ while (my ($key, $value) = each %cookie) {
+ $query->param($key, $value) unless defined($query->param($key));
+ }
+ }
+
+ $Opts{Summary_Only} = 1 if $query->param('summary');
+
+ if ($query->param('hide_redirects')) {
+ $Opts{Dir_Redirects} = 0;
+ if (my $type = $query->param('hide_type')) {
+ $Opts{Redirects} = 0 if ($type ne 'dir');
+ } else {
+ $Opts{Redirects} = 0;
+ }
+ }
+
+ $Opts{Accept_Language} = 0 if ($query->param('no_accept_language'));
+
+ $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0);
+ if ($query->param('depth') && ($query->param('depth') != 0)) {
+ $Opts{Depth} = $query->param('depth');
+ }
+
+ # Save, clear or leave cookie as is.
+ my $cookie = '';
+ if (my $action = $query->param('cookie')) {
+ my %cookie = (-name => $AGENT);
+ if ($action eq 'clear') {
+ # Clear the cookie.
+ $cookie{-value} = '';
+ $cookie{-expires} = '-1M';
+ } else {
+ # Always refresh the expiration time.
+ $cookie{-expires} = '+1M';
+ if ($action eq 'set') {
+ # Set the options.
+ my %options = $query->Vars();
+ delete($options{$_}) for qw(url uri check cookie); # Non-persistent.
+ $cookie{-value} = \%options;
+ } else {
+ # Use the old values.
+ $cookie{-value} = { $query->cookie($AGENT) };
+ }
+ }
+ $cookie = $query->cookie(%cookie);
+ }
+
+ my $uri = $query->param('uri');
+
+ if (! $uri) {
+ &html_header('', 1); # Set cookie only from results page.
+ &print_form($query);
+ &html_footer();
+ exit;
+ }
+
+ undef $query; # Not needed any more.
+
+ # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
+ # If we're under mod_perl, there is a way around it...
+ if ($ENV{MOD_PERL}) {
+ my $auth = Apache->request()->header_in('Authorization');
+ $ENV{HTTP_AUTHORIZATION} ||= $auth if $auth;
+ }
+
+ $uri =~ s/^\s+//g;
+ if ($uri =~ m/^file:/) {
+ # Only the http scheme is allowed
+ &file_uri($uri);
+ } elsif ($uri !~ m/:/) {
+ if ($uri =~ m|^//|) {
+ $uri = 'http:'.$uri;
+ } else {
+ $uri = 'http://'.$uri;
+ }
+ }
+
+ &check_uri($uri, 1, $Opts{Depth}, $cookie);
+ &html_footer();
+}
+
+###############################################################################
+
+################################
+# Command line and usage stuff #
+################################
+
+sub parse_arguments ()
+{
+
+ require Getopt::Long;
+ Getopt::Long->require_version(2.17);
+ Getopt::Long->import('GetOptions');
+ Getopt::Long::Configure('bundling', 'no_ignore_case');
+ my $masq = '';
+
+ GetOptions('help|?' => sub { usage(0) },
+ 'q|quiet' => sub { $Opts{Quiet} = 1;
+ $Opts{Summary_Only} = 1;
+ },
+ 's|summary' => \$Opts{Summary_Only},
+ 'b|broken' => sub { $Opts{Redirects} = 0;
+ $Opts{Dir_Redirects} = 0;
+ },
+ 'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; },
+ 'v|verbose' => \$Opts{Verbose},
+ 'i|indicator' => \$Opts{Progress},
+ 'h|html' => \$Opts{HTML},
+ 'n|noacclanguage' => sub { $Opts{Accept_Language} = 0; },
+ 'r|recursive' => sub { $Opts{Depth} = -1
+ if $Opts{Depth} == 0; },
+ 'l|location=s' => \$Opts{Base_Location},
+ 'u|user=s' => \$Opts{User},
+ 'p|password=s' => \$Opts{Password},
+ 't|timeout=i' => \$Opts{Timeout},
+ 'L|languages=s' => \$Opts{Languages},
+ 'D|depth=i' => sub { $Opts{Depth} = $_[1]
+ unless $_[1] == 0; },
+ 'd|domain=s' => \$Opts{Trusted},
+ 'y|proxy=s' => \$Opts{HTTP_Proxy},
+ 'masquerade=s' => \$masq,
+ 'hide-same-realm' => \$Opts{Hide_Same_Realm},
+ 'V|version' => \&version,
+ )
+ || usage(1);
+
+ if ($masq) {
+ $Opts{Masquerade} = 1;
+ my @masq = split(/\s+/, $masq);
+ if (scalar(@masq) != 2 ||
+ !defined($masq[0]) || $masq[0] !~ /\S/ ||
+ !defined($masq[1]) || $masq[1] !~ /\S/) {
+ usage(1, "Error: --masquerade takes two whitespace separated URIs.");
+ } else {
+ $Opts{Masquerade_From} = $masq[0];
+ $Opts{Masquerade_To} = $masq[1];
+ }
+ }
+}
+
+sub version ()
+{
+ print STDERR "$PROGRAM $REVISION\n";
+ exit(0);
+}
+
+sub usage ()
+{
+ my ($exitval, $msg) = @_;
+ $exitval = 0 unless defined($exitval);
+ $msg = $msg ? "$msg\n\n" : '';
+
+ my $langs = defined($Opts{Languages}) ? " (default: $Opts{Languages})" : '';
+ my $trust = defined($Opts{Trusted}) ? " (default: $Opts{Trusted})" : '';
+
+ print STDERR "$msg$PROGRAM $REVISION
+
+Usage: checklink <options> <uris>
+Options:
+ -s/--summary Result summary only.
+ -b/--broken Show only the broken links, not the redirects.
+ -e/--directory Hide directory redirects, for example
+ http://www.w3.org/TR -> http://www.w3.org/TR/
+ -r/--recursive Check the documents linked from the first one.
+ -D/--depth n Check the documents linked from the first one
+ to depth n (implies --recursive).
+ -l/--location uri Scope of the documents checked in recursive mode.
+ By default, for example for
+ http://www.w3.org/TR/html4/Overview.html
+ it would be http://www.w3.org/TR/html4/
+ -n/--noacclanguage Do not send an Accept-Language header.
+ -L/--languages Languages accepted$langs.
+ -q/--quiet No output if no errors are found. Implies -s.
+ -v/--verbose Verbose mode.
+ -i/--indicator Show progress while parsing.
+ -u/--user username Specify a username for authentication.
+ -p/--password password Specify a password.
+ --hide-same-realm Hide 401's that are in the same realm as the
+ document checked.
+ -t/--timeout value Timeout for HTTP requests.
+ -d/--domain domain Regular expression describing the domain to
+ which the authentication information will be
+ sent$trust.
+ --masquerade \"base1 base2\" Masquerade base URI base1 as base2. See manual
+ page for more information.
+ -y/--proxy proxy Specify an HTTP proxy server.
+ -h/--html HTML output.
+ -?/--help Show this message.
+ -V/--version Output version information.
+
+See \"perldoc Net::FTP\" for information about various environment variables
+affecting FTP connections and \"perldoc Net::NNTP\" for setting a default
+NNTP server for news: URIs.
+
+The W3C_CHECKLINK_CFG environment variable can be used to set the
+configuration file to use. See details in the full manual page, it can
+be displayed with:
+ perldoc $0
+
+More documentation at: http://www.w3.org/2000/07/checklink
+Please send bug reports and comments to the www-validator mailing list:
+ www-validator\@w3.org (with 'checklink' in the subject)
+ Archives are at: http://lists.w3.org/Archives/Public/www-validator/
+";
+ exit $exitval;
+}
+
+sub ask_password ()
+{
+ eval {
+ local $SIG{__DIE__};
+ require Term::ReadKey;
+ Term::ReadKey->require_version(2.00);
+ Term::ReadKey->import(qw(ReadMode));
+ };
+ if ($@) {
+ warn('Warning: Term::ReadKey 2.00 or newer not available, ' .
+ "password input disabled.\n");
+ return;
+ }
+ printf(STDERR 'Enter the password for user %s: ', $Opts{User});
+ ReadMode('noecho', *STDIN);
+ chomp($Opts{Password} = <STDIN>);
+ ReadMode('restore', *STDIN);
+ print(STDERR "ok.\n");
+}
+
+###############################################################################
+
+###########################################
+# Transform foo into file://localhost/foo #
+###########################################
+
+sub urize ($)
+{
+ my $u = URI->new_abs(URI::Escape::uri_unescape($_[0]), URI::file->cwd());
+ return $u->as_string();
+}
+
+########################################
+# Check for broken links in a resource #
+########################################
+
+sub check_uri ($$$;$)
+{
+
+ my ($uri, $first, $depth, $cookie) = @_;
+
+ my $start = &get_timestamp() unless $Opts{Quiet};
+
+ # Get and parse the document
+ my $response = &get_document('GET', $uri, $doc_count, \%redirects);
+
+ # Can we check the resource? If not, we exit here...
+ return -1 if defined($response->{Stop});
+
+ if ($first) {
+ # Use the first URI as the recursion base unless specified otherwise.
+ $Opts{Base_Location} = ($Opts{Base_Location} eq '.')
+ ? $response->{absolute_uri}->canonical() :
+ URI->new($Opts{Base_Location})->canonical();
+ } else {
+ # Before fetching the document, we don't know if we'll be within the
+ # recursion scope or not (think redirects).
+ return -1 unless &in_recursion_scope($response->{absolute_uri});
+
+ print $Opts{HTML} ? '<hr>' : '-' x 40, "\n";
+ }
+
+ # We are checking a new document
+ $doc_count++;
+
+ if ($Opts{HTML}) {
+ &html_header($uri, 0, $cookie) if $first;
+ print('<h2>');
+ }
+
+ my $absolute_uri = $response->{absolute_uri}->as_string();
+
+ my $result_anchor = 'results'.$doc_count;
+
+ printf("\nProcessing\t%s\n\n",
+ $Opts{HTML} ? &show_url($absolute_uri) : $absolute_uri);
+
+ if ($Opts{HTML}) {
+ print("</h2>\n");
+ if (! $Opts{Summary_Only}) {
+ printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n",
+ $result_anchor);
+ printf("<p>For reliable link checking results, check
+<a href=\"check?uri=%s\">HTML Validity</a> first. See also
+<a href=\"http://jigsaw.w3.org/css-validator/validator?uri=%s\">CSS Validity</a>.</p>
+<p>Back to the <a href=\"checklink\">link checker</a>.</p>\n",
+ map{&encode(URI::Escape::uri_escape($absolute_uri,
+ "^A-Za-z0-9."))}(1..2));
+ print("<pre>\n");
+ }
+ }
+
+ # Record that we have processed this resource
+ $processed{$absolute_uri} = 1;
+ # Parse the document
+ my $p = &parse_document($uri, $absolute_uri,
+ $response->content(), 1,
+ $depth != 0);
+ my $base = URI->new($p->{base});
+
+ # Check anchors
+ ###############
+
+ print "Checking anchors...\n" unless $Opts{Summary_Only};
+
+ my %errors;
+ foreach my $anchor (keys %{$p->{Anchors}}) {
+ my $times = 0;
+ foreach my $l (keys %{$p->{Anchors}{$anchor}}) {
+ $times += $p->{Anchors}{$anchor}{$l};
+ }
+ # They should appear only once
+ $errors{$anchor} = 1 if ($times > 1);
+ # Empty IDREF's are not allowed
+ $errors{$anchor} = 1 if ($anchor eq '');
+ }
+ print " done.\n" unless $Opts{Summary_Only};
+
+ # Check links
+ #############
+
+ my %links;
+ # Record all the links found
+ foreach my $link (keys %{$p->{Links}}) {
+ my $link_uri = URI->new($link);
+ my $abs_link_uri = URI->new_abs($link_uri, $base);
+ if ($Opts{Masquerade}) {
+ if ($abs_link_uri =~ m|^$Opts{Masquerade_From}|) {
+ printf("processing %s in base %s\n",
+ $abs_link_uri, $Opts{Masquerade_To});
+ my $nlink = $abs_link_uri;
+ $nlink =~
+ s|^$Opts{Masquerade_From}|$Opts{Masquerade_To}|;
+ $abs_link_uri = URI->new($nlink);
+ };
+ }
+ foreach my $lines (keys %{$p->{Links}{$link}}) {
+ my $canonical = URI->new($abs_link_uri->canonical());
+ my $url = $canonical->scheme().':'.$canonical->opaque();
+ my $fragment = $canonical->fragment();
+ if (! $fragment) {
+ # Document without fragment
+ $links{$url}{location}{$lines} = 1;
+ } else {
+ # Resource with a fragment
+ $links{$url}{fragments}{$fragment}{$lines} = 1;
+ }
+ }
+ }
+
+ # Build the list of broken URI's
+ my %broken;
+ foreach my $u (keys %links) {
+
+ # Don't check mailto: URI's
+ next if ($u =~ m/^mailto:/);
+
+ &hprintf("Checking link %s\n", $u) unless $Opts{Summary_Only};
+
+ # Check that a link is valid
+ &check_validity($uri, $u, \%links, \%redirects);
+ &hprintf("\tReturn code: %s\n", $results{$u}{location}{code})
+ if ($Opts{Verbose});
+ if ($results{$u}{location}{success}) {
+
+ # Even though it was not broken, we might want to display it
+ # on the results page (e.g. because it required authentication)
+ $broken{$u}{location} = 1 if ($results{$u}{location}{display} >= 400);
+
+ # List the broken fragments
+ foreach my $fragment (keys %{$links{$u}{fragments}}) {
+ if ($Opts{Verbose}) {
+ my @frags = sort keys %{$links{$u}{fragments}{$fragment}};
+ &hprintf("\t\t%s %s - Line%s: %s\n",
+ $fragment,
+ ($results{$u}{fragments}{$fragment}) ? 'OK' : 'Not found',
+ (scalar(@frags) > 1) ? 's' : '',
+ join(', ', @frags)
+ );
+ }
+ # A broken fragment?
+ if ($results{$u}{fragments}{$fragment} == 0) {
+ $broken{$u}{fragments}{$fragment} += 2;
+ }
+ }
+ } else {
+ # Couldn't find the document
+ $broken{$u}{location} = 1;
+ # All the fragments associated are hence broken
+ foreach my $fragment (keys %{$links{$u}{fragments}}) {
+ $broken{$u}{fragments}{$fragment}++;
+ }
+ }
+ }
+ &hprintf("Processed in %ss.\n", &time_diff($start, &get_timestamp()))
+ unless $Opts{Summary_Only};
+
+ # Display results
+ if ($Opts{HTML} && !$Opts{Summary_Only}) {
+ print("</pre>\n");
+ printf("<h2><a name=\"%s\">Results</a></h2>\n", $result_anchor);
+ }
+ print "\n" unless $Opts{Quiet};
+
+ &anchors_summary($p->{Anchors}, \%errors);
+ &links_summary(\%links, \%results, \%broken, \%redirects);
+
+ # Do we want to process other documents?
+ if ($depth != 0) {
+
+ foreach my $u (keys %links) {
+
+ next unless $results{$u}{location}{success}; # Broken link?
+
+ next unless &in_recursion_scope($u);
+
+ # Do we understand its content type?
+ next unless ($results{$u}{location}{type} =~ $ContentTypes);
+
+ # Have we already processed this URI?
+ next if &already_processed($u);
+
+ # Do the job
+ print "\n";
+ if ($Opts{HTML}) {
+ # For the online version, wait for a while to avoid abuses
+ if (!$Opts{Command_Line}) {
+ if ($doc_count == $Opts{Max_Documents}) {
+ print("<hr>\n<p><strong>Maximum number of documents reached!</strong></p>\n");
+ }
+ if ($doc_count >= $Opts{Max_Documents}) {
+ $doc_count++;
+ print("<p>Not checking <strong>$u</strong></p>\n");
+ $processed{$u} = 1;
+ next;
+ }
+ }
+ sleep($Opts{Sleep_Time});
+ }
+ if ($depth < 0) {
+ &check_uri($u, 0, -1);
+ } else {
+ &check_uri($u, 0, $depth-1);
+ }
+ }
+ }
+}
+
+#######################################
+# Get and parse a resource to process #
+#######################################
+
+sub get_document ($$$;\%)
+{
+ my ($method, $uri, $in_recursion, $redirects) = @_;
+ # $method contains the HTTP method the use (GET or HEAD)
+ # $uri contains the identifier of the resource
+ # $in_recursion equals 1 if we are in recursion mode (i.e. it is at least
+ # the second resource checked)
+ # $redirects is a pointer to the hash containing the map of the redirects
+
+ # Get the resource
+ my $response;
+ if (defined($results{$uri}{response})
+ && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
+ $response = $results{$uri}{response};
+ } else {
+ $response = &get_uri($method, $uri);
+ &record_results($uri, $method, $response);
+ &record_redirects($redirects, $response->{Redirects});
+ }
+ if (! $response->is_success()) {
+ if (! $in_recursion) {
+ # Is it too late to request authentication?
+ if ($response->code() == 401) {
+ &authentication($response);
+ } else {
+ &html_header($uri) if $Opts{HTML};
+ &hprintf("\nError: %d %s\n",
+ $response->code(), $response->message());
+ }
+ }
+ $response->{Stop} = 1;
+ return($response);
+ }
+
+ # What is the URI of the resource that we are processing by the way?
+ my $base_uri = URI->new($response->base());
+ my $request_uri = URI->new($response->request->url);
+ $response->{absolute_uri} = $request_uri->abs($base_uri);
+
+ # Can we parse the document?
+ my $failed_reason;
+ if ((my $ct = $response->header('Content-Type')) !~ $ContentTypes) {
+ $failed_reason = "Content-Type for '$request_uri' is '$ct'";
+ } elsif ($response->header('Content-Encoding') &&
+ ((my $ce = $response->header('Content-Encoding')) ne 'identity')) {
+ # @@@ We could maybe handle gzip...
+ $failed_reason = "Content-Encoding for '$request_uri' is '$ce'";
+ }
+ if ($failed_reason) {
+ # No, there is a problem...
+ if (! $in_recursion) {
+ &html_header($uri) if $Opts{HTML};
+ &hprintf("Can't check links: %s.\n", $failed_reason);
+ }
+ $response->{Stop} = 1;
+ }
+
+ # Ok, return the information
+ return($response);
+}
+
+#########################################################
+# Check whether a URI is within the scope of recursion. #
+#########################################################
+
+sub in_recursion_scope ($)
+{
+ my ($uri) = @_;
+ return undef unless $uri;
+
+ my $current = URI->new($uri)->canonical();
+ my $rel = $current->rel($Opts{Base_Location}); # base -> current!
+
+ return undef if ($current eq $rel); # Relative path not possible?
+ return undef if ($rel =~ m|^(\.\.)?/|); # Relative path starts with ../ or /?
+ return 1;
+}
+
+##################################################
+# Check whether a URI has already been processed #
+##################################################
+
+sub already_processed ($)
+{
+ my ($uri) = @_;
+ # Don't be verbose for that part...
+ my $summary_value = $Opts{Summary_Only};
+ $Opts{Summary_Only} = 1;
+ # Do a GET: if it fails, we stop, if not, the results are cached
+ my $response = &get_document('GET', $uri, 1);
+ # ... but just for that part
+ $Opts{Summary_Only} = $summary_value;
+ # Can we process the resource?
+ return -1 if defined($response->{Stop});
+ # Have we already processed it?
+ return 1 if defined($processed{$response->{absolute_uri}->as_string()});
+ # It's not processed yet and it is processable: return 0
+ return 0;
+}
+
+############################
+# Get the content of a URI #
+############################
+
+sub get_uri ($$;$\%$$$$)
+{
+ # Here we have a lot of extra parameters in order not to lose information
+ # if the function is called several times (401's)
+ my ($method, $uri, $start, $redirects, $code, $realm, $message, $auth) = @_;
+
+ # $method contains the method used
+ # $uri contains the target of the request
+ # $start is a timestamp (not defined the first time the function is
+ # called)
+ # $redirects is a map of redirects
+ # $code is the first HTTP return code
+ # $realm is the realm of the request
+ # $message is the HTTP message received
+ # $auth equals 1 if we want to send out authentication information
+
+ # For timing purposes
+ $start = &get_timestamp() unless defined($start);
+
+ # Prepare the query
+ my %lwpargs = ($LWP::VERSION >= 5.6) ? (keep_alive => 1) : ();
+ my $ua = W3C::UserAgent->new(%lwpargs);
+ $ua->timeout($Opts{Timeout});
+ $ua->agent(sprintf('%s/%s [%s] %s',
+ $AGENT, $VERSION, $CVS_VERSION, $ua->agent()));
+ $ua->env_proxy();
+ $ua->proxy('http', 'http://' . $Opts{HTTP_Proxy}) if $Opts{HTTP_Proxy};
+
+ # $ua->{fetching} contains the URI we originally wanted
+ # $ua->{uri} is modified in the case of a redirect; this is used to
+ # build $ua->{Redirects}
+ $ua->{uri} = $ua->{fetching} = $uri;
+ $ua->{Redirects} = $redirects if defined($redirects);
+
+ # Do we want printouts of progress?
+ my $verbose_progress =
+ ! ($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}));
+
+ &hprintf("%s %s ", $method, $uri) if $verbose_progress;
+
+ my $request = new HTTP::Request($method, $uri);
+ if ($Opts{Accept_Language} && $Opts{Languages}) {
+ $request->header('Accept-Language' => $Opts{Languages});
+ }
+ $request->header('Accept', $Accept);
+ # Are we providing authentication info?
+ if ($auth && $request->url()->host() =~ /$Opts{Trusted}/i) {
+ if (defined($ENV{HTTP_AUTHORIZATION})) {
+ $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION});
+ } elsif (defined($Opts{User}) && defined($Opts{Password})) {
+ $request->authorization_basic($Opts{User}, $Opts{Password});
+ }
+ }
+
+ # Tell the user agent if we want progress reports (in redirects) or not.
+ $ua->{Checklink_verbose_progress} = $verbose_progress;
+
+ # Check if the IP address is allowed.
+ my $response = &ip_allowed($request->uri());
+ return $response if $response;
+
+ # Do the query
+ $response = $ua->request($request);
+
+ # Get the results
+ # Record the very first response
+ if (! defined($code)) {
+ $code = $ua->{FirstResponse};
+ $message = $ua->{FirstMessage};
+ }
+ # Authentication requested?
+ if ($response->code() == 401 &&
+ !defined($auth) &&
+ (defined($ENV{HTTP_AUTHORIZATION})
+ || (defined($Opts{User}) && defined($Opts{Password})))) {
+
+ # Set host as trusted domain unless we already have one.
+ $Opts{Trusted} ||= sprintf('^%s$', quotemeta($response->base()->host()));
+
+ # Deal with authentication and avoid loops
+ if (! defined($realm)) {
+ $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
+ $realm = $1;
+ }
+ print "\n" if $verbose_progress;
+ return &get_uri($method, $response->request->url,
+ $start, $ua->{Redirects},
+ $code, $realm, $message, 1);
+ }
+ # Record the redirects
+ $response->{Redirects} = $ua->{Redirects};
+ &hprintf(" fetched in %ss\n",
+ &time_diff($start, &get_timestamp())) if $verbose_progress;
+
+ $response->{OriginalCode} = $code;
+ $response->{OriginalMessage} = $message;
+ $response->{Realm} = $realm if defined($realm);
+
+ return $response;
+}
+
+#########################################
+# Record the results of an HTTP request #
+#########################################
+
+sub record_results ($$$)
+{
+ my ($uri, $method, $response) = @_;
+ $results{$uri}{response} = $response;
+ $results{$uri}{method} = $method;
+ $results{$uri}{location}{code} = $response->code();
+ $results{$uri}{location}{type} = $response->header('Content-type');
+ $results{$uri}{location}{display} = $results{$uri}{location}{code};
+ $results{$uri}{location}{orig} = $response->{OriginalCode};
+ # Did we get a redirect?
+ if ($response->{OriginalCode} != $response->code()) {
+ $results{$uri}{location}{orig_message} = $response->{OriginalMessage};
+ $results{$uri}{location}{redirected} = 1;
+ }
+ $results{$uri}{location}{success} = $response->is_success();
+ # Stores the authentication information
+ if (defined($response->{Realm})) {
+ $results{$uri}{location}{realm} = $response->{Realm};
+ $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm};
+ }
+ # What type of broken link is it? (stored in {record} - the {display}
+ # information is just for visual use only)
+ if (($results{$uri}{location}{display} == 401)
+ && ($results{$uri}{location}{code} == 404)) {
+ $results{$uri}{location}{record} = 404;
+ } else {
+ $results{$uri}{location}{record} = $results{$uri}{location}{display};
+ }
+ # Did it fail?
+ $results{$uri}{location}{message} = $response->message();
+ if (! $results{$uri}{location}{success}) {
+ &hprintf("Error: %d %s\n",
+ $results{$uri}{location}{code},
+ $results{$uri}{location}{message})
+ if ($Opts{Verbose});
+ return;
+ }
+}
+
+####################
+# Parse a document #
+####################
+
+sub parse_document ($$$$;$)
+{
+ my ($uri, $location, $document, $links, $rec_needs_links) = @_;
+
+ my $p;
+
+ if (defined($results{$uri}{parsing})) {
+ # We have already done the job. Woohoo!
+ $p->{base} = $results{$uri}{parsing}{base};
+ $p->{Anchors} = $results{$uri}{parsing}{Anchors};
+ $p->{Links} = $results{$uri}{parsing}{Links};
+ return $p;
+ }
+
+ my $start;
+ $p = W3C::CheckLink->new();
+ $p->{base} = $location;
+ if (! $Opts{Summary_Only}) {
+ $start = &get_timestamp();
+ print("Parsing...\n");
+ }
+ if (!$Opts{Summary_Only} || $Opts{Progress}) {
+ $p->{Total} = ($document =~ tr/\n//);
+ }
+ # We only look for anchors if we are not interested in the links
+ # obviously, or if we are running a recursive checking because we
+ # might need this information later
+ $p->{only_anchors} = !($links || $rec_needs_links);
+
+ # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
+ # Processing instructions are not parsed by process, but in this case
+ # it should be. It's expensive, it's horrible, but it's the easiest way
+ # for right now.
+ $document =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors};
+
+ $p->parse($document);
+
+ if (! $Opts{Summary_Only}) {
+ my $stop = &get_timestamp();
+ print "\r" if $Opts{Progress};
+ &hprintf(" done (%d lines in %ss).\n",
+ $p->{Total}, &time_diff($start, $stop));
+ }
+
+ # Save the results before exiting
+ $results{$uri}{parsing}{base} = $p->{base};
+ $results{$uri}{parsing}{Anchors} = $p->{Anchors};
+ $results{$uri}{parsing}{Links} = $p->{Links};
+
+ return $p;
+}
+
+###################################
+# Constructor for W3C::CheckLink #
+###################################
+
+sub new
+{
+ my $p = HTML::Parser::new(@_, api_version => 3);
+
+ # Start tags
+ $p->handler(start => 'start', 'self, tagname, attr, text, event, tokens');
+ # Declarations
+ $p->handler(declaration =>
+ sub {
+ my $self = shift;
+ $self->declaration(substr($_[0], 2, -1));
+ }, 'self, text');
+ # Other stuff
+ $p->handler(default => 'text', 'self, text');
+ # Line count
+ $p->{Line} = 1;
+ # Check <a [..] name="...">?
+ $p->{check_name} = 1;
+ # Check <[..] id="..">?
+ $p->{check_id} = 1;
+ # Don't interpret comment loosely
+ $p->strict_comment(1);
+
+ return $p;
+}
+
+#################################################
+# Record or return the doctype of the document #
+#################################################
+
+sub doctype
+{
+ my ($self, $dc) = @_;
+ return $self->{doctype} unless $dc;
+ $_ = $self->{doctype} = $dc;
+
+ # What to look for depending on the doctype
+ $self->{check_name} = 0 if ($_ eq '-//W3C//DTD XHTML Basic 1.0//EN');
+
+ # Check for the id tag
+ if (
+ # HTML 2.0 & 3.0
+ m%^-//IETF//DTD HTML [23]\.0//% ||
+ # HTML 3.2
+ m%^-//W3C//DTD HTML 3\.2//%) {
+ $self->{check_id} = 0;
+ }
+ # Enable XML extensions
+ $self->xml_mode(1) if (m%^-//W3C//DTD XHTML %);
+}
+
+#######################################
+# Count the number of lines in a file #
+#######################################
+
+sub new_line
+{
+ my ($self, $string) = @_;
+ my $count = ($string =~ tr/\n//);
+ $self->{Line} = $self->{Line} + $count;
+ printf("\r%4d%%", int($self->{Line}/$self->{Total}*100)) if $Opts{Progress};
+}
+
+#############################
+# Extraction of the anchors #
+#############################
+
+sub get_anchor
+{
+ my ($self, $tag, $attr) = @_;
+
+ my $anchor = $attr->{id} if $self->{check_id};
+ if ($self->{check_name} && ($tag eq 'a')) {
+ # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
+ # Force an error if it's not the case (or if id's and name's values
+ # are different)
+ # If id is defined, name if defined must have the same value
+ $anchor ||= $attr->{name};
+ }
+
+ return $anchor;
+}
+
+###########################
+# W3C::CheckLink handlers #
+###########################
+
+sub add_link
+{
+ my ($self, $uri) = @_;
+ $self->{Links}{$uri}{$self->{Line}}++ if defined($uri);
+}
+
+sub start
+{
+ my ($self, $tag, $attr, $text) = @_;
+
+ # Anchors
+ my $anchor = $self->get_anchor($tag, $attr);
+ $self->{Anchors}{$anchor}{$self->{Line}}++ if defined($anchor);
+
+ # Links
+ if (!$self->{only_anchors}) {
+ # Here, we are checking too many things
+ # The right thing to do is to parse the DTD...
+ if ($tag eq 'base') {
+ # Treat <base> (without href) or <base href=""> as if it didn't exist.
+ if (defined($attr->{href}) && $attr->{href} ne '') {
+ $self->{base} = $attr->{href};
+ }
+ } else {
+ $self->add_link($attr->{href});
+ }
+ $self->add_link($attr->{src});
+ $self->add_link($attr->{data}) if ($tag eq 'object');
+ $self->add_link($attr->{cite}) if ($tag eq 'blockquote');
+ }
+
+ # Line counting
+ $self->new_line($text) if ($text =~ m/\n/);
+}
+
+sub text
+{
+ my ($self, $text) = @_;
+ if (!$Opts{Progress}) {
+ # If we are just extracting information about anchors,
+ # parsing this part is only cosmetic (progress indicator)
+ return unless !$self->{only_anchors};
+ }
+ $self->new_line($text) if ($text =~ /\n/);
+}
+
+sub declaration
+{
+ my ($self, $text) = @_;
+ # Extract the doctype
+ my @declaration = split(/\s+/, $text, 4);
+ if (($#declaration >= 3) &&
+ ($declaration[0] eq 'DOCTYPE') &&
+ (lc($declaration[1]) eq 'html')) {
+ # Parse the doctype declaration
+ $text =~ m/^DOCTYPE\s+html\s+PUBLIC\s+\"([^\"]*)\"(\s+\"([^\"]*)\")?\s*$/i;
+ # Store the doctype
+ $self->doctype($1) if $1;
+ # If there is a link to the DTD, record it
+ $self->{Links}{$3}{$self->{Line}}++ if (!$self->{only_anchors} && $3);
+ }
+ return unless !$self->{only_anchors};
+ $self->text($text);
+}
+
+################################
+# Check the validity of a link #
+################################
+
+sub check_validity ($$\%\%)
+{
+ my ($testing, $uri, $links, $redirects) = @_;
+ # $testing is the URI of the document checked
+ # $uri is the URI of the target that we are verifying
+ # $links is a hash of the links in the documents checked
+ # $redirects is a map of the redirects encountered
+
+ # Checking file: URI's is not allowed with a CGI
+ if ($testing ne $uri) {
+ if (!$Opts{Command_Line} && $testing !~ m/^file:/ && $uri =~ m/^file:/) {
+ my $msg = 'Error: \'file:\' URI not allowed';
+ # Can't test? Return 400 Bad request.
+ $results{$uri}{location}{code} = 400;
+ $results{$uri}{location}{record} = 400;
+ $results{$uri}{location}{orig} = 400;
+ $results{$uri}{location}{success} = 0;
+ $results{$uri}{location}{message} = $msg;
+ &hprintf("Error: %d %s\n", 400, $msg) if $Opts{Verbose};
+ return;
+ }
+ }
+
+ # Get the document with the appropriate method
+ # Only use GET if there are fragments. HEAD is enough if it's not the
+ # case.
+ my @fragments = keys %{$links->{$uri}{fragments}};
+ my $method = scalar(@fragments) ? 'GET' : 'HEAD';
+
+ my $response;
+ my $being_processed = 0;
+ if ((! defined($results{$uri}))
+ || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
+ $being_processed = 1;
+ $response = &get_uri($method, $uri);
+ # Get the information back from get_uri()
+ &record_results($uri, $method, $response);
+ # Record the redirects
+ &record_redirects($redirects, $response->{Redirects});
+ }
+
+ # We got the response of the HTTP request. Stop here if it was a HEAD.
+ return if ($method eq 'HEAD');
+
+ # There are fragments. Parse the document.
+ my $p;
+ if ($being_processed) {
+ # Can we really parse the document?
+ return unless defined($results{$uri}{location}{type});
+ if ($results{$uri}{location}{type} !~ $ContentTypes) {
+ &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n",
+ $uri, $results{$uri}{location}{type})
+ if ($Opts{Verbose});
+ return;
+ }
+ # Do it then
+ $p = &parse_document($uri, $response->base(),
+ $response->as_string(), 0);
+ } else {
+ # We already had the information
+ $p->{Anchors} = $results{$uri}{parsing}{Anchors};
+ }
+ # Check that the fragments exist
+ foreach my $fragment (keys %{$links->{$uri}{fragments}}) {
+ if (defined($p->{Anchors}{$fragment})
+ || &escape_match($fragment, $p->{Anchors})) {
+ $results{$uri}{fragments}{$fragment} = 1;
+ } else {
+ $results{$uri}{fragments}{$fragment} = 0;
+ }
+ }
+}
+
+sub escape_match ($\%)
+{
+ my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
+ foreach my $b (keys %$hash) {
+ return 1 if ($a eq URI::Escape::uri_unescape($b));
+ }
+ return 0;
+}
+
+##########################
+# Ask for authentication #
+##########################
+
+sub authentication ($)
+{
+ my $r = $_[0];
+ $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
+ my $realm = $1;
+
+ if ($Opts{Command_Line}) {
+ printf STDERR <<EOF, $r->request()->url(), $realm;
+
+Authentication is required for %s.
+The realm is %s.
+Use the -u and -p options to specify a username and password and the -d option
+to specify trusted domains.
+EOF
+ } else {
+
+ printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\nConnection: close\nContent-Language: en\nContent-Type: text/html; charset=iso-8859-1\n\n", $r->headers->www_authenticate);
+
+ printf("%s
+<html lang=\"en\">
+<head>
+<title>401 Authorization Required</title>
+</head>
+<body>
+<h1>Authorization Required</h1>
+<p>
+ You need %s access to %s to perform Link Checking.<br>
+", $DocType, &encode($realm), $r->request->url);
+
+ if ($Opts{Trusted}) {
+ printf <<EOF, &encode($Opts{Trusted});
+ This service has been configured to send authentication only to hostnames
+ matching the regular expression <code>%s</code>
+EOF
+ }
+
+ print "</p>\n";
+ }
+}
+
+##################
+# Get statistics #
+##################
+
+sub get_timestamp ()
+{
+ return pack('LL', Time::HiRes::gettimeofday());
+}
+
+sub time_diff ($$)
+{
+ my @start = unpack('LL', $_[0]);
+ my @stop = unpack('LL', $_[1]);
+ for ($start[1], $stop[1]) {
+ $_ /= 1_000_000;
+ }
+ return(sprintf("%.1f", ($stop[0]+$stop[1])-($start[0]+$start[1])));
+}
+
+########################
+# Handle the redirects #
+########################
+
+# Record the redirects in a hash
+sub record_redirects (\%\%)
+{
+ my ($redirects, $sub) = @_;
+ foreach my $r (keys %$sub) {
+ $redirects->{$r} = $sub->{$r};
+ }
+}
+
+# Determine if a request is redirected
+sub is_redirected ($%)
+{
+ my ($uri, %redirects) = @_;
+ return(defined($redirects{$uri}));
+}
+
+# Get a list of redirects for a URI
+sub get_redirects ($%)
+{
+ my ($uri, %redirects) = @_;
+ my @history = ($uri);
+ my %seen = ($uri => 1); # for tracking redirect loops
+ my $loop = 0;
+ while ($redirects{$uri}) {
+ $uri = $redirects{$uri};
+ push(@history, $uri);
+ if ($seen{$uri}) {
+ $loop = 1;
+ last;
+ } else {
+ $seen{$uri}++;
+ }
+ }
+ return ($loop, @history);
+}
+
+####################################################
+# Tool for sorting the unique elements of an array #
+####################################################
+
+sub sort_unique (@)
+{
+ my %saw;
+ @saw{@_} = ();
+ return (sort { $a <=> $b } keys %saw);
+}
+
+#####################
+# Print the results #
+#####################
+
+sub anchors_summary (\%\%)
+{
+ my ($anchors, $errors) = @_;
+
+ # Number of anchors found.
+ my $n = scalar(keys(%$anchors));
+ if (! $Opts{Quiet}) {
+ if ($Opts{HTML}) {
+ print("<h3>Anchors</h3>\n<p>");
+ } else {
+ print("Anchors\n\n");
+ }
+ &hprintf("Found %d anchor%s.", $n, ($n == 1) ? '' : 's');
+ print('</p>') if $Opts{HTML};
+ print("\n");
+ }
+ # List of the duplicates, if any.
+ my @errors = keys %{$errors};
+ if (! scalar(@errors)) {
+ print("<p>Valid anchors!</p>\n") if (! $Opts{Quiet} && $Opts{HTML} && $n);
+ return;
+ }
+ undef $n;
+
+ print('<p>') if $Opts{HTML};
+ print('List of duplicate and empty anchors');
+ print <<EOF if $Opts{HTML};
+</p>
+<table border="1" summary="List of duplicate and empty anchors.">
+<thead>
+<tr>
+<th>Anchors</th>
+<th>Lines</th>
+</tr>
+</thead>
+<tbody>
+EOF
+ print("\n");
+
+ foreach my $anchor (@errors) {
+ my $format;
+ my @unique = &sort_unique(keys %{$anchors->{$anchor}});
+ if ($Opts{HTML}) {
+ $format = "<tr class=\"broken\"><td>%s</td><td>%s</td></tr>\n";
+ } else {
+ my $s = (scalar(@unique) > 1) ? 's' : '';
+ $format = "\t%s\tLine$s: %s\n";
+ }
+ printf($format,
+ &encode($anchor eq '' ? 'Empty anchor' : $anchor),
+ join(', ', @unique));
+ }
+
+ print("</tbody>\n</table>\n") if $Opts{HTML};
+}
+
+sub show_link_report (\%\%\%\%\@;$\%)
+{
+ my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;
+
+ print("\n<dl class=\"report\">") if $Opts{HTML};
+ print("\n");
+
+ # Process each URL
+ my ($c, $previous_c);
+ foreach my $u (@$urls) {
+ my @fragments = keys %{$broken->{$u}{fragments}};
+ # Did we get a redirect?
+ my $redirected = &is_redirected($u, %$redirects);
+ # List of lines
+ my @total_lines;
+ foreach my $l (keys %{$links->{$u}{location}}) {
+ push (@total_lines, $l);
+ }
+ foreach my $f (keys %{$links->{$u}{fragments}}) {
+ next if ($f eq $u && defined($links->{$u}{$u}{-1}));
+ foreach my $l (keys %{$links->{$u}{fragments}{$f}}) {
+ push (@total_lines, $l);
+ }
+ }
+
+ my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects);
+
+ # Error type
+ $c = &code_shown($u, $results);
+ # What to do
+ my $whattodo;
+ my $redirect_too;
+ if ($todo) {
+ if ($u =~ m/^javascript:/) {
+ if ($Opts{HTML}) {
+ $whattodo =
+'You must change this link: people using a browser without Javascript support
+will <em>not</em> be able to follow this link. See the
+<a href="http://www.w3.org/TR/1999/WAI-WEBCONTENT-19990505/#tech-scripts">Web
+Content Accessibility Guidelines on the use of scripting on the Web</a> and
+the
+<a href="http://www.w3.org/TR/WCAG10-HTML-TECHS/#directly-accessible-scripts">techniques
+on how to solve this</a>.';
+ } else {
+ $whattodo = 'Change this link: people using a browser without Javascript support will not be able to follow this link.';
+ }
+ } elsif ($c == 500) {
+ # 500's could be a real 500 or a DNS lookup problem
+ if ($results->{$u}{location}{message} =~
+ m/Bad hostname '[^\']*'/) {
+ $whattodo = 'The hostname could not be resolved. This link needs to be fixed.';
+ } else {
+ $whattodo = 'This is a server-side problem. Check the URI.';
+ }
+ } elsif ($redirect_loop) {
+ $whattodo =
+ 'Retrieving the URI results in a redirect loop, that should be ' .
+ 'fixed. Examine the redirect sequence to see where the loop ' .
+ 'occurs.';
+ } else {
+ $whattodo = $todo->{$c};
+ }
+ # @@@ 303 and 307 ???
+ if (defined($redirects{$u}) && ($c != 301) && ($c != 302)) {
+ $redirect_too = 'The original request has been redirected.';
+ $whattodo .= ' '.$redirect_too unless $Opts{HTML};
+ }
+ } else {
+ # Directory redirects
+ $whattodo = 'Add a trailing slash to the URL.';
+ }
+
+ my @unique = &sort_unique(@total_lines);
+ my $lines_list = join(', ', @unique);
+ my $s = (scalar(@unique) > 1) ? 's' : '';
+ undef @unique;
+
+ if ($Opts{HTML}) {
+ # Style stuff
+ my $idref = '';
+ if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
+ $idref = ' id="d'.$doc_count.'code_'.$c.'"';
+ $previous_c = $c;
+ }
+ # Main info
+ for (@redirects_urls) {
+ $_ = &show_url($_);
+ }
+ # HTTP message
+ my $http_message;
+ if ($results->{$u}{location}{message}) {
+ $http_message = &encode($results->{$u}{location}{message});
+ if ($c == 404 || $c == 500) {
+ $http_message = '<span class="broken">'.
+ $http_message.'</span>';
+ }
+ }
+ my $redirmsg =
+ $redirect_loop ? ' <em>redirect loop detected</em>' : '';
+ printf("
+<dt%s>%s</dt>
+<dd>What to do: <strong%s>%s</strong>%s<br></dd>
+<dd>HTTP Code returned: %d%s<br>
+HTTP Message: %s%s%s</dd>
+<dd>Line%s: %s</dd>\n",
+ # Anchor for return codes
+ $idref,
+ # List of redirects
+ $redirected ?
+ join(' redirected to<br>', @redirects_urls) . $redirmsg :
+ &show_url($u),
+ # Color
+ &bgcolor($c),
+ # What to do
+ $whattodo,
+ # Redirect too?
+ $redirect_too ?
+ sprintf(' <span %s>%s</span>', &bgcolor(301), $redirect_too) : '',
+ # Original HTTP reply
+ $results->{$u}{location}{orig},
+ # Final HTTP reply
+ ($results->{$u}{location}{code} !=
+ $results->{$u}{location}{orig})
+ ? ' <span title="redirected to">-></span> '.
+ &encode($results->{$u}{location}{code})
+ : '',
+ # Realm
+ (defined($results->{$u}{location}{realm})
+ ? 'Realm: '.&encode($results->{$u}{location}{realm}).'<br>'
+ : ''),
+ # HTTP original message
+ defined($results->{$u}{location}{orig_message})
+ ? &encode($results->{$u}{location}{orig_message}).
+ ' <span title="redirected to">-></span> '
+ : '',
+ # HTTP final message
+ $http_message,
+ $s,
+ # List of lines
+ $lines_list);
+ if ($#fragments >= 0) {
+ my $fragment_direction = '';
+ if ($results->{$u}{location}{code} == 200) {
+ $fragment_direction =
+ ' <strong class="broken">They need to be fixed!</strong>';
+ }
+ printf("<dd><dl><dt>Broken fragments and their line numbers: %s</dt>\n",
+ $fragment_direction);
+ }
+ } else {
+ my $redirmsg = $redirect_loop ? ' redirect loop detected' : '';
+ printf("\n%s\t%s\n Code: %d%s %s\n%s\n",
+ # List of redirects
+ $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u,
+ # List of lines
+ $lines_list ? "Line$s: $lines_list" : '',
+ # Original HTTP reply
+ $results->{$u}{location}{orig},
+ # Final HTTP reply
+ ($results->{$u}{location}{code} != $results->{$u}{location}{orig})
+ ? ' -> '.$results->{$u}{location}{code}
+ : '',
+ # HTTP message
+ $results->{$u}{location}{message} ?
+ $results->{$u}{location}{message} : '',
+ # What to do
+ wrap(' To do: ', ' ', $whattodo));
+ if ($#fragments >= 0) {
+ if ($results->{$u}{location}{code} == 200) {
+ print("The following fragments need to be fixed:\n");
+ } else {
+ print("Fragments:\n");
+ }
+ }
+ }
+ # Fragments
+ foreach my $f (@fragments) {
+ if ($Opts{HTML}) {
+ printf("<dd>%s: %s</dd>\n",
+ # Broken fragment
+ &show_url($u, $f),
+ # List of lines
+ join(', ', &sort_unique(keys %{$links->{$u}{fragments}{$f}})));
+ } else {
+ my @unq = &sort_unique(keys %{$links->{$u}{fragments}{$f}});
+ printf("\t%-30s\tLine%s: %s\n",
+ # Fragment
+ $f,
+ # Multiple?
+ (scalar(@unq) > 1) ? 's' : '',
+ # List of lines
+ join(', ', @unq));
+ }
+ }
+
+ print("</dl></dd>\n") if ($Opts{HTML} && scalar(@fragments));
+ }
+
+ # End of the table
+ print("</dl>\n") if $Opts{HTML};
+}
+
+sub code_shown ($$)
+{
+ my ($u, $results) = @_;
+
+ if ($results->{$u}{location}{record} == 200) {
+ return $results->{$u}{location}{orig};
+ } else {
+ return $results->{$u}{location}{record};
+ }
+}
+
+#
+# Checks whether we're allowed to retrieve the document based on it's IP
+# address. Takes an URI object and returns a HTTP::Response containing the
+# appropriate status and error message if the IP was disallowed, undef
+# otherwise. URIs without hostname or IP address are always allowed,
+# including schemes where those make no sense (eg. data:, often javascript:).
+#
+sub ip_allowed ($)
+{
+ my ($uri) = @_;
+ my $hostname = undef;
+ eval { $hostname = $uri->host() }; # Not all URIs implement host()...
+ return undef unless $hostname;
+
+ my $addr = my $iptype = my $resp = undef;
+ if (my $host = gethostbyname($hostname)) {
+ $addr = inet_ntoa($host->addr()) if $host->addr();
+ if ($addr && (my $ip = Net::IP->new($addr))) {
+ $iptype = $ip->iptype();
+ }
+ }
+ $iptype = 'PUBLIC'
+ if ($iptype && $iptype eq 'PRIVATE' && $Opts{Allow_Private_IPs});
+ if ($iptype && $iptype ne 'PUBLIC') {
+ my $code = 403;
+ my $msg =
+ 'Checking non-public IP address disallowed by service configuration';
+ $resp = HTTP::Response->new($code, $msg);
+ $resp->{OriginalCode} = $code;
+ $resp->{OriginalMessage} = $msg;
+ }
+ return $resp;
+}
+
+sub links_summary (\%\%\%\%)
+{
+ # Advices to fix the problems
+
+ my %todo = ( 200 => 'There are broken fragments which must be fixed.',
+ 300 => 'It usually means that there is a typo in a link that triggers mod_speling action - this must be fixed!',
+ 301 => 'You should update the link.',
+ 302 => 'Usually nothing.',
+ 303 => 'Usually nothing.',
+ 307 => 'Usually nothing.',
+ 400 => 'Usually the sign of a malformed URL that cannot be parsed by the server.',
+ 401 => "The link is not public. You'd better specify it.",
+ 403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
+ 404 => 'The link is broken. Fix it NOW!',
+ 405 => 'The server does not allow HEAD requests. Go ask the guys who run this server why. Check the link manually.',
+ 406 => "The server isn't capable of responding according to the Accept* headers sent. Check it out.",
+ 407 => 'The link is a proxy, but requires Authentication.',
+ 408 => 'The request timed out.',
+ 410 => 'The resource is gone. You should remove this link.',
+ 415 => 'The media type is not supported.',
+ 500 => 'Either the hostname is incorrect or it is a server side problem. Check the detailed list.',
+ 501 => 'Could not check this link: method not implemented or scheme not supported.',
+ 503 => 'The server cannot service the request, for some unknown reason.');
+ my %priority = ( 410 => 1,
+ 404 => 2,
+ 403 => 5,
+ 200 => 10,
+ 300 => 15,
+ 401 => 20
+ );
+
+ my ($links, $results, $broken, $redirects) = @_;
+
+ # List of the broken links
+ my @urls = keys %{$broken};
+ my @dir_redirect_urls = ();
+ if ($Opts{Redirects}) {
+ # Add the redirected URI's to the report
+ for my $l (keys %$redirects) {
+ next unless (defined($results->{$l})
+ && defined($links->{$l})
+ && !defined($broken->{$l}));
+ # Check whether we have a "directory redirect"
+ # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/
+ my ($redirect_loop, @redirects) = get_redirects($l, %$redirects);
+ if (($#redirects == 1)
+ && (($redirects[0].'/') eq $redirects[1])) {
+ push(@dir_redirect_urls, $l);
+ next;
+ }
+ push(@urls, $l);
+ }
+ }
+
+ # Broken links and redirects
+ if ($#urls < 0) {
+ if (! $Opts{Quiet}) {
+ if ($Opts{HTML}) {
+ print "<h3>Links</h3>\n<p>Valid links!</p>";
+ } else {
+ print "\nValid links.";
+ }
+ print "\n";
+ }
+ } else {
+ print('<h3>') if $Opts{HTML};
+ print("\nList of broken links");
+ print(' and redirects') if $Opts{Redirects};
+
+ # Sort the URI's by HTTP Code
+ my %code_summary;
+ my @idx;
+ foreach my $u (@urls) {
+ if (defined($results->{$u}{location}{record})) {
+ my $c = &code_shown($u, $results);
+ $code_summary{$c}++;
+ push(@idx, $c);
+ }
+ }
+ my @sorted = @urls[
+ sort {
+ defined($priority{$idx[$a]}) ?
+ defined($priority{$idx[$b]}) ?
+ $priority{$idx[$a]}
+ <=> $priority{$idx[$b]} :
+ -1 :
+ defined($priority{$idx[$b]}) ?
+ 1 :
+ $idx[$a] <=> $idx[$b]
+ } 0 .. $#idx
+ ];
+ @urls = @sorted;
+ undef(@sorted); undef(@idx);
+
+ if ($Opts{HTML}) {
+ # Print a summary
+ print <<EOF;
+</h3>
+<p><em>Fragments listed are broken. See the table below to know what action
+to take.</em></p>
+<table border="1" summary="List of broken fragments and suggested actions.">
+<thead>
+<tr>
+<th>Code</th>
+<th>Occurrences</th>
+<th>What to do</th>
+</tr>
+</thead>
+<tbody>
+EOF
+ foreach my $code (sort(keys(%code_summary))) {
+ printf('<tr%s>', &bgcolor($code));
+ printf('<td><a href="#d%scode_%s">%s</a></td>',
+ $doc_count, $code, $code);
+ printf('<td>%s</td>', $code_summary{$code});
+ printf('<td>%s</td>', $todo{$code});
+ print "</tr>\n";
+ }
+ print "</tbody>\n</table>\n";
+ } else {
+ print(':');
+ }
+ &show_link_report($links, $results, $broken, $redirects,
+ \@urls, 1, \%todo);
+ }
+
+ # Show directory redirects
+ if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) {
+ print('<h3>') if $Opts{HTML};
+ print("\nList of directory redirects");
+ print("</h3>\n<p>The links below are not broken, but the document does not use the exact URL.</p>") if $Opts{HTML};
+ &show_link_report($links, $results, $broken, $redirects,
+ \@dir_redirect_urls);
+ }
+}
+
+###############################################################################
+
+################
+# Global stats #
+################
+
+sub global_stats ()
+{
+ my $stop = &get_timestamp();
+ my $n_docs =
+ ($doc_count <= $Opts{Max_Documents}) ? $doc_count : $Opts{Max_Documents};
+ return sprintf('Checked %d document%s in %s seconds.',
+ $n_docs,
+ ($n_docs == 1) ? '' : 's',
+ &time_diff($timestamp, $stop));
+}
+
+##################
+# HTML interface #
+##################
+
+sub html_header ($;$$)
+{
+ my ($uri, $doform, $cookie) = @_;
+
+ $uri = &encode($uri);
+ my $title = ' Link Checker' . ($uri eq '' ? '' : ': ' . $uri);
+
+ # mod_perl 1.99_05 doesn't seem to like if the "\n\n" isn't in the same
+ # print() statement as the last header...
+
+ my $headers = '';
+ if (! $Opts{Command_Line}) {
+ $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $doform;
+ $headers .= "Content-Type: text/html; charset=iso-8859-1\n";
+ $headers .= "Content-Script-Type: application/x-javascript\n";
+ $headers .= "Set-Cookie: $cookie\n" if $cookie;
+ $headers .= "Content-Language: en\n\n";
+ }
+
+ my $script = my $onload = '';
+ if ($doform) {
+ $script = "
+<script type=\"application/x-javascript\">
+function uriOk()
+{
+ var v = document.forms[0].uri.value;
+ if (v.length > 0) {
+ if (v.search) return (v.search(/\\S/) != -1);
+ return true;
+ }
+ return false;
+}
+</script>";
+ $onload = ' onload="document.forms[0].uri.focus()"';
+ }
+
+ print $headers, $DocType, "
+<html lang=\"en\">
+<head>
+<title>W3C", $title, "</title>
+<style type=\"text/css\">
+body, address {
+ font-family: sans-serif;
+ color: black;
+ background: white;
+}
+pre, code, tt {
+ font-family: monospace;
+}
+img {
+ color: white;
+ border: none;
+ vertical-align: middle;
+}
+fieldset {
+ padding-left: 1em;
+ background-color: #eee;
+}
+th {
+ text-align: left;
+}
+h1 a {
+ color: black;
+}
+h1 {
+ color: #053188;
+}
+h1#title {
+ background-color: #eee;
+ border-bottom: 1px solid black;
+ padding: .25em;
+}
+address {
+ padding: 1ex;
+ border-top: 1px solid black;
+ background-color: #eee;
+ clear: right;
+}
+address img {
+ float: right;
+ width: 88px;
+}
+a:hover {
+ background-color: #eee#;
+}
+a:visited {
+ color: purple;
+}
+.report {
+ width: 100%;
+}
+dt.report {
+ font-weight: bold;
+}
+.unauthorized {
+ background-color: aqua;
+}
+.redirect {
+ background-color: yellow;
+}
+.broken {
+ background-color: red;
+}
+.multiple {
+ background-color: fuchsia;
+}
+</style>", $script, "
+</head>
+<body", $onload, ">
+<h1 id=\"title\"><a href=\"http://www.w3.org/\" title=\"W3C\"><img alt=\"W3C\" id=\"logo\" src=\"http://www.w3.org/Icons/w3c_home\" height=\"48\" width=\"72\"></a> ", $title, "</h1>\n\n";
+}
+
+sub bgcolor ($)
+{
+ my ($code) = @_;
+ my $class;
+ my $r = HTTP::Response->new($code);
+ if ($r->is_success()) {
+ return '';
+ } elsif ($code == 300) {
+ $class = 'multiple';
+ } elsif ($code == 401) {
+ $class = 'unauthorized';
+ } elsif ($r->is_redirect()) {
+ $class = 'redirect';
+ } elsif ($r->is_error()) {
+ $class = 'broken';
+ } else {
+ $class = 'broken';
+ }
+ return(' class="'.$class.'"');
+}
+
+sub show_url ($;$)
+{
+ my ($url, $fragment) = @_;
+ if (defined($fragment)) {
+ my $u = URI->new($url);
+ $u->fragment($fragment);
+ $url = $u->as_string();
+ }
+ $url = &encode($url);
+ return sprintf('<a href="%s">%s</a>',
+ $url, defined($fragment) ? &encode($fragment) : $url);
+}
+
+sub html_footer ()
+{
+ printf("<p>%s</p>\n", &global_stats()) if ($doc_count > 0 && !$Opts{Quiet});
+ print <<EOF;
+<div>
+<address>
+$PROGRAM $REVISION,
+by <a href="http://www.w3.org/People/Hugo/">Hugo Haas</a> and others.<br>
+Please send bug reports, suggestions and comments to the
+<a href="mailto:www-validator\@w3.org?subject=checklink%3A%20">www-validator
+mailing list</a>
+(<a href="http://lists.w3.org/Archives/Public/www-validator/">archives</a>).
+<br>
+Check out the <a href="docs/checklink.html">documentation</a>.
+Download the
+<a href="http://dev.w3.org/cvsweb/~checkout~/validator/httpd/cgi-bin/checklink.pl?rev=$CVS_VERSION&content-type=text/plain">source code</a> from
+<a href="http://dev.w3.org/cvsweb/validator/httpd/cgi-bin/checklink.pl">CVS</a>.
+</address>
+</div>
+</body>
+</html>
+EOF
+}
+
+sub file_uri ($)
+{
+ my ($uri) = @_;
+ &html_header($uri);
+ print "<h2>Forbidden</h2>
+<p>You cannot check such a URI (<code>$uri</code>).</p>
+";
+ &html_footer();
+ exit;
+}
+
+sub print_form ($)
+{
+ my ($q) = @_;
+
+ my $chk = ' checked="checked"';
+ $q->param('hide_type', 'all') unless $q->param('hide_type');
+
+ my $sum = $q->param('summary') ? $chk : '';
+ my $red = $q->param('hide_redirects') ? $chk : '';
+ my $all = ($q->param('hide_type') ne 'dir') ? $chk : '';
+ my $dir = $all ? '' : $chk;
+ my $acc = $q->param('no_accept_language') ? $chk : '';
+ my $rec = $q->param('recursive') ? $chk : '';
+ my $dep = &encode($q->param('depth') || '');
+
+ my $cookie_options = '';
+ if ($q->cookie()) {
+ $cookie_options = "
+ <label for=\"cookie1\"><input type=\"radio\" id=\"cookie1\" name=\"cookie\" value=\"nochanges\" checked=\"checked\"> Don't modify saved options</label>
+ <label for=\"cookie2\"><input type=\"radio\" id=\"cookie2\" name=\"cookie\" value=\"set\"> Save these options</label>
+ <label for=\"cookie3\"><input type=\"radio\" id=\"cookie3\" name=\"cookie\" value=\"clear\"> Clear saved options</label>";
+ } else {
+ $cookie_options = "
+ <label for=\"cookie\"><input type=\"checkbox\" id=\"cookie\" name=\"cookie\" value=\"set\"> Save options in a <a href=\"http://www.w3.org/Protocols/rfc2109/rfc2109\">cookie</a></label>";
+ }
+
+ print "<form action=\"", $q->self_url(), "\" method=\"get\" onsubmit=\"return uriOk()\">
+<p><label for=\"uri\">Enter the address (<a href=\"http://www.w3.org/Addressing/#terms\">URL</a>)
+of a document that you would like to check:</label></p>
+<p><input type=\"text\" size=\"50\" id=\"uri\" name=\"uri\" value=\"\"></p>
+<fieldset>
+ <legend>Options</legend>
+ <p>
+ <label for=\"summary\"><input type=\"checkbox\" id=\"summary\" name=\"summary\" value=\"on\"", $sum, "> Summary only</label>
+ <br>
+ <label for=\"hide_redirects\"><input type=\"checkbox\" id=\"hide_redirects\" name=\"hide_redirects\" value=\"on\"", $red, "> Hide <a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3\">redirects</a>:</label>
+ <label><input type=\"radio\" name=\"hide_type\" value=\"all\"", $all, "> all</label>
+ <label><input type=\"radio\" name=\"hide_type\" value=\"dir\"", $dir, "> for directories only</label>
+ <br>
+ <label for=\"no_accept_language\"><input type=\"checkbox\" id=\"no_accept_language\" name=\"no_accept_language\" value=\"on\"", $acc, "> Don't send <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> headers</label>
+ <br>
+ <label title=\"Check linked documents recursively (maximum: ", $Opts{Max_Documents}, " documents; sleeping ", $Opts{Sleep_Time}, " seconds between each document)\" for=\"recursive\"><input type=\"checkbox\" id=\"recursive\" name=\"recursive\" value=\"on\"", $rec, "> Check linked documents recursively</label>,
+ <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth\" name=\"depth\" value=\"", $dep, "\"></label>
+ <br><br>", $cookie_options, "
+ </p>
+</fieldset>
+<p><input type=\"submit\" name=\"check\" value=\"Check\"></p>
+</form>
+";
+}
+
+sub encode (@)
+{
+ return $Opts{HTML} ? HTML::Entities::encode(@_) : @_;
+}
+
+sub hprintf (@)
+{
+ if (! $Opts{HTML}) {
+ printf(@_);
+ } else {
+ print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1]));
+ }
+}
+
+# Local Variables:
+# mode: perl
+# indent-tabs-mode: nil
+# tab-width: 2
+# perl-indent-level: 2
+# End:
+# ex: ts=2 sw=2 et
Received on Thursday, 5 August 2010 14:46:54 UTC