link-checker commit: Copy here from the validator dir. from Mercurial notifier on 2010-08-05 (www-validator-cvs@w3.org from August 2010)

From: Mercurial notifier <nobody@w3.org>
Date: Thu, 05 Aug 2010 14:46:51 +0000
To: link-checker updates <www-validator-cvs@w3.org>
Message-Id: <E1Oh1id-0005S5-Uu@blinky.w3.org>
changeset:   1:55210bc792ab
user:        ville
date:        Thu Mar 18 21:39:55 2004 +0000
files:       bin/checklink
description:
Copy here from the validator dir.


diff -r 3b1a5c7ab987 -r 55210bc792ab bin/checklink
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/checklink	Thu Mar 18 21:39:55 2004 +0000
@@ -0,0 +1,2081 @@
+#!/usr/bin/perl -wT
+#
+# W3C Link Checker
+# by Hugo Haas <hugo@w3.org>
+# (c) 1999-2004 World Wide Web Consortium
+# based on Renaud Bruyeron's checklink.pl
+#
+# $Id: checklink,v 3.7 2004-03-18 21:39:55 ville Exp $
+#
+# This program is licensed under the W3C(r) Software License:
+#       http://www.w3.org/Consortium/Legal/copyright-software
+#
+# The documentation is at:
+#       http://www.w3.org/2000/07/checklink
+#
+# See the CVSweb interface at:
+#       http://dev.w3.org/cvsweb/validator/httpd/cgi-bin/checklink.pl
+#
+# An online version is available at:
+#       http://validator.w3.org/checklink
+#
+# Comments and suggestions should be sent to the www-validator mailing list:
+#       www-validator@w3.org (with 'checklink' in the subject)
+#       http://lists.w3.org/Archives/Public/www-validator/ (archives)
+
+use strict;
+
+# -----------------------------------------------------------------------------
+
+package W3C::UserAgent;
+
+use LWP::UserAgent      qw();
+# @@@ Needs also W3C::CheckLink but can't use() it here.
+
+@W3C::UserAgent::ISA =  qw(LWP::UserAgent);
+
+sub simple_request
+{
+  my $self = shift;
+  my $response = $self->W3C::UserAgent::SUPER::simple_request(@_);
+  if (! defined($self->{FirstResponse})) {
+    $self->{FirstResponse} = $response->code();
+    $self->{FirstMessage} = $response->message();
+  }
+  return $response;
+}
+
+sub redirect_ok
+{
+  my ($self, $request) = @_;
+
+  if ($self->{Checklink_verbose_progress}) {
+    &W3C::CheckLink::hprintf("\n%s %s ", $request->method(), $request->uri());
+  }
+
+  # Build a map of redirects
+  $self->{Redirects}{$self->{fetching}} = $request->uri();
+  $self->{fetching} = $request->uri();
+
+  return ($request->method() eq 'POST') ? 0 : 1;
+}
+
+# -----------------------------------------------------------------------------
+
+package W3C::CheckLink;
+
+use vars qw($PROGRAM $AGENT $VERSION $CVS_VERSION $REVISION
+            $DocType $Accept $ContentTypes %Cfg);
+
+use HTML::Entities       qw();
+use HTML::Parser    3.00 qw();
+use HTTP::Request        qw();
+use HTTP::Response       qw();
+use Net::hostent         qw(gethostbyname);
+use Net::IP              qw();
+use Socket               qw(inet_ntoa);
+use Time::HiRes          qw();
+use URI                  qw();
+use URI::Escape          qw();
+use URI::file            qw();
+# @@@ Needs also W3C::UserAgent but can't use() it here.
+
+@W3C::CheckLink::ISA =  qw(HTML::Parser);
+
+BEGIN
+{
+  # Version info
+  $PROGRAM       = 'W3C checklink';
+  ($AGENT        = $PROGRAM) =~ s/\s+/-/g;
+  $VERSION       = '3.9.1';
+  ($CVS_VERSION) = q$Revision: 3.7 $ =~ /(\d+[\d\.]*\.\d+)/;
+  $REVISION      = sprintf('version %s [%s] (c) 1999-2004 W3C',
+                           $VERSION, $CVS_VERSION);
+
+  # Pull in mod_perl modules if applicable.
+  if ($ENV{MOD_PERL}) {
+    eval "require Apache::compat"; # For mod_perl 2
+    require Apache;
+  }
+
+  $DocType = '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">';
+
+  my @content_types = qw(application/xhtml+xml text/html);
+  $Accept = join(', ', @content_types) . ', */*;q=0.5';
+  my $re = join('|', map { s/\+/\\+/g; $_ } @content_types);
+  $ContentTypes = qr{\b(?:$re)\b}io;
+
+  #
+  # Read configuration.  If the W3C_CHECKLINK_CFG environment variable has
+  # been set or the default contains a non-empty file, read it.  Otherwise,
+  # skip silently.
+  #
+  my $defaultconfig = '/etc/w3c/checklink.conf';
+  if ($ENV{W3C_CHECKLINK_CFG} || -s $defaultconfig) {
+
+    require Config::General;
+    Config::General->require_version(2.06); # Need 2.06 for -SplitPolicy
+
+    my $conffile = $ENV{W3C_CHECKLINK_CFG} || $defaultconfig;
+    eval {
+      my %config_opts =
+        ( -ConfigFile  => $conffile,
+          -SplitPolicy => 'equalsign',
+        );
+      %Cfg = Config::General->new(%config_opts)->getall();
+    };
+    if ($@) {
+      die <<".EOF.";
+Failed to read configuration from '$conffile':
+$@
+.EOF.
+    }
+  }
+
+  # Trusted environment variables that need laundering in taint mode.
+  foreach (qw(NNTPSERVER NEWSHOST)) {
+    ($ENV{$_}) = ($ENV{$_} =~ /^(.*)$/) if $ENV{$_};
+  }
+
+  # Use passive FTP by default, see Net::FTP(3).
+  $ENV{FTP_PASSIVE} = 1 unless exists($ENV{FTP_PASSIVE});
+}
+
+# Autoflush
+$| = 1;
+
+# Different options specified by the user
+my $cmdline = ! ($ENV{GATEWAY_INTERFACE} && $ENV{GATEWAY_INTERFACE} =~ /^CGI/);
+my %Opts =
+  ( Command_Line      => $cmdline,
+    Quiet             => 0,
+    Summary_Only      => 0,
+    Verbose           => 0,
+    Progress          => 0,
+    HTML              => 0,
+    Timeout           => 60,
+    Redirects         => 1,
+    Dir_Redirects     => 1,
+    Accept_Language   => 1,
+    Languages         => $ENV{HTTP_ACCEPT_LANGUAGE} || '*',
+    HTTP_Proxy        => undef,
+    Hide_Same_Realm   => 0,
+    Depth             => 0,    # -1 means unlimited recursion.
+    Sleep_Time        => 3,    # For the online version.
+    Max_Documents     => 150,  # Ditto.
+    User              => undef,
+    Password          => undef,
+    Base_Location     => '.',
+    Masquerade        => 0,
+    Masquerade_From   => '',
+    Masquerade_To     => '',
+    Trusted           => $Cfg{Trusted},
+    Allow_Private_IPs => defined($Cfg{Allow_Private_IPs}) ?
+                         $Cfg{Allow_Private_IPs} : $cmdline,
+  );
+undef $cmdline;
+
+# Global variables
+# What is our query?
+my $query;
+# What URI's did we process? (used for recursive mode)
+my %processed;
+# Result of the HTTP query
+my %results;
+# List of redirects
+my %redirects;
+# Count of the number of documents checked
+my $doc_count = 0;
+# Time stamp
+my $timestamp = &get_timestamp();
+
+if ($Opts{Command_Line}) {
+
+  require Text::Wrap;
+  Text::Wrap->import('wrap');
+
+  # Parse command line
+  &parse_arguments();
+  &ask_password() if ($Opts{User} && !$Opts{Password});
+
+  my $first = 1;
+  foreach my $uri (@ARGV) {
+    if (!$Opts{Summary_Only}) {
+      printf("%s %s\n", $PROGRAM, $REVISION) unless $Opts{HTML};
+    } else {
+      $Opts{Verbose} = 0;
+      $Opts{Progress} = 0;
+    }
+    # Transform the parameter into a URI
+    $uri = &urize($uri);
+    &check_uri($uri, $first, $Opts{Depth});
+    $first &&= 0;
+  }
+  undef $first;
+
+  if ($Opts{HTML}) {
+    &html_footer();
+  } elsif (($doc_count > 0) && !$Opts{Summary_Only}) {
+    printf("\n%s\n", &global_stats());
+  }
+
+} else {
+
+  require CGI;
+  require CGI::Carp;
+  CGI::Carp->import(qw(fatalsToBrowser));
+  $query = new CGI;
+  # Set a few parameters in CGI mode
+  $Opts{Verbose}   = 0;
+  $Opts{Progress}  = 0;
+  $Opts{HTML}      = 1;
+
+  # Backwards compatibility
+  if ($query->param('hide_dir_redirects')) {
+    $query->param('hide_redirects', 'on');
+    $query->param('hide_type', 'dir');
+    $query->delete('hide_dir_redirects');
+  }
+  if (my $uri = $query->param('url')) {
+    $query->param('uri', $uri) unless $query->param('uri');
+    $query->delete('url');
+  }
+
+  # Override undefined values from the cookie, if we got one.
+  if (my %cookie = $query->cookie($AGENT)) {
+    while (my ($key, $value) = each %cookie) {
+      $query->param($key, $value) unless defined($query->param($key));
+    }
+  }
+
+  $Opts{Summary_Only} = 1 if $query->param('summary');
+
+  if ($query->param('hide_redirects')) {
+    $Opts{Dir_Redirects} = 0;
+    if (my $type = $query->param('hide_type')) {
+      $Opts{Redirects} = 0 if ($type ne 'dir');
+    } else {
+      $Opts{Redirects} = 0;
+    }
+  }
+
+  $Opts{Accept_Language} = 0 if ($query->param('no_accept_language'));
+
+  $Opts{Depth} = -1 if ($query->param('recursive') && $Opts{Depth} == 0);
+  if ($query->param('depth') && ($query->param('depth') != 0)) {
+    $Opts{Depth} = $query->param('depth');
+  }
+
+  # Save, clear or leave cookie as is.
+  my $cookie = '';
+  if (my $action = $query->param('cookie')) {
+    my %cookie = (-name => $AGENT);
+    if ($action eq 'clear') {
+      # Clear the cookie.
+      $cookie{-value}   = '';
+      $cookie{-expires} = '-1M';
+    } else {
+      # Always refresh the expiration time.
+      $cookie{-expires} = '+1M';
+      if ($action eq 'set') {
+        # Set the options.
+        my %options = $query->Vars();
+        delete($options{$_}) for qw(url uri check cookie); # Non-persistent.
+        $cookie{-value}   = \%options;
+      } else {
+        # Use the old values.
+        $cookie{-value} = { $query->cookie($AGENT) };
+      }
+    }
+    $cookie = $query->cookie(%cookie);
+  }
+
+  my $uri = $query->param('uri');
+
+  if (! $uri) {
+    &html_header('', 1); # Set cookie only from results page.
+    &print_form($query);
+    &html_footer();
+    exit;
+  }
+
+  undef $query; # Not needed any more.
+
+  # All Apache configurations don't set HTTP_AUTHORIZATION for CGI scripts.
+  # If we're under mod_perl, there is a way around it...
+  if ($ENV{MOD_PERL}) {
+    my $auth = Apache->request()->header_in('Authorization');
+    $ENV{HTTP_AUTHORIZATION} ||= $auth if $auth;
+  }
+
+  $uri =~ s/^\s+//g;
+  if ($uri =~ m/^file:/) {
+    # Only the http scheme is allowed
+    &file_uri($uri);
+  } elsif ($uri !~ m/:/) {
+    if ($uri =~ m|^//|) {
+      $uri = 'http:'.$uri;
+    } else {
+      $uri = 'http://'.$uri;
+    }
+  }
+
+  &check_uri($uri, 1, $Opts{Depth}, $cookie);
+  &html_footer();
+}
+
+###############################################################################
+
+################################
+# Command line and usage stuff #
+################################
+
+sub parse_arguments ()
+{
+
+  require Getopt::Long;
+  Getopt::Long->require_version(2.17);
+  Getopt::Long->import('GetOptions');
+  Getopt::Long::Configure('bundling', 'no_ignore_case');
+  my $masq = '';
+
+  GetOptions('help|?'          => sub { usage(0) },
+             'q|quiet'         => sub { $Opts{Quiet} = 1;
+                                        $Opts{Summary_Only} = 1;
+                                      },
+             's|summary'       => \$Opts{Summary_Only},
+             'b|broken'        => sub { $Opts{Redirects} = 0;
+                                        $Opts{Dir_Redirects} = 0;
+                                      },
+             'e|dir-redirects' => sub { $Opts{Dir_Redirects} = 0; },
+             'v|verbose'       => \$Opts{Verbose},
+             'i|indicator'     => \$Opts{Progress},
+             'h|html'          => \$Opts{HTML},
+             'n|noacclanguage' => sub { $Opts{Accept_Language} = 0; },
+             'r|recursive'     => sub { $Opts{Depth} = -1
+                                          if $Opts{Depth} == 0; },
+             'l|location=s'    => \$Opts{Base_Location},
+             'u|user=s'        => \$Opts{User},
+             'p|password=s'    => \$Opts{Password},
+             't|timeout=i'     => \$Opts{Timeout},
+             'L|languages=s'   => \$Opts{Languages},
+             'D|depth=i'       => sub { $Opts{Depth} = $_[1]
+                                          unless $_[1] == 0; },
+             'd|domain=s'      => \$Opts{Trusted},
+             'y|proxy=s'       => \$Opts{HTTP_Proxy},
+             'masquerade=s'    => \$masq,
+             'hide-same-realm' => \$Opts{Hide_Same_Realm},
+             'V|version'       => \&version,
+            )
+    || usage(1);
+
+  if ($masq) {
+    $Opts{Masquerade} = 1;
+    my @masq = split(/\s+/, $masq);
+    if (scalar(@masq) != 2 ||
+        !defined($masq[0]) || $masq[0] !~ /\S/ ||
+        !defined($masq[1]) || $masq[1] !~ /\S/) {
+      usage(1, "Error: --masquerade takes two whitespace separated URIs.");
+    } else {
+      $Opts{Masquerade_From} = $masq[0];
+      $Opts{Masquerade_To}   = $masq[1];
+    }
+  }
+}
+
+sub version ()
+{
+  print STDERR "$PROGRAM $REVISION\n";
+  exit(0);
+}
+
+sub usage ()
+{
+  my ($exitval, $msg) = @_;
+  $exitval = 0 unless defined($exitval);
+  $msg = $msg ? "$msg\n\n" : '';
+
+  my $langs = defined($Opts{Languages}) ? " (default: $Opts{Languages})" : '';
+  my $trust = defined($Opts{Trusted})   ? " (default: $Opts{Trusted})"   : '';
+
+  print STDERR "$msg$PROGRAM $REVISION
+
+Usage: checklink <options> <uris>
+Options:
+  -s/--summary                Result summary only.
+  -b/--broken                 Show only the broken links, not the redirects.
+  -e/--directory              Hide directory redirects, for example
+                              http://www.w3.org/TR -> http://www.w3.org/TR/
+  -r/--recursive              Check the documents linked from the first one.
+  -D/--depth n                Check the documents linked from the first one
+                              to depth n (implies --recursive).
+  -l/--location uri           Scope of the documents checked in recursive mode.
+                              By default, for example for
+                              http://www.w3.org/TR/html4/Overview.html
+                              it would be http://www.w3.org/TR/html4/
+  -n/--noacclanguage          Do not send an Accept-Language header.
+  -L/--languages              Languages accepted$langs.
+  -q/--quiet                  No output if no errors are found.  Implies -s.
+  -v/--verbose                Verbose mode.
+  -i/--indicator              Show progress while parsing.
+  -u/--user username          Specify a username for authentication.
+  -p/--password password      Specify a password.
+  --hide-same-realm           Hide 401's that are in the same realm as the
+                              document checked.
+  -t/--timeout value          Timeout for HTTP requests.
+  -d/--domain domain          Regular expression describing the domain to
+                              which the authentication information will be
+                              sent$trust.
+  --masquerade \"base1 base2\"  Masquerade base URI base1 as base2.  See manual
+                              page for more information.
+  -y/--proxy proxy            Specify an HTTP proxy server.
+  -h/--html                   HTML output.
+  -?/--help                   Show this message.
+  -V/--version                Output version information.
+
+See \"perldoc Net::FTP\" for information about various environment variables
+affecting FTP connections and \"perldoc Net::NNTP\" for setting a default
+NNTP server for news: URIs.
+
+The W3C_CHECKLINK_CFG environment variable can be used to set the
+configuration file to use.  See details in the full manual page, it can
+be displayed with:
+  perldoc $0
+
+More documentation at: http://www.w3.org/2000/07/checklink
+Please send bug reports and comments to the www-validator mailing list:
+  www-validator\@w3.org (with 'checklink' in the subject)
+  Archives are at: http://lists.w3.org/Archives/Public/www-validator/
+";
+  exit $exitval;
+}
+
+sub ask_password ()
+{
+  eval {
+    local $SIG{__DIE__};
+    require Term::ReadKey;
+    Term::ReadKey->require_version(2.00);
+    Term::ReadKey->import(qw(ReadMode));
+  };
+  if ($@) {
+    warn('Warning: Term::ReadKey 2.00 or newer not available, ' .
+         "password input disabled.\n");
+    return;
+  }
+  printf(STDERR 'Enter the password for user %s: ', $Opts{User});
+  ReadMode('noecho',  *STDIN);
+  chomp($Opts{Password} = <STDIN>);
+  ReadMode('restore', *STDIN);
+  print(STDERR "ok.\n");
+}
+
+###############################################################################
+
+###########################################
+# Transform foo into file://localhost/foo #
+###########################################
+
+sub urize ($)
+{
+  my $u = URI->new_abs(URI::Escape::uri_unescape($_[0]), URI::file->cwd());
+  return $u->as_string();
+}
+
+########################################
+# Check for broken links in a resource #
+########################################
+
+sub check_uri ($$$;$)
+{
+
+  my ($uri, $first, $depth, $cookie) = @_;
+
+  my $start = &get_timestamp() unless $Opts{Quiet};
+
+  # Get and parse the document
+  my $response = &get_document('GET', $uri, $doc_count, \%redirects);
+
+  # Can we check the resource? If not, we exit here...
+  return -1 if defined($response->{Stop});
+
+  if ($first) {
+    # Use the first URI as the recursion base unless specified otherwise.
+    $Opts{Base_Location} = ($Opts{Base_Location} eq '.')
+      ? $response->{absolute_uri}->canonical() :
+        URI->new($Opts{Base_Location})->canonical();
+  } else {
+    # Before fetching the document, we don't know if we'll be within the
+    # recursion scope or not (think redirects).
+    return -1 unless &in_recursion_scope($response->{absolute_uri});
+
+    print $Opts{HTML} ? '<hr>' : '-' x 40, "\n";
+  }
+
+  # We are checking a new document
+  $doc_count++;
+
+  if ($Opts{HTML}) {
+    &html_header($uri, 0, $cookie) if $first;
+    print('<h2>');
+  }
+
+  my $absolute_uri = $response->{absolute_uri}->as_string();
+
+  my $result_anchor = 'results'.$doc_count;
+
+  printf("\nProcessing\t%s\n\n",
+         $Opts{HTML} ? &show_url($absolute_uri) : $absolute_uri);
+
+  if ($Opts{HTML}) {
+    print("</h2>\n");
+    if (! $Opts{Summary_Only}) {
+      printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n",
+             $result_anchor);
+      printf("<p>For reliable link checking results, check
+<a href=\"check?uri=%s\">HTML Validity</a> first.  See also
+<a href=\"http://jigsaw.w3.org/css-validator/validator?uri=%s\">CSS Validity</a>.</p>
+<p>Back to the <a href=\"checklink\">link checker</a>.</p>\n",
+             map{&encode(URI::Escape::uri_escape($absolute_uri,
+                                                 "^A-Za-z0-9."))}(1..2));
+      print("<pre>\n");
+    }
+  }
+
+  # Record that we have processed this resource
+  $processed{$absolute_uri} = 1;
+  # Parse the document
+  my $p = &parse_document($uri, $absolute_uri,
+                          $response->content(), 1,
+                          $depth != 0);
+  my $base = URI->new($p->{base});
+
+  # Check anchors
+  ###############
+
+  print "Checking anchors...\n" unless $Opts{Summary_Only};
+
+  my %errors;
+  foreach my $anchor (keys %{$p->{Anchors}}) {
+    my $times = 0;
+    foreach my $l (keys %{$p->{Anchors}{$anchor}}) {
+      $times += $p->{Anchors}{$anchor}{$l};
+    }
+    # They should appear only once
+    $errors{$anchor} = 1 if ($times > 1);
+    # Empty IDREF's are not allowed
+    $errors{$anchor} = 1 if ($anchor eq '');
+  }
+  print " done.\n" unless $Opts{Summary_Only};
+
+  # Check links
+  #############
+
+  my %links;
+  # Record all the links found
+  foreach my $link (keys %{$p->{Links}}) {
+    my $link_uri = URI->new($link);
+    my $abs_link_uri = URI->new_abs($link_uri, $base);
+    if ($Opts{Masquerade}) {
+      if ($abs_link_uri =~ m|^$Opts{Masquerade_From}|) {
+        printf("processing %s in base %s\n",
+               $abs_link_uri, $Opts{Masquerade_To});
+        my $nlink = $abs_link_uri;
+        $nlink =~
+          s|^$Opts{Masquerade_From}|$Opts{Masquerade_To}|;
+        $abs_link_uri = URI->new($nlink);
+      };
+    }
+    foreach my $lines (keys %{$p->{Links}{$link}}) {
+      my $canonical = URI->new($abs_link_uri->canonical());
+      my $url = $canonical->scheme().':'.$canonical->opaque();
+      my $fragment = $canonical->fragment();
+      if (! $fragment) {
+        # Document without fragment
+        $links{$url}{location}{$lines} = 1;
+      } else {
+        # Resource with a fragment
+        $links{$url}{fragments}{$fragment}{$lines} = 1;
+      }
+    }
+  }
+
+  # Build the list of broken URI's
+  my %broken;
+  foreach my $u (keys %links) {
+
+    # Don't check mailto: URI's
+    next if ($u =~ m/^mailto:/);
+
+    &hprintf("Checking link %s\n", $u) unless $Opts{Summary_Only};
+
+    # Check that a link is valid
+    &check_validity($uri, $u, \%links, \%redirects);
+    &hprintf("\tReturn code: %s\n", $results{$u}{location}{code})
+      if ($Opts{Verbose});
+    if ($results{$u}{location}{success}) {
+
+      # Even though it was not broken, we might want to display it
+      # on the results page (e.g. because it required authentication)
+      $broken{$u}{location} = 1 if ($results{$u}{location}{display} >= 400);
+
+      # List the broken fragments
+      foreach my $fragment (keys %{$links{$u}{fragments}}) {
+        if ($Opts{Verbose}) {
+          my @frags = sort keys %{$links{$u}{fragments}{$fragment}};
+          &hprintf("\t\t%s %s - Line%s: %s\n",
+                   $fragment,
+                   ($results{$u}{fragments}{$fragment}) ? 'OK' : 'Not found',
+                   (scalar(@frags) > 1) ? 's' : '',
+                   join(', ', @frags)
+                  );
+        }
+        # A broken fragment?
+        if ($results{$u}{fragments}{$fragment} == 0) {
+          $broken{$u}{fragments}{$fragment} += 2;
+        }
+      }
+    } else {
+      # Couldn't find the document
+      $broken{$u}{location} = 1;
+      # All the fragments associated are hence broken
+      foreach my $fragment (keys %{$links{$u}{fragments}}) {
+        $broken{$u}{fragments}{$fragment}++;
+      }
+    }
+  }
+  &hprintf("Processed in %ss.\n", &time_diff($start, &get_timestamp()))
+    unless $Opts{Summary_Only};
+
+  # Display results
+  if ($Opts{HTML} && !$Opts{Summary_Only}) {
+    print("</pre>\n");
+    printf("<h2><a name=\"%s\">Results</a></h2>\n", $result_anchor);
+  }
+  print "\n" unless $Opts{Quiet};
+
+  &anchors_summary($p->{Anchors}, \%errors);
+  &links_summary(\%links, \%results, \%broken, \%redirects);
+
+  # Do we want to process other documents?
+  if ($depth != 0) {
+
+    foreach my $u (keys %links) {
+
+      next unless $results{$u}{location}{success};  # Broken link?
+
+      next unless &in_recursion_scope($u);
+
+      # Do we understand its content type?
+      next unless ($results{$u}{location}{type} =~ $ContentTypes);
+
+      # Have we already processed this URI?
+      next if &already_processed($u);
+
+      # Do the job
+      print "\n";
+      if ($Opts{HTML}) {
+        # For the online version, wait for a while to avoid abuses
+        if (!$Opts{Command_Line}) {
+          if ($doc_count == $Opts{Max_Documents}) {
+            print("<hr>\n<p><strong>Maximum number of documents reached!</strong></p>\n");
+          }
+          if ($doc_count >= $Opts{Max_Documents}) {
+            $doc_count++;
+            print("<p>Not checking <strong>$u</strong></p>\n");
+            $processed{$u} = 1;
+            next;
+          }
+        }
+        sleep($Opts{Sleep_Time});
+      }
+      if ($depth < 0) {
+        &check_uri($u, 0, -1);
+      } else {
+        &check_uri($u, 0, $depth-1);
+      }
+    }
+  }
+}
+
+#######################################
+# Get and parse a resource to process #
+#######################################
+
+sub get_document ($$$;\%)
+{
+  my ($method, $uri, $in_recursion, $redirects) = @_;
+  # $method contains the HTTP method the use (GET or HEAD)
+  # $uri contains the identifier of the resource
+  # $in_recursion equals 1 if we are in recursion mode (i.e. it is at least
+  #                        the second resource checked)
+  # $redirects is a pointer to the hash containing the map of the redirects
+
+  # Get the resource
+  my $response;
+  if (defined($results{$uri}{response})
+      && !(($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
+    $response = $results{$uri}{response};
+  } else {
+    $response = &get_uri($method, $uri);
+    &record_results($uri, $method, $response);
+    &record_redirects($redirects, $response->{Redirects});
+  }
+  if (! $response->is_success()) {
+    if (! $in_recursion) {
+      # Is it too late to request authentication?
+      if ($response->code() == 401) {
+        &authentication($response);
+      } else {
+        &html_header($uri) if $Opts{HTML};
+        &hprintf("\nError: %d %s\n",
+                 $response->code(), $response->message());
+      }
+    }
+    $response->{Stop} = 1;
+    return($response);
+  }
+
+  # What is the URI of the resource that we are processing by the way?
+  my $base_uri = URI->new($response->base());
+  my $request_uri = URI->new($response->request->url);
+  $response->{absolute_uri} = $request_uri->abs($base_uri);
+
+  # Can we parse the document?
+  my $failed_reason;
+  if ((my $ct = $response->header('Content-Type')) !~ $ContentTypes) {
+    $failed_reason = "Content-Type for '$request_uri' is '$ct'";
+  } elsif ($response->header('Content-Encoding') &&
+           ((my $ce = $response->header('Content-Encoding')) ne 'identity')) {
+    # @@@ We could maybe handle gzip...
+    $failed_reason = "Content-Encoding for '$request_uri' is '$ce'";
+  }
+  if ($failed_reason) {
+    # No, there is a problem...
+    if (! $in_recursion) {
+      &html_header($uri) if $Opts{HTML};
+      &hprintf("Can't check links: %s.\n", $failed_reason);
+    }
+    $response->{Stop} = 1;
+  }
+
+  # Ok, return the information
+  return($response);
+}
+
+#########################################################
+# Check whether a URI is within the scope of recursion. #
+#########################################################
+
+sub in_recursion_scope ($)
+{
+  my ($uri) = @_;
+  return undef unless $uri;
+
+  my $current = URI->new($uri)->canonical();
+  my $rel = $current->rel($Opts{Base_Location}); # base -> current!
+
+  return undef if ($current eq $rel);     # Relative path not possible?
+  return undef if ($rel =~ m|^(\.\.)?/|); # Relative path starts with ../ or /?
+  return 1;
+}
+
+##################################################
+# Check whether a URI has already been processed #
+##################################################
+
+sub already_processed ($)
+{
+  my ($uri) = @_;
+  # Don't be verbose for that part...
+  my $summary_value = $Opts{Summary_Only};
+  $Opts{Summary_Only} = 1;
+  # Do a GET: if it fails, we stop, if not, the results are cached
+  my $response = &get_document('GET', $uri, 1);
+  # ... but just for that part
+  $Opts{Summary_Only} = $summary_value;
+  # Can we process the resource?
+  return -1 if defined($response->{Stop});
+  # Have we already processed it?
+  return  1 if defined($processed{$response->{absolute_uri}->as_string()});
+  # It's not processed yet and it is processable: return 0
+  return  0;
+}
+
+############################
+# Get the content of a URI #
+############################
+
+sub get_uri ($$;$\%$$$$)
+{
+  # Here we have a lot of extra parameters in order not to lose information
+  # if the function is called several times (401's)
+  my ($method, $uri, $start, $redirects, $code, $realm, $message, $auth) = @_;
+
+  # $method contains the method used
+  # $uri contains the target of the request
+  # $start is a timestamp (not defined the first time the function is
+  #                        called)
+  # $redirects is a map of redirects
+  # $code is the first HTTP return code
+  # $realm is the realm of the request
+  # $message is the HTTP message received
+  # $auth equals 1 if we want to send out authentication information
+
+  # For timing purposes
+  $start = &get_timestamp() unless defined($start);
+
+  # Prepare the query
+  my %lwpargs = ($LWP::VERSION >= 5.6) ? (keep_alive => 1) : ();
+  my $ua = W3C::UserAgent->new(%lwpargs);
+  $ua->timeout($Opts{Timeout});
+  $ua->agent(sprintf('%s/%s [%s] %s',
+                     $AGENT, $VERSION, $CVS_VERSION, $ua->agent()));
+  $ua->env_proxy();
+  $ua->proxy('http', 'http://' . $Opts{HTTP_Proxy}) if $Opts{HTTP_Proxy};
+
+  # $ua->{fetching} contains the URI we originally wanted
+  # $ua->{uri} is modified in the case of a redirect; this is used to
+  # build $ua->{Redirects}
+  $ua->{uri} = $ua->{fetching} = $uri;
+  $ua->{Redirects} = $redirects if defined($redirects);
+
+  # Do we want printouts of progress?
+  my $verbose_progress =
+    ! ($Opts{Summary_Only} || (!$doc_count && $Opts{HTML}));
+
+  &hprintf("%s %s ", $method, $uri) if $verbose_progress;
+
+  my $request = new HTTP::Request($method, $uri);
+  if ($Opts{Accept_Language} && $Opts{Languages}) {
+    $request->header('Accept-Language' => $Opts{Languages});
+  }
+  $request->header('Accept', $Accept);
+  # Are we providing authentication info?
+  if ($auth && $request->url()->host() =~ /$Opts{Trusted}/i) {
+    if (defined($ENV{HTTP_AUTHORIZATION})) {
+      $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION});
+    } elsif (defined($Opts{User}) && defined($Opts{Password})) {
+      $request->authorization_basic($Opts{User}, $Opts{Password});
+    }
+  }
+
+  # Tell the user agent if we want progress reports (in redirects) or not.
+  $ua->{Checklink_verbose_progress} = $verbose_progress;
+
+  # Check if the IP address is allowed.
+  my $response = &ip_allowed($request->uri());
+  return $response if $response;
+
+  # Do the query
+  $response = $ua->request($request);
+
+  # Get the results
+  # Record the very first response
+  if (! defined($code)) {
+    $code = $ua->{FirstResponse};
+    $message = $ua->{FirstMessage};
+  }
+  # Authentication requested?
+  if ($response->code() == 401 &&
+      !defined($auth) &&
+      (defined($ENV{HTTP_AUTHORIZATION})
+       || (defined($Opts{User}) && defined($Opts{Password})))) {
+
+    # Set host as trusted domain unless we already have one.
+    $Opts{Trusted} ||= sprintf('^%s$', quotemeta($response->base()->host()));
+
+    # Deal with authentication and avoid loops
+    if (! defined($realm)) {
+      $response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
+      $realm = $1;
+    }
+    print "\n" if $verbose_progress;
+    return &get_uri($method, $response->request->url,
+                    $start, $ua->{Redirects},
+                    $code, $realm, $message, 1);
+  }
+  # Record the redirects
+  $response->{Redirects} = $ua->{Redirects};
+  &hprintf(" fetched in %ss\n",
+           &time_diff($start, &get_timestamp())) if $verbose_progress;
+
+  $response->{OriginalCode}    = $code;
+  $response->{OriginalMessage} = $message;
+  $response->{Realm}           = $realm if defined($realm);
+
+  return $response;
+}
+
+#########################################
+# Record the results of an HTTP request #
+#########################################
+
+sub record_results ($$$)
+{
+  my ($uri, $method, $response) = @_;
+  $results{$uri}{response} = $response;
+  $results{$uri}{method} = $method;
+  $results{$uri}{location}{code} = $response->code();
+  $results{$uri}{location}{type} = $response->header('Content-type');
+  $results{$uri}{location}{display} = $results{$uri}{location}{code};
+  $results{$uri}{location}{orig} = $response->{OriginalCode};
+  # Did we get a redirect?
+  if ($response->{OriginalCode} != $response->code()) {
+    $results{$uri}{location}{orig_message} =  $response->{OriginalMessage};
+    $results{$uri}{location}{redirected} = 1;
+  }
+  $results{$uri}{location}{success} = $response->is_success();
+  # Stores the authentication information
+  if (defined($response->{Realm})) {
+    $results{$uri}{location}{realm} = $response->{Realm};
+    $results{$uri}{location}{display} = 401 unless $Opts{Hide_Same_Realm};
+  }
+  # What type of broken link is it? (stored in {record} - the {display}
+  #              information is just for visual use only)
+  if (($results{$uri}{location}{display} == 401)
+      && ($results{$uri}{location}{code} == 404)) {
+    $results{$uri}{location}{record} = 404;
+  } else {
+    $results{$uri}{location}{record} = $results{$uri}{location}{display};
+  }
+  # Did it fail?
+  $results{$uri}{location}{message} = $response->message();
+  if (! $results{$uri}{location}{success}) {
+    &hprintf("Error: %d %s\n",
+             $results{$uri}{location}{code},
+             $results{$uri}{location}{message})
+      if ($Opts{Verbose});
+    return;
+  }
+}
+
+####################
+# Parse a document #
+####################
+
+sub parse_document ($$$$;$)
+{
+  my ($uri, $location, $document, $links, $rec_needs_links) = @_;
+
+  my $p;
+
+  if (defined($results{$uri}{parsing})) {
+    # We have already done the job. Woohoo!
+    $p->{base} = $results{$uri}{parsing}{base};
+    $p->{Anchors} = $results{$uri}{parsing}{Anchors};
+    $p->{Links} = $results{$uri}{parsing}{Links};
+    return $p;
+  }
+
+  my $start;
+  $p = W3C::CheckLink->new();
+  $p->{base} = $location;
+  if (! $Opts{Summary_Only}) {
+    $start = &get_timestamp();
+    print("Parsing...\n");
+  }
+  if (!$Opts{Summary_Only} || $Opts{Progress}) {
+    $p->{Total} = ($document =~ tr/\n//);
+  }
+  # We only look for anchors if we are not interested in the links
+  # obviously, or if we are running a recursive checking because we
+  # might need this information later
+  $p->{only_anchors} = !($links || $rec_needs_links);
+
+  # Transform <?xml:stylesheet ...?> into <xml:stylesheet ...> for parsing
+  # Processing instructions are not parsed by process, but in this case
+  # it should be. It's expensive, it's horrible, but it's the easiest way
+  # for right now.
+  $document =~ s/\<\?(xml:stylesheet.*?)\?\>/\<$1\>/ unless $p->{only_anchors};
+
+  $p->parse($document);
+
+  if (! $Opts{Summary_Only}) {
+    my $stop = &get_timestamp();
+    print "\r" if $Opts{Progress};
+    &hprintf(" done (%d lines in %ss).\n",
+             $p->{Total}, &time_diff($start, $stop));
+  }
+
+  # Save the results before exiting
+  $results{$uri}{parsing}{base} = $p->{base};
+  $results{$uri}{parsing}{Anchors} = $p->{Anchors};
+  $results{$uri}{parsing}{Links} = $p->{Links};
+
+  return $p;
+}
+
+###################################
+# Constructor for W3C::CheckLink #
+###################################
+
+sub new
+{
+  my $p = HTML::Parser::new(@_, api_version => 3);
+
+  # Start tags
+  $p->handler(start => 'start', 'self, tagname, attr, text, event, tokens');
+  # Declarations
+  $p->handler(declaration =>
+              sub {
+                my $self = shift;
+                $self->declaration(substr($_[0], 2, -1));
+              }, 'self, text');
+  # Other stuff
+  $p->handler(default => 'text', 'self, text');
+  # Line count
+  $p->{Line} = 1;
+  # Check <a [..] name="...">?
+  $p->{check_name} = 1;
+  # Check <[..] id="..">?
+  $p->{check_id} = 1;
+  # Don't interpret comment loosely
+  $p->strict_comment(1);
+
+  return $p;
+}
+
+#################################################
+# Record or return  the doctype of the document #
+#################################################
+
+sub doctype
+{
+  my ($self, $dc) = @_;
+  return $self->{doctype} unless $dc;
+  $_ = $self->{doctype} = $dc;
+
+  # What to look for depending on the doctype
+  $self->{check_name} = 0 if ($_ eq '-//W3C//DTD XHTML Basic 1.0//EN');
+
+  # Check for the id tag
+  if (
+      # HTML 2.0 & 3.0
+      m%^-//IETF//DTD HTML [23]\.0//% ||
+      # HTML 3.2
+      m%^-//W3C//DTD HTML 3\.2//%) {
+    $self->{check_id} = 0;
+  }
+  # Enable XML extensions
+  $self->xml_mode(1) if (m%^-//W3C//DTD XHTML %);
+}
+
+#######################################
+# Count the number of lines in a file #
+#######################################
+
+sub new_line
+{
+  my ($self, $string) = @_;
+  my $count = ($string =~ tr/\n//);
+  $self->{Line} = $self->{Line} + $count;
+  printf("\r%4d%%", int($self->{Line}/$self->{Total}*100)) if $Opts{Progress};
+}
+
+#############################
+# Extraction of the anchors #
+#############################
+
+sub get_anchor
+{
+  my ($self, $tag, $attr) = @_;
+
+  my $anchor = $attr->{id} if $self->{check_id};
+  if ($self->{check_name} && ($tag eq 'a')) {
+    # @@@@ In XHTML, <a name="foo" id="foo"> is mandatory
+    # Force an error if it's not the case (or if id's and name's values
+    #                                      are different)
+    # If id is defined, name if defined must have the same value
+    $anchor ||= $attr->{name};
+  }
+
+  return $anchor;
+}
+
+###########################
+# W3C::CheckLink handlers #
+###########################
+
+sub add_link
+{
+  my ($self, $uri) = @_;
+  $self->{Links}{$uri}{$self->{Line}}++ if defined($uri);
+}
+
+sub start
+{
+  my ($self, $tag, $attr, $text) = @_;
+
+  # Anchors
+  my $anchor = $self->get_anchor($tag, $attr);
+  $self->{Anchors}{$anchor}{$self->{Line}}++ if defined($anchor);
+
+  # Links
+  if (!$self->{only_anchors}) {
+    # Here, we are checking too many things
+    # The right thing to do is to parse the DTD...
+    if ($tag eq 'base') {
+      # Treat <base> (without href) or <base href=""> as if it didn't exist.
+      if (defined($attr->{href}) && $attr->{href} ne '') {
+        $self->{base} = $attr->{href};
+      }
+    } else {
+      $self->add_link($attr->{href});
+    }
+    $self->add_link($attr->{src});
+    $self->add_link($attr->{data}) if ($tag eq 'object');
+    $self->add_link($attr->{cite}) if ($tag eq 'blockquote');
+  }
+
+  # Line counting
+  $self->new_line($text) if ($text =~ m/\n/);
+}
+
+sub text
+{
+  my ($self, $text) = @_;
+  if (!$Opts{Progress}) {
+    # If we are just extracting information about anchors,
+    # parsing this part is only cosmetic (progress indicator)
+    return unless !$self->{only_anchors};
+  }
+  $self->new_line($text) if ($text =~ /\n/);
+}
+
+sub declaration
+{
+  my ($self, $text) = @_;
+  # Extract the doctype
+  my @declaration = split(/\s+/, $text, 4);
+  if (($#declaration >= 3) &&
+      ($declaration[0] eq 'DOCTYPE') &&
+      (lc($declaration[1]) eq 'html')) {
+    # Parse the doctype declaration
+    $text =~ m/^DOCTYPE\s+html\s+PUBLIC\s+\"([^\"]*)\"(\s+\"([^\"]*)\")?\s*$/i;
+    # Store the doctype
+    $self->doctype($1) if $1;
+    # If there is a link to the DTD, record it
+    $self->{Links}{$3}{$self->{Line}}++ if (!$self->{only_anchors} && $3);
+  }
+  return unless !$self->{only_anchors};
+  $self->text($text);
+}
+
+################################
+# Check the validity of a link #
+################################
+
+sub check_validity ($$\%\%)
+{
+  my ($testing, $uri, $links, $redirects) = @_;
+  # $testing is the URI of the document checked
+  # $uri is the URI of the target that we are verifying
+  # $links is a hash of the links in the documents checked
+  # $redirects is a map of the redirects encountered
+
+  # Checking file: URI's is not allowed with a CGI
+  if ($testing ne $uri) {
+    if (!$Opts{Command_Line} && $testing !~ m/^file:/ && $uri =~ m/^file:/) {
+      my $msg = 'Error: \'file:\' URI not allowed';
+      # Can't test? Return 400 Bad request.
+      $results{$uri}{location}{code}    = 400;
+      $results{$uri}{location}{record}  = 400;
+      $results{$uri}{location}{orig}    = 400;
+      $results{$uri}{location}{success} = 0;
+      $results{$uri}{location}{message} = $msg;
+      &hprintf("Error: %d %s\n", 400, $msg) if $Opts{Verbose};
+      return;
+    }
+  }
+
+  # Get the document with the appropriate method
+  # Only use GET if there are fragments. HEAD is enough if it's not the
+  # case.
+  my @fragments = keys %{$links->{$uri}{fragments}};
+  my $method = scalar(@fragments) ? 'GET' : 'HEAD';
+
+  my $response;
+  my $being_processed = 0;
+  if ((! defined($results{$uri}))
+      || (($method eq 'GET') && ($results{$uri}{method} eq 'HEAD'))) {
+    $being_processed = 1;
+    $response = &get_uri($method, $uri);
+    # Get the information back from get_uri()
+    &record_results($uri, $method, $response);
+    # Record the redirects
+    &record_redirects($redirects, $response->{Redirects});
+  }
+
+  # We got the response of the HTTP request. Stop here if it was a HEAD.
+  return if ($method eq 'HEAD');
+
+  # There are fragments. Parse the document.
+  my $p;
+  if ($being_processed) {
+    # Can we really parse the document?
+    return unless defined($results{$uri}{location}{type});
+    if ($results{$uri}{location}{type} !~ $ContentTypes) {
+      &hprintf("Can't check content: Content-Type for '%s' is '%s'.\n",
+               $uri, $results{$uri}{location}{type})
+        if ($Opts{Verbose});
+      return;
+    }
+    # Do it then
+    $p = &parse_document($uri, $response->base(),
+                         $response->as_string(), 0);
+  } else {
+    # We already had the information
+    $p->{Anchors} = $results{$uri}{parsing}{Anchors};
+  }
+  # Check that the fragments exist
+  foreach my $fragment (keys %{$links->{$uri}{fragments}}) {
+    if (defined($p->{Anchors}{$fragment})
+        || &escape_match($fragment, $p->{Anchors})) {
+      $results{$uri}{fragments}{$fragment} = 1;
+    } else {
+      $results{$uri}{fragments}{$fragment} = 0;
+    }
+  }
+}
+
+sub escape_match ($\%)
+{
+  my ($a, $hash) = (URI::Escape::uri_unescape($_[0]), $_[1]);
+  foreach my $b (keys %$hash) {
+    return 1 if ($a eq URI::Escape::uri_unescape($b));
+  }
+  return 0;
+}
+
+##########################
+# Ask for authentication #
+##########################
+
+sub authentication ($)
+{
+  my $r = $_[0];
+  $r->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
+  my $realm = $1;
+
+  if ($Opts{Command_Line}) {
+    printf STDERR <<EOF, $r->request()->url(), $realm;
+
+Authentication is required for %s.
+The realm is %s.
+Use the -u and -p options to specify a username and password and the -d option
+to specify trusted domains.
+EOF
+  } else {
+
+    printf("Status: 401 Authorization Required\nWWW-Authenticate: %s\nConnection: close\nContent-Language: en\nContent-Type: text/html; charset=iso-8859-1\n\n", $r->headers->www_authenticate);
+
+    printf("%s
+<html lang=\"en\">
+<head>
+<title>401 Authorization Required</title>
+</head>
+<body>
+<h1>Authorization Required</h1>
+<p>
+  You need %s access to %s to perform Link Checking.<br>
+", $DocType, &encode($realm), $r->request->url);
+
+    if ($Opts{Trusted}) {
+      printf <<EOF, &encode($Opts{Trusted});
+  This service has been configured to send authentication only to hostnames
+  matching the regular expression <code>%s</code>
+EOF
+    }
+
+    print "</p>\n";
+  }
+}
+
+##################
+# Get statistics #
+##################
+
+sub get_timestamp ()
+{
+  return pack('LL', Time::HiRes::gettimeofday());
+}
+
+sub time_diff ($$)
+{
+  my @start = unpack('LL', $_[0]);
+  my @stop = unpack('LL', $_[1]);
+  for ($start[1], $stop[1]) {
+    $_ /= 1_000_000;
+  }
+  return(sprintf("%.1f", ($stop[0]+$stop[1])-($start[0]+$start[1])));
+}
+
+########################
+# Handle the redirects #
+########################
+
+# Record the redirects in a hash
+sub record_redirects (\%\%)
+{
+  my ($redirects, $sub) = @_;
+  foreach my $r (keys %$sub) {
+    $redirects->{$r} = $sub->{$r};
+  }
+}
+
+# Determine if a request is redirected
+sub is_redirected ($%)
+{
+  my ($uri, %redirects) = @_;
+  return(defined($redirects{$uri}));
+}
+
+# Get a list of redirects for a URI
+sub get_redirects ($%)
+{
+  my ($uri, %redirects) = @_;
+  my @history = ($uri);
+  my %seen = ($uri => 1); # for tracking redirect loops
+  my $loop = 0;
+  while ($redirects{$uri}) {
+    $uri = $redirects{$uri};
+    push(@history, $uri);
+    if ($seen{$uri}) {
+      $loop = 1;
+      last;
+    } else {
+      $seen{$uri}++;
+    }
+  }
+  return ($loop, @history);
+}
+
+####################################################
+# Tool for sorting the unique elements of an array #
+####################################################
+
+sub sort_unique (@)
+{
+  my %saw;
+  @saw{@_} = ();
+  return (sort { $a <=> $b } keys %saw);
+}
+
+#####################
+# Print the results #
+#####################
+
+sub anchors_summary (\%\%)
+{
+  my ($anchors, $errors) = @_;
+
+  # Number of anchors found.
+  my $n = scalar(keys(%$anchors));
+  if (! $Opts{Quiet}) {
+    if ($Opts{HTML}) {
+      print("<h3>Anchors</h3>\n<p>");
+    } else {
+      print("Anchors\n\n");
+    }
+    &hprintf("Found %d anchor%s.", $n, ($n == 1) ? '' : 's');
+    print('</p>') if $Opts{HTML};
+    print("\n");
+  }
+  # List of the duplicates, if any.
+  my @errors = keys %{$errors};
+  if (! scalar(@errors)) {
+    print("<p>Valid anchors!</p>\n") if (! $Opts{Quiet} && $Opts{HTML} && $n);
+    return;
+  }
+  undef $n;
+
+  print('<p>') if $Opts{HTML};
+  print('List of duplicate and empty anchors');
+  print <<EOF if $Opts{HTML};
+</p>
+<table border="1" summary="List of duplicate and empty anchors.">
+<thead>
+<tr>
+<th>Anchors</th>
+<th>Lines</th>
+</tr>
+</thead>
+<tbody>
+EOF
+  print("\n");
+
+  foreach my $anchor (@errors) {
+    my $format;
+    my @unique = &sort_unique(keys %{$anchors->{$anchor}});
+    if ($Opts{HTML}) {
+      $format = "<tr class=\"broken\"><td>%s</td><td>%s</td></tr>\n";
+    } else {
+      my $s = (scalar(@unique) > 1) ? 's' : '';
+      $format = "\t%s\tLine$s: %s\n";
+    }
+    printf($format,
+           &encode($anchor eq '' ? 'Empty anchor' : $anchor),
+           join(', ', @unique));
+  }
+
+  print("</tbody>\n</table>\n") if $Opts{HTML};
+}
+
+sub show_link_report (\%\%\%\%\@;$\%)
+{
+  my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;
+
+  print("\n<dl class=\"report\">") if $Opts{HTML};
+  print("\n");
+
+  # Process each URL
+  my ($c, $previous_c);
+  foreach my $u (@$urls) {
+    my @fragments = keys %{$broken->{$u}{fragments}};
+    # Did we get a redirect?
+    my $redirected = &is_redirected($u, %$redirects);
+    # List of lines
+    my @total_lines;
+    foreach my $l (keys %{$links->{$u}{location}}) {
+      push (@total_lines, $l);
+    }
+    foreach my $f (keys %{$links->{$u}{fragments}}) {
+      next if ($f eq $u && defined($links->{$u}{$u}{-1}));
+      foreach my $l (keys %{$links->{$u}{fragments}{$f}}) {
+        push (@total_lines, $l);
+      }
+    }
+
+    my ($redirect_loop, @redirects_urls) = get_redirects($u, %$redirects);
+
+    # Error type
+    $c = &code_shown($u, $results);
+    # What to do
+    my $whattodo;
+    my $redirect_too;
+    if ($todo) {
+      if ($u =~ m/^javascript:/) {
+        if ($Opts{HTML}) {
+          $whattodo =
+'You must change this link: people using a browser without Javascript support
+will <em>not</em> be able to follow this link. See the
+<a href="http://www.w3.org/TR/1999/WAI-WEBCONTENT-19990505/#tech-scripts">Web
+Content Accessibility Guidelines on the use of scripting on the Web</a> and
+the
+<a href="http://www.w3.org/TR/WCAG10-HTML-TECHS/#directly-accessible-scripts">techniques
+on how to solve this</a>.';
+        } else {
+          $whattodo = 'Change this link: people using a browser without Javascript support will not be able to follow this link.';
+        }
+      } elsif ($c == 500) {
+        # 500's could be a real 500 or a DNS lookup problem
+        if ($results->{$u}{location}{message} =~
+            m/Bad hostname '[^\']*'/) {
+          $whattodo = 'The hostname could not be resolved. This link needs to be fixed.';
+        } else {
+          $whattodo = 'This is a server-side problem. Check the URI.';
+        }
+      } elsif ($redirect_loop) {
+        $whattodo =
+          'Retrieving the URI results in a redirect loop, that should be ' .
+          'fixed.  Examine the redirect sequence to see where the loop ' .
+          'occurs.';
+      } else {
+        $whattodo = $todo->{$c};
+      }
+      # @@@ 303 and 307 ???
+      if (defined($redirects{$u}) && ($c != 301) && ($c != 302)) {
+        $redirect_too = 'The original request has been redirected.';
+        $whattodo .= ' '.$redirect_too unless $Opts{HTML};
+      }
+    } else {
+      # Directory redirects
+      $whattodo = 'Add a trailing slash to the URL.';
+    }
+
+    my @unique = &sort_unique(@total_lines);
+    my $lines_list = join(', ', @unique);
+    my $s = (scalar(@unique) > 1) ? 's' : '';
+    undef @unique;
+
+    if ($Opts{HTML}) {
+      # Style stuff
+      my $idref = '';
+      if ($codes && (!defined($previous_c) || ($c != $previous_c))) {
+        $idref = ' id="d'.$doc_count.'code_'.$c.'"';
+        $previous_c = $c;
+      }
+      # Main info
+      for (@redirects_urls) {
+        $_ = &show_url($_);
+      }
+      # HTTP message
+      my $http_message;
+      if ($results->{$u}{location}{message}) {
+        $http_message = &encode($results->{$u}{location}{message});
+        if ($c == 404 || $c == 500) {
+          $http_message = '<span class="broken">'.
+            $http_message.'</span>';
+        }
+      }
+      my $redirmsg =
+        $redirect_loop ? ' <em>redirect loop detected</em>' : '';
+      printf("
+<dt%s>%s</dt>
+<dd>What to do: <strong%s>%s</strong>%s<br></dd>
+<dd>HTTP Code returned: %d%s<br>
+HTTP Message: %s%s%s</dd>
+<dd>Line%s: %s</dd>\n",
+             # Anchor for return codes
+             $idref,
+             # List of redirects
+             $redirected ?
+             join(' redirected to<br>', @redirects_urls) . $redirmsg :
+             &show_url($u),
+             # Color
+             &bgcolor($c),
+             # What to do
+             $whattodo,
+             # Redirect too?
+             $redirect_too ?
+             sprintf(' <span %s>%s</span>', &bgcolor(301), $redirect_too) : '',
+             # Original HTTP reply
+             $results->{$u}{location}{orig},
+             # Final HTTP reply
+             ($results->{$u}{location}{code} !=
+              $results->{$u}{location}{orig})
+             ? ' <span title="redirected to">-&gt;</span> '.
+             &encode($results->{$u}{location}{code})
+             : '',
+             # Realm
+             (defined($results->{$u}{location}{realm})
+              ? 'Realm: '.&encode($results->{$u}{location}{realm}).'<br>'
+              : ''),
+             # HTTP original message
+             defined($results->{$u}{location}{orig_message})
+             ? &encode($results->{$u}{location}{orig_message}).
+             ' <span title="redirected to">-&gt;</span> '
+             : '',
+             # HTTP final message
+             $http_message,
+             $s,
+             # List of lines
+             $lines_list);
+      if ($#fragments >= 0) {
+        my $fragment_direction = '';
+        if ($results->{$u}{location}{code} == 200) {
+          $fragment_direction =
+            ' <strong class="broken">They need to be fixed!</strong>';
+        }
+        printf("<dd><dl><dt>Broken fragments and their line numbers: %s</dt>\n",
+               $fragment_direction);
+      }
+    } else {
+      my $redirmsg = $redirect_loop ? ' redirect loop detected' : '';
+      printf("\n%s\t%s\n  Code: %d%s %s\n%s\n",
+             # List of redirects
+             $redirected ? join("\n-> ", @redirects_urls) . $redirmsg : $u,
+             # List of lines
+             $lines_list ? "Line$s: $lines_list" : '',
+             # Original HTTP reply
+             $results->{$u}{location}{orig},
+             # Final HTTP reply
+             ($results->{$u}{location}{code} != $results->{$u}{location}{orig})
+             ? ' -> '.$results->{$u}{location}{code}
+             : '',
+             # HTTP message
+             $results->{$u}{location}{message} ?
+             $results->{$u}{location}{message} : '',
+             # What to do
+             wrap(' To do: ', '        ', $whattodo));
+      if ($#fragments >= 0) {
+        if ($results->{$u}{location}{code} == 200) {
+          print("The following fragments need to be fixed:\n");
+        } else {
+          print("Fragments:\n");
+        }
+      }
+    }
+    # Fragments
+    foreach my $f (@fragments) {
+      if ($Opts{HTML}) {
+        printf("<dd>%s: %s</dd>\n",
+               # Broken fragment
+               &show_url($u, $f),
+               # List of lines
+               join(', ', &sort_unique(keys %{$links->{$u}{fragments}{$f}})));
+      } else {
+        my @unq = &sort_unique(keys %{$links->{$u}{fragments}{$f}});
+        printf("\t%-30s\tLine%s: %s\n",
+               # Fragment
+               $f,
+               # Multiple?
+               (scalar(@unq) > 1) ? 's' : '',
+               # List of lines
+               join(', ', @unq));
+      }
+    }
+
+    print("</dl></dd>\n") if ($Opts{HTML} && scalar(@fragments));
+  }
+
+  # End of the table
+  print("</dl>\n") if $Opts{HTML};
+}
+
+sub code_shown ($$)
+{
+  my ($u, $results) = @_;
+
+  if ($results->{$u}{location}{record} == 200) {
+    return $results->{$u}{location}{orig};
+  } else {
+    return $results->{$u}{location}{record};
+  }
+}
+
+#
+# Checks whether we're allowed to retrieve the document based on it's IP
+# address.  Takes an URI object and returns a HTTP::Response containing the
+# appropriate status and error message if the IP was disallowed, undef
+# otherwise.  URIs without hostname or IP address are always allowed,
+# including schemes where those make no sense (eg. data:, often javascript:).
+#
+sub ip_allowed ($)
+{
+  my ($uri) = @_;
+  my $hostname = undef;
+  eval { $hostname = $uri->host() }; # Not all URIs implement host()...
+  return undef unless $hostname;
+
+  my $addr = my $iptype = my $resp = undef;
+  if (my $host = gethostbyname($hostname)) {
+    $addr = inet_ntoa($host->addr()) if $host->addr();
+    if ($addr && (my $ip = Net::IP->new($addr))) {
+      $iptype = $ip->iptype();
+    }
+  }
+  $iptype = 'PUBLIC'
+    if ($iptype && $iptype eq 'PRIVATE' && $Opts{Allow_Private_IPs});
+  if ($iptype && $iptype ne 'PUBLIC') {
+    my $code = 403;
+    my $msg =
+      'Checking non-public IP address disallowed by service configuration';
+    $resp = HTTP::Response->new($code, $msg);
+    $resp->{OriginalCode} = $code;
+    $resp->{OriginalMessage} = $msg;
+  }
+  return $resp;
+}
+
+sub links_summary (\%\%\%\%)
+{
+  # Advices to fix the problems
+
+  my %todo = ( 200 => 'There are broken fragments which must be fixed.',
+               300 => 'It usually means that there is a typo in a link that triggers mod_speling action - this must be fixed!',
+               301 => 'You should update the link.',
+               302 => 'Usually nothing.',
+               303 => 'Usually nothing.',
+               307 => 'Usually nothing.',
+               400 => 'Usually the sign of a malformed URL that cannot be parsed by the server.',
+               401 => "The link is not public. You'd better specify it.",
+               403 => 'The link is forbidden! This needs fixing. Usual suspects: a missing index.html or Overview.html, or a missing ACL.',
+               404 => 'The link is broken. Fix it NOW!',
+               405 => 'The server does not allow HEAD requests. Go ask the guys who run this server why. Check the link manually.',
+               406 => "The server isn't capable of responding according to the Accept* headers sent. Check it out.",
+               407 => 'The link is a proxy, but requires Authentication.',
+               408 => 'The request timed out.',
+               410 => 'The resource is gone. You should remove this link.',
+               415 => 'The media type is not supported.',
+               500 => 'Either the hostname is incorrect or it is a server side problem. Check the detailed list.',
+               501 => 'Could not check this link: method not implemented or scheme not supported.',
+               503 => 'The server cannot service the request, for some unknown reason.');
+  my %priority = ( 410 => 1,
+                   404 => 2,
+                   403 => 5,
+                   200 => 10,
+                   300 => 15,
+                   401 => 20
+                 );
+
+  my ($links, $results, $broken, $redirects) = @_;
+
+  # List of the broken links
+  my @urls = keys %{$broken};
+  my @dir_redirect_urls = ();
+  if ($Opts{Redirects}) {
+    # Add the redirected URI's to the report
+    for my $l (keys %$redirects) {
+      next unless (defined($results->{$l})
+                   && defined($links->{$l})
+                   && !defined($broken->{$l}));
+      # Check whether we have a "directory redirect"
+      # e.g. http://www.w3.org/TR -> http://www.w3.org/TR/
+      my ($redirect_loop, @redirects) = get_redirects($l, %$redirects);
+      if (($#redirects == 1)
+          && (($redirects[0].'/') eq $redirects[1])) {
+        push(@dir_redirect_urls, $l);
+        next;
+      }
+      push(@urls, $l);
+    }
+  }
+
+  # Broken links and redirects
+  if ($#urls < 0) {
+    if (! $Opts{Quiet}) {
+      if ($Opts{HTML}) {
+        print "<h3>Links</h3>\n<p>Valid links!</p>";
+      } else {
+        print "\nValid links.";
+      }
+      print "\n";
+    }
+  } else {
+    print('<h3>') if $Opts{HTML};
+    print("\nList of broken links");
+    print(' and redirects') if $Opts{Redirects};
+
+    # Sort the URI's by HTTP Code
+    my %code_summary;
+    my @idx;
+    foreach my $u (@urls) {
+      if (defined($results->{$u}{location}{record}))  {
+        my $c = &code_shown($u, $results);
+        $code_summary{$c}++;
+        push(@idx, $c);
+      }
+    }
+    my @sorted = @urls[
+                       sort {
+                         defined($priority{$idx[$a]}) ?
+                           defined($priority{$idx[$b]}) ?
+                             $priority{$idx[$a]}
+                               <=> $priority{$idx[$b]} :
+                                 -1 :
+                                   defined($priority{$idx[$b]}) ?
+                                     1 :
+                                         $idx[$a] <=> $idx[$b]
+                                       } 0 .. $#idx
+                      ];
+    @urls = @sorted;
+    undef(@sorted); undef(@idx);
+
+    if ($Opts{HTML}) {
+      # Print a summary
+      print <<EOF;
+</h3>
+<p><em>Fragments listed are broken. See the table below to know what action
+to take.</em></p>
+<table border="1" summary="List of broken fragments and suggested actions.">
+<thead>
+<tr>
+<th>Code</th>
+<th>Occurrences</th>
+<th>What to do</th>
+</tr>
+</thead>
+<tbody>
+EOF
+      foreach my $code (sort(keys(%code_summary))) {
+        printf('<tr%s>', &bgcolor($code));
+        printf('<td><a href="#d%scode_%s">%s</a></td>',
+               $doc_count, $code, $code);
+        printf('<td>%s</td>', $code_summary{$code});
+        printf('<td>%s</td>', $todo{$code});
+        print "</tr>\n";
+      }
+      print "</tbody>\n</table>\n";
+    } else {
+      print(':');
+    }
+    &show_link_report($links, $results, $broken, $redirects,
+                      \@urls, 1, \%todo);
+  }
+
+  # Show directory redirects
+  if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) {
+    print('<h3>') if $Opts{HTML};
+    print("\nList of directory redirects");
+    print("</h3>\n<p>The links below are not broken, but the document does not use the exact URL.</p>") if $Opts{HTML};
+    &show_link_report($links, $results, $broken, $redirects,
+                      \@dir_redirect_urls);
+  }
+}
+
+###############################################################################
+
+################
+# Global stats #
+################
+
+sub global_stats ()
+{
+  my $stop = &get_timestamp();
+  my $n_docs =
+    ($doc_count <= $Opts{Max_Documents}) ? $doc_count : $Opts{Max_Documents};
+  return sprintf('Checked %d document%s in %s seconds.',
+                 $n_docs,
+                 ($n_docs == 1) ? '' : 's',
+                 &time_diff($timestamp, $stop));
+}
+
+##################
+# HTML interface #
+##################
+
+sub html_header ($;$$)
+{
+  my ($uri, $doform, $cookie) = @_;
+
+  $uri = &encode($uri);
+  my $title = ' Link Checker' . ($uri eq '' ? '' : ': ' . $uri);
+
+  # mod_perl 1.99_05 doesn't seem to like if the "\n\n" isn't in the same
+  # print() statement as the last header...
+
+  my $headers = '';
+  if (! $Opts{Command_Line}) {
+    $headers .= "Cache-Control: no-cache\nPragma: no-cache\n" if $doform;
+    $headers .= "Content-Type: text/html; charset=iso-8859-1\n";
+    $headers .= "Content-Script-Type: application/x-javascript\n";
+    $headers .= "Set-Cookie: $cookie\n" if $cookie;
+    $headers .= "Content-Language: en\n\n";
+  }
+
+  my $script = my $onload = '';
+  if ($doform) {
+    $script = "
+<script type=\"application/x-javascript\">
+function uriOk()
+{
+  var v = document.forms[0].uri.value;
+  if (v.length > 0) {
+    if (v.search) return (v.search(/\\S/) != -1);
+    return true;
+  }
+  return false;
+}
+</script>";
+   $onload = ' onload="document.forms[0].uri.focus()"';
+  }
+
+  print $headers, $DocType, "
+<html lang=\"en\">
+<head>
+<title>W3C", $title, "</title>
+<style type=\"text/css\">
+body, address {
+  font-family: sans-serif;
+  color: black;
+  background: white;
+}
+pre, code, tt {
+  font-family: monospace;
+}
+img {
+  color: white;
+  border: none;
+  vertical-align: middle;
+}
+fieldset {
+  padding-left: 1em;
+  background-color: #eee;
+}
+th {
+  text-align: left;
+}
+h1 a {
+  color: black;
+}
+h1 {
+  color: #053188;
+}
+h1#title {
+  background-color: #eee;
+  border-bottom: 1px solid black;
+  padding: .25em;
+}
+address {
+  padding: 1ex;
+  border-top: 1px solid black;
+  background-color: #eee;
+  clear: right;
+}
+address img {
+  float: right;
+  width: 88px;
+}
+a:hover {
+  background-color: #eee#;
+}
+a:visited {
+  color: purple;
+}
+.report {
+  width: 100%;
+}
+dt.report {
+  font-weight: bold;
+}
+.unauthorized {
+  background-color: aqua;
+}
+.redirect {
+  background-color: yellow;
+}
+.broken {
+  background-color: red;
+}
+.multiple {
+  background-color: fuchsia;
+}
+</style>", $script, "
+</head>
+<body", $onload, ">
+<h1 id=\"title\"><a href=\"http://www.w3.org/\" title=\"W3C\"><img alt=\"W3C\" id=\"logo\" src=\"http://www.w3.org/Icons/w3c_home\" height=\"48\" width=\"72\"></a> ", $title, "</h1>\n\n";
+}
+
+sub bgcolor ($)
+{
+  my ($code) = @_;
+  my $class;
+  my $r = HTTP::Response->new($code);
+  if ($r->is_success()) {
+    return '';
+  } elsif ($code == 300) {
+    $class = 'multiple';
+  } elsif ($code == 401) {
+    $class = 'unauthorized';
+  } elsif ($r->is_redirect()) {
+    $class = 'redirect';
+  } elsif ($r->is_error()) {
+    $class = 'broken';
+  } else {
+    $class = 'broken';
+  }
+  return(' class="'.$class.'"');
+}
+
+sub show_url ($;$)
+{
+  my ($url, $fragment) = @_;
+  if (defined($fragment)) {
+    my $u = URI->new($url);
+    $u->fragment($fragment);
+    $url = $u->as_string();
+  }
+  $url = &encode($url);
+  return sprintf('<a href="%s">%s</a>',
+                 $url, defined($fragment) ? &encode($fragment) : $url);
+}
+
+sub html_footer ()
+{
+  printf("<p>%s</p>\n", &global_stats()) if ($doc_count > 0 && !$Opts{Quiet});
+  print <<EOF;
+<div>
+<address>
+$PROGRAM $REVISION,
+by <a href="http://www.w3.org/People/Hugo/">Hugo Haas</a> and others.<br>
+Please send bug reports, suggestions and comments to the
+<a href="mailto:www-validator\@w3.org?subject=checklink%3A%20">www-validator
+mailing list</a>
+(<a href="http://lists.w3.org/Archives/Public/www-validator/">archives</a>).
+<br>
+Check out the <a href="docs/checklink.html">documentation</a>.
+Download the
+<a href="http://dev.w3.org/cvsweb/~checkout~/validator/httpd/cgi-bin/checklink.pl?rev=$CVS_VERSION&amp;content-type=text/plain">source code</a> from
+<a href="http://dev.w3.org/cvsweb/validator/httpd/cgi-bin/checklink.pl">CVS</a>.
+</address>
+</div>
+</body>
+</html>
+EOF
+}
+
+sub file_uri ($)
+{
+  my ($uri) = @_;
+  &html_header($uri);
+  print "<h2>Forbidden</h2>
+<p>You cannot check such a URI (<code>$uri</code>).</p>
+";
+  &html_footer();
+  exit;
+}
+
+sub print_form ($)
+{
+  my ($q) = @_;
+
+  my $chk = ' checked="checked"';
+  $q->param('hide_type', 'all') unless $q->param('hide_type');
+
+  my $sum = $q->param('summary')              ? $chk : '';
+  my $red = $q->param('hide_redirects')       ? $chk : '';
+  my $all = ($q->param('hide_type') ne 'dir') ? $chk : '';
+  my $dir = $all                              ? ''   : $chk;
+  my $acc = $q->param('no_accept_language')   ? $chk : '';
+  my $rec = $q->param('recursive')            ? $chk : '';
+  my $dep = &encode($q->param('depth')              || '');
+
+  my $cookie_options = '';
+  if ($q->cookie()) {
+    $cookie_options = "
+    <label for=\"cookie1\"><input type=\"radio\" id=\"cookie1\" name=\"cookie\" value=\"nochanges\" checked=\"checked\"> Don't modify saved options</label>
+    <label for=\"cookie2\"><input type=\"radio\" id=\"cookie2\" name=\"cookie\" value=\"set\"> Save these options</label>
+    <label for=\"cookie3\"><input type=\"radio\" id=\"cookie3\" name=\"cookie\" value=\"clear\"> Clear saved options</label>";
+  } else {
+    $cookie_options = "
+    <label for=\"cookie\"><input type=\"checkbox\" id=\"cookie\" name=\"cookie\" value=\"set\"> Save options in a <a href=\"http://www.w3.org/Protocols/rfc2109/rfc2109\">cookie</a></label>";
+  }
+
+  print "<form action=\"", $q->self_url(), "\" method=\"get\" onsubmit=\"return uriOk()\">
+<p><label for=\"uri\">Enter the address (<a href=\"http://www.w3.org/Addressing/#terms\">URL</a>)
+of a document that you would like to check:</label></p>
+<p><input type=\"text\" size=\"50\" id=\"uri\" name=\"uri\" value=\"\"></p>
+<fieldset>
+  <legend>Options</legend>
+  <p>
+    <label for=\"summary\"><input type=\"checkbox\" id=\"summary\" name=\"summary\" value=\"on\"", $sum, "> Summary only</label>
+    <br>
+    <label for=\"hide_redirects\"><input type=\"checkbox\" id=\"hide_redirects\" name=\"hide_redirects\" value=\"on\"", $red, "> Hide <a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.3\">redirects</a>:</label>
+    <label><input type=\"radio\" name=\"hide_type\" value=\"all\"", $all, "> all</label>
+    <label><input type=\"radio\" name=\"hide_type\" value=\"dir\"", $dir, "> for directories only</label>
+    <br>
+    <label for=\"no_accept_language\"><input type=\"checkbox\" id=\"no_accept_language\" name=\"no_accept_language\" value=\"on\"", $acc, "> Don't send <tt><a href=\"http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4\">Accept-Language</a></tt> headers</label>
+    <br>
+    <label title=\"Check linked documents recursively (maximum: ", $Opts{Max_Documents}, " documents; sleeping ", $Opts{Sleep_Time}, " seconds between each document)\" for=\"recursive\"><input type=\"checkbox\" id=\"recursive\" name=\"recursive\" value=\"on\"", $rec, "> Check linked documents recursively</label>,
+    <label title=\"Depth of the recursion (-1 is the default and means unlimited)\" for=\"depth\">recursion depth: <input type=\"text\" size=\"3\" maxlength=\"3\" id=\"depth\" name=\"depth\" value=\"", $dep, "\"></label>
+    <br><br>", $cookie_options, "
+  </p>
+</fieldset>
+<p><input type=\"submit\" name=\"check\" value=\"Check\"></p>
+</form>
+";
+}
+
+sub encode (@)
+{
+  return $Opts{HTML} ? HTML::Entities::encode(@_) : @_;
+}
+
+sub hprintf (@)
+{
+  if (! $Opts{HTML}) {
+    printf(@_);
+  } else {
+    print HTML::Entities::encode(sprintf($_[0], @_[1..@_-1]));
+  }
+}
+
+# Local Variables:
+# mode: perl
+# indent-tabs-mode: nil
+# tab-width: 2
+# perl-indent-level: 2
+# End:
+# ex: ts=2 sw=2 et
Received on Thursday, 5 August 2010 14:46:54 UTC