- From: Mercurial notifier <nobody@w3.org>
- Date: Sun, 02 Jan 2011 21:16:48 +0000
- To: link-checker updates <www-validator-cvs@w3.org>
changeset: 379:620e6c7f57b3
tag: tip
user: Ville Skyttä <ville.skytta@iki.fi>
date: Sun Jan 02 23:16:41 2011 +0200
files: bin/checklink
description:
Keep links as URI objects internally so IRI have a better chance of working.
diff -r e3fb02214532 -r 620e6c7f57b3 bin/checklink
--- a/bin/checklink Sun Jan 02 23:08:25 2011 +0200
+++ b/bin/checklink Sun Jan 02 23:16:41 2011 +0200
@@ -678,7 +678,8 @@
}
}
}
- $uri = $uri->canonical()->as_string();
+ $uri = $uri->canonical();
+ $query->param("uri", $uri);
&check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie);
undef $query; # Not needed any more.
@@ -1030,14 +1031,14 @@
$uri = URI::file->new_abs($uri) unless $uri->scheme();
}
}
- return $uri->canonical()->as_string();
+ return $uri->canonical();
}
########################################
# Check for broken links in a resource #
########################################
-sub check_uri (\%$$$$;$$)
+sub check_uri (\%\$$$$;\$$)
{
my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_;
$is_start ||= ($check_num == 1);
@@ -1088,14 +1089,13 @@
}
}
- my $absolute_uri = $response->{absolute_uri}->as_string();
-
if ($Opts{HTML}) {
$doc_header .=
- ("<h2>\nProcessing\t" . &show_url($absolute_uri) . "\n</h2>\n\n");
+ ("<h2>\nProcessing\t" . &show_url($response->{absolute_uri}) .
+ "\n</h2>\n\n");
}
else {
- $doc_header .= "\nProcessing\t$absolute_uri\n\n";
+ $doc_header .= "\nProcessing\t$response->{absolute_uri}\n\n";
}
if (!$Opts{Quiet}) {
@@ -1158,8 +1158,8 @@
EOF
printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n",
$result_anchor);
- my $esc_uri =
- URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9.");
+ my $esc_uri = URI::Escape::uri_escape($response->{absolute_uri},
+ "^A-Za-z0-9.");
print "<p>For reliable link checking results, check ";
if (!$response->{IsCss}) {
@@ -1191,10 +1191,11 @@
}
# Record that we have processed this resource
- $processed{$absolute_uri} = 1;
+ $processed{$response->{absolute_uri}} = 1;
# Parse the document
- my $p = &parse_document($uri, $absolute_uri, $response, 1, ($depth != 0));
+ my $p = &parse_document($uri, $response->{absolute_uri},
+ $response, 1, ($depth != 0));
my $base = URI->new($p->{base});
# Check anchors
@@ -1271,6 +1272,7 @@
my %broken;
my $link_num = 0;
while (my ($u, $ulinks) = each(%links)) {
+ $u = URI->new($u);
if ($Opts{Summary_Only}) {
@@ -1355,7 +1357,7 @@
# Do we want to process other documents?
if ($depth != 0) {
- foreach my $u (keys %links) {
+ foreach my $u (map { URI->new($_) } keys %links) {
next unless $results{$u}{location}{success}; # Broken link?
@@ -1437,18 +1439,18 @@
# Get and parse a resource to process #
#######################################
-sub get_document ($$$;\%$$$$$)
+sub get_document ($\$$;\%\$$$$$)
{
my ($method, $uri, $in_recursion, $redirects, $referer,
$cookie, $params, $check_num, $is_start
) = @_;
# $method contains the HTTP method the use (GET or HEAD)
- # $uri contains the identifier of the resource
+ # $uri object contains the identifier of the resource
# $in_recursion is > 0 if we are in recursion mode (i.e. it is at least
# the second resource checked)
# $redirects is a pointer to the hash containing the map of the redirects
- # $referer is the URI of the referring document
+ # $referer is the URI object of the referring document
# $cookie, $params, $check_num, and $is_start are for printing HTTP headers
# and the form if $in_recursion == 0 and not authenticating
@@ -1528,12 +1530,12 @@
# Check whether a URI is within the scope of recursion. #
#########################################################
-sub in_recursion_scope ($)
+sub in_recursion_scope (\$)
{
my ($uri) = @_;
return 0 unless $uri;
- my $candidate = URI->new($uri)->canonical();
+ my $candidate = $uri->canonical();
return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude});
@@ -1567,7 +1569,7 @@
# Check whether a URI has already been processed #
##################################################
-sub already_processed ($$)
+sub already_processed (\$\$)
{
my ($uri, $referer) = @_;
@@ -1595,7 +1597,7 @@
# Get the content of a URI #
############################
-sub get_uri ($$;$$\%$$$$)
+sub get_uri ($\$;\$$\%$$$$)
{
# Here we have a lot of extra parameters in order not to lose information
@@ -1605,8 +1607,8 @@
) = @_;
# $method contains the method used
- # $uri contains the target of the request
- # $referer is the URI of the referring document
+ # $uri object contains the target of the request
+ # $referer is the URI object of the referring document
# $start is a timestamp (not defined the first time the function is called)
# $redirects is a map of redirects
# $code is the first HTTP return code
@@ -1702,7 +1704,7 @@
# Record the results of an HTTP request #
#########################################
-sub record_results ($$$)
+sub record_results (\$$$)
{
my ($uri, $method, $response) = @_;
$results{$uri}{response} = $response;
@@ -1776,7 +1778,7 @@
# Parse a document #
####################
-sub parse_document ($$$$$)
+sub parse_document (\$\$$$$)
{
my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_;
@@ -1976,6 +1978,7 @@
# Remove repeated slashes after the . or .. in relative links, to avoid
# duplicated checking or infinite recursion.
$uri =~ s|^(\.\.?/)/+|$1|o;
+ $uri = Encode::decode_utf8($uri);
$uri = URI->new_abs($uri, $base) if defined($base);
$self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}++;
}
@@ -2151,12 +2154,12 @@
# Check the validity of a link #
################################
-sub check_validity ($$$\%\%)
+sub check_validity (\$\$$\%\%)
{
my ($referer, $uri, $want_links, $links, $redirects) = @_;
- # $referer is the URI of the document checked
- # $uri is the URI of the target that we are verifying
+ # $referer is the URI object of the document checked
+ # $uri is the URI object of the target that we are verifying
# $want_links is true if we're interested in links in the target doc
# $links is a hash of the links in the documents checked
# $redirects is a map of the redirects encountered
Received on Sunday, 2 January 2011 21:16:50 UTC