link-checker commit: Keep links as URI objects internally so IRI have a better chance of working.

changeset:   379:620e6c7f57b3
tag:         tip
user:        Ville Skyttä <ville.skytta@iki.fi>
date:        Sun Jan 02 23:16:41 2011 +0200
files:       bin/checklink
description:
Keep links as URI objects internally so IRI have a better chance of working.


diff -r e3fb02214532 -r 620e6c7f57b3 bin/checklink
--- a/bin/checklink	Sun Jan 02 23:08:25 2011 +0200
+++ b/bin/checklink	Sun Jan 02 23:16:41 2011 +0200
@@ -678,7 +678,8 @@
             }
         }
     }
-    $uri = $uri->canonical()->as_string();
+    $uri = $uri->canonical();
+    $query->param("uri", $uri);
 
     &check_uri(scalar($query->Vars()), $uri, 1, $Opts{Depth}, $cookie);
     undef $query;    # Not needed any more.
@@ -1030,14 +1031,14 @@
             $uri = URI::file->new_abs($uri) unless $uri->scheme();
         }
     }
-    return $uri->canonical()->as_string();
+    return $uri->canonical();
 }
 
 ########################################
 # Check for broken links in a resource #
 ########################################
 
-sub check_uri (\%$$$$;$$)
+sub check_uri (\%\$$$$;\$$)
 {
     my ($params, $uri, $check_num, $depth, $cookie, $referer, $is_start) = @_;
     $is_start ||= ($check_num == 1);
@@ -1088,14 +1089,13 @@
         }
     }
 
-    my $absolute_uri = $response->{absolute_uri}->as_string();
-
     if ($Opts{HTML}) {
         $doc_header .=
-            ("<h2>\nProcessing\t" . &show_url($absolute_uri) . "\n</h2>\n\n");
+            ("<h2>\nProcessing\t" . &show_url($response->{absolute_uri}) .
+                "\n</h2>\n\n");
     }
     else {
-        $doc_header .= "\nProcessing\t$absolute_uri\n\n";
+        $doc_header .= "\nProcessing\t$response->{absolute_uri}\n\n";
     }
 
     if (!$Opts{Quiet}) {
@@ -1158,8 +1158,8 @@
 EOF
             printf("<p>Go to <a href=\"#%s\">the results</a>.</p>\n",
                 $result_anchor);
-            my $esc_uri =
-                URI::Escape::uri_escape($absolute_uri, "^A-Za-z0-9.");
+            my $esc_uri = URI::Escape::uri_escape($response->{absolute_uri},
+                "^A-Za-z0-9.");
             print "<p>For reliable link checking results, check ";
 
             if (!$response->{IsCss}) {
@@ -1191,10 +1191,11 @@
     }
 
     # Record that we have processed this resource
-    $processed{$absolute_uri} = 1;
+    $processed{$response->{absolute_uri}} = 1;
 
     # Parse the document
-    my $p = &parse_document($uri, $absolute_uri, $response, 1, ($depth != 0));
+    my $p = &parse_document($uri, $response->{absolute_uri},
+        $response, 1, ($depth != 0));
     my $base = URI->new($p->{base});
 
     # Check anchors
@@ -1271,6 +1272,7 @@
     my %broken;
     my $link_num = 0;
     while (my ($u, $ulinks) = each(%links)) {
+        $u = URI->new($u);
 
         if ($Opts{Summary_Only}) {
 
@@ -1355,7 +1357,7 @@
     # Do we want to process other documents?
     if ($depth != 0) {
 
-        foreach my $u (keys %links) {
+        foreach my $u (map { URI->new($_) } keys %links) {
 
             next unless $results{$u}{location}{success};    # Broken link?
 
@@ -1437,18 +1439,18 @@
 # Get and parse a resource to process #
 #######################################
 
-sub get_document ($$$;\%$$$$$)
+sub get_document ($\$$;\%\$$$$$)
 {
     my ($method, $uri,    $in_recursion, $redirects, $referer,
         $cookie, $params, $check_num,    $is_start
     ) = @_;
 
     # $method contains the HTTP method the use (GET or HEAD)
-    # $uri contains the identifier of the resource
+    # $uri object contains the identifier of the resource
     # $in_recursion is > 0 if we are in recursion mode (i.e. it is at least
     #                        the second resource checked)
     # $redirects is a pointer to the hash containing the map of the redirects
-    # $referer is the URI of the referring document
+    # $referer is the URI object of the referring document
     # $cookie, $params, $check_num, and $is_start are for printing HTTP headers
     #                  and the form if $in_recursion == 0 and not authenticating
 
@@ -1528,12 +1530,12 @@
 # Check whether a URI is within the scope of recursion. #
 #########################################################
 
-sub in_recursion_scope ($)
+sub in_recursion_scope (\$)
 {
     my ($uri) = @_;
     return 0 unless $uri;
 
-    my $candidate = URI->new($uri)->canonical();
+    my $candidate = $uri->canonical();
 
     return 0 if (defined($Opts{Exclude}) && $candidate =~ $Opts{Exclude});
 
@@ -1567,7 +1569,7 @@
 # Check whether a URI has already been processed #
 ##################################################
 
-sub already_processed ($$)
+sub already_processed (\$\$)
 {
     my ($uri, $referer) = @_;
 
@@ -1595,7 +1597,7 @@
 # Get the content of a URI #
 ############################
 
-sub get_uri ($$;$$\%$$$$)
+sub get_uri ($\$;\$$\%$$$$)
 {
 
     # Here we have a lot of extra parameters in order not to lose information
@@ -1605,8 +1607,8 @@
     ) = @_;
 
     # $method contains the method used
-    # $uri contains the target of the request
-    # $referer is the URI of the referring document
+    # $uri object contains the target of the request
+    # $referer is the URI object of the referring document
     # $start is a timestamp (not defined the first time the function is called)
     # $redirects is a map of redirects
     # $code is the first HTTP return code
@@ -1702,7 +1704,7 @@
 # Record the results of an HTTP request #
 #########################################
 
-sub record_results ($$$)
+sub record_results (\$$$)
 {
     my ($uri, $method, $response) = @_;
     $results{$uri}{response}       = $response;
@@ -1776,7 +1778,7 @@
 # Parse a document #
 ####################
 
-sub parse_document ($$$$$)
+sub parse_document (\$\$$$$)
 {
     my ($uri, $base_uri, $response, $links, $rec_needs_links) = @_;
 
@@ -1976,6 +1978,7 @@
         # Remove repeated slashes after the . or .. in relative links, to avoid
         # duplicated checking or infinite recursion.
         $uri =~ s|^(\.\.?/)/+|$1|o;
+        $uri = Encode::decode_utf8($uri);
         $uri = URI->new_abs($uri, $base) if defined($base);
         $self->{Links}{$uri}{defined($line) ? $line : LINE_UNKNOWN()}++;
     }
@@ -2151,12 +2154,12 @@
 # Check the validity of a link #
 ################################
 
-sub check_validity ($$$\%\%)
+sub check_validity (\$\$$\%\%)
 {
     my ($referer, $uri, $want_links, $links, $redirects) = @_;
 
-    # $referer is the URI of the document checked
-    # $uri is the URI of the target that we are verifying
+    # $referer is the URI object of the document checked
+    # $uri is the URI object of the target that we are verifying
     # $want_links is true if we're interested in links in the target doc
     # $links is a hash of the links in the documents checked
     # $redirects is a map of the redirects encountered

Received on Sunday, 2 January 2011 21:16:50 UTC