W3C home > Mailing lists > Public > www-validator-cvs@w3.org > August 2010

link-checker commit: Improve heuristics when passed non-absolute URIs.

From: Mercurial notifier <nobody@w3.org>
Date: Thu, 05 Aug 2010 14:47:25 +0000
To: link-checker updates <www-validator-cvs@w3.org>
Message-Id: <E1Oh1jB-0006Mo-8l@blinky.w3.org>
changeset:   344:db78fb7c2ce7
user:        ville
date:        Sun Feb 21 17:17:47 2010 +0000
files:       bin/checklink
description:
Improve heuristics when passed non-absolute URIs.


diff -r 4754ddab123b -r db78fb7c2ce7 bin/checklink
--- a/bin/checklink	Sun Feb 21 16:57:43 2010 +0000
+++ b/bin/checklink	Sun Feb 21 17:17:47 2010 +0000
@@ -5,7 +5,7 @@
 # (c) 1999-2010 World Wide Web Consortium
 # based on Renaud Bruyeron's checklink.pl
 #
-# $Id: checklink,v 4.180 2010-02-21 16:57:43 ville Exp $
+# $Id: checklink,v 4.181 2010-02-21 17:17:47 ville Exp $
 #
 # This program is licensed under the W3C(r) Software License:
 #       http://www.w3.org/Consortium/Legal/copyright-software
@@ -219,6 +219,7 @@
 use Time::HiRes          qw();
 use URI             1.31 qw(); # >= 1.31 for sip: abs/rel
 use URI::Escape          qw();
+use URI::Heuristic       qw();
 # @@@ Needs also W3C::UserAgent but can't use() it here.
 
 use constant RC_ROBOTS_TXT          => -1;
@@ -284,7 +285,7 @@
   $PROGRAM     = 'W3C-checklink';
   $VERSION     = '4.5';
   $REVISION    = sprintf('version %s (c) 1999-2010 W3C', $VERSION);
-  my ($cvsver) = q$Revision: 4.180 $ =~ /(\d+[\d\.]*\.\d+)/;
+  my ($cvsver) = q$Revision: 4.181 $ =~ /(\d+[\d\.]*\.\d+)/;
   $AGENT       = sprintf('%s/%s [%s] %s',
                          $PROGRAM, $VERSION, $cvsver,
                          (W3C::UserAgent::USE_ROBOT_UA
@@ -619,7 +620,13 @@
     if ($uri =~ m|^//|) {
       $uri = 'http:'.$uri;
     } else {
-      $uri = 'http://'.$uri;
+      local $ENV{URL_GUESS_PATTERN} = '';
+      my $guess = URI::Heuristic::uf_uri($uri);
+      if ($guess->scheme() && $ua->is_protocol_supported($guess)) {
+        $uri = $guess->as_string();
+      } else {
+        $uri = 'http://'.$uri;
+      }
     }
   }
 
@@ -904,9 +911,9 @@
   return $res;
 }
 
-###########################################
-# Transform foo into file://localhost/foo #
-###########################################
+############################
+# Transform foo into a URI #
+############################
 
 sub urize ($)
 {
@@ -925,8 +932,17 @@
     }
     # return dir itself if an index file was not found
     $uri ||= URI::file->new_abs($uarg);
+  } elsif ($uarg =~ /^[.\/\\]/ || -e $uarg) {
+    $uri = URI::file->new_abs($uarg);
   } else {
-    $uri = URI->new_abs($arg, URI::file->cwd());
+    my $newuri = URI->new($arg);
+    if ($newuri->scheme()) {
+      $uri = $newuri;
+    } else {
+      local $ENV{URL_GUESS_PATTERN} = '';
+      $uri = URI::Heuristic::uf_uri($arg);
+      $uri = URI::file->new_abs($uri) unless $uri->scheme();
+    }
   }
   return $uri->as_string();
 }
Received on Thursday, 5 August 2010 14:47:40 UTC

This archive was generated by hypermail 2.3.1 : Wednesday, 7 January 2015 15:17:44 UTC