link-checker commit: Distribute per-doc link check order on host:port basis to avoid RobotUA delays.

changeset:   395:db4d0fe63d96
tag:         tip
user:        Ville Skyttä <ville.skytta@iki.fi>
date:        Mon Mar 28 23:49:07 2011 +0300
files:       bin/checklink
description:
Distribute per-doc link check order on host:port basis to avoid RobotUA delays.


diff -r e1fc47d7f54a -r db4d0fe63d96 bin/checklink
--- a/bin/checklink	Mon Mar 28 01:00:34 2011 +0300
+++ b/bin/checklink	Mon Mar 28 23:49:07 2011 +0300
@@ -1212,6 +1212,7 @@
         scalar(keys %{$p->{Links}}))
         if ($Opts{Verbose});
     my %links;
+    my %hostlinks;
 
     # Record all the links found
     while (my ($link, $lines) = each(%{$p->{Links}})) {
@@ -1232,6 +1233,9 @@
         my $canon_uri = URI->new($abs_link_uri->canonical());
         my $fragment  = $canon_uri->fragment(undef);
         if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) {
+            my $hostport =
+                $canon_uri->can('host_port') ? $canon_uri->host_port() : '';
+            push(@{$hostlinks{$hostport}}, $canon_uri);
             for my $line_num (keys(%$lines)) {
                 if (!defined($fragment) || !length($fragment)) {
 
@@ -1247,17 +1251,20 @@
         }
     }
 
+    my @order = &distribute_links(\%hostlinks);
+    undef %hostlinks;
+
     # Build the list of broken URI's
 
-    my $nlinks = scalar(keys(%links));
+    my $nlinks = scalar(@order);
 
     &hprintf("Checking %d links to build list of broken URI's\n", $nlinks)
         if ($Opts{Verbose});
 
     my %broken;
     my $link_num = 0;
-    while (my ($u, $ulinks) = each(%links)) {
-        $u = URI->new($u);
+    for my $u (@order) {
+        my $ulinks = $links{$u};
 
         if ($Opts{Summary_Only}) {
 
@@ -1387,6 +1394,42 @@
     return;
 }
 
+###############################################################
+# Distribute links based on host:port to avoid RobotUA delays #
+###############################################################
+
+sub distribute_links(\%)
+{
+    my $hostlinks = shift;
+
+    # Hosts ordered by weight (number of links), descending
+    my @order =
+        sort { scalar(@{$hostlinks->{$b}}) <=> scalar(@{$hostlinks->{$a}}) }
+        keys %$hostlinks;
+
+    # All link list flattened into one, in host weight order
+    my @all;
+    push(@all, @{$hostlinks->{$_}}) for @order;
+
+    return @all if (scalar(@order) == 1);
+
+    # Indexes and chunk size for "zipping" the end result list
+    my $num = scalar(@{$hostlinks->{$order[0]}});
+    my @indexes = map { $_ * $num } (0 .. $num - 1);
+
+    # Distribute them
+    my @result;
+    while (my @chunk = splice(@all, 0, $num)) {
+        @result[@indexes] = @chunk;
+        @indexes = map { $_ + 1 } @indexes;
+    }
+
+    # Weed out undefs
+    @result = grep(defined, @result);
+
+    return @result;
+}
+
 ##########################################
 # Decode Content-Encodings in a response #
 ##########################################

Received on Monday, 28 March 2011 20:49:15 UTC