- From: Mercurial notifier <nobody@w3.org>
- Date: Mon, 28 Mar 2011 20:49:13 +0000
- To: link-checker updates <www-validator-cvs@w3.org>
changeset: 395:db4d0fe63d96 tag: tip user: Ville Skyttä <ville.skytta@iki.fi> date: Mon Mar 28 23:49:07 2011 +0300 files: bin/checklink description: Distribute per-doc link check order on host:port basis to avoid RobotUA delays. diff -r e1fc47d7f54a -r db4d0fe63d96 bin/checklink --- a/bin/checklink Mon Mar 28 01:00:34 2011 +0300 +++ b/bin/checklink Mon Mar 28 23:49:07 2011 +0300 @@ -1212,6 +1212,7 @@ scalar(keys %{$p->{Links}})) if ($Opts{Verbose}); my %links; + my %hostlinks; # Record all the links found while (my ($link, $lines) = each(%{$p->{Links}})) { @@ -1232,6 +1233,9 @@ my $canon_uri = URI->new($abs_link_uri->canonical()); my $fragment = $canon_uri->fragment(undef); if (!defined($Opts{Exclude}) || $canon_uri !~ $Opts{Exclude}) { + my $hostport = + $canon_uri->can('host_port') ? $canon_uri->host_port() : ''; + push(@{$hostlinks{$hostport}}, $canon_uri); for my $line_num (keys(%$lines)) { if (!defined($fragment) || !length($fragment)) { @@ -1247,17 +1251,20 @@ } } + my @order = &distribute_links(\%hostlinks); + undef %hostlinks; + # Build the list of broken URI's - my $nlinks = scalar(keys(%links)); + my $nlinks = scalar(@order); &hprintf("Checking %d links to build list of broken URI's\n", $nlinks) if ($Opts{Verbose}); my %broken; my $link_num = 0; - while (my ($u, $ulinks) = each(%links)) { - $u = URI->new($u); + for my $u (@order) { + my $ulinks = $links{$u}; if ($Opts{Summary_Only}) { @@ -1387,6 +1394,42 @@ return; } +############################################################### +# Distribute links based on host:port to avoid RobotUA delays # +############################################################### + +sub distribute_links(\%) +{ + my $hostlinks = shift; + + # Hosts ordered by weight (number of links), descending + my @order = + sort { scalar(@{$hostlinks->{$b}}) <=> scalar(@{$hostlinks->{$a}}) } + keys %$hostlinks; + + # All link list flattened into one, in host weight order + my @all; + push(@all, @{$hostlinks->{$_}}) for @order; + + return @all if (scalar(@order) == 1); + + # Indexes and chunk size for "zipping" the end result list + my $num = scalar(@{$hostlinks->{$order[0]}}); + my @indexes = map { $_ * $num } (0 .. $num - 1); + + # Distribute them + my @result; + while (my @chunk = splice(@all, 0, $num)) { + @result[@indexes] = @chunk; + @indexes = map { $_ + 1 } @indexes; + } + + # Weed out undefs + @result = grep(defined, @result); + + return @result; +} + ########################################## # Decode Content-Encodings in a response # ##########################################
Received on Monday, 28 March 2011 20:49:15 UTC