- From: Stuart Clark <stuart.clark@jahingo.com>
- Date: Sat, 07 Sep 2002 17:39:54 +0100
- To: www-validator@w3.org
- Message-ID: <3D7A2BDA.3040802@Jahingo.com>
I have attached a patch to allow the user to choose between doing a depth first and a breadth first link traversal. -- Stuart Clark mailto:stuart.clark@Jahingo.com http://www.Jahingo.com/
--- checklink.pl Sat Sep 7 17:34:32 2002 +++ checklink.pl.new Sat Sep 7 17:28:13 2002 @@ -62,6 +62,7 @@ my $_local_dir = my $_remote_masqueraded_uri = ''; my $_hide_same_realm = 0; my $_depth = 0; # -1 means unlimited recursion +my $_breadth_first = 0; # Restrictions for the online version my $_sleep_time = 3; @@ -98,7 +99,12 @@ } # Transform the parameter into a URI $uri = &urize($uri); - &check_uri($uri, 0, $_depth); + if ($_depth) { + &check_uri_depth($uri, 0, $_depth, $_breadth_first); + } + else { + &check_uri($uri, 0, 0); + } } if (($doc_count > 0) && !$_summary) { printf("\n%s\n", &global_stats()); @@ -154,6 +160,9 @@ if ($query->param('depth') && ($query->param('depth') != 0)) { $_depth = $query->param('depth'); } + if ($query->param('breadthfirst')) { + $_breadth_first = 1; + } $_html = 1; my $uri; if ($query->param('uri')) { @@ -174,7 +183,12 @@ $uri = 'http://'.$uri; } } - &check_uri($uri, 1, $_depth); + if ($_depth) { + &check_uri_depth($uri, 1, $_depth, $_breadth_first); + } + else { + &check_uri($uri, 1, 0); + } &html_footer(); } @@ -200,6 +214,7 @@ 'h|html' => \$_html, 'n|noacclanguage' => sub { $_accept_language = 0; }, 'r|recursive' => sub { $_depth = -1 if $_depth == 0; }, + 'b|breadthfirst' => sub { $_breadth_first = 1 }, 'l|location=s' => \$_base_location, 'u|user=s' => \$_user, 'p|password=s' => \$_password, @@ -235,6 +250,8 @@ -e/--directory Hide directory redirects - e.g. http://www.w3.org/TR -> http://www.w3.org/TR/ -r/--recursive Check the documents linked from the first one. + -b/--breadthfirst Check the documents copletely at thw current + level before doing next level. -D/--depth n Check the documents linked from the first one to depth n. -l/--location uri Scope of the documents checked. @@ -309,6 +326,84 @@ # Check for broken links in a resource # ######################################## +sub check_uri_depth { + my ($uri, $html_header, $depth, $breadth_first) = @_; + + my @check_stack = ($uri); + + while (my $uri = shift @check_stack) { + next if (&already_processed($uri) != 0); + + my @links = check_uri($uri, $html_header, $depth); + + $depth--; + last unless $depth; + + $processed{$uri} = 1; + + my %to_check = (); + + if ($_base_location eq '.') { + # Get the name of the original directory + # e.g. http://www.w3.org/TR/html4/Overview.html + # should return http://www.w3.org/TR/html4/ + $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|; + $_base_location = $1; + } + + foreach my $u (@links) { + next if (! (# Check if it's in our scope for recursion + ($u =~ m|^$_base_location|) && + # and the link is not broken + $results{$u}{location}{success} && + # And it is a text/html or application/xhtml+xml + # resource + (($results{$u}{location}{type} =~ m|text/html|) || + ($results{$u}{location}{type} + =~ m|application/xhtml\+xml|)) + ) + ); + # Check if we have already processed the URI + next if (&already_processed($u) != 0); + + $to_check{$u} = 1; + } + + if ($breadth_first) { + push @check_stack, keys %to_check; + } + else { + unshift @check_stack, keys %to_check; + } + + if (@check_stack) { + print "\n"; + if (! $_html) { + my $i = 40; + while ($i--) { + print('-'); + } + } else { + # For the online version, wait for a while to avoid abuses + if (!$_cl) { + if ($doc_count == $_max_documents) { + print("<hr>\n<p><strong>Maximum number of documents reached!</strong></p>\n"); + last; + } + if ($doc_count >= $_max_documents) { + $doc_count++; + print("<p>Not checking <strong>$uri</strong></p>\n"); + next; + } + } + print('<hr>'); + sleep($_sleep_time); + } + print "\n"; + } + } +} + sub check_uri() { my ($uri, $html_header, $depth) = @_; # If $html_header equals 1, we need to generate a HTML header (first @@ -366,7 +461,7 @@ # Parse the document my $p = &parse_document($uri, $absolute_uri, $response->content(), 1, - $depth != 0); + $depth); my $base = URI->new($p->{base}); # Check anchors @@ -496,60 +591,7 @@ &anchors_summary($p->{Anchors}, \%errors); &links_summary(\%links, \%results, \%broken, \%redirects); - # Do we want to process other documents? - if ($depth != 0) { - if ($_base_location eq '.') { - # Get the name of the original directory - # e.g. http://www.w3.org/TR/html4/Overview.html - # should return http://www.w3.org/TR/html4/ - $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|; - $_base_location = $1; - } - foreach $u (keys %links) { - next if (! (# Check if it's in our scope for recursion - ($u =~ m|^$_base_location|) && - # and the link is not broken - $results{$u}{location}{success} && - # And it is a text/html or application/xhtml+xml - # resource - (($results{$u}{location}{type} =~ m|text/html|) || - ($results{$u}{location}{type} - =~ m|application/xhtml\+xml|)) - ) - ); - # Check if we have already processed the URI - next if (&already_processed($u) != 0); - # Do the job - print "\n"; - if (! $_html) { - my $i = 40; - while ($i--) { - print('-'); - } - } else { - # For the online version, wait for a while to avoid abuses - if (!$_cl) { - if ($doc_count == $_max_documents) { - print("<hr>\n<p><strong>Maximum number of documents reached!</strong></p>\n"); - } - if ($doc_count >= $_max_documents) { - $doc_count++; - print("<p>Not checking <strong>$u</strong></p>\n"); - $processed{$u} = 1; - next; - } - } - print('<hr>'); - sleep($_sleep_time); - } - print "\n"; - if ($depth < 0) { - &check_uri($u, 0, -1); - } else { - &check_uri($u, 0, $depth-1); - } - } - } + return (keys %links); } ####################################### @@ -1792,6 +1834,8 @@ <label><input type=\"checkbox\" name=\"recursive\"> Check linked documents recursively <small>(maximum: $_max_documents documents; sleeping $_sleep_time seconds between each document)</small></label> <br> <label>Depth of the recursion: <input type=\"text\" size=\"3\" name=\"depth\"><small>(-1 is the default and means unlimited)</small></label> + <br> + <label><input type=\"checkbox\" name=\"breadthfirst\"> Check linked documents recursively using breadth first method</label> </p> <p><input type=\"submit\" name=\"submit\" value=\"Check\"></p> </form>
Received on Saturday, 7 September 2002 12:38:53 UTC