- From: Stuart Clark <stuart.clark@jahingo.com>
- Date: Sat, 07 Sep 2002 17:39:54 +0100
- To: www-validator@w3.org
- Message-ID: <3D7A2BDA.3040802@Jahingo.com>
I have attached a patch to allow the user to choose between doing a depth first and a breadth first link traversal. -- Stuart Clark mailto:stuart.clark@Jahingo.com http://www.Jahingo.com/
--- checklink.pl Sat Sep 7 17:34:32 2002
+++ checklink.pl.new Sat Sep 7 17:28:13 2002
@@ -62,6 +62,7 @@
my $_local_dir = my $_remote_masqueraded_uri = '';
my $_hide_same_realm = 0;
my $_depth = 0; # -1 means unlimited recursion
+my $_breadth_first = 0;
# Restrictions for the online version
my $_sleep_time = 3;
@@ -98,7 +99,12 @@
}
# Transform the parameter into a URI
$uri = &urize($uri);
- &check_uri($uri, 0, $_depth);
+ if ($_depth) {
+ &check_uri_depth($uri, 0, $_depth, $_breadth_first);
+ }
+ else {
+ &check_uri($uri, 0, 0);
+ }
}
if (($doc_count > 0) && !$_summary) {
printf("\n%s\n", &global_stats());
@@ -154,6 +160,9 @@
if ($query->param('depth') && ($query->param('depth') != 0)) {
$_depth = $query->param('depth');
}
+ if ($query->param('breadthfirst')) {
+ $_breadth_first = 1;
+ }
$_html = 1;
my $uri;
if ($query->param('uri')) {
@@ -174,7 +183,12 @@
$uri = 'http://'.$uri;
}
}
- &check_uri($uri, 1, $_depth);
+ if ($_depth) {
+ &check_uri_depth($uri, 1, $_depth, $_breadth_first);
+ }
+ else {
+ &check_uri($uri, 1, 0);
+ }
&html_footer();
}
@@ -200,6 +214,7 @@
'h|html' => \$_html,
'n|noacclanguage' => sub { $_accept_language = 0; },
'r|recursive' => sub { $_depth = -1 if $_depth == 0; },
+ 'b|breadthfirst' => sub { $_breadth_first = 1 },
'l|location=s' => \$_base_location,
'u|user=s' => \$_user,
'p|password=s' => \$_password,
@@ -235,6 +250,8 @@
-e/--directory Hide directory redirects - e.g.
http://www.w3.org/TR -> http://www.w3.org/TR/
-r/--recursive Check the documents linked from the first one.
+ -b/--breadthfirst Check the documents copletely at thw current
+ level before doing next level.
-D/--depth n Check the documents linked from the first one
to depth n.
-l/--location uri Scope of the documents checked.
@@ -309,6 +326,84 @@
# Check for broken links in a resource #
########################################
+sub check_uri_depth {
+ my ($uri, $html_header, $depth, $breadth_first) = @_;
+
+ my @check_stack = ($uri);
+
+ while (my $uri = shift @check_stack) {
+ next if (&already_processed($uri) != 0);
+
+ my @links = check_uri($uri, $html_header, $depth);
+
+ $depth--;
+ last unless $depth;
+
+ $processed{$uri} = 1;
+
+ my %to_check = ();
+
+ if ($_base_location eq '.') {
+ # Get the name of the original directory
+ # e.g. http://www.w3.org/TR/html4/Overview.html
+ # should return http://www.w3.org/TR/html4/
+ $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|;
+ $_base_location = $1;
+ }
+
+ foreach my $u (@links) {
+ next if (! (# Check if it's in our scope for recursion
+ ($u =~ m|^$_base_location|) &&
+ # and the link is not broken
+ $results{$u}{location}{success} &&
+ # And it is a text/html or application/xhtml+xml
+ # resource
+ (($results{$u}{location}{type} =~ m|text/html|) ||
+ ($results{$u}{location}{type}
+ =~ m|application/xhtml\+xml|))
+ )
+ );
+ # Check if we have already processed the URI
+ next if (&already_processed($u) != 0);
+
+ $to_check{$u} = 1;
+ }
+
+ if ($breadth_first) {
+ push @check_stack, keys %to_check;
+ }
+ else {
+ unshift @check_stack, keys %to_check;
+ }
+
+ if (@check_stack) {
+ print "\n";
+ if (! $_html) {
+ my $i = 40;
+ while ($i--) {
+ print('-');
+ }
+ } else {
+ # For the online version, wait for a while to avoid abuses
+ if (!$_cl) {
+ if ($doc_count == $_max_documents) {
+ print("<hr>\n<p><strong>Maximum number of documents reached!</strong></p>\n");
+ last;
+ }
+ if ($doc_count >= $_max_documents) {
+ $doc_count++;
+ print("<p>Not checking <strong>$uri</strong></p>\n");
+ next;
+ }
+ }
+ print('<hr>');
+ sleep($_sleep_time);
+ }
+ print "\n";
+ }
+ }
+}
+
sub check_uri() {
my ($uri, $html_header, $depth) = @_;
# If $html_header equals 1, we need to generate a HTML header (first
@@ -366,7 +461,7 @@
# Parse the document
my $p = &parse_document($uri, $absolute_uri,
$response->content(), 1,
- $depth != 0);
+ $depth);
my $base = URI->new($p->{base});
# Check anchors
@@ -496,60 +591,7 @@
&anchors_summary($p->{Anchors}, \%errors);
&links_summary(\%links, \%results, \%broken, \%redirects);
- # Do we want to process other documents?
- if ($depth != 0) {
- if ($_base_location eq '.') {
- # Get the name of the original directory
- # e.g. http://www.w3.org/TR/html4/Overview.html
- # should return http://www.w3.org/TR/html4/
- $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|;
- $_base_location = $1;
- }
- foreach $u (keys %links) {
- next if (! (# Check if it's in our scope for recursion
- ($u =~ m|^$_base_location|) &&
- # and the link is not broken
- $results{$u}{location}{success} &&
- # And it is a text/html or application/xhtml+xml
- # resource
- (($results{$u}{location}{type} =~ m|text/html|) ||
- ($results{$u}{location}{type}
- =~ m|application/xhtml\+xml|))
- )
- );
- # Check if we have already processed the URI
- next if (&already_processed($u) != 0);
- # Do the job
- print "\n";
- if (! $_html) {
- my $i = 40;
- while ($i--) {
- print('-');
- }
- } else {
- # For the online version, wait for a while to avoid abuses
- if (!$_cl) {
- if ($doc_count == $_max_documents) {
- print("<hr>\n<p><strong>Maximum number of documents reached!</strong></p>\n");
- }
- if ($doc_count >= $_max_documents) {
- $doc_count++;
- print("<p>Not checking <strong>$u</strong></p>\n");
- $processed{$u} = 1;
- next;
- }
- }
- print('<hr>');
- sleep($_sleep_time);
- }
- print "\n";
- if ($depth < 0) {
- &check_uri($u, 0, -1);
- } else {
- &check_uri($u, 0, $depth-1);
- }
- }
- }
+ return (keys %links);
}
#######################################
@@ -1792,6 +1834,8 @@
<label><input type=\"checkbox\" name=\"recursive\"> Check linked documents recursively <small>(maximum: $_max_documents documents; sleeping $_sleep_time seconds between each document)</small></label>
<br>
<label>Depth of the recursion: <input type=\"text\" size=\"3\" name=\"depth\"><small>(-1 is the default and means unlimited)</small></label>
+ <br>
+ <label><input type=\"checkbox\" name=\"breadthfirst\"> Check linked documents recursively using breadth first method</label>
</p>
<p><input type=\"submit\" name=\"submit\" value=\"Check\"></p>
</form>
Received on Saturday, 7 September 2002 12:38:53 UTC