W3C home > Mailing lists > Public > www-validator@w3.org > September 2002

checklink.pl patch

From: Stuart Clark <stuart.clark@jahingo.com>
Date: Sat, 07 Sep 2002 17:39:54 +0100
Message-ID: <3D7A2BDA.3040802@Jahingo.com>
To: www-validator@w3.org
I have attached a patch to allow the user to choose between doing a 
depth first and a breadth first link traversal.

-- 
Stuart Clark
mailto:stuart.clark@Jahingo.com
http://www.Jahingo.com/

--- checklink.pl	Sat Sep  7 17:34:32 2002
+++ checklink.pl.new	Sat Sep  7 17:28:13 2002
@@ -62,6 +62,7 @@
 my $_local_dir = my $_remote_masqueraded_uri = '';
 my $_hide_same_realm = 0;
 my $_depth = 0; # -1 means unlimited recursion
+my $_breadth_first = 0;
 
 # Restrictions for the online version
 my $_sleep_time = 3;
@@ -98,7 +99,12 @@
         }
         # Transform the parameter into a URI
         $uri = &urize($uri);
-        &check_uri($uri, 0, $_depth);
+        if ($_depth) {
+            &check_uri_depth($uri, 0, $_depth, $_breadth_first);
+        }
+        else {
+            &check_uri($uri, 0, 0);
+        }
     }
     if (($doc_count > 0) && !$_summary) {
         printf("\n%s\n", &global_stats());
@@ -154,6 +160,9 @@
     if ($query->param('depth') && ($query->param('depth') != 0)) {
         $_depth = $query->param('depth');
     }
+    if ($query->param('breadthfirst')) {
+        $_breadth_first = 1;
+    }
     $_html = 1;
     my $uri;
     if ($query->param('uri')) {
@@ -174,7 +183,12 @@
             $uri = 'http://'.$uri;
         }
     }
-    &check_uri($uri, 1, $_depth);
+    if ($_depth) {
+        &check_uri_depth($uri, 1, $_depth, $_breadth_first);
+    }
+    else {
+        &check_uri($uri, 1, 0);
+    }
     &html_footer();
 }
 
@@ -200,6 +214,7 @@
                'h|html'          => \$_html,
                'n|noacclanguage' => sub { $_accept_language = 0; },
                'r|recursive'     => sub { $_depth = -1 if $_depth == 0; },
+               'b|breadthfirst'  => sub { $_breadth_first = 1 },
                'l|location=s'    => \$_base_location,
                'u|user=s'        => \$_user,
                'p|password=s'    => \$_password,
@@ -235,6 +250,8 @@
 	-e/--directory		Hide directory redirects - e.g.
 				http://www.w3.org/TR -> http://www.w3.org/TR/
 	-r/--recursive		Check the documents linked from the first one.
+	-b/--breadthfirst		Check the documents copletely at thw current
+				level before doing next level.
 	-D/--depth n		Check the documents linked from the first one
 				to depth n.
 	-l/--location uri	Scope of the documents checked.
@@ -309,6 +326,84 @@
 # Check for broken links in a resource #
 ########################################
 
+sub check_uri_depth {
+    my ($uri, $html_header, $depth, $breadth_first) = @_;
+
+    my @check_stack = ($uri);
+
+    while (my $uri = shift @check_stack) {
+        next if (&already_processed($uri) != 0);
+
+        my @links = check_uri($uri, $html_header, $depth);
+
+        $depth--;
+        last unless $depth;
+
+        $processed{$uri} = 1;
+
+        my %to_check = ();
+
+        if ($_base_location eq '.') {
+            # Get the name of the original directory
+            # e.g. http://www.w3.org/TR/html4/Overview.html
+            #      should return http://www.w3.org/TR/html4/
+            $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|;
+            $_base_location = $1;
+        }
+
+        foreach my $u (@links) {
+            next if (! (# Check if it's in our scope for recursion
+                        ($u =~ m|^$_base_location|) &&
+                        # and the link is not broken
+                        $results{$u}{location}{success} &&
+                        # And it is a text/html or application/xhtml+xml
+                        # resource
+                        (($results{$u}{location}{type} =~ m|text/html|) ||
+                        ($results{$u}{location}{type}
+                        =~ m|application/xhtml\+xml|))
+                        )
+                    );
+            # Check if we have already processed the URI
+            next if (&already_processed($u) != 0);
+
+            $to_check{$u} = 1;
+        }
+
+        if ($breadth_first) {
+            push @check_stack, keys %to_check;
+        }
+        else {
+            unshift @check_stack, keys %to_check;
+        }
+
+        if (@check_stack) {
+            print "\n";
+            if (! $_html) {
+                my $i = 40;
+                while ($i--) {
+                    print('-');
+                }
+            } else {
+                # For the online version, wait for a while to avoid abuses
+                if (!$_cl) {
+                    if ($doc_count == $_max_documents) {
+                        print("<hr>\n<p><strong>Maximum number of documents reached!</strong></p>\n");
+                        last;
+                    }
+                    if ($doc_count >= $_max_documents) {
+                        $doc_count++;
+                        print("<p>Not checking <strong>$uri</strong></p>\n");
+                        next;
+                    }
+                }
+                print('<hr>');
+                sleep($_sleep_time);
+            }
+            print "\n";
+        }
+    }
+}
+
 sub check_uri() {
     my ($uri, $html_header, $depth) = @_;
     # If $html_header equals 1, we need to generate a HTML header (first
@@ -366,7 +461,7 @@
     # Parse the document
     my $p = &parse_document($uri, $absolute_uri,
                             $response->content(), 1,
-			    $depth != 0);
+                            $depth);
     my $base = URI->new($p->{base});
 
     # Check anchors
@@ -496,60 +591,7 @@
     &anchors_summary($p->{Anchors}, \%errors);
     &links_summary(\%links, \%results, \%broken, \%redirects);
 
-    # Do we want to process other documents?
-    if ($depth != 0) {
-        if ($_base_location eq '.') {
-            # Get the name of the original directory
-            # e.g. http://www.w3.org/TR/html4/Overview.html
-            #      should return http://www.w3.org/TR/html4/
-            $results{$uri}{parsing}{base} =~ m|^(.*/)[^/]*|;
-            $_base_location = $1;
-        }
-        foreach $u (keys %links) {
-            next if (! (# Check if it's in our scope for recursion
-                        ($u =~ m|^$_base_location|) &&
-                        # and the link is not broken
-                        $results{$u}{location}{success} &&
-                        # And it is a text/html or application/xhtml+xml
-			# resource
-                        (($results{$u}{location}{type} =~ m|text/html|) ||
-			 ($results{$u}{location}{type}
-			  =~ m|application/xhtml\+xml|))
-                        )
-                     );
-            # Check if we have already processed the URI
-            next if (&already_processed($u) != 0);
-            # Do the job
-            print "\n";
-            if (! $_html) {
-                my $i = 40;
-                while ($i--) {
-                    print('-');
-                }
-            } else {
-                # For the online version, wait for a while to avoid abuses
-                if (!$_cl) {
-                    if ($doc_count == $_max_documents) {
-                        print("<hr>\n<p><strong>Maximum number of documents reached!</strong></p>\n");
-                    }
-                    if ($doc_count >= $_max_documents) {
-                        $doc_count++;
-                        print("<p>Not checking <strong>$u</strong></p>\n");
-                        $processed{$u} = 1;
-                        next;
-                    }
-                }
-                print('<hr>');
-                sleep($_sleep_time);
-            }
-            print "\n";
-            if ($depth < 0) {
-                &check_uri($u, 0, -1);
-            } else {
-                &check_uri($u, 0, $depth-1);
-            }
-        }
-    }
+    return (keys %links);
 }
 
 #######################################
@@ -1792,6 +1834,8 @@
   <label><input type=\"checkbox\" name=\"recursive\"> Check linked documents recursively <small>(maximum: $_max_documents documents; sleeping $_sleep_time seconds between each document)</small></label>
   <br>
   <label>Depth of the recursion: <input type=\"text\" size=\"3\" name=\"depth\"><small>(-1 is the default and means unlimited)</small></label>
+  <br>
+  <label><input type=\"checkbox\" name=\"breadthfirst\"> Check linked documents recursively using breadth first method</label>
 </p>
 <p><input type=\"submit\" name=\"submit\" value=\"Check\"></p>
 </form>
Received on Saturday, 7 September 2002 12:38:53 GMT

This archive was generated by hypermail 2.2.0+W3C-0.50 : Wednesday, 25 April 2012 12:14:04 GMT