checklink: problems with separators in quiet text output

The implementation of the -q flag to the checklink program has several
problems.  You can reproduce these by running, for example,
  checklink -q -r -e http://pag.csail.mit.edu/~mernst/

1. checklink omits the "processing http://foo.com/my/webpage.html" output,
so the output gives no indication of which page the errors are on.

2. checklink isn't actually quiet:  it always prints the
"----------------------------------------" separator even if there is no
other output.  This makes it hard to scan the output looking for problems,
especially if very many pages are being processed.

The separator and the "processing" line should be output if, and only if,
checklink outputs other information regarding the given webpage.
Users who wish an indication regarding each page that is processed can
still use the --summary switch, as before.

The below patch corrects the problems.

                    -Michael Ernst



diff -u -b -r --exclude=CVS --exclude=.hg --exclude=.svn /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink-orig /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink
--- /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink-orig	2008-05-04 18:59:22.000000000 +0200
+++ /DS/home-0/mernst/bin/src/perl/W3C-LinkChecker/perl/modules/W3C/LinkChecker/bin/checklink	2008-06-28 14:56:35.240690771 +0200
@@ -365,6 +365,8 @@
 my $doc_count = 0;
 # Time stamp
 my $timestamp = &get_timestamp();
+# Per-document header; undefined if already printed.  See print_doc_header().
+my $doc_header;
 
 &parse_arguments() if $Opts{Command_Line};
 
@@ -802,18 +804,42 @@
     # Before fetching the document, we don't know if we'll be within the
     # recursion scope or not (think redirects).
     if (!&in_recursion_scope($response->{absolute_uri})) {
+      hprintf("Not in recursion scope: %s\n")
+        if ($Opts{Verbose});
       $response->content("");
       return;
     }
+  }
+
+  # Define the document header, and perhaps print it.
+  # (It might still be defined if the previous document had no errors;
+  # just redefine it in that case.)
+
+  if ($check_num != 1) {
+    if ($Opts{HTML}) {
+      $doc_header = "\n<hr>\n";
+    } else {
+      $doc_header = "\n" . ('-' x 40) . "\n";
+    }
+  }
+
+  my $absolute_uri = $response->{absolute_uri}->as_string();
+
+  if ($Opts{HTML}) {
+    $doc_header .= ("<h2>\nProcessing\t"
+                    . &show_url($absolute_uri)
+                    . "\n</h2>\n\n");
+  } else {
+    $doc_header .= "\nProcessing\t$absolute_uri\n\n";
+  }
 
-    print $Opts{HTML} ? '<hr />' : '-' x 40, "\n";
+  if (! $Opts{Quiet}) {
+    print_doc_header();
   }
 
   # We are checking a new document
   $doc_count++;
 
-  my $absolute_uri = $response->{absolute_uri}->as_string();
-
   my $result_anchor = 'results'.$doc_count;
 
   if ($check_num == 1 && !$Opts{HTML} && !$Opts{Summary_Only}) {
@@ -918,6 +944,7 @@
 
     if ($Opts{Masquerade}) {
       if ($abs_link_uri =~ m|^\Q$Opts{Masquerade_From}\E|) {
+        print_doc_header();
         printf("processing %s in base %s\n",
                $abs_link_uri, $Opts{Masquerade_To});
         my $nlink = $abs_link_uri;
@@ -1027,7 +1054,7 @@
       next if &already_processed($u, $uri);
 
       # Do the job
-      print "\n";
+      print "\n" unless $Opts{Quiet};
       if ($Opts{HTML}) {
         if (!$Opts{Command_Line}) {
           if ($doc_count == $Opts{Max_Documents}) {
@@ -1834,9 +1861,8 @@
     } else {
       print("Anchors\n\n");
     }
-    &hprintf("Found %d anchor%s.", $n, ($n == 1) ? '' : 's');
-    print('</p>') if $Opts{HTML};
-    print("\n");
+    &hprintf("Found %d anchor%s.\n", $n, ($n == 1) ? '' : 's');
+    print('</p>\n') if $Opts{HTML};
   }
   # List of the duplicates, if any.
   my @errors = keys %{$errors};
@@ -1846,6 +1872,7 @@
   }
   undef $n;
 
+  print_doc_header();
   print('<p>') if $Opts{HTML};
   print('List of duplicate and empty anchors');
   print <<EOF if $Opts{HTML};
@@ -1886,7 +1913,7 @@
   my ($links, $results, $broken, $redirects, $urls, $codes, $todo) = @_;
 
   print("\n<dl class=\"report\">") if $Opts{HTML};
-  print("\n");
+  print("\n") if (! $Opts{Quiet});
 
   # Process each URL
   my ($c, $previous_c);
@@ -2136,14 +2163,15 @@
   # Broken links and redirects
   if ($#urls < 0) {
     if (! $Opts{Quiet}) {
+      print_doc_header();
       if ($Opts{HTML}) {
-        print "<h3>Links</h3>\n<p>Valid links!</p>";
+        print "<h3>Links</h3>\n<p>Valid links!</p>\n";
       } else {
-        print "\nValid links.";
+        print "\nValid links.\n";
       }
-      print "\n";
     }
   } else {
+    print_doc_header();
     print('<h3>') if $Opts{HTML};
     print("\nList of broken links");
     #print(' and redirects') if $Opts{Redirects};
@@ -2207,6 +2235,7 @@
 
   # Show directory redirects
   if ($Opts{Dir_Redirects} && ($#dir_redirect_urls > -1)) {
+    print_doc_header();
     print('<h3>') if $Opts{HTML};
     print("\nList of redirects");
     print("</h3>\n<p>The links below are not broken, but the document does not use the exact URL, and the links were redirected. It may be a good idea to link to the final location, for the sake of speed.</p>") if $Opts{HTML};
@@ -2492,6 +2521,7 @@
 
 sub hprintf (@)
 {
+  print_doc_header();
   if (! $Opts{HTML}) {
     printf(@_);
   } else {
@@ -2500,6 +2530,19 @@
   return;
 }
 
+# Print the document header, if it hasn't been printed already.
+# This is invoked before most other output operations, in order
+# to enable quiet processing that doesn't clutter the output with
+# "Processing..." messages when nothing else will be reported.
+sub print_doc_header ()
+{
+  if (defined($doc_header)) {
+    print $doc_header;
+    undef($doc_header);
+  }
+}
+
+
 # Local Variables:
 # mode: perl
 # indent-tabs-mode: nil

Diff finished.  Sun Jun 29 20:01:01 2008

Received on Sunday, 29 June 2008 18:11:11 UTC