- From: Olivier Thereaux via cvs-syncmail <cvsmail@w3.org>
- Date: Mon, 01 Aug 2005 05:11:21 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator
In directory hutz:/tmp/cvs-serv5421/lib/W3C/LogValidator
Added Files:
LinkChecker.pm
Log Message:
Adding a link checker module (not very robust since the link checker is not yet
modularized, but nevertheless working) for automatic discovery of the most important
documents with broken links.
--- NEW FILE: LinkChecker.pm ---
# Copyright (c) YYYY the World Wide Web Consortium :
# Keio University,
# European Research Consortium for Informatics and Mathematics
# Massachusetts Institute of Technology.
# written by Firstname Lastname <your@email.address> for W3C
#
# $Id: LinkChecker.pm,v 1.1 2005/08/01 05:11:19 ot Exp $
package W3C::LogValidator::LinkChecker;
use strict;
use warnings;
require Exporter;
our @ISA = qw(Exporter);
our %EXPORT_TAGS = ( 'all' => [ qw() ] );
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
our @EXPORT = qw();
our $VERSION = sprintf "%d.%03d",q$Revision: 1.1 $ =~ /(\d+)\.(\d+)/;
###########################
# usual package interface #
###########################
our $verbose = 1;
our %config;
sub new
{
my $self = {};
my $proto = shift;
my $class = ref($proto) || $proto;
# mandatory vars for the API
@{$self->{URIs}} = undef;
# internal stuff here
# $self->{FOO} = undef;
# don't change this
if (@_) {%config = %{(shift)};}
if (exists $config{verbose}) {$verbose = $config{verbose}}
bless($self, $class);
return $self;
}
sub uris
{
my $self = shift;
if (@_) { @{$self->{URIs}} = @_ }
return @{$self->{URIs}};
}
# internal routines
#sub foobar
#{
# my $self = shift;
# ...
#}
sub path_checklink
{
my $self = shift;
my $cl_path;
my $found = 0;
if (exists $config{checklink}){
$cl_path = $config{checklink};
if ( (-e $cl_path) && (-r $cl_path) && (-x $cl_path)) {
$found = 1;
return $cl_path;
}
}
if ($found == 0) {
foreach ('/usr/bin/checklink', '/bin/checklink', '/usr/local/bin/checklink', './checklink'){
$cl_path = $_;
print "looking for checklink at: $cl_path..." if ($verbose >1);
if ((-e $cl_path) && (-r $cl_path) && (-x $cl_path)) {
$found = 1;
print "found!\n" if ($verbose >1);
return $cl_path;
}
else {
print "\n" if ($verbose >1);
}
}
}
if ($found == 0) { die("checklink not found") }
}
#########################################
# Actual subroutine to check the list of uris #
#########################################
sub process_list
{
my $self = shift;
my $max_invalid = undef;
if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}}
my $max_documents = undef;
if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}}
else {$max_documents = 0}
print "Now Using the Link Checker module :\n" if $verbose;
my $name = "";
if (exists $config{ServerName}) {$name = $config{ServerName}}
my @uris = undef;
my $uri;
my $checklink;
$checklink = $self->path_checklink();
my %hits;
# Opening the file with the hits and URIs data
if (defined ($config{tmpfile}))
{
use DB_File;
my $tmp_file = $config{tmpfile};
tie (%hits, 'DB_File', "$tmp_file", O_RDONLY) ||
die ("Cannot create or open $tmp_file");
@uris = sort { $hits{$b} <=> $hits{$a} } keys %hits;
}
elsif ($self->uris())
{
@uris = $self->uris();
foreach my $uri (@uris) { $hits{$uri} = 0 }
}
print "\n (This may take a long time if you have many files to validate)\n" if ($verbose eq 1);
print "\n" if ($verbose > 2); # trying to breathe in the debug volume...
# require W3C::LinkChecker; # TODO when the link checker is nicely modularized
my @result;
my @result_head;
push @result_head, "Rank";
push @result_head, "Hits";
push @result_head, "#Error(s)";
push @result_head, "Address";
my $total_census = 0;
my $invalid_census = 0;
my $last_invalid_position = 0;
while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) and (($total_census < $max_documents) or (!$max_documents)) )
{
$uri = shift (@uris);
# $self->new_doc();
my $uri_orig = $uri;
$total_census++;
print " processing #$total_census $uri..." if ($verbose > 1);
# FIXME at some point we will use the library instead of running the script
open(LINK, "$checklink $uri 2>/dev/null |");
my $num_errs = 0;
print "\n" if ($verbose > 2);
while (<LINK>) {
my $line = $_;
if (($line =~ /To do: The link is broken/) or ($line =~ /To do: There are broken fragments/) or ($line =~ /To do: The hostname could not be resolved. This link needs to be fixed/)){
$num_errs += 1;
print $line if ($verbose > 2);
}
}
print " " if ($verbose > 2);
if ($num_errs > 0) {
print " $num_errs broken link(s)\n" if ($verbose > 1);
my @result_tmp;
push @result_tmp, $total_census;
push @result_tmp, $hits{$uri_orig};
push @result_tmp, $num_errs;
push @result_tmp, $uri_orig;
push @result, [@result_tmp];
$invalid_census++;
$last_invalid_position = $total_census;
}
else {
print " OK.\n" if ($verbose > 1);
}
}
print "Done!\n" if $verbose;
print "invalid_census $invalid_census \n" if ($verbose > 2 );
my $intro = "Here are the <census> most popular document(s) with broken links \nthat I could find in the logs for $name.";
my $outro;
if ($invalid_census) # we found invalid docs
{
if ($invalid_census eq 1) # let's repect grammar here
{
$intro=~ s/are/is/;
$intro=~ s/<census> //;
$intro=~ s/document\(s\)/document/;
}
$intro =~s/<census>/$invalid_census/;
my $ratio = 10000*$invalid_census/$total_census;
$ratio = int($ratio)/100;
if ($last_invalid_position eq $total_census )
# usual case
{
$outro="Conclusion :
I had to check $last_invalid_position document(s) in order to find $invalid_census HTML documents with broken links.
This means that about $ratio\% of your most popular documents needs fixing.";
}
else
# we didn't find as many invalid docs as requested
{
if ($max_invalid) {
$outro= "Conclusion :
You asked for $max_invalid document with broken links but I could only find $invalid_census
by processing (all the) $total_census document(s) in your logs.
This means that about $ratio\% of your most popular documents were invalid.";}
else # max_invalid set to 0, user asked for all invalid docs
{ $outro= "Conclusion :
I found $invalid_census documents with broken links
by processing (all the) $total_census document(s) in your logs.
This means that about $ratio\% of your most popular documents were invalid.";}
}
}
elsif (!$total_census)
{
$intro="There was nothing to check in this log.";
$outro="";
}
else # everything was actually OK!
{
$intro=~s/<census> //;
$outro="I couldn't find any document with broken links in this log. Congratulations!";
}
if (($total_census == $max_documents) and ($total_census)) # we stopped because of max_documents
{
$outro=$outro."\nNOTE: I stopped after processing $max_documents documents:\n Maybe you could set MaxDocuments to a higher value?";
}
if (defined ($config{tmpfile}))
{
untie %hits;
}
# Here is what the module will return. The hash will be sent to
# the output module
my %returnhash;
# the name of the module
$returnhash{"name"}="Link Checker";
#intro
$returnhash{"intro"}=$intro;
#Headers for the result table
@{$returnhash{"thead"}}=@result_head;
# data for the results table
@{$returnhash{"trows"}}= @result;
#outro
$returnhash{"outro"}=$outro;
return %returnhash;
}
package W3C::LogValidator::LinkChecker;
1;
__END__
=head1 NAME
W3C::LogValidator::LinkChecker - [W3C Log Validator] finds the most popular documents with broken links in a Web server log.
=head1 DESCRIPTION
This module is part of the W3C::LogValidator suite, and combines the W3C link checker with a Web server log analysis tool, providing a way to fix documents with broken links little by little while focusing first on the ones that should have priority.
=head1 AUTHOR
Template created by olivier Thereaux <ot@w3.org> for W3C
Module created by You <your@address>
=head1 SEE ALSO
W3C::LogValidator::LogProcessor, perl(1).
Up-to-date complete info at http://www.w3.org/QA/Tools/LogValidator/
=cut
Received on Monday, 1 August 2005 05:11:24 UTC