- From: Olivier Thereaux via cvs-syncmail <cvsmail@w3.org>
- Date: Thu, 22 Jun 2006 05:33:42 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LogValidator/lib/W3C
In directory hutz:/tmp/cvs-serv8672/lib/W3C
Modified Files:
LogValidator.pm
Log Message:
- When log format holds referer info, store that
- When log format holds HTTP status code, store that
- When log format holds mime type info (usually, only in W3C extended log format), store that
all these info are stored in temporary DB_FILEs
Index: LogValidator.pm
===================================================================
RCS file: /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator.pm,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -d -r1.18 -r1.19
--- LogValidator.pm 9 Sep 2005 06:33:11 -0000 1.18
+++ LogValidator.pm 22 Jun 2006 05:33:40 -0000 1.19
@@ -8,7 +8,7 @@
package W3C::LogValidator;
use strict;
-
+no strict "refs";
require Exporter;
our @ISA = qw(Exporter);
our %EXPORT_TAGS = ( 'all' => [ qw() ] );
@@ -22,6 +22,9 @@
our $verbose;
our %cmdline_conf;
our %hits; # hash URI->hits
+our %referers;
+our %mimetypes;
+our %HTTPcodes;
our $output_proc;
###########################
@@ -95,6 +98,9 @@
use File::Temp qw/ /;
my $tmpdir = File::Spec->tmpdir;
$config{LogProcessor}{tmpfile} = File::Temp::tempnam( $tmpdir, "LogValidator-" );
+ $config{LogProcessor}{tmpfile_HTTP_codes} = File::Temp::tempnam( $tmpdir, "LogValidator-" );
+ $config{LogProcessor}{tmpfile_mime_types} = File::Temp::tempnam( $tmpdir, "LogValidator-" );
+ $config{LogProcessor}{tmpfile_referers} = File::Temp::tempnam( $tmpdir, "LogValidator-" );
bless($self, $class);
return $self;
}
@@ -144,6 +150,66 @@
}
}
+sub add_referer
+# usage $self->add_referer($uri, $referer)
+{
+ my $self = shift;
+ if (@_)
+ {
+ my $uri = shift;
+ my $referer = shift;
+ $referer =~ s/^"(.*)"$/$1/;
+ my $preferedref = $config{LogProcessor}{RefererMatch};
+ if (($referer ne "-") and ( $referer =~ /$preferedref/))
+ {
+
+ if (exists $referers{"$uri : $referer"})
+ # nth time this referer is mentioned for $uri, incrementing
+ {
+ $referers{"$uri : $referer"} += 1;
+ }
+ else
+ # first time this referer is mentioned for $uri
+ {
+ $referers{"$uri : $referer"} = 1;
+ }
+ }
+ }
+}
+
+sub add_mime_type
+# record the mime type known for a given logged resource
+# usage $self->add_mime_type('http://foobar', "text/html")
+{
+ my $self = shift;
+ if (@_)
+ {
+ my $uri = shift;
+ my $mime_type = shift;
+ next unless defined($uri);
+ if (! exists($mimetypes{$uri}) )
+ { $mimetypes{$uri} = $mime_type; }
+ }
+}
+
+sub add_HTTP_code
+# record the returned HTTP Code for a given logged resource
+# usage $self->add_HTTP_code('http://foobar', "200")
+# NOTE: doesn't cover if that code changes throughout the log file - TODO fix that?
+{
+ my $self = shift;
+ if (@_)
+ {
+ my $uri = shift;
+ my $HTTP_code = shift;
+ next unless defined($uri);
+ if (! exists($HTTPcodes{$uri}) )
+ {
+ $HTTPcodes{$uri} = $HTTP_code;
+ }
+ }
+}
+
sub read_logfiles
# just looping
{
@@ -153,6 +219,20 @@
my $tmp_file = $config{LogProcessor}{tmpfile};
tie (%hits, 'DB_File', "$tmp_file") ||
die ("Cannot create or open $tmp_file");
+
+ # TODO this should probably be triggered (on or off) by an option rather than always on
+
+ my $tmp_file_referers = $config{LogProcessor}{tmpfile_referers};
+ tie (%referers, 'DB_File', "$tmp_file_referers") ||
+ die ("Cannot create or open $tmp_file_referers");
+
+ my $tmp_file_mime_types = $config{LogProcessor}{tmpfile_mime_types};
+ tie (%mimetypes, 'DB_File', "$tmp_file_mime_types") ||
+ die ("Cannot create or open $tmp_file_mime_types");
+
+ my $tmp_file_HTTP_codes = $config{LogProcessor}{tmpfile_HTTP_codes};
+ tie (%HTTPcodes, 'DB_File', "$tmp_file_HTTP_codes") ||
+ die ("Cannot create or open $tmp_file_HTTP_codes");
print "Reading logfiles: " if ($verbose); #non-quiet mode
print "\n" if ($verbose >1); # verbose or above, we'll have details so linebreak
@@ -161,7 +241,12 @@
{
$self->read_logfile($current_logfile);
}
+
untie %hits;
+ untie %HTTPcodes;
+ untie %mimetypes;
+ untie %referers;
+
print "Done! \n" if ($verbose); #non-quiet mode
}
@@ -189,9 +274,17 @@
my $logtype = $config{LogProcessor}{LogType}{$logfile};
if ($tmp_record) # not a blank line
{
- $tmp_record = $self->find_uri($tmp_record, $logtype);
- #print "$tmp_record \n" if ($verbose >2);
- if ($self->no_cgi($tmp_record)) {$self->add_uri($tmp_record);}
+ my $tmp_record_uri = $self->find_uri($tmp_record, $logtype);
+ my $tmp_record_mime_type = $self->find_mime_type($tmp_record, $logtype);
+ my $tmp_record_HTTP_code = $self->find_HTTP_code($tmp_record, $logtype);
+ my $tmp_record_referer = $self->find_referer($tmp_record, $logtype);
+ if ($self->no_cgi($tmp_record)) {
+ $self->add_uri($tmp_record_uri);
+ $self->add_mime_type($tmp_record_uri, $tmp_record_mime_type);
+ $self->add_HTTP_code($tmp_record_uri,$tmp_record_HTTP_code);
+ $self->add_referer($tmp_record_uri,$tmp_record_referer);
+ }
+
}
$entriescounter++;
}
@@ -233,29 +326,98 @@
$tmprecord = $record_arry[0];
$tmprecord = $self->remove_duplicates($tmprecord);
}
- elsif ($logtype eq "w3") # our W3C in-house log format
+ else #common combined or full or w3c
{
- $tmprecord = $record_arry[4];
- # an oddity of W3C log formats
- my $serverstring = join ("",'http://',$config{LogProcessor}{ServerName});
- $tmprecord =~ s/$serverstring//;
- my $path = $config{LogProcessor}{DocumentRoot};
- $tmprecord =~ s/$path/\//;
+ $tmprecord = $record_arry[6];
$tmprecord = $self->remove_duplicates($tmprecord);
$tmprecord = join ("",'http://',$config{LogProcessor}{ServerName},$tmprecord);
+ }
+ #print "$tmprecord \n" if ($verbose > 2);
+ return $tmprecord;
+ }
+}
+
+sub find_HTTP_code
+# finds the returned HTTP code from a log record, if available
+{
+ my $self = shift;
+ if (@_)
+ {
+ my $tmprecord = shift;
+ my @record_arry;
+ @record_arry = split(" ", $tmprecord);
+ # hardcoded to most apache log formats, included common and combined
+ # for the moment... TODO
+ my $logtype = shift;
+ # print "log type $logtype" if ($verbose > 2);
+ if ($logtype eq "plain")
+ {
+ $tmprecord = "";
+ }
+ else #common combined full or w3c
+ {
+ $tmprecord = $record_arry[8];
+ }
+ #print "HTTP Code $tmprecord \n" if (($verbose > 2) and ($tmprecord ne ""));
+ return $tmprecord;
+ }
+}
+
+sub find_referer
+# finds the referrer info from a log record, if available
+{
+ my $self = shift;
+ if (@_)
+ {
+ my $tmprecord = shift;
+ my @record_arry;
+ @record_arry = split(" ", $tmprecord);
+ # hardcoded to most apache log formats, included common and combined
+ # for the moment... TODO
+ my $logtype = shift;
+ # print "log type $logtype" if ($verbose > 2);
+ if ( ($logtype eq "plain") or ($logtype eq "common"))
+ {
+ $tmprecord = "";
+ }
+ else #combined or full or w3c
+ {
+ $tmprecord = $record_arry[10];
+ }
+ #print "referrer $tmprecord \n" if (($verbose > 2) and ($tmprecord ne ""));
+ return $tmprecord;
+ }
+}
+
+sub find_mime_type
+# only for W3c extended log format - find the mime type for the resource
+{
+ my $self = shift;
+ if (@_)
+ {
+ my $tmprecord = shift;
+ my @record_arry;
+ @record_arry = split(' ', $tmprecord);
+ # hardcoded to most apache log formats, included common and combined
+ # for the moment... TODO
+ my $logtype = shift;
+ # print "log type $logtype" if ($verbose > 2);
+ if ($logtype eq "w3c")
+ {
+ $tmprecord = pop @record_arry;
}
- else #common combined or full
+ else # all other formats
{
- $tmprecord = $record_arry[6];
- $tmprecord = $self->remove_duplicates($tmprecord);
- $tmprecord = join ("",'http://',$config{LogProcessor}{ServerName},$tmprecord);
+ $tmprecord = "";
}
- #print "$tmprecord \n" if ($verbose > 2);
+ #print "mime type $tmprecord \n" if (($verbose > 2) and ($tmprecord ne ""));
return $tmprecord;
}
}
+
+
sub remove_duplicates
# removes "directory index" suffixes such as index.html, etc
# so that http://foobar/ and http://foobar/index.html be counted as one resource
Received on Thursday, 22 June 2006 05:33:54 UTC