- From: Olivier Thereaux via cvs-syncmail <cvsmail@w3.org>
- Date: Thu, 22 Jun 2006 05:33:42 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LogValidator/lib/W3C In directory hutz:/tmp/cvs-serv8672/lib/W3C Modified Files: LogValidator.pm Log Message: - When log format holds referer info, store that - When log format holds HTTP status code, store that - When log format holds mime type info (usually, only in W3C extended log format), store that all these info are stored in temporary DB_FILEs Index: LogValidator.pm =================================================================== RCS file: /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator.pm,v retrieving revision 1.18 retrieving revision 1.19 diff -u -d -r1.18 -r1.19 --- LogValidator.pm 9 Sep 2005 06:33:11 -0000 1.18 +++ LogValidator.pm 22 Jun 2006 05:33:40 -0000 1.19 @@ -8,7 +8,7 @@ package W3C::LogValidator; use strict; - +no strict "refs"; require Exporter; our @ISA = qw(Exporter); our %EXPORT_TAGS = ( 'all' => [ qw() ] ); @@ -22,6 +22,9 @@ our $verbose; our %cmdline_conf; our %hits; # hash URI->hits +our %referers; +our %mimetypes; +our %HTTPcodes; our $output_proc; ########################### @@ -95,6 +98,9 @@ use File::Temp qw/ /; my $tmpdir = File::Spec->tmpdir; $config{LogProcessor}{tmpfile} = File::Temp::tempnam( $tmpdir, "LogValidator-" ); + $config{LogProcessor}{tmpfile_HTTP_codes} = File::Temp::tempnam( $tmpdir, "LogValidator-" ); + $config{LogProcessor}{tmpfile_mime_types} = File::Temp::tempnam( $tmpdir, "LogValidator-" ); + $config{LogProcessor}{tmpfile_referers} = File::Temp::tempnam( $tmpdir, "LogValidator-" ); bless($self, $class); return $self; } @@ -144,6 +150,66 @@ } } +sub add_referer +# usage $self->add_referer($uri, $referer) +{ + my $self = shift; + if (@_) + { + my $uri = shift; + my $referer = shift; + $referer =~ s/^"(.*)"$/$1/; + my $preferedref = $config{LogProcessor}{RefererMatch}; + if (($referer ne "-") and ( $referer =~ /$preferedref/)) + { + + if (exists $referers{"$uri : $referer"}) + # nth time this referer is mentioned for $uri, incrementing + { + $referers{"$uri : $referer"} += 1; + } + else + # first time this referer is mentioned for $uri + { + $referers{"$uri : $referer"} = 1; + } + } + } +} + +sub add_mime_type +# record the mime type known for a given logged resource +# usage $self->add_mime_type('http://foobar', "text/html") +{ + my $self = shift; + if (@_) + { + my $uri = shift; + my $mime_type = shift; + next unless defined($uri); + if (! exists($mimetypes{$uri}) ) + { $mimetypes{$uri} = $mime_type; } + } +} + +sub add_HTTP_code +# record the returned HTTP Code for a given logged resource +# usage $self->add_HTTP_code('http://foobar', "200") +# NOTE: doesn't cover if that code changes throughout the log file - TODO fix that? +{ + my $self = shift; + if (@_) + { + my $uri = shift; + my $HTTP_code = shift; + next unless defined($uri); + if (! exists($HTTPcodes{$uri}) ) + { + $HTTPcodes{$uri} = $HTTP_code; + } + } +} + sub read_logfiles # just looping { @@ -153,6 +219,20 @@ my $tmp_file = $config{LogProcessor}{tmpfile}; tie (%hits, 'DB_File', "$tmp_file") || die ("Cannot create or open $tmp_file"); + + # TODO this should probably be triggered (on or off) by an option rather than always on + + my $tmp_file_referers = $config{LogProcessor}{tmpfile_referers}; + tie (%referers, 'DB_File', "$tmp_file_referers") || + die ("Cannot create or open $tmp_file_referers"); + + my $tmp_file_mime_types = $config{LogProcessor}{tmpfile_mime_types}; + tie (%mimetypes, 'DB_File', "$tmp_file_mime_types") || + die ("Cannot create or open $tmp_file_mime_types"); + + my $tmp_file_HTTP_codes = $config{LogProcessor}{tmpfile_HTTP_codes}; + tie (%HTTPcodes, 'DB_File', "$tmp_file_HTTP_codes") || + die ("Cannot create or open $tmp_file_HTTP_codes"); print "Reading logfiles: " if ($verbose); #non-quiet mode print "\n" if ($verbose >1); # verbose or above, we'll have details so linebreak @@ -161,7 +241,12 @@ { $self->read_logfile($current_logfile); } + untie %hits; + untie %HTTPcodes; + untie %mimetypes; + untie %referers; + print "Done! \n" if ($verbose); #non-quiet mode } @@ -189,9 +274,17 @@ my $logtype = $config{LogProcessor}{LogType}{$logfile}; if ($tmp_record) # not a blank line { - $tmp_record = $self->find_uri($tmp_record, $logtype); - #print "$tmp_record \n" if ($verbose >2); - if ($self->no_cgi($tmp_record)) {$self->add_uri($tmp_record);} + my $tmp_record_uri = $self->find_uri($tmp_record, $logtype); + my $tmp_record_mime_type = $self->find_mime_type($tmp_record, $logtype); + my $tmp_record_HTTP_code = $self->find_HTTP_code($tmp_record, $logtype); + my $tmp_record_referer = $self->find_referer($tmp_record, $logtype); + if ($self->no_cgi($tmp_record)) { + $self->add_uri($tmp_record_uri); + $self->add_mime_type($tmp_record_uri, $tmp_record_mime_type); + $self->add_HTTP_code($tmp_record_uri,$tmp_record_HTTP_code); + $self->add_referer($tmp_record_uri,$tmp_record_referer); + } + } $entriescounter++; } @@ -233,29 +326,98 @@ $tmprecord = $record_arry[0]; $tmprecord = $self->remove_duplicates($tmprecord); } - elsif ($logtype eq "w3") # our W3C in-house log format + else #common combined or full or w3c { - $tmprecord = $record_arry[4]; - # an oddity of W3C log formats - my $serverstring = join ("",'http://',$config{LogProcessor}{ServerName}); - $tmprecord =~ s/$serverstring//; - my $path = $config{LogProcessor}{DocumentRoot}; - $tmprecord =~ s/$path/\//; + $tmprecord = $record_arry[6]; $tmprecord = $self->remove_duplicates($tmprecord); $tmprecord = join ("",'http://',$config{LogProcessor}{ServerName},$tmprecord); + } + #print "$tmprecord \n" if ($verbose > 2); + return $tmprecord; + } +} + +sub find_HTTP_code +# finds the returned HTTP code from a log record, if available +{ + my $self = shift; + if (@_) + { + my $tmprecord = shift; + my @record_arry; + @record_arry = split(" ", $tmprecord); + # hardcoded to most apache log formats, included common and combined + # for the moment... TODO + my $logtype = shift; + # print "log type $logtype" if ($verbose > 2); + if ($logtype eq "plain") + { + $tmprecord = ""; + } + else #common combined full or w3c + { + $tmprecord = $record_arry[8]; + } + #print "HTTP Code $tmprecord \n" if (($verbose > 2) and ($tmprecord ne "")); + return $tmprecord; + } +} + +sub find_referer +# finds the referrer info from a log record, if available +{ + my $self = shift; + if (@_) + { + my $tmprecord = shift; + my @record_arry; + @record_arry = split(" ", $tmprecord); + # hardcoded to most apache log formats, included common and combined + # for the moment... TODO + my $logtype = shift; + # print "log type $logtype" if ($verbose > 2); + if ( ($logtype eq "plain") or ($logtype eq "common")) + { + $tmprecord = ""; + } + else #combined or full or w3c + { + $tmprecord = $record_arry[10]; + } + #print "referrer $tmprecord \n" if (($verbose > 2) and ($tmprecord ne "")); + return $tmprecord; + } +} + +sub find_mime_type +# only for W3c extended log format - find the mime type for the resource +{ + my $self = shift; + if (@_) + { + my $tmprecord = shift; + my @record_arry; + @record_arry = split(' ', $tmprecord); + # hardcoded to most apache log formats, included common and combined + # for the moment... TODO + my $logtype = shift; + # print "log type $logtype" if ($verbose > 2); + if ($logtype eq "w3c") + { + $tmprecord = pop @record_arry; } - else #common combined or full + else # all other formats { - $tmprecord = $record_arry[6]; - $tmprecord = $self->remove_duplicates($tmprecord); - $tmprecord = join ("",'http://',$config{LogProcessor}{ServerName},$tmprecord); + $tmprecord = ""; } - #print "$tmprecord \n" if ($verbose > 2); + #print "mime type $tmprecord \n" if (($verbose > 2) and ($tmprecord ne "")); return $tmprecord; } } + + sub remove_duplicates # removes "directory index" suffixes such as index.html, etc # so that http://foobar/ and http://foobar/index.html be counted as one resource
Received on Thursday, 22 June 2006 05:33:54 UTC