perl/modules/W3C/LogValidator/lib/W3C LogValidator.pm,1.18,1.19

Update of /sources/public/perl/modules/W3C/LogValidator/lib/W3C
In directory hutz:/tmp/cvs-serv8672/lib/W3C

Modified Files:
	LogValidator.pm 
Log Message:
- When log format holds referer info, store that
- When log format holds HTTP status code, store that
- When log format holds mime type info (usually, only in W3C extended log format), store that

all these info are stored in temporary DB_FILEs



Index: LogValidator.pm
===================================================================
RCS file: /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator.pm,v
retrieving revision 1.18
retrieving revision 1.19
diff -u -d -r1.18 -r1.19
--- LogValidator.pm	9 Sep 2005 06:33:11 -0000	1.18
+++ LogValidator.pm	22 Jun 2006 05:33:40 -0000	1.19
@@ -8,7 +8,7 @@
 
 package W3C::LogValidator;
 use strict;
-
+no strict "refs";
 require Exporter;
 our @ISA = qw(Exporter);
 our %EXPORT_TAGS = ( 'all' => [ qw() ] );
@@ -22,6 +22,9 @@
 our $verbose;
 our %cmdline_conf;
 our %hits; # hash URI->hits
+our %referers;
+our %mimetypes;
+our %HTTPcodes;
 our $output_proc;
 
 ###########################
@@ -95,6 +98,9 @@
 	use File::Temp qw/ /;
 	my $tmpdir = File::Spec->tmpdir;
 	$config{LogProcessor}{tmpfile} = File::Temp::tempnam( $tmpdir, "LogValidator-" );
+	$config{LogProcessor}{tmpfile_HTTP_codes} = File::Temp::tempnam( $tmpdir, "LogValidator-" );
+	$config{LogProcessor}{tmpfile_mime_types} = File::Temp::tempnam( $tmpdir, "LogValidator-" );
+	$config{LogProcessor}{tmpfile_referers} = File::Temp::tempnam( $tmpdir, "LogValidator-" );
 	bless($self, $class);
 	return $self;
 }
@@ -144,6 +150,66 @@
 	}
 }
 
+sub add_referer
+# usage $self->add_referer($uri, $referer)
+{
+	my $self = shift;
+	if (@_)
+	{
+	    my $uri = shift;
+	    my $referer = shift;
+	    $referer =~ s/^"(.*)"$/$1/;
+	    my $preferedref = $config{LogProcessor}{RefererMatch};
+	    if (($referer ne "-") and ( $referer =~ /$preferedref/))
+	    {
+	   
+	        if (exists $referers{"$uri : $referer"})
+	        # nth time this referer is mentioned for $uri, incrementing
+	        {
+		    $referers{"$uri : $referer"} += 1;
+	        }
+	        else
+	        # first time this referer is mentioned for $uri
+	        {
+		    $referers{"$uri : $referer"} = 1;
+	        }
+	    }
+	}
+}
+
+sub add_mime_type
+# record the mime type known for a given logged resource
+# usage $self->add_mime_type('http://foobar', "text/html")
+{
+	my $self = shift;
+	if (@_)
+	{
+		my $uri = shift;
+		my $mime_type = shift;
+		next unless defined($uri);
+		if (! exists($mimetypes{$uri}) )
+		{ $mimetypes{$uri} = $mime_type; }
+	}
+}
+
+sub add_HTTP_code
+# record the returned HTTP Code for a given logged resource
+# usage $self->add_HTTP_code('http://foobar', "200")
+# NOTE: doesn't cover if that code changes throughout the log file - TODO fix that?
+{
+	my $self = shift;
+	if (@_)
+	{
+		my $uri = shift;
+		my $HTTP_code = shift;
+		next unless defined($uri);
+		if (! exists($HTTPcodes{$uri}) )
+		{ 
+		    $HTTPcodes{$uri} = $HTTP_code; 
+		}
+	}
+}
+
 sub read_logfiles
 # just looping
 {
@@ -153,6 +219,20 @@
 	my $tmp_file = $config{LogProcessor}{tmpfile};
 	tie (%hits, 'DB_File', "$tmp_file") ||
 	die ("Cannot create or open $tmp_file");
+
+	# TODO this should probably be triggered (on or off) by an option rather than always on
+	
+	my $tmp_file_referers = $config{LogProcessor}{tmpfile_referers};
+	tie (%referers, 'DB_File', "$tmp_file_referers") ||
+	die ("Cannot create or open $tmp_file_referers");
+	
+	my $tmp_file_mime_types = $config{LogProcessor}{tmpfile_mime_types};
+	tie (%mimetypes, 'DB_File', "$tmp_file_mime_types") ||
+	die ("Cannot create or open $tmp_file_mime_types");
+	
+	my $tmp_file_HTTP_codes = $config{LogProcessor}{tmpfile_HTTP_codes};
+	tie (%HTTPcodes, 'DB_File', "$tmp_file_HTTP_codes") ||
+	die ("Cannot create or open $tmp_file_HTTP_codes");
 	
 	print "Reading logfiles: " if ($verbose); #non-quiet mode
 	print "\n" if ($verbose >1); # verbose or above, we'll have details so linebreak
@@ -161,7 +241,12 @@
 	{
 		$self->read_logfile($current_logfile);
 	}
+
 	untie %hits;
+	untie %HTTPcodes;
+	untie %mimetypes;
+	untie %referers;
+
 	print "Done! \n" if ($verbose); #non-quiet mode
 
 }
@@ -189,9 +274,17 @@
 				my $logtype = $config{LogProcessor}{LogType}{$logfile};
 				if ($tmp_record) # not a blank line
 				{
-					$tmp_record = $self->find_uri($tmp_record, $logtype);
-					#print "$tmp_record \n" if ($verbose >2);
-					if ($self->no_cgi($tmp_record)) {$self->add_uri($tmp_record);}
+					my $tmp_record_uri = $self->find_uri($tmp_record, $logtype);
+					my $tmp_record_mime_type = $self->find_mime_type($tmp_record, $logtype);
+					my $tmp_record_HTTP_code = $self->find_HTTP_code($tmp_record, $logtype);
+					my $tmp_record_referer = $self->find_referer($tmp_record, $logtype);
+					if ($self->no_cgi($tmp_record)) {
+						$self->add_uri($tmp_record_uri);
+						$self->add_mime_type($tmp_record_uri, $tmp_record_mime_type);
+						$self->add_HTTP_code($tmp_record_uri,$tmp_record_HTTP_code);
+						$self->add_referer($tmp_record_uri,$tmp_record_referer);
+					}
+
 				}
 				$entriescounter++;
 			}
@@ -233,29 +326,98 @@
 			$tmprecord = $record_arry[0];
 			$tmprecord = $self->remove_duplicates($tmprecord);
 		}
-		elsif ($logtype eq "w3") # our W3C in-house log format
+		else #common combined or full or w3c
 		{
-			$tmprecord = $record_arry[4];
-			# an oddity of W3C log formats
-			my $serverstring = join ("",'http://',$config{LogProcessor}{ServerName});
-			$tmprecord =~ s/$serverstring//;
-			my $path = $config{LogProcessor}{DocumentRoot};
-			$tmprecord =~ s/$path/\//;
+			$tmprecord = $record_arry[6];
 			$tmprecord = $self->remove_duplicates($tmprecord);
 			$tmprecord = join ("",'http://',$config{LogProcessor}{ServerName},$tmprecord);
+		}
+	#print "$tmprecord \n" if ($verbose > 2);
+	return $tmprecord;
+	}
+}
+
+sub find_HTTP_code
+# finds the returned HTTP code from a log record, if available
+{
+	my $self = shift;
+	if (@_)
+	{
+		my $tmprecord = shift;
+		my @record_arry;
+		@record_arry = split(" ", $tmprecord);
+		# hardcoded to most apache log formats, included common and combined
+		# for the moment... TODO
+		my $logtype = shift;
+		# print "log type $logtype" if ($verbose > 2);
+		if ($logtype eq "plain") 
+		{
+			$tmprecord = "";
+		}
+		else #common combined full or w3c
+		{
+			$tmprecord = $record_arry[8];
+		}
+	#print "HTTP Code $tmprecord \n" if (($verbose > 2) and ($tmprecord ne ""));
+	return $tmprecord;
+	}
+}
+
+sub find_referer
+# finds the referrer info from a log record, if available
+{
+	my $self = shift;
+	if (@_)
+	{
+		my $tmprecord = shift;
+		my @record_arry;
+		@record_arry = split(" ", $tmprecord);
+		# hardcoded to most apache log formats, included common and combined
+		# for the moment... TODO
+		my $logtype = shift;
+		# print "log type $logtype" if ($verbose > 2);
+		if ( ($logtype eq "plain") or ($logtype eq "common"))
+		{
+			$tmprecord = "";
+		}
+		else #combined or full or w3c
+		{
+			$tmprecord = $record_arry[10];
+		}
+	#print "referrer $tmprecord \n" if (($verbose > 2) and ($tmprecord ne ""));
+	return $tmprecord;
+	}
+}
+
+sub find_mime_type 
+# only for W3c extended log format - find the mime type for the resource
+{
+	my $self = shift;
+	if (@_)
+	{
+		my $tmprecord = shift;
+		my @record_arry;
+		@record_arry = split(' ', $tmprecord);
+		# hardcoded to most apache log formats, included common and combined
+		# for the moment... TODO
+		my $logtype = shift;
+		# print "log type $logtype" if ($verbose > 2);
+		if ($logtype eq "w3c") 
+		{
 			
+			$tmprecord = pop @record_arry;
 		}
-		else #common combined or full
+		else # all other formats
 		{
-			$tmprecord = $record_arry[6];
-			$tmprecord = $self->remove_duplicates($tmprecord);
-			$tmprecord = join ("",'http://',$config{LogProcessor}{ServerName},$tmprecord);
+			$tmprecord = "";
 		}
-	#print "$tmprecord \n" if ($verbose > 2);
+	#print "mime type $tmprecord \n" if (($verbose > 2) and ($tmprecord ne ""));
 	return $tmprecord;
 	}
 }
 
+
+
 sub remove_duplicates
 # removes "directory index" suffixes such as index.html, etc
 # so that http://foobar/ and http://foobar/index.html be counted as one resource

Received on Thursday, 22 June 2006 05:33:54 UTC