W3C home > Mailing lists > Public > www-validator-cvs@w3.org > November 2008

perl/modules/W3C/LogValidator/lib/W3C LogValidator.pm,1.23,1.24

From: Olivier Thereaux via cvs-syncmail <cvsmail@w3.org>
Date: Tue, 18 Nov 2008 16:04:19 +0000
To: www-validator-cvs@w3.org
Message-Id: <E1L2T3r-0005pM-34@lionel-hutz.w3.org>

Update of /sources/public/perl/modules/W3C/LogValidator/lib/W3C
In directory hutz:/tmp/cvs-serv21547/lib/W3C

Modified Files:
	LogValidator.pm 
Log Message:
Patches by Martin B. Smith for the implementation of an ExcludeHost option
http://www.w3.org/Bugs/Public/show_bug.cgi?id=5221



Index: LogValidator.pm
===================================================================
RCS file: /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator.pm,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -d -r1.23 -r1.24
--- LogValidator.pm	14 Nov 2008 23:16:28 -0000	1.23
+++ LogValidator.pm	18 Nov 2008 16:04:17 -0000	1.24
@@ -259,6 +259,8 @@
 	my $self = shift;
 	my $tmp_record;
 	my $entriesperlogfile = $config{LogProcessor}{EntriesPerLogfile};
+	my $allskiphosts = ($config{LogProcessor}{ExcludeHosts}) ? $config{LogProcessor}{ExcludeHosts} : ""; # default to none
+	my @skiphostsregex = split(" ", $allskiphosts);
 	my $entriescounter=0;
 	if (@_)
 	{
@@ -274,6 +276,19 @@
 				my $logtype = $config{LogProcessor}{LogType}{$logfile};
 				if ($tmp_record) # not a blank line
 				{
+					my $tmp_record_remote_addr = $self->find_remote_addr($tmp_record, $logtype);
+					if ($tmp_record_remote_addr) # not a blank remote host or address
+					{
+						foreach my $skipexpression (@skiphostsregex)
+						{
+						     if( $tmp_record_remote_addr =~ /$skipexpression/ )
+						     {
+							print " Skipping " . $tmp_record_remote_addr . " because it matches the ExcludeHosts pattern " . $skipexpression. "\n" if ($verbose > 2);
+							next;
+						     }
+						}
+					}
+
 					my $tmp_record_uri = $self->find_uri($tmp_record, $logtype);
 					my $tmp_record_HTTP_method = $self->find_HTTP_Method($tmp_record, $logtype);
 					my $tmp_record_mime_type = $self->find_mime_type($tmp_record, $logtype);
@@ -337,6 +352,32 @@
 			$tmprecord = $self->remove_duplicates($tmprecord);
 			if( !( $tmprecord =~ m/^https?\:/ ) ) {
 				$tmprecord = join ("",'http://',$config{LogProcessor}{ServerName},$tmprecord);
+sub find_remote_addr
+# finds the returned HTTP code from a log record, if available
+{
+        my $self = shift;
+        if (@_)
+        {
+                my $tmprecord = shift;
+                my @record_arry;
+                @record_arry = split(" ", $tmprecord);
+                # hardcoded to most apache log formats, included common and combined
+                # for the moment... TODO
+                my $logtype = shift;
+                # print "log type $logtype" if ($verbose > 2);
+                if ($logtype eq "plain")
+                {
+                        $tmprecord = "";
+                }
+                else #common combined full or w3c
+                {
+                        $tmprecord = $record_arry[0];
+                }
+        #print "Remote Addr $tmprecord \n" if (($verbose > 2) and ($tmprecord ne ""));
+        return $tmprecord;
+        }
+}
+
 			}
 		}
 	#print "$tmprecord \n" if ($verbose > 2);
@@ -661,6 +702,10 @@
 =over 4
 
 =item $processor->process
+=item $processor->find_remote_addr
+
+Given a log record and the type of the log (common log format, flat list of URIs, etc), extracts the remote host or ip
+
 
 Do-it-all method:
 Read configuration file (if any), parse log files, run them through processing modules, send result to output module.
Received on Tuesday, 18 November 2008 16:04:29 UTC

This archive was generated by hypermail 2.3.1 : Wednesday, 7 January 2015 15:17:34 UTC