- From: Olivier Thereaux via cvs-syncmail <cvsmail@w3.org>
- Date: Tue, 18 Nov 2008 16:04:19 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LogValidator/lib/W3C
In directory hutz:/tmp/cvs-serv21547/lib/W3C
Modified Files:
LogValidator.pm
Log Message:
Patches by Martin B. Smith for the implementation of an ExcludeHost option
http://www.w3.org/Bugs/Public/show_bug.cgi?id=5221
Index: LogValidator.pm
===================================================================
RCS file: /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator.pm,v
retrieving revision 1.23
retrieving revision 1.24
diff -u -d -r1.23 -r1.24
--- LogValidator.pm 14 Nov 2008 23:16:28 -0000 1.23
+++ LogValidator.pm 18 Nov 2008 16:04:17 -0000 1.24
@@ -259,6 +259,8 @@
my $self = shift;
my $tmp_record;
my $entriesperlogfile = $config{LogProcessor}{EntriesPerLogfile};
+ my $allskiphosts = ($config{LogProcessor}{ExcludeHosts}) ? $config{LogProcessor}{ExcludeHosts} : ""; # default to none
+ my @skiphostsregex = split(" ", $allskiphosts);
my $entriescounter=0;
if (@_)
{
@@ -274,6 +276,19 @@
my $logtype = $config{LogProcessor}{LogType}{$logfile};
if ($tmp_record) # not a blank line
{
+ my $tmp_record_remote_addr = $self->find_remote_addr($tmp_record, $logtype);
+ if ($tmp_record_remote_addr) # not a blank remote host or address
+ {
+ foreach my $skipexpression (@skiphostsregex)
+ {
+ if( $tmp_record_remote_addr =~ /$skipexpression/ )
+ {
+ print " Skipping " . $tmp_record_remote_addr . " because it matches the ExcludeHosts pattern " . $skipexpression. "\n" if ($verbose > 2);
+ next;
+ }
+ }
+ }
+
my $tmp_record_uri = $self->find_uri($tmp_record, $logtype);
my $tmp_record_HTTP_method = $self->find_HTTP_Method($tmp_record, $logtype);
my $tmp_record_mime_type = $self->find_mime_type($tmp_record, $logtype);
@@ -337,6 +352,32 @@
$tmprecord = $self->remove_duplicates($tmprecord);
if( !( $tmprecord =~ m/^https?\:/ ) ) {
$tmprecord = join ("",'http://',$config{LogProcessor}{ServerName},$tmprecord);
+sub find_remote_addr
+# finds the returned HTTP code from a log record, if available
+{
+ my $self = shift;
+ if (@_)
+ {
+ my $tmprecord = shift;
+ my @record_arry;
+ @record_arry = split(" ", $tmprecord);
+ # hardcoded to most apache log formats, included common and combined
+ # for the moment... TODO
+ my $logtype = shift;
+ # print "log type $logtype" if ($verbose > 2);
+ if ($logtype eq "plain")
+ {
+ $tmprecord = "";
+ }
+ else #common combined full or w3c
+ {
+ $tmprecord = $record_arry[0];
+ }
+ #print "Remote Addr $tmprecord \n" if (($verbose > 2) and ($tmprecord ne ""));
+ return $tmprecord;
+ }
+}
+
}
}
#print "$tmprecord \n" if ($verbose > 2);
@@ -661,6 +702,10 @@
=over 4
=item $processor->process
+=item $processor->find_remote_addr
+
+Given a log record and the type of the log (common log format, flat list of URIs, etc), extracts the remote host or ip
+
Do-it-all method:
Read configuration file (if any), parse log files, run them through processing modules, send result to output module.
Received on Tuesday, 18 November 2008 16:04:29 UTC