- From: Olivier Thereaux <ot@dev.w3.org>
- Date: Tue, 08 Jun 2004 06:36:07 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator
In directory hutz:/tmp/cvs-serv13913
Modified Files:
CSSValidator.pm HTMLValidator.pm
Log Message:
- adding full support for MaxDocuments
- checking content-type of documents with no file ext
Index: HTMLValidator.pm
===================================================================
RCS file: /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator/HTMLValidator.pm,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -d -r1.9 -r1.10
--- HTMLValidator.pm 7 Jun 2004 14:27:43 -0000 1.9
+++ HTMLValidator.pm 8 Jun 2004 06:36:04 -0000 1.10
@@ -105,6 +105,58 @@
$self->{VALID_SUCCESS} = undef;
$self->{VALID_HEAD} = undef;
}
+
+sub HEAD_check {
+## Checking whether a document with no extension is actually an HTML/XML document
+## causes a lot of requests, but internal - should be OK?
+ my $self = shift;
+ my $check_uri;
+ use LWP::UserAgent;
+ if (@_) { $check_uri = shift }
+ my $ua = new LWP::UserAgent;
+ my $method = "HEAD";
+ my $request = new HTTP::Request("$method", "$check_uri");
+ my $response = new HTTP::Response;
+ $response = $ua->simple_request($request);
+ my $is_html = 0;
+ if ($response->is_success) # not an error, we could contact the server
+ {
+ my $type = $response->header('Content-Type');
+ if ($type =~ /text\/html|application\/xhtml+xml|text\/xml/) #should be enough for a start
+ {
+ $is_html = 1;
+ # print "URI with no extension $check_uri has content-type $type\n" if ($verbose > 2); # debug
+ }
+ }
+ return $is_html;
+}
+
+sub trim_uris
+{
+ my $self = shift;
+ my @authorized_extensions = split(" ", $self->auth_ext);
+ my @trimmed_uris;
+ my $uri;
+ while ($uri = shift)
+ {
+ my $uri_ext = "";
+ my $match = 0;
+ if ($uri =~ /(\.[0-9a-zA-Z]+)$/)
+ {
+ $uri_ext = $1;
+ }
+ elsif ($uri =~ /\/$/) { $uri_ext = "/";}
+ elsif ( $self->HEAD_check($uri) ) { $match = 1; }
+ foreach my $ext (@authorized_extensions)
+ {
+ if ($ext eq $uri_ext) { $match = 1; }
+ }
+ push @trimmed_uris,$uri if ($match);
+ }
+ return @trimmed_uris;
+}
+
+
#########################################
# Actual subroutine to check the list of uris #
#########################################
@@ -130,19 +182,12 @@
my $max_invalid = undef;
if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}}
else {$max_invalid = 0}
+ my $max_documents = undef;
+ if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}}
+ else {$max_documents = 0}
my $name = "";
if (exists $config{ServerName}) {$name = $config{ServerName}}
- my @trimmed_uris;
- foreach my $uri (@uris)
- {
- my @authorized_extensions = split(" ", $self->auth_ext);
- foreach my $ext (@authorized_extensions)
- {
- if ($uri=~ /$ext$/ )
- { push @trimmed_uris,$uri }
- }
- }
- @uris = @trimmed_uris;
+ @uris = $self->trim_uris(@uris);
my @result;
my @result_head;
my $intro="Here are the <census> most popular invalid document(s) that I could find in the
@@ -158,8 +203,7 @@
my $ua = new LWP::UserAgent;
# $ua->timeout([30]); # instead of 180. 3 minutes timeout is too long.
my $uri = undef;
- while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) )
- # if $max_invalid is 0, process everything
+ while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) and (($total_census < $max_documents) or (!$max_documents)) )
{
$uri = shift (@uris);
$self->new_doc();
@@ -247,6 +291,10 @@
$intro=~s/<census> //;
$outro="I couldn't find any invalid document in this log. Congratulations!";
}
+ if (($total_census == $max_documents) and ($total_census)) # we stopped because of max_documents
+ {
+ $outro=$outro."\nNOTE: I stopped after processing $max_documents documents:\n Maybe you could set MaxDocuments to a higher value?";
+ }
untie %hits;
my %returnhash;
$returnhash{"name"}="HTMLValidator";
Index: CSSValidator.pm
===================================================================
RCS file: /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator/CSSValidator.pm,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- CSSValidator.pm 8 Jun 2004 05:00:09 -0000 1.2
+++ CSSValidator.pm 8 Jun 2004 06:36:04 -0000 1.3
@@ -88,6 +88,56 @@
}
+sub HEAD_check {
+## Checking whether a document with no extension is actually a CSS file
+## causes a lot of requests, but internal - should be OK?
+ my $self = shift;
+ my $check_uri;
+ use LWP::UserAgent;
+ if (@_) { $check_uri = shift }
+ my $ua = new LWP::UserAgent;
+ my $method = "HEAD";
+ my $request = new HTTP::Request("$method", "$check_uri");
+ my $response = new HTTP::Response;
+ $response = $ua->simple_request($request);
+ my $is_css = 0;
+ if ($response->is_success) # not an error, we could contact the server
+ {
+ my $type = $response->header('Content-Type');
+ if ($type =~ /text\/css/)
+ {
+ $is_css = 1;
+ print "URI with no extension $check_uri has content-type $type\n" if ($verbose > 2); # debug
+ }
+ }
+ return $is_css;
+}
+
+sub trim_uris
+{
+ my $self = shift;
+ my @authorized_extensions = split(" ", $self->auth_ext);
+ my @trimmed_uris;
+ my $uri;
+ while ($uri = shift)
+ {
+ my $uri_ext = "";
+ my $match = 0;
+ if ($uri =~ /(\.[0-9a-zA-Z]+)$/)
+ {
+ $uri_ext = $1;
+ }
+ elsif ($uri =~ /\/$/) { $uri_ext = "/";}
+ elsif ( $self->HEAD_check($uri) ) { $match = 1; }
+ foreach my $ext (@authorized_extensions)
+ {
+ if ($ext eq $uri_ext) { $match = 1; }
+ }
+ push @trimmed_uris,$uri if ($match);
+ }
+ return @trimmed_uris;
+}
+
#########################################
# Actual subroutine to check the list of uris #
#########################################
@@ -98,6 +148,9 @@
my $self = shift;
my $max_invalid = undef;
if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}}
+ my $max_documents = undef;
+ if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}}
+ else {$max_documents = 0}
print "Now Using the CSS Validation module...\n" if $verbose;
use DB_File;
my $tmp_file = $config{tmpfile};
@@ -107,10 +160,6 @@
my @uris = sort { $hits{$b} <=> $hits{$a} } keys %hits;
my $name = "";
if (exists $config{ServerName}) {$name = $config{ServerName}}
-
- # do what pleases you!
-
- my @trimmed_uris;
my @result;
my @result_head;
push @result_head, "Rank";
@@ -122,24 +171,14 @@
logs for $name.";
my $outro;
- foreach my $uri (@uris)
- {
- my @authorized_extensions = split(" ", $self->auth_ext);
- foreach my $ext (@authorized_extensions)
- {
- if ($uri=~ /$ext$/ )
- { push @trimmed_uris,$uri }
- }
- }
- @uris = @trimmed_uris;
+ @uris = $self->trim_uris(@uris);
my $invalid_census = 0; # number of invalid docs
my $last_invalid_position = 0; # latest position at which we found an invalid doc
my $total_census = 0; # number of documents checked
my $uri = undef;
# bulk of validation
- while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) )
- # if $max_invalid is 0, process everything
+ while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) and (($total_census < $max_documents) or (!$max_documents)) )
{
$uri = shift (@uris);
my $uri_orig = $uri;
@@ -220,10 +259,10 @@
$intro=~s/<census> //;
$outro="I couldn't find any invalid document in this log. Congratulations!";
}
-
-
-
-
+ if (($total_census == $max_documents) and ($total_census)) # we stopped because of max_documents
+ {
+ $outro=$outro."\nNOTE: I stopped after processing $max_documents documents:\n Maybe you could set MaxDocuments to a higher value?";
+ }
untie %hits;
# Here is what the module will return. The hash will be sent to
@@ -251,10 +290,7 @@
=head1 NAME
-W3C::LogValidator::CSSValidator -
-
-=head1 SYNOPSIS
-
+W3C::LogValidator::CSSValidator - Validates CSS style sheets from Web Server logs
=head1 DESCRIPTION
Received on Tuesday, 8 June 2004 02:36:08 UTC