- From: Olivier Thereaux <ot@dev.w3.org>
- Date: Tue, 08 Jun 2004 06:36:07 +0000
- To: www-validator-cvs@w3.org
Update of /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator In directory hutz:/tmp/cvs-serv13913 Modified Files: CSSValidator.pm HTMLValidator.pm Log Message: - adding full support for MaxDocuments - checking content-type of documents with no file ext Index: HTMLValidator.pm =================================================================== RCS file: /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator/HTMLValidator.pm,v retrieving revision 1.9 retrieving revision 1.10 diff -u -d -r1.9 -r1.10 --- HTMLValidator.pm 7 Jun 2004 14:27:43 -0000 1.9 +++ HTMLValidator.pm 8 Jun 2004 06:36:04 -0000 1.10 @@ -105,6 +105,58 @@ $self->{VALID_SUCCESS} = undef; $self->{VALID_HEAD} = undef; } + +sub HEAD_check { +## Checking whether a document with no extension is actually an HTML/XML document +## causes a lot of requests, but internal - should be OK? + my $self = shift; + my $check_uri; + use LWP::UserAgent; + if (@_) { $check_uri = shift } + my $ua = new LWP::UserAgent; + my $method = "HEAD"; + my $request = new HTTP::Request("$method", "$check_uri"); + my $response = new HTTP::Response; + $response = $ua->simple_request($request); + my $is_html = 0; + if ($response->is_success) # not an error, we could contact the server + { + my $type = $response->header('Content-Type'); + if ($type =~ /text\/html|application\/xhtml+xml|text\/xml/) #should be enough for a start + { + $is_html = 1; + # print "URI with no extension $check_uri has content-type $type\n" if ($verbose > 2); # debug + } + } + return $is_html; +} + +sub trim_uris +{ + my $self = shift; + my @authorized_extensions = split(" ", $self->auth_ext); + my @trimmed_uris; + my $uri; + while ($uri = shift) + { + my $uri_ext = ""; + my $match = 0; + if ($uri =~ /(\.[0-9a-zA-Z]+)$/) + { + $uri_ext = $1; + } + elsif ($uri =~ /\/$/) { $uri_ext = "/";} + elsif ( $self->HEAD_check($uri) ) { $match = 1; } + foreach my $ext (@authorized_extensions) + { + if ($ext eq $uri_ext) { $match = 1; } + } + push @trimmed_uris,$uri if ($match); + } + return @trimmed_uris; +} + + ######################################### # Actual subroutine to check the list of uris # ######################################### @@ -130,19 +182,12 @@ my $max_invalid = undef; if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}} else {$max_invalid = 0} + my $max_documents = undef; + if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}} + else {$max_documents = 0} my $name = ""; if (exists $config{ServerName}) {$name = $config{ServerName}} - my @trimmed_uris; - foreach my $uri (@uris) - { - my @authorized_extensions = split(" ", $self->auth_ext); - foreach my $ext (@authorized_extensions) - { - if ($uri=~ /$ext$/ ) - { push @trimmed_uris,$uri } - } - } - @uris = @trimmed_uris; + @uris = $self->trim_uris(@uris); my @result; my @result_head; my $intro="Here are the <census> most popular invalid document(s) that I could find in the @@ -158,8 +203,7 @@ my $ua = new LWP::UserAgent; # $ua->timeout([30]); # instead of 180. 3 minutes timeout is too long. my $uri = undef; - while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) ) - # if $max_invalid is 0, process everything + while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) and (($total_census < $max_documents) or (!$max_documents)) ) { $uri = shift (@uris); $self->new_doc(); @@ -247,6 +291,10 @@ $intro=~s/<census> //; $outro="I couldn't find any invalid document in this log. Congratulations!"; } + if (($total_census == $max_documents) and ($total_census)) # we stopped because of max_documents + { + $outro=$outro."\nNOTE: I stopped after processing $max_documents documents:\n Maybe you could set MaxDocuments to a higher value?"; + } untie %hits; my %returnhash; $returnhash{"name"}="HTMLValidator"; Index: CSSValidator.pm =================================================================== RCS file: /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator/CSSValidator.pm,v retrieving revision 1.2 retrieving revision 1.3 diff -u -d -r1.2 -r1.3 --- CSSValidator.pm 8 Jun 2004 05:00:09 -0000 1.2 +++ CSSValidator.pm 8 Jun 2004 06:36:04 -0000 1.3 @@ -88,6 +88,56 @@ } +sub HEAD_check { +## Checking whether a document with no extension is actually a CSS file +## causes a lot of requests, but internal - should be OK? + my $self = shift; + my $check_uri; + use LWP::UserAgent; + if (@_) { $check_uri = shift } + my $ua = new LWP::UserAgent; + my $method = "HEAD"; + my $request = new HTTP::Request("$method", "$check_uri"); + my $response = new HTTP::Response; + $response = $ua->simple_request($request); + my $is_css = 0; + if ($response->is_success) # not an error, we could contact the server + { + my $type = $response->header('Content-Type'); + if ($type =~ /text\/css/) + { + $is_css = 1; + print "URI with no extension $check_uri has content-type $type\n" if ($verbose > 2); # debug + } + } + return $is_css; +} + +sub trim_uris +{ + my $self = shift; + my @authorized_extensions = split(" ", $self->auth_ext); + my @trimmed_uris; + my $uri; + while ($uri = shift) + { + my $uri_ext = ""; + my $match = 0; + if ($uri =~ /(\.[0-9a-zA-Z]+)$/) + { + $uri_ext = $1; + } + elsif ($uri =~ /\/$/) { $uri_ext = "/";} + elsif ( $self->HEAD_check($uri) ) { $match = 1; } + foreach my $ext (@authorized_extensions) + { + if ($ext eq $uri_ext) { $match = 1; } + } + push @trimmed_uris,$uri if ($match); + } + return @trimmed_uris; +} + ######################################### # Actual subroutine to check the list of uris # ######################################### @@ -98,6 +148,9 @@ my $self = shift; my $max_invalid = undef; if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}} + my $max_documents = undef; + if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}} + else {$max_documents = 0} print "Now Using the CSS Validation module...\n" if $verbose; use DB_File; my $tmp_file = $config{tmpfile}; @@ -107,10 +160,6 @@ my @uris = sort { $hits{$b} <=> $hits{$a} } keys %hits; my $name = ""; if (exists $config{ServerName}) {$name = $config{ServerName}} - - # do what pleases you! - - my @trimmed_uris; my @result; my @result_head; push @result_head, "Rank"; @@ -122,24 +171,14 @@ logs for $name."; my $outro; - foreach my $uri (@uris) - { - my @authorized_extensions = split(" ", $self->auth_ext); - foreach my $ext (@authorized_extensions) - { - if ($uri=~ /$ext$/ ) - { push @trimmed_uris,$uri } - } - } - @uris = @trimmed_uris; + @uris = $self->trim_uris(@uris); my $invalid_census = 0; # number of invalid docs my $last_invalid_position = 0; # latest position at which we found an invalid doc my $total_census = 0; # number of documents checked my $uri = undef; # bulk of validation - while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) ) - # if $max_invalid is 0, process everything + while ( (@uris) and (($invalid_census < $max_invalid) or (!$max_invalid)) and (($total_census < $max_documents) or (!$max_documents)) ) { $uri = shift (@uris); my $uri_orig = $uri; @@ -220,10 +259,10 @@ $intro=~s/<census> //; $outro="I couldn't find any invalid document in this log. Congratulations!"; } - - - - + if (($total_census == $max_documents) and ($total_census)) # we stopped because of max_documents + { + $outro=$outro."\nNOTE: I stopped after processing $max_documents documents:\n Maybe you could set MaxDocuments to a higher value?"; + } untie %hits; # Here is what the module will return. The hash will be sent to @@ -251,10 +290,7 @@ =head1 NAME -W3C::LogValidator::CSSValidator - - -=head1 SYNOPSIS - +W3C::LogValidator::CSSValidator - Validates CSS style sheets from Web Server logs =head1 DESCRIPTION
Received on Tuesday, 8 June 2004 02:36:08 UTC