W3C home > Mailing lists > Public > www-validator-cvs@w3.org > June 2004

perl/modules/W3C/LogValidator/lib/W3C/LogValidator CSSValidator.pm,1.2,1.3 HTMLValidator.pm,1.9,1.10

From: Olivier Thereaux <ot@dev.w3.org>
Date: Tue, 08 Jun 2004 06:36:07 +0000
To: www-validator-cvs@w3.org
Message-Id: <20040608063608.F32F74A846@hutz.w3.org>

Update of /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator
In directory hutz:/tmp/cvs-serv13913

Modified Files:
	CSSValidator.pm HTMLValidator.pm 
Log Message:
- adding full support for MaxDocuments
- checking content-type of documents with no file ext


Index: HTMLValidator.pm
===================================================================
RCS file: /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator/HTMLValidator.pm,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -d -r1.9 -r1.10
--- HTMLValidator.pm	7 Jun 2004 14:27:43 -0000	1.9
+++ HTMLValidator.pm	8 Jun 2004 06:36:04 -0000	1.10
@@ -105,6 +105,58 @@
         $self->{VALID_SUCCESS} = undef;
         $self->{VALID_HEAD} = undef;
 }
+
+sub HEAD_check {
+## Checking whether a document with no extension is actually an HTML/XML document
+## causes a lot of requests, but internal - should be OK?
+	my $self = shift;
+	my $check_uri;
+	use LWP::UserAgent;
+	if (@_) { $check_uri = shift }
+	my $ua = new LWP::UserAgent;
+	my $method = "HEAD";
+	my $request = new HTTP::Request("$method", "$check_uri");
+	my $response = new HTTP::Response;
+	$response = $ua->simple_request($request);
+	my $is_html = 0;
+	if ($response->is_success) # not an error, we could contact the server
+	{
+		my $type = $response->header('Content-Type');
+		if ($type =~ /text\/html|application\/xhtml+xml|text\/xml/) #should be enough for a start
+		{ 
+			$is_html = 1;
+			# print "URI with no extension $check_uri has content-type $type\n" if ($verbose > 2); # debug
+		}
+	}
+	return $is_html;
+}
+
+sub trim_uris 
+{
+        my $self = shift;
+        my @authorized_extensions = split(" ", $self->auth_ext);
+        my @trimmed_uris;
+        my $uri;
+        while ($uri = shift)
+        {
+                my $uri_ext = "";
+                my $match = 0;
+                if ($uri =~ /(\.[0-9a-zA-Z]+)$/)
+                {  
+                   $uri_ext = $1;
+                }
+                elsif ($uri =~ /\/$/) { $uri_ext = "/";}
+                elsif ( $self->HEAD_check($uri) ) { $match = 1; }
+                foreach my $ext (@authorized_extensions)
+                {   
+                    if ($ext eq $uri_ext) { $match = 1; }
+                }
+                push @trimmed_uris,$uri if ($match);
+        }
+        return @trimmed_uris;
+}
+
+
 #########################################
 # Actual subroutine to check the list of uris #
 #########################################
@@ -130,19 +182,12 @@
 	my $max_invalid = undef;
 	if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}}
 	else {$max_invalid = 0}
+	my $max_documents = undef;
+	if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}}
+	else {$max_documents = 0}
 	my $name = ""; 
 	if (exists $config{ServerName}) {$name = $config{ServerName}}
-	my @trimmed_uris;
-	foreach my $uri (@uris)
-	{
-		my @authorized_extensions = split(" ", $self->auth_ext);
-		foreach my $ext (@authorized_extensions)
-		{
-			if ($uri=~ /$ext$/ )
-			{ push @trimmed_uris,$uri }
-		}
-	}
-	@uris = @trimmed_uris;
+	@uris = $self->trim_uris(@uris);
 	my @result;
 	my @result_head;
 	my $intro="Here are the <census> most popular invalid document(s) that I could find in the 
@@ -158,8 +203,7 @@
 	my $ua = new LWP::UserAgent;
 #	$ua->timeout([30]); # instead of 180. 3 minutes timeout is too long.
 	my $uri = undef;
-	while ( (@uris) and  (($invalid_census < $max_invalid) or (!$max_invalid)) )
-	# if $max_invalid is 0, process everything
+	while ( (@uris) and  (($invalid_census < $max_invalid) or (!$max_invalid)) and (($total_census < $max_documents) or (!$max_documents)) )
 	{
 		$uri = shift (@uris);
 		$self->new_doc();
@@ -247,6 +291,10 @@
 		$intro=~s/<census> //;
 		$outro="I couldn't find any invalid document in this log. Congratulations!";
 	}
+	if (($total_census == $max_documents) and ($total_census)) # we stopped because of max_documents
+	{
+		$outro=$outro."\nNOTE: I stopped after processing $max_documents documents:\n     Maybe you could set MaxDocuments to a higher value?";
+	}
 	untie %hits;
 	my %returnhash;
         $returnhash{"name"}="HTMLValidator";

Index: CSSValidator.pm
===================================================================
RCS file: /sources/public/perl/modules/W3C/LogValidator/lib/W3C/LogValidator/CSSValidator.pm,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -d -r1.2 -r1.3
--- CSSValidator.pm	8 Jun 2004 05:00:09 -0000	1.2
+++ CSSValidator.pm	8 Jun 2004 06:36:04 -0000	1.3
@@ -88,6 +88,56 @@
 }
 
 
+sub HEAD_check {
+## Checking whether a document with no extension is actually a CSS file
+## causes a lot of requests, but internal - should be OK?
+        my $self = shift;
+        my $check_uri;
+        use LWP::UserAgent;
+        if (@_) { $check_uri = shift }
+        my $ua = new LWP::UserAgent;
+        my $method = "HEAD";
+        my $request = new HTTP::Request("$method", "$check_uri");
+        my $response = new HTTP::Response;
+        $response = $ua->simple_request($request);
+        my $is_css = 0;
+        if ($response->is_success) # not an error, we could contact the server
+        {
+                my $type = $response->header('Content-Type');
+                if ($type =~ /text\/css/) 
+                {
+                        $is_css = 1;
+                        print "URI with no extension $check_uri has content-type $type\n" if ($verbose > 2); # debug
+                }
+        }
+        return $is_css;
+}
+
+sub trim_uris 
+{
+	my $self = shift;
+        my @authorized_extensions = split(" ", $self->auth_ext);
+	my @trimmed_uris;
+	my $uri;
+        while ($uri = shift)
+        {
+                my $uri_ext = "";
+                my $match = 0;
+                if ($uri =~ /(\.[0-9a-zA-Z]+)$/)
+                {
+                   $uri_ext = $1;
+                }
+                elsif ($uri =~ /\/$/) { $uri_ext = "/";}
+                elsif ( $self->HEAD_check($uri) ) { $match = 1; }
+                foreach my $ext (@authorized_extensions)
+                {
+                    if ($ext eq $uri_ext) { $match = 1; }
+                }
+                push @trimmed_uris,$uri if ($match);
+        }
+	return @trimmed_uris;
+}
+
 #########################################
 # Actual subroutine to check the list of uris #
 #########################################
@@ -98,6 +148,9 @@
 	my $self = shift;
 	my $max_invalid = undef;
 	if (exists $config{MaxInvalid}) {$max_invalid = $config{MaxInvalid}}
+        my $max_documents = undef;                                                                      
+        if (exists $config{MaxDocuments}) {$max_documents = $config{MaxDocuments}}                      
+        else {$max_documents = 0}
 	print "Now Using the CSS Validation module...\n" if $verbose;
 	use DB_File;                                                                  
         my $tmp_file = $config{tmpfile};
@@ -107,10 +160,6 @@
 	my @uris = sort { $hits{$b} <=> $hits{$a} } keys %hits;
 	my $name = "";
 	if (exists $config{ServerName}) {$name = $config{ServerName}}
-
-	# do what pleases you!
-
-	my @trimmed_uris;
        	my @result;
         my @result_head;
         push @result_head, "Rank";
@@ -122,24 +171,14 @@
 logs for $name.";
         my $outro;
 
-        foreach my $uri (@uris)
-        {
-                my @authorized_extensions = split(" ", $self->auth_ext);
-                foreach my $ext (@authorized_extensions)
-                {
-                        if ($uri=~ /$ext$/ )
-                        { push @trimmed_uris,$uri }
-                }
-        }
-        @uris = @trimmed_uris;
+        @uris = $self->trim_uris(@uris);
 	my $invalid_census = 0; # number of invalid docs
 	my $last_invalid_position = 0; # latest position at which we found an invalid doc
 	my $total_census = 0; # number of documents checked
 
         my $uri = undef;
 	# bulk of validation
-        while ( (@uris) and  (($invalid_census < $max_invalid) or (!$max_invalid)) )
-        # if $max_invalid is 0, process everything
+        while ( (@uris) and  (($invalid_census < $max_invalid) or (!$max_invalid)) and (($total_census < $max_documents) or (!$max_documents)) )
 	{
 		$uri = shift (@uris);
 		my $uri_orig = $uri;
@@ -220,10 +259,10 @@
                 $intro=~s/<census> //;
                 $outro="I couldn't find any invalid document in this log. Congratulations!";
         }
-
-
-
-
+        if (($total_census == $max_documents) and ($total_census)) # we stopped because of max_documents
+        {
+                $outro=$outro."\nNOTE: I stopped after processing $max_documents documents:\n      Maybe you could set MaxDocuments to a higher value?";
+        }
 	untie %hits;                                                                  
 	
 	# Here is what the module will return. The hash will be sent to 
@@ -251,10 +290,7 @@
 
 =head1 NAME
 
-W3C::LogValidator::CSSValidator - 
-
-=head1 SYNOPSIS
-
+W3C::LogValidator::CSSValidator - Validates CSS style sheets from Web Server logs
 
 
 =head1 DESCRIPTION
Received on Tuesday, 8 June 2004 02:36:08 UTC

This archive was generated by hypermail 2.4.0 : Friday, 17 January 2020 23:02:06 UTC