2006/ack ack.pl,1.7,1.8

Update of /sources/public/2006/ack
In directory hutz:/tmp/cvs-serv25775

Modified Files:
	ack.pl 
Log Message:
New checks: system IDs in doctype declarations, P3P policyrefs, PICS-Labels,
Link HTTP headers.


Index: ack.pl
===================================================================
RCS file: /sources/public/2006/ack/ack.pl,v
retrieving revision 1.7
retrieving revision 1.8
diff -u -d -r1.7 -r1.8
--- ack.pl	25 May 2006 13:45:29 -0000	1.7
+++ ack.pl	25 May 2006 15:21:46 -0000	1.8
@@ -21,6 +21,7 @@
 use Digest::MD5 qw(md5_hex);
 use File::Temp ();
 use HTML::Entities qw(encode_entities);
+use HTTP::Headers::Util qw(split_header_words);
 use HTML::LinkExtor ();
 use LWP::UserAgent ();
 use URI ();
@@ -38,12 +39,44 @@
         )->get($url, ":content_file" => "$file");
     $url = $res->request()->uri()->canonical();
 
-    my $p = HTML::LinkExtor->new(undef, $res->base());
+    my $base = $res->base();
+
+    my @extra_links;
+
+    # P3P policyrefs:
+    for (split_header_words($res->header("P3P"))) {
+      my %p3p = (@$_);
+      push(@extra_links, ["http", "p3p", URI->new_abs($p3p{policyref}, $base)])
+        if $p3p{policyref};
+    }
+
+    # PICS labels:
+    for my $label ($res->header("PICS-Label")) {
+      push(@extra_links, ["http", "pics-label", URI->new_abs($1, $base)])
+        if ($label =~ /^\(PICS-1\.\d+\s+"([^"]+)"/);
+    }
+
+    # Link headers:
+    for my $link ($res->header("Link")) {
+      push(@extra_links, ["http", "link", URI->new_abs($1, $base)])
+        if ($link =~ /^<([^>]+)>/);
+    }
+
+    my $p = HTML::LinkExtor->new(undef, $base);
+
+    # System ID's in doctype declarations:
+    $p->handler(declaration => sub {
+      (my ($self, $tag), local $_) = @_;
+      return unless $_;
+      if (/^<!DOCTYPE\s+html\s+(?:PUBLIC\s+"[^"]*"|SYSTEM)\s+"([^"]+)"/i) {
+        push(@extra_links, [$tag, system => URI->new_abs($1, $base)]);
+      }}, "self, tagname, text");
+
     $p->parse_file($file);
 
-    foreach ($p->links()) {
-	my ($tag, %foo) = @$_;
-	map { $links{$_->canonical()}++ } values(%foo);
+    foreach ($p->links(), @extra_links) {
+	my ($tag, %attrs) = @$_;
+	map { $links{$_->canonical()}++ } values(%attrs);
     }
 }
 

Received on Thursday, 25 May 2006 15:21:54 UTC