#!/usr/local/bin/perl # # top-invalid-docs: generate and mail a report of the most popular # invalid HTML documents # # Gerald Oskoboiny, 3 Sep 1999 # # [slightly altered version of # $Id: top-invalid-docs,v 1.13 2000/09/28 04:37:10 gerald Exp $ ] # $me = $0; $date = "/usr/local/bin/date"; $sort = "/usr/local/bin/sort"; $uniq = "/usr/local/bin/uniq"; $lynx = "/usr/local/bin/lynx"; $temp = "/u1/stats/tmp/tid.$$"; $max = 20; $topmax = 100; $days = 4; $sender = "someone "; $env_sender = "foo\@example.org"; $recipient = "bar\@example.org"; $cvsdate = '$Date: 2000/09/28 04:37:10 $'; $sendmail = "/usr/lib/sendmail"; $msgidfile = "/u1/stats/top-invalid-docs.msgid"; my $nice_ymd = `$date +%Y-%m-%d --date "1 day ago"`; chomp( $nice_ymd ); my $ymdhms = `$date +%Y%m%d%H%M%S --date "1 day ago"`; chomp( $ymdhms ); my $msgid = "$ymdhms.tid-$$\@example.org"; my $total_html_pages = 0; my $valid_pageviews = 0; my $invalid_pageviews = 0; my $logs = ""; foreach $when (1..$days) { $yyyymm = `$date +%Y%m --date "$when days ago"`; chomp( $yyyymm ); $ymd = `$date +%Y%m%d --date "$when days ago"`; chomp( $ymd ); $logs .= `ls /u1/stats/logs/*/$yyyymm/complete_log.$ymd.gz`; $logs .= `ls /u1/stats/logs/*/$yyyymm/validator_log.$ymd.gz`; } $logs =~ s/\n/ /g; open( OUT, "| $sort | $uniq -c | $sort -rn > $temp" ) || die "couldn't create OUT! $!"; open( LOG, "/usr/local/bin/gunzip -c $logs | " ) || die "couldn't open! $!"; while () { chomp; ($server) = (/([^ ]+)$/); # grab the last word of the line $server =~ s/"//g; # remove quotes $server =~ s/:80//g; # remove port @f = split; next unless $f[1] =~ /^2/; next unless $f[11] =~ /text\/html/i; next if $f[4] =~ /^http:\/\//i; next if $f[4] =~ /^proxy:/i; next if $f[4] =~ m,/$,; # skip invalid apache directoryindexes $total_html_pages++; if ( $f[4] =~ m,afs/w3.org/pub/WWW, ) { $server = "www.w3.org"; # kludge to avoid 'web3.w3.org' etc. $f[4] =~ s,^/afs/w3.org/pub/WWW,,; } if ( $server eq "validator.w3.org" ) { $f[4] =~ s,^/usr/local/validator/htdocs,,; $f[4] =~ s,^/usr/local/validator/httpd,,; } if ( ( $server eq "web2.w3.org" ) && ( $f[4] eq "/usr/local/apache/share/log/Overview.html" ) ) { $server = "www.w3.org"; $f[4] = "/1998/12/bridge/Overview.html"; } if ( $server eq "search.w3.org" ) { $f[4] =~ s,^/export/home/search/htdocs,,; } if ( ( $server eq "-" ) || ( $server eq "localhost" ) ) { $server = "unknown"; } if ( ( $server eq "lists.w3.org" ) || ( $server eq "cgi.w3.org" ) || ( $server eq "www19.w3.org" ) ) { $f[4] =~ s,^/usr/etc/apache/docs,,; $f[4] =~ s,^/usr/local/apache/htdocs,,; } $uri = "http://" . $server . $f[4]; print OUT $uri, "\n"; } close( LOG ) || die "couldn't close LOG! $!"; close( OUT ) || die "couldn't close OUT! $!"; open( MSGID, "< $msgidfile" ) || warn "couldn't read from msgid file, $msgidfile! $!"; $prevmsgid = ; chomp( $prevmsgid ); close( MSGID ) || warn "couldn't close msgid file: $!"; if ( length( $prevmsgid ) ) { $references = "\nIn-Reply-To: <$prevmsgid>\nReferences: <$prevmsgid>"; $last_week_text = "\nLast week's report was: mid:$prevmsgid\n"; } open( TOP, "< $temp" ) || die "couldn't read from top file, $temp! $!"; open( MAIL, "| $sendmail -t -f$env_sender" ) || die "couldn't open pipe to sendmail! $!"; print MAIL <<"EOHD"; Subject: Most popular documents on our site, $nice_ymd Message-Id: <$msgid>$references From: $sender To: $recipient Here are the most frequently-requested HTML documents on our site that do not validate, with the their overall rank and the number of times they were requested from our site in the last $days days. Rank Hits URI ------ ------- --------------------------------------------------------- EOHD my $rank = 0; while () { chomp; $rank++; ($count,$uri) = split; undef @results; open( WGET, "$lynx -source http://validator.w3.org/check\?uri=$uri | " ) || warn "couldn't open pipe to lynx for URI $uri! $!"; while () { push( @results, $_ ); } sleep( 30 ); # avoid killing validator.w3.org close( WGET ) || warn "couldn't close pipe to lynx for URI $uri! $!"; if(grep(/Sorry/,@results)) { # uncomment this if/when directoryindexes are included in the report # $uri =~ s,/$,/ [ Apache DirectoryIndex ],; $uri =~ s,/Overview\.html$,/,; $uri =~ s,/index\.html$,/,; printf MAIL " %4d %8d %s\n", $rank, $count, $uri; $invalid_count++; $invalid_pageviews += $count; } else { $valid_pageviews += $count; $valid_count++; } $uri =~ s,/Overview\.html$,/,; $uri =~ s,/index\.html$,/,; push( @topdocs, sprintf( " %4d %8d %s\n", $rank, $count, $uri )); last if $invalid_count == $max; } $total_checked = $valid_count + $invalid_count; $valid_proportion = ($valid_pageviews / ($valid_pageviews+$invalid_pageviews)) * 100; $valid_minbound = ($valid_pageviews / $total_html_pages) * 100; $valid_proportion = sprintf("%5.2f", $valid_proportion); $valid_minbound = sprintf("%5.2f", $valid_minbound ); close( TOP ) || warn "couldn't close pipe to TOP! $!"; print MAIL <<"EOHD"; I checked a total of $total_checked documents to find the $max above that didn't validate. Among the top $total_checked documents served, $valid_proportion% of the page views were valid HTML. These documents account for $valid_minbound% of the page views on our site, so $valid_minbound% is a minimum bound on the amount of our traffic which is valid HTML. (all the numbers above exclude Apache's invalid DirectoryIndexes.) Here is a list of the top $topmax documents overall, valid or invalid: Rank Hits URI ------ ------- --------------------------------------------------------- EOHD print MAIL join( "", @topdocs[0..($topmax-1)] ); print MAIL <<"EOHD"; $last_week_text This message was generated automatically by $me on www15.example.org, last modified $cvsdate. If you have any questions about this report, please contact . EOHD close( MAIL ) || die "couldn't close pipe to sendmail! $!"; open( MSGID, "> $msgidfile" ) || die "couldn't write to msgid file, $msgidfile! $!"; print MSGID $msgid, "\n"; close( MSGID ) || die "couldn't close msgid file: $!"; system "rm -f $temp"; exit;