#!/usr/local/bin/perl -w
#
# W3C HTML Validation Service
# A CGI script to retrieve and validate an HTML file
#
# Copyright 1995-1999 Gerald Oskoboiny <gerald@w3.org>
#
# This source code is available under the license at:
#     http://www.w3.org/Consortium/Legal/copyright-software
#
# $Id: check,v 1.7 1999/10/18 00:49:19 link Exp $

#
# Load modules
use strict;
use LWP::UserAgent;
use URI::Escape;
use CGI::Carp;
use CGI qw(:cgi -newstyle_urls -private_tempfiles);
use Text::Wrap;

#
# Define global constants
use constant TRUE  => 1;
use constant FALSE => 0;
use constant UNDEF => undef;

#############################################################################
# Constant definitions
#############################################################################

my $cvsrevision	= '$Revision: 1.7 $';
my $cvsdate	= '$Date: 1999/10/18 00:49:19 $';

my $logfile	= "/var/log/httpd/val-svc";

my $uri_def_uri	= "http://www.w3.org/Addressing/#terms";
my $faqloc	= "http://www.cs.duke.edu/~dsb/kgv-faq/";
my $faqerrloc	= "${faqloc}errors.html";
my $abs_svc_uri	= "http://validator.w3.org/";
my $rel_img_uri	= "/images/";
my $abs_img_uri	= "${abs_svc_uri}images/";
my $maintainer	= 'gerald@w3.org';

my $sgmlstuff	= "/usr/local/src/validator/htdocs/sgml-lib";
my $sp		= "/usr/local/bin/nsgmls";
my $nkf		= "/usr/local/bin/nkf";

my $sgmldecl	= "$sgmlstuff/REC-html40-19980424/HTML4.decl";
my $xhtmldecl	= "$sgmlstuff/PR-xhtml1-19990824/xhtml1.dcl";
my $xmldecl	= "$sgmlstuff/sp-1.3/pubtext/xml.dcl";

my $revision	= $cvsrevision;
   $revision	=~ s/^\$Revision: //;
   $revision	=~ s/ \$$//;

my ( $validity, $version, $document_type, $xmlflags, %undef_frag,
    $meta_charset, $http_charset, $effective_charset, $charsets_differ,
    $codeconv, $lastmod, $decl, $catalog, $command, @fake_errors,
    $guessed_doctype, $doctype, $line, $col, $type, $msg, $extraspaces, $diff,
    $pos, $indent, $gifname, $alttext, $gifhw, $nicegifname, $pedanticflags,
    $pedantic_blurb, $level, $prevlevel, $i, $prevdata );

my $notice		= '';
			# "<p><strong>Note: This service will be ...</strong>";

umask( 022 );
my $weblint		= "/usr/bin/weblint";
my $html32_doctype	= qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN">};
my $html40t_doctype	= qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">};
my $html40f_doctype	= qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN" "http://www.w3.org/TR/REC-html40/frameset.dtd">};
my $xhtmlt_doctype	= qq{<!DOCTYPE HTML PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"};
my $temp		= "/tmp/validate.$$";
my $lt			= "\020";
my $gt			= "\021";
my $leftarrow		= "${lt}tt${gt}${lt}img src=\"/images/arrow_left.gif\" alt=\"^\"${gt}${lt}/tt${gt}";
my $rightarrow		= "${lt}tt${gt}${lt}img src=\"/images/arrow_right.gif\" alt=\"^\"${gt}${lt}/tt${gt}";
my $contchars		= "${lt}tt${gt}${lt}img src=\"/images/ellipsis.gif\" alt=\"[...]\"${gt}${lt}/tt${gt}";
my $gifborder		= " border=0";

my @options = qw(weblint pw outline ss sp noatt);

#############################################################################
# Array of FPIs -> plain text version strings
#############################################################################

my %pub_ids = (
    '-//IETF//DTD HTML Level 0//EN//2.0', 'HTML 0.0',
    '-//IETF//DTD HTML Strict Level 0//EN//2.0', 'Strict HTML 0.0',

    '-//IETF//DTD HTML 2.0 Level 1//EN', 'HTML 1.0',
    '-//IETF//DTD HTML 2.0 Strict Level 1//EN', 'Strict HTML 1.0',

    '-//IETF//DTD HTML 2.0 Strict//EN', 'Strict HTML 2.0',
    '-//IETF//DTD HTML 2.0//EN', 'HTML 2.0',
    '-//IETF//DTD HTML 2.1E//EN', 'HTML 2.1E',

  '-//AS//DTD HTML 3.0 asWedit + extensions//EN', 'HTML 3.0 (AdvaSoft version)',
    '-//IETF//DTD HTML 3.0//EN', 'HTML 3.0 (Beta)',
    '-//W3O//DTD W3 HTML Strict 3.0//EN//', 'Strict HTML 3.0 (Beta)',

    '-//Sun Microsystems Corp.//DTD HotJava HTML//EN', 'Hotjava-HTML',
'-//Sun Microsystems Corp.//DTD HotJava Strict HTML//EN', 'Strict Hotjava-HTML',
    '-//WebTechs//DTD Mozilla HTML 2.0//EN', 'Netscape-HTML',
    '-//Netscape Comm. Corp. Strict//DTD HTML//EN', 'Strict Netscape-HTML',
    '-//Microsoft//DTD Internet Explorer 2.0 HTML//EN', 'MSIE-HTML',
    '-//Microsoft//DTD Internet Explorer 2.0 HTML Strict//EN', 'Strict MSIE-HTML',
    '-//Microsoft//DTD Internet Explorer 3.0 HTML//EN', 'MSIE 3.0 HTML',
    '-//Microsoft//DTD Internet Explorer 3.0 HTML Strict//EN', 'Strict MSIE 3.0 HTML',
    '-//OReilly and Associates//DTD HTML Extended 1.0//EN', 'O\'Reilly HTML Extended v1.0',
    '-//OReilly and Associates//DTD HTML Extended Relaxed 1.0//EN', 'O\'Reilly HTML Extended Relaxed v1.0',

    '-//IETF//DTD HTML V2.2//EN', 'HTML 2.2',
    '-//W3C//DTD HTML 1996-01//EN', 'HTML 1996-01',
    '-//W3C//DTD HTML 3.2 Final//EN', '<a href="http://www.w3.org/TR/REC-html32">HTML 3.2</a>',
    '-//W3C//DTD HTML Experimental 970421//EN', '<a href="http://www.w3.org/TR/NOTE-html-970421.html">HTML 3.2 + Style</a>',
    '+//Silmaril//DTD HTML Pro v0r11 19970101//EN', '<a href="http://www.ucc.ie/doc/www/html/dtds/htmlpro.html">HTML Pro</a>',
    '-//Spyglass//DTD HTML 2.0 Extended//EN', 'Spyglass HTML 2.0 Extended',
    'http://www.w3.org/MarkUp/Cougar/Cougar.dtd', '<a href="http://www.w3.org/MarkUp/Cougar/">HTML Level "Cougar"</a>',
    '-//W3C//DTD HTML 4.0//EN', '<a href="http://www.w3.org/TR/REC-html40/">HTML 4.0</a> Strict',
    '-//W3C//DTD HTML 4.0 Transitional//EN', '<a href="http://www.w3.org/TR/REC-html40/">HTML 4.0</a> Transitional',
    '-//W3C//DTD HTML 4.0 Frameset//EN', '<a href="http://www.w3.org/TR/PR-html40/">HTML 4.0</a> Frameset',
    '-//W3C//DTD HTML 4.01//EN', '<a href="http://www.w3.org/TR/1999/PR-html40-19990824/">HTML 4.01</a> Strict',
    '-//W3C//DTD HTML 4.01 Transitional//EN', '<a href="http://www.w3.org/TR/1999/PR-html40-19990824/">HTML 4.01</a> Transitional',
    '-//W3C//DTD HTML 4.01 Frameset//EN', '<a href="http://www.w3.org/TR/1999/PR-html40-19990824/">HTML 4.01</a> Frameset',
    '-//W3C//DTD XHTML 1.0 Strict//EN', '<a href="http://www.w3.org/TR/1999/PR-xhtml1-19990824/">XHTML 1.0</a> Strict',
    '-//W3C//DTD XHTML 1.0 Transitional//EN', '<a href="http://www.w3.org/TR/1999/PR-xhtml1-19990824/">XHTML 1.0</a> Transitional',
    '-//W3C//DTD XHTML 1.0 Frameset//EN', '<a href="http://www.w3.org/TR/1999/PR-xhtml1-19990824/">XHTML 1.0</a> Frameset',
    'XML', '<a href="http://www.w3.org/TR/REC-xml">XML</a>'

);

#############################################################################
# Array of errors -> fragment identifiers for error explanation links
#############################################################################

my %frag = (
    'entity end not allowed in comment', 'unterm-comment-1',
    'name start character invalid only s and comment allowed in comment declaration', 'unterm-comment-2',
    'name character invalid only s and comment allowed in comment declaration', 'unterm-comment-2',
    'unknown declaration type FOO', 'bad-comment',
    'character FOO not allowed in attribute specification list', 'attr-char',
    'an attribute value must be a literal unless it contains only name characters', 'attr-quoted',
    'syntax of attribute value does not conform to declared value', 'bad-attr-char',
    'length of attribute value must not exceed LITLEN less NORMSEP', 'name-length',
    'element FOO undefined', 'undef-tag',
    'element FOO not allowed here', 'not-allowed',
    'there is no attribute FOO', 'undef-attr',
    'FOO is not a member of the group specified in the declared value of this attribute', 'undef-attr-val',
    'FOO is not a member of a group specified for any attribute', 'bad-abbrev-attr',
    'end tag for FOO omitted but its declaration does not permit this', 'no-end-tag',
    'end tag for element FOO which is not open', 'floating-close',
    'end tag for FOO which is not finished', 'omitted-content',
    'start tag for FOO omitted but its declaration does not permit this', 'no-start-tag',
    'general entity FOO not defined and no default entity', 'bad-entity',
    'non SGML character number', 'bad-char',
    'cannot generate system identifier for entity FOO', 'bad-pub-id'

#    'error', 'frag',
#    'character data is not allowed here', 'frag',

);

#############################################################################
# Set up some signal handlers in case we get killed (darned impatient people...)
#############################################################################

$SIG{'TERM'} = 'erase_stuff';
$SIG{'KILL'} = 'erase_stuff';
$SIG{'PIPE'} = 'IGNORE';
# $SIG{'CHLD'} = 'erase_stuff';

#############################################################################
# Process CGI variables
#############################################################################

#
# Create a new CGI object.
my $q = new CGI;

#
# Backwards compatibility; see
# http://lists.w3.org/Archives/Public/www-validator/1999JulSep/0197
# http://lists.w3.org/Archives/Public/www-validator/1999JulSep/0212
if (scalar $q->param) {
    foreach my $param ($q->param) {
        $q->param($param, TRUE) unless $q->param($param);
    }
}

#
# Futz the URI so "/referer" works.
if ($q->path_info eq '/referer') {
    $q->param('uri', $q->referer);
}

#
# Use "url" unless a "uri" was also given.
if ($q->param('url') and not $q->param('uri')) {
    $q->param('uri', $q->param('url'));
}

#
# Send them to the homepage unless we can extract a URI from either of the
# acceptable sources: uri, url or /referer.
&redirect_to_home_page unless length($q->param('uri')) > 5;

#
# Munge the URI to include commonly omitted prefixes/suffixes.
$q->param('uri', $q->param('uri') . '/')   unless $q->param('uri') =~ m(/);
$q->param('uri', 'http://' . $q->param('uri')) if $q->param('uri') =~ m(^www)i;

#############################################################################
# Output validation results
#############################################################################

my $header = <<"EOF";
Content-Type: text/html

$html40t_doctype
<html>

  <head>
    <title>W3C HTML Validation Service Results</title>
    <link rev="made" href="mailto:$maintainer">
  </head>

  <body bgcolor="#FFFFFF" text="#000000" link="#0000ee" vlink="#551a8b">

  <p>
    <a href="http://www.w3.org/"><img
       src="http://www.w3.org/Icons/WWW/w3c_home" height=48 border=0
       alt="W3C"></a>
  </p>

  <h1><a href="/">W3C HTML Validation Service</a> Results</h1>

$notice
EOF

unless($q->param('uri') =~ m(^http://)) {
     print $header;
     print <<"EOF";
<p>
  Sorry, this type of URI is not supported by this service.
</p>

<p>
  URIs should be in the form:
</p>

<blockquote>
  <code>$abs_svc_uri</code>
</blockquote>

<p>
  (There are other types of URIs, too, but only <code>http://</code> URIs
  are currently supported by this service.)
</p>

EOF

    &clean_up_and_exit;
}

my $ua = new LWP::UserAgent;
$ua->agent( "W3C_Validator/$revision " . $ua->agent );
$ua->parse_head(0);  # we want to parse the http-equiv stuff ourselves, for now
my $request = new HTTP::Request(GET => $q->param('uri'));

# if we got a Authorization header from the client, it means
# that the client is back at it after being prompted for
# a password: let's insert the header as is in the outgoing request
if($ENV{HTTP_AUTHORIZATION}){
    $request->headers->header(Authorization => $ENV{HTTP_AUTHORIZATION});
}

my $response = $ua->request($request);

if ( $response->code != 200 ) {
    if ( $response->code == 401 ) {
	$response->headers->www_authenticate =~ /Basic realm=\"([^\"]+)\"/;
	my $realm = $1;
	my $resource = $response->request->url;
	my $authHeader = $response->headers->www_authenticate;
	&print_401_auth_required_message( $resource, $realm, $authHeader );
    }
    else {
	print $header;
	&print_unknown_http_error_message( $q->param('uri'), $response->code,
	    $response->message );
    }
    &clean_up_and_exit;
}

my $content_type = $response->headers->header("Content-Type");

if ( ( $content_type =~ /text\/xml/i ) ||
     ( $content_type =~ /image\/svg/i ) ||
     ( $content_type =~ /application\/smil/i ) ||
     ( $content_type =~ /application\/xml/i ) ) {
    $document_type = "xml";
}
elsif ($content_type =~ /text\/html/i) {
    $document_type = "html";
}
else {
    print $header;
    print <<"EOF";

<p>
  Sorry, I am unable to validate this document because its returned
  content-type was <code>$content_type</code>, which is not
  currently supported by this service.
</p>
EOF

    &clean_up_and_exit;
}

my $jump_links = &build_jump_links;
my $count = 1; # @@ should loop over many uris instead

print $header;
print <<"EOF";
<h2><a name="doc$count">Document Checked</a></h2>

$jump_links
EOF

my @file = split '\n',$response->content;
if ( ( $document_type eq "html" ) || ( $document_type eq "xhtml" ) ) {
    ( $guessed_doctype, $doctype ) = &check_for_doctype( \@file );
}

if ( $doctype =~ /xhtml/i ) {
    $document_type = "xhtml";
}

foreach $line (@file) {
    # @@ needs to handle meta elements that span more than one line
    if ( $line =~ /<meta/i ) {
	if ( $line =~ /charset\s*=[\s"]*([^\s;">]*)/i ) {
	    $meta_charset = $1;
	    last;
	}
    }
}

( $http_charset ) = ( $content_type =~ /;\s*charset=(.*)/i );
$content_type =~ s/;.*$//;
$content_type =~ s/\s*$//g;
$http_charset =~ s/;.*//;
$http_charset =~ s/\s*//g;

if ( $http_charset ne '' ) {
    $effective_charset = $http_charset;
    if ( $meta_charset ne '' && $http_charset !~ /$meta_charset/i ) {
    	    # @@ the above needs work
        $charsets_differ = 1;
    }
}
else {
    if ( $meta_charset ne '' ) {
        $effective_charset = $meta_charset;
    }
    else {
        $effective_charset = "unknown";
    }
}

if ( $effective_charset =~ /iso-2022-jp/i ) {
    $codeconv = "$nkf -Jex | ";
}
elsif ( $effective_charset =~ /utf-8/i ) {
    $ENV{SP_CHARSET_FIXED}="YES";
    $ENV{SP_ENCODING}="utf-8";
}
elsif ( $effective_charset =~ /Shift_JIS/i ) {
    $codeconv = "$nkf -Sex | ";
}
else {
    $codeconv = "";
}

print qq(<ul>\n  <li><a href="$uri_def_uri">URI</a>: ),
    '<a href="', $q->param('uri'), '">', $q->param('uri'), qq(</a>\n);

if ( $lastmod = $response->headers->header("Last-Modified") ) {
    print qq{  <li>Last modified: $lastmod\n};
}

if ( defined $response->headers->server ) {
    print "  <li>Server: " . $response->headers->server . "\n";
}

if ( defined $response->content_length ) {
    print "  <li>Content length: " . $response->content_length . "\n";
}

if ( $document_type eq "xhtml" ) {
    $ENV{SP_CATALOG_FILES} = "$sgmlstuff/PR-xhtml1-19990824/xhtml.soc";
    $ENV{SGML_SEARCH_PATH} = "$sgmlstuff/PR-xhtml1-19990824/";
    $ENV{SP_CHARSET_FIXED}="YES";
    $ENV{SP_ENCODING}="XML";
    $xmlflags = "-wxml ";
    $decl = $xhtmldecl;
}
elsif ( $document_type eq "xml" ) {
    $ENV{SP_CATALOG_FILES} = "$sgmlstuff/sp-1.3/pubtext/xml.soc";
    $ENV{SGML_SEARCH_PATH} = "$sgmlstuff/sp-1.3/pubtext/";
    $ENV{SP_CHARSET_FIXED}="YES";
    $ENV{SP_ENCODING}="XML";
    $xmlflags = "-wxml -wno-valid ";
    $decl = $xmldecl;
}
else {			# must be HTML (for now)
    $decl = $sgmldecl;
    $catalog = "-c $sgmlstuff/catalog";
}

$command  = "$codeconv $sp -E0 $xmlflags $catalog $decl";

# print "  <li>nsgmls command line: <code>$command</code>\n";

open CHECKER, "|$command - >$temp.esis 2>$temp"
  or die "open(|$command - >$temp.esis 2>$temp) returned: $!\n";

print CHECKER "$doctype\n" if $guessed_doctype;
# this is a kludge for DOS users with their entire file on a single line
# like http://bogo.w3.org/test/samuels.html
if ( $#file == 0 ) {
    @file = (split(/
/,$file[0]));
    for (0..$#file) {
	$file[$_] .= "\n";
    }
}
# kludge for other DOS users with CRLFs
for (@file) {
    s/
+$//;
    print CHECKER $_, "\n";
}
close CHECKER                   or warn "close(CHECKER) returned: $!\n";

open ERRORS, "<$temp"           or die  "open($temp) returned: $!\n";
my @errors = <ERRORS>;
close ERRORS                    or warn "close(ERRORS) returned: $!\n";

my @esis;
open ESIS, "$temp.esis"         or die  "open($temp.esis) returned: $!\n";
while (<ESIS>) {
    next if / IMPLIED$/;
    next if /^ASDAFORM CDATA /;
    next if /^ASDAPREF CDATA /;
    chomp; # Removes trailing newlines
    push @esis, $_;
}
close ESIS                      or warn "close(ESIS) returned: $!";

my $fpi;
$version = "unknown";
if ( $document_type eq "xhtml" ) {
    $fpi = $doctype;
}
elsif ( $document_type eq "xml" ) {
    $fpi = "XML";
}
else {
    for (@esis) {
	next unless /^AVERSION CDATA (.*)/;
	$fpi = $1;
	last;
    }
    if ( ! defined $fpi && length( $doctype) ) {
	    # this is needed for HTML 4 strict, which doesn't have a
	    # version attribute on the HTML element
        $fpi = $doctype;
    }
}
$version = $pub_ids{$fpi} || "unknown";

if ( $guessed_doctype ) {
    push( @fake_errors, "nsgmls:<OSFD>0:2:1:E: Missing DOCTYPE declaration at start of document (${lt}a href=\"http://www.htmlhelp.org/tools/validator/doctype.html\"${gt}explanation...${lt}/a${gt})\n" );
}

print qq{  <li>Character encoding: $effective_charset\n};

if ( $charsets_differ ) {
    print <<"EOHD";
<br>
  <strong>Warning:</strong> the character encoding specified in the HTTP header
  (<code>$http_charset</code>) is different from the one specified in the META
  element (<code>$meta_charset</code>).
  I will use <code>$effective_charset</code> for this validation.

EOHD

}

print "  <li>Document type: <b>$version</b>.\n";

print "</ul>\n\n";

if ( $document_type eq "xml" ) {
print <<"EOHD";
  <p>
    <strong>Note: experimental XML support was added to this service
    on Aug 31, 1999, but it is not quite working yet; stay tuned to <a
    href="http://lists.w3.org/Archives/Public/www-validator/">the
    <code>www-validator</code> mailing list</a> for updates, and
    please do not trust this service\'s output for XML documents
    in the meantime.</strong>
  </p>
EOHD

}

print <<"EOHD";
  <p>
    Below are the results of attempting to parse this document with
    an SGML parser.
  </p>

EOHD

if ( $? || $guessed_doctype ) {
    print "<pre>\n";
    for ((@fake_errors,@errors)) {
	next if /^<OSFD>0:[0-9]+:[0-9]+:[^A-Z]/;
	next if / numbers exceeding 65535 not supported$/;
	next if ( $document_type eq "xhtml" ) && /:W: SGML declaration was not implied$/;
	s/.*<OSFD>//g;
	if ( ! (($line, $col, $type, $msg)=(/^[^:]*:([0-9]+):([0-9]+):([A-Z]?):? (.*)/))) {
	    print "Uh oh! I got the following unknown error:\n\n   $_\n\n";
	    print "Please make sure you specified the DOCTYPE properly!\n\n";
	    &output_doctype_spiel;
	    last;
	}
	if ( $msg =~ /^cannot generate system identifier for entity / ) {
	    print "   <b>Fatal error</b>! $msg\n";
	    print "</pre>\n<p>I couldn't parse this document, because it " .
		  "uses a public\n    identifier that's not in my <a\n " .
		  "    href=\"sgml-lib/catalog\">catalog</a>!\n  </p>\n";
	    &output_doctype_spiel;
	    print "<pre>";	# so the </pre> we print later gets re-started
	    last;
	}
	if ( $msg =~ /^cannot open / ) {
	    print "   Fatal error! $msg\n";
	    print "</pre>\n<p>I couldn't parse this document, because it " .
		  "makes reference to\n    a system-specific file instead of " .
		  "simply using a public identifier\n    to specify the " .
		  "level of HTML being used.\n  </p>\n";
	    &output_doctype_spiel;
	    print "<pre>";	# so the </pre> we print later gets re-started
	    last;
	}
	$extraspaces = "";	# in case we put "(truncated)" gif on LHS
	$line-- if $guessed_doctype;
	my $newline = $file[$line-1];
	$newline .= "\n";

	# make sure there's no ^P or ^Q's in the file, since we need to use
	# them to represent '<' and '>' temporarily.
	$newline =~ s/${lt}/P/go; $newline =~ s/${gt}/Q/g;

	if ( length( $newline ) > 70 ) {
	    if ( $col < 25 ) {
		# truncate source line at 70 chars (truncate right side only)
		$newline = substr( $newline, 0, 70 ) . "$contchars" . "\n";
	    }
	    elsif ( $col > 70 ) {
		# keep rightmost 70 chars; adjust $col accordingly
		# (truncate left side only)
		$diff = $col - 50;
		$newline = "$contchars" . substr( $newline, $diff, 70 );
		if ( length( $newline ) == (70 + length( "$contchars" )) ) {
		    $newline .= "$contchars" . "\n";
		}
		if ( $col > $diff ) {
		    $col -= $diff;
		}
		else {
		    $col -= 70;
		}
		$extraspaces = " " x 8;
	    }
	    else {
		# truncate source line on both sides; leave more source text
		# on left, and about 30 chars on right side. Also, adjust $col.
		if ( $col < 35 ) {
		    $newline = "$contchars" . substr( $newline, 0, 60 );
		}
		else {
		    $newline = "$contchars" . substr( $newline, $col - 35, 60 );
		    $col = 35;
		}
		if ( length( $newline ) == ( 60 + length( "$contchars" ))) {
		    $newline .= "$contchars" . "\n";
		}
		$extraspaces = " " x 8;
	    }
	}

	# figure out the index into the %frag associative array for the
	# "explanation..." links to the KGV FAQ.
	my $msgindex = $msg;
	$msgindex =~ s/"[^"]+"/FOO/g;
	$msgindex =~ s/[^A-Za-z ]//;

	my $out = "${lt}hr${gt}\n\nError at line $line:\n   $newline";
	if ( length( $msg ) < $col ) {    # does it fit in front?
	    $out .= "$extraspaces " . ' ' x ($col-length($msg)) .
		"$msg $rightarrow";
	}
	else {
	    if ( ( length( $msg ) + $col ) > 60 ) {
		if ( $msg =~ /,/ ) {
		    $msg =~ s/,/,\n         /;
		}
		else {
		    if ( ( (length( $msg ) / 2) + $col ) > 60 ) {
			$pos = index( $msg, ' ', length($msg)/4 );
			$indent = " " x (65-length($msg)*3/4);
		    }
		    else {
			$pos = index( $msg, ' ', length($msg)/2 );
			$indent = " " x ($col + 4);
		    }
		    $msg = substr( $msg, 0, $pos ) .
			"\n$indent" . substr( $msg, $pos );
		}
	    }
	    $out .= "$extraspaces   " . ' ' x ($col-1) . "$leftarrow $msg";
	}

	if ( defined $frag{$msgindex} ) {
	    $out .=
  " (${lt}a href=\"$faqerrloc#$frag{$msgindex}\"${gt}explanation...${lt}/a${gt})";
	}
	else { # remember msgindexes without frags, to get the KGV FAQ updated.
	    $undef_frag{$msgindex} = 1;
	}

	$out .= "\n";
	$out =~ s/&/&amp;/go; $out =~ s/</&lt;/go; $out =~ s/>/&gt;/go;
# the following kludge is so the <img src>s don't get sgml-proofed above
	$out =~ s/${lt}/</g; $out =~ s/${gt}/>/g;
	print $out;
    }
    print "</pre>\n";
    print "<hr>\n";
    if ( $version eq "unknown" ) {
	print "\n  <p>\n    Sorry, I can't validate this document.\n  </p>\n";
    }
    else {
	print "\n  <p>\n    Sorry, this document does not validate as $version.\n  </p>\n\n";
	&output_css_validator_blurb( $q->param('uri') );
    }
    $validity="invalid";
}
else {
    print "\n  <pre>\n    No errors found!</pre>\n\n";
    if ( $version ne "unknown" ) {
	if ( $version =~ /^HTML 2\.0$/ ) {
	    $gifname = "vh20";
	    $alttext = "Valid HTML 2.0!";
            $gifborder = "";
	}
	elsif ( $version =~ /HTML 3\.2</ ) {
	    $gifname = "vh32";
	    $alttext = "Valid HTML 3.2!";
            $gifhw   = " height=31 width=88";
	}
	elsif ( $version =~ /HTML 4\.0<\/a> Strict$/ ) {
	    $gifname = "vh40";
	    $alttext = "Valid HTML 4.0!";
            $gifborder = "";
            $gifhw   = " height=31 width=88";
	}
	elsif ( $version =~ /HTML 4\.0<\/a> / ) {
	    $gifname = "vh40";
	    $alttext = "Valid HTML 4.0!";
            $gifhw   = " height=31 width=88";
	}
	elsif ( $version =~ /HTML 4\.01<\/a> Strict$/ ) {
	    $gifname = "vh40";
	    $alttext = "Valid HTML 4.01!";
            $gifborder = "";
            $gifhw   = " height=31 width=88";
	}
	elsif ( $version =~ /HTML 4\.01<\/a> / ) {
	    $gifname = "vh40";
	    $alttext = "Valid HTML 4.01!";
            $gifhw   = " height=31 width=88";
	}
	elsif ( $version =~ /HTML 3\.0/ ) {
	    $gifname = "vh30";
	    $alttext = "Valid HTML 3.0!";
	}
	elsif ( $version =~ /Netscape/ ) {
	    $gifname = "vhns";
	    $alttext = "Valid Netscape-HTML!";
	}
	elsif ( $version =~ /Hotjava/ ) {
	    $gifname = "vhhj";
	    $alttext = "Valid Hotjava-HTML!";
	}
	if ( defined $gifname ) {
	    $nicegifname = $gifname;
	    $nicegifname =~ s/</\&lt;/g; $nicegifname =~ s/&/\&amp;/g;
	    print <<"EOHD";
  <p>
    <img src="$rel_img_uri$gifname" alt="$alttext"> Congratulations, this
    document validates as $version!
  </p>

  <p>
    To show your readers that you have taken the care to create an
    interoperable Web page, you may display this icon on any page
    that validates. Here is the HTML you could use to add this icon
    to your Web page:
  </p>
  <pre>
  &lt;p&gt;
    &lt;a href="${abs_svc_uri}check/referer"&gt;&lt;img$gifborder
        src="$abs_img_uri$nicegifname"
        alt="$alttext"$gifhw&gt;&lt;/a&gt;
  &lt;/p&gt;</pre>
  <p>
    If you like, you can <a href="$rel_img_uri$gifname">download a copy of this
    image</a> to keep in your local web directory, and change the HTML fragment
    above to reference your local image rather than the one on this server.
  </p>

EOHD
	}
    }
    if ( ( $version eq "unknown" ) || ( ! defined $gifname ) ) {
	print "  <p>\n    Congratulations, this document validates as the document type specified! (I don't have an icon for this one yet, sorry.)\n  </p>\n";
    }

    my $thispage = $q->self_url;

    &output_css_validator_blurb( $q->param('uri') );

    print <<"EOHD";
  <p>
    If you would like to create a link to <em>this</em> page (i.e., this
    validation result) to make it easier to re-validate this page in the
    future or to allow others to validate your page, the URI is:
  </p>

  <blockquote>
    <code>$thispage</code>
  </blockquote>

  <p>
    (Or, you can just add the current page to your bookmarks or hotlist.)
  </p>

EOHD

    $validity="valid";
}

if ( $q->param('weblint') ) {
    if ( $q->param('pw') ) {
	$pedanticflags  = '-pedantic -e mailto-link';
	$pedantic_blurb = ' (in "pedantic" mode)';
    }
    else {
	$pedanticflags = '';
    }

    print <<"EOF";
  <hr>
  <h2><a name="weblint">Weblint Results</a></h2>

  <p>
    Below are the results of running <a
    href="http://www.weblint.org/">Weblint</a>
    on this document$pedantic_blurb:
  </p>
EOF

    open( WEBLINT,
	"| $weblint -s $pedanticflags - 2>&1 >$temp.weblint" )
	    || die "couldn't open weblint: $!";
    for (@file) {
	print WEBLINT $_, "\n";
    }
    close( WEBLINT ) or warn "couldn't close weblint: $!";

    print "\n\n";
    if ( $? ) {
	print "  <ul>\n";

	    open( WEBLINTOUT, "$temp.weblint" )
		|| die "couldn't open weblint results in $temp: $!";

	    while (<WEBLINTOUT>) {
		s/ \(use "-x <extension>" to allow this\)\.$/./go;
		s/&/&amp;/go;
		s/</&lt;/go;
		s/>/&gt;/go;
		print "    <li>$_";
	    }

	    close( WEBLINTOUT ) || die "couldn't close weblint results: $!";
	    print "  </ul>\n";
    }
    else {
	print "\n  <blockquote>\n    Looks good to me!\n  </blockquote>\n";
    }
    print "\n\n";
}

if ($q->param('outline')) {
  print <<'EOF';
  <DIV id="outline" class="mtb">
    <HR>
    <H2><A name="outline">Outline</A></H2>
    <P>
      Below is an outline for this document, automatically generated from the
      heading tags (<CODE>&lt;H1&gt;</CODE> through <CODE>&lt;H6&gt;</CODE>.)
    </P>
EOF

  my $prevlevel = 0;
  my $indent    = 0;
  my $level     = 0;
  for (1 .. $#esis) {
    my $line = $esis[$_];
    next unless $line =~ /^\(H([1-6])$/i;
    $prevlevel = $level;
    $level     = $1;

    print "    </UL>\n" x ($prevlevel - $level); # perl is so cool.
    if ($level - $prevlevel == 1) {
      print "    <UL>\n";
    }
    foreach my $i (($prevlevel + 1) .. ($level - 1)) {
      print qq(  <UL>\n    <LI class="warning">A level $i heading is missing!\n);
    }
    if ($level - $prevlevel > 1) {
      print "    <UL>\n";
    }

    $line       = '';
    my $heading = '';
    until (substr($line, 0, 3) =~ /^\)H$level/i) {
      $line = $esis[$_++];
      if ($line =~ /^-/) {
	my $headcont = $line;
	substr($headcont, 0, 1) = " ";
	$headcont =~ s/\\n/ /g;
	$heading .= $headcont;
      } elsif ($line =~ /^AALT CDATA( .+)/) {
	my $headcont = $1;
	$headcont =~ s/\\n/ /g;
	$heading .= $headcont;
      }
    }

    $heading = substr($heading, 1); # chop the leading '-' or ' '.
    $heading =~ s/&/&amp;/go; $heading =~ s/</&lt;/go;
    print "    <LI>$heading</LI>\n";
  }
  print "    </UL>\n" x $level;
  print <<'EOF';
    <P>
      If this does not look like a real outline, it is likely that the
      heading tags are not being used properly. (Headings should reflect
      the logical structure of the document; they should not be used simply
      to add emphasis, or to change the font size.)
    </P>
EOF
}

if ( $q->param('ss') ) {
    print <<'EOF';
  <hr>
  <h2><a name="source">Source Listing</a></h2>

  <p>
    Below is the source input I used for this validation:
  </p>
EOF

    print "<pre>\n";
    if ( $guessed_doctype ) {
	my $gd = "$doctype\n";
	$gd =~ s/&/&amp;/go; $gd =~ s/</&lt;/go;
	printf "%4d: %s", 0, $gd;
    }
    $line = 1;
    for (@file) {
	s/&/&amp;/go; s/</&lt;/go;
	printf "%4d: %s\n", $line, $_;
	$line++;
    }
    print "</pre>\n";
}

if ($q->param('sp')) {
  print <<'EOF';
  <DIV id="parse" class="mtb">
    <HR>
    <H2><A name="parse">Parse Tree</A></H2>
    <P class="note">
      Note that this feature is still under construction! I am trying to make
      this easier to read somehow, with little success.
    </P>
EOF
    if ($q->param('noatt')) {
      print <<'EOF';
    <P class="note">
      I am excluding the attributes, as you requested.
    </P>
EOF
    } else {
      print <<'EOF';
    <P class="note">
      It helps a bit if you select the "don't show attributes" option on the
      <A href="./#byURI">form</A>.
    </P>
EOF
    }

  my $indent   = 0;
  my $prevdata = '';

  print "<PRE>\n";
  foreach my $line (@esis) {
    if ($q->param('noatt')) {
      next if $line =~ /^A/;
      next if $line =~ /^\(A$/;
      next if $line =~ /^\)A$/;
    }

    #
    # Experimental: skip data if it's only newlines and space.
    $line =~ s/\\n/ /g;
    $line =~ s/\\011/ /g;
    $line =~ s/\s+/ /g;
    next if $line =~ /^-\s*$/;

    if ($line =~ /^-/) {
      substr($line, 0, 1) = ' ';
      $prevdata .= $line;
      next;
    } elsif ($prevdata) {
      $prevdata =~ s/&/&amp;/go;
      $prevdata =~ s/</&lt;/go;
      $prevdata =~ s/\s+/ /go;
      print wrap(' ' x $indent, ' ' x $indent, $prevdata), "\n";
      undef $prevdata;
    }
    # End of experimental stuff.
    #

    # This is interesting when uncommented
    # next unless /^AHREF CDATA /;
    $line =~ s/&/&amp;/go;
    $line =~ s/</&lt;/go;
    if ($line =~ /^\)/) {
      $indent -= 2;
    }

    my $printme;
    chomp($printme = $line);
    $printme =~ s/^\((.*)/&lt;<STRONG>$1<\/STRONG>&gt;/;
    $printme =~ s/^\)(.*)/&lt;\/<STRONG>$1<\/STRONG>&gt;/;
    print ' ' x $indent, $printme, "\n";
    if ($line =~ /^\(/) {
      $indent += 2;
    }
  }
  print "</pre>\n";
}

&clean_up_and_exit;

sub output_doctype_spiel {

    print <<"EOF";

    <p>
      You should make the first line of your HTML document a DOCTYPE
      declaration, like this:
    </p>

    <pre>
      &lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN"&gt;
      &lt;HTML&gt;
	&lt;HEAD&gt;
	  &lt;TITLE&gt;Title&lt;/TITLE&gt;
	&lt;/HEAD&gt;

	&lt;BODY&gt;
	  &lt;-- ... body of document ... --&gt;
	&lt;/BODY&gt;
      &lt;/HTML&gt;</pre>

    <p>
      Or, if you are using features from <a
      href="http://www.w3.org/TR/REC-html40/">HTML 4.0</a>,
      one of these:
    </p>

    <pre>
      &lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0//EN"&gt;
      &lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"&gt;
      &lt;!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Frameset//EN"&gt;</pre>

EOF

}

sub output_closing {

    print <<"EOF";

<hr>

<address>
  <a href="http://validator.w3.org/check/referer"><img
     src="http://validator.w3.org/images/vh40" height=31 width=88
     align=right border=0 alt="Valid HTML 4.0!"></a>
  <a href="/feedback.html">Gerald Oskoboiny</A><br>
  $cvsdate
</address>

</body>

</html>
EOF

}

sub erase_stuff {

    unlink $temp           or warn "unlink($temp) returned: $!\n";
    unlink "$temp.esis"    or warn "unlink($temp.esis) returned: $!\n";
    unlink "$temp.weblint" or warn "unlink($temp.weblint) returned: $!\n";

}

sub make_log_entry {

    my $msgindex;

    open(LOG,">>$logfile") || die "couldn't append to log: $!";
    print LOG "$ENV{REMOTE_HOST}\t$validity $version\t", $q->param('uri'), "\n";
    foreach $msgindex (keys %undef_frag) {
	print LOG "frag not defined for msgindex: $msgindex\n";
    }
    close( LOG ) || die "couldn't close log: $!";

}

sub clean_up_and_exit {

    &output_closing;
    &erase_stuff;
#    &make_log_entry;
    exit;

}

sub redirect_to_home_page {

    print "Status: 302 Moved Permanently\n";
    print "Content-Type: text/html\n";
    print "Location: http://validator.w3.org/\n\n";
    print "<title>Moved!</title>\n";
    print "<p>\n";
    print "  Please see <a href=\"http://validator.w3.org/\">the validation service's home page.</a>\n";
    print "</p>\n";

    &clean_up_and_exit;

}

sub build_jump_links {

    my $text;
    my $count = 0;

    $count++ if $q->param('ss');
    $count++ if $q->param('sp');
    $count++ if $q->param('weblint');
    $count++ if $q->param('outline');

    if ( $count ) {
	$text .= "  <p>\n    Jump to: ";
	if ( $q->param('weblint') ) {
	    $text .= "<a\n      href=\"#weblint\">Weblint Results</a>";
	    $count--;
	    $text .= " or " if ( $count == 1 );
	    $text .= ", "   if ( $count >  1 );
	}
	if ( $q->param('outline') ) {
	    $text .= "<a\n      href=\"#outline\">Outline</a>";
	    $count--;
	    $text .= " or " if ( $count == 1 );
	    $text .= ", "   if ( $count >  1 );
	}
	if ( $q->param('ss') ) {
	    $text .= "<a\n      href=\"#source\">Source Listing</a>";
	    $count--;
	    $text .= " or " if ( $count == 1 );
	    $text .= ", "   if ( $count >  1 );
	}
	if ( $q->param('sp') ) {
	    $text .= "<a\n      href=\"#parse\">Parse Tree</a>";
	}
	$text .= ".\n  </p>\n\n";
    }
    return $text;

}

sub check_for_doctype {
    # check if the document has a doctype; if it doesn't, try to
    # guess an appropriate one given the elements used
    #
    # returns 2 values:
    #
    # first value: 0 or 1:
    #     if 0, there was a doctype already present;
    #     if 1, there wasn't a doctype
    #
    # second value:
    #     the inferred doctype, if any

    my $fileref = shift;		# a reference to @file, for efficiency
    my @file = @$fileref;		# dereference $fileref

    foreach $count (0..$#file) {
	$line = $file[$count];

	# does an HTML element precede the doctype on the same line?
	last if $line =~ /<[a-z].*<!doctype/i;

	if ( $line =~ /<!doctype/i ) {	# found a doctype
	    my $dttext = join( "", @file[$count..$count+5] );
	    $dttext =~ s/\n//g;
	    $dttext =~ s/.*doctype\s+html\s+public\s*"//i;
	    $dttext =~ s/".*//;	# strip everything except the FPI
	    return 0, $dttext;
	}

	$line =~ s/<!(?:--(?:[^-]|-[^-])*--\s*)+>//go;	# strip comments,
	    # so the next line doesn't find commented-out markup etc.
	    # (this doesn't handle multi-line comments, unfortunately)

	last if ( $line =~ /<[a-z]/i );		# found an element

    }

    # do several loops of increasing lengths to avoid iterating over
    # the whole file if possible.
    #
    # these heuristics could be improved a lot.

    foreach $line (@file[0..20]) {
	return 1, $xhtmlt_doctype if $line =~ /xmlns\s*=/i;
    }

    foreach $line (@file[0..20]) {
	return 1, $html40f_doctype if $line =~ /<frame/i;
    }

    foreach $line (@file[0..20]) {
	return 1, $html40t_doctype if $line =~ /<(table|body )/i;
    }

    # go through the whole file
    foreach $line (@file) {
	return 1, $html40t_doctype if $line =~ /<(table|body )/i;
    }

    foreach $line (@file) {
	return 1, $html32_doctype if $line =~ /<center>/i;
	return 1, $html32_doctype if $line =~ /<[h0-9p]*\s*align\s*=\s*center>/i;
    }

    # no luck earlier; guess HTML 4.0 transitional
    return 1, $html40t_doctype;

}

sub print_401_auth_required_message {

    my $resource = shift;
    my $realm = shift;
    my $authHeader = shift;

    print <<"EOF";
Status: 401 Authorization Required
WWW-Authenticate: $authHeader
Connection: close
Content-Type: text/html

<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">
<HTML><HEAD>
<TITLE>401 Authorization Required</TITLE>
</HEAD><BODY>
<H1>Authorization Required</H1>
<p>
  Sorry, I am not authorized to access the specified URI.
</p>

<p>
  The URI you specified,
</p>

<blockquote>
  <code><a href="$resource">$resource</a></code>
</blockquote>

<p>
  returned a 401 "authorization required" response when I tried
  to download it.
</p>

<p>
  You should have been prompted by your browser for a
  username/password pair; if you had supplied this information, I
  would have forwarded it to your server for authorization to
  access the resource. You can use your browser's "reload" function
  to try again, if you wish.
</p>

<p>
  Of course, you may not want to trust me with this information,
  which is fine. I can tell you that I don't log it or do
  anything else nasty with it, and you can <a
  href="http://validator.w3.org/source/">download the source for
  this service</a> to see what it does, but you have no guarantee
  that this is actually the code I'm using; you basically have to
  decide whether to trust me or not. :-)
</p>

<p>
  Note that you shouldn't use HTTP Basic Authentication for
  anything which really needs to be private, since the password
  goes across the network unencrypted.
</p>
EOF

}

sub print_unknown_http_error_message {

    my $uri = shift;
    my $code = shift;
    my $message = shift;

    print <<"EOF";
  <p>
    I got the following unexpected response when trying to
    retrieve <code><a href="$uri">$uri</a></code>:
  </p>

  <blockquote>
    <code>$code $message</code>
  </blockquote>

  <p>
    Please make sure you have entered the URI correctly.
  </p>

EOF

}

sub output_css_validator_blurb {

    my $uri = shift;

    print <<"EOHD";
  <p>
    If you use <a href="http://www.w3.org/Style/css/">CSS</a>
    in your document, you should also <a
    href="http://jigsaw.w3.org/css-validator/validator?uri=$uri">check
    it for validity</a> using W3C's <a
    href="http://jigsaw.w3.org/css-validator/">CSS
    Validation Service</a>.
  </p>

EOHD

}

