Combo Walker/Validator [was: Conformance ratings]

Daniel W. Connolly (connolly@beach.w3.org)
Mon, 19 Feb 1996 00:02:26 -0500


Message-Id: <m0toNk3-0002UJC@beach.w3.org>
To: MegaZone <megazone@livingston.com>
Cc: www-html@w3.org
Subject: Combo Walker/Validator [was: Conformance ratings]
In-Reply-To: Your message of "Mon, 12 Feb 1996 19:36:27 PST."
             <199602130336.TAA23804@server.livingston.com> 
Date: Mon, 19 Feb 1996 00:02:26 -0500
From: "Daniel W. Connolly" <connolly@beach.w3.org>

In message <199602130336.TAA23804@server.livingston.com>, MegaZone writes:
> (I'd like to find a checker I can have recurse our site to
>check them all and just report errors.  I know my own code has some silly
>things (When tired I sometimes close tags that don't require closing) but
>moreso, some pages are done by someone in marketing and I find errors in
>her HTML often enough for it to be a concern for me as Webmaster.
>
>The tools I've tried are one page at a time.

The HTML validation service is based on James Clark's sgml parser
(available via www.jclark.com).

There are lots of web walkers. I'll attach one below.

20 points to the folks that glue them together and get them to work
for MegaZone (as an alpha tester) and eventually for everybody. Heck:
stick a cheesy GUI on it, and from reading the trade rags, this would
probably sell like hotcakes at $49/copy ;-)

(courtesy of Guido VanRossum... requires python 1.3. See www.python.org)


#! /usr/local/bin/python

"""A Web robot that checks out one or more trees and reports bad links.

Usage: robot [-v] [-x] [-b blacklist_url] ... [root_url] ...

Options:

-b blacklist_url -- don't descend below this URL; may be repeated
-v               -- verbose
-x               -- check external links as well (in last phase)

Arguments:

root_url -- The root(s) of the trees to be checked.
            Default: http://www.python.org/. 
            If no -b options are given, a number of default ones are assumed.

Algorithm:



"""

__version__ = "1.2"

import urllib
import sys
import os
import urlparse
import htmllib
import string
import regex
import md5
import time
from types import *
from formatter import NullFormatter


# Status enum type
Status = []
for st in "UNKNOWN", "OK", "NOHTML", "MOVED", "BAD":
    Status.append(st)
    exec "%s = %s" % (st, `st`)
del st


class URLInfo:

    def __init__(self, url, manager, is_local):
	self.url = url
	self.manager = manager
	self.is_local = is_local
	self.status = UNKNOWN
	self.addinfo = None
	self.refs = []

    def add_ref(self, ref):
	if ref not in self.refs:
	    self.refs.append(ref)

    def examine(self):
	self.manager.message("examine %s", self.url)
	f = self.urlopen()
	if f:
	    try:
		info = f.info()
		text = f.read()
		self.process(info, text)
	    finally:
		f.close()

    def urlopen(self):
	self.manager.message("urlopen %s", self.url)
	try:
	    f = self.manager.urlopen(self)
	except IOError, msg:
	    nurl = None
	    err = None
	    if type(msg) == TupleType and len(msg) == 4:
		err = msg[1]
	    if err == 302:
		m = msg[3]
		if m.has_key('location'):
		    nurl = m['location']
		elif m.has_key('uri'):
		    nurl = m['uri']
	    elif err == 500:
		# Come back later
		print "500 error:", self.url
		self.manager.add_url(self.url, self)
		time.sleep(10) # Oops! back off a while
		return
	    if nurl:
		self.manager.message("relocated %s\n\t-> %s",
				     self.url, nurl)
		self.status = MOVED
		self.addinfo = nurl
		self.manager.add_url(nurl, self)
		return
	    if err is not None:
		msg = msg[:3]
	    print "Error", self.url, msg
	    self.status = BAD
	    self.addinfo = msg
	    return
	if not self.is_local:
	    f.close()
	    return
	return f

    def process(self, info, text):
	self.manager.message("process %s", self.url)
	if info and info.has_key('content-type'):
	    ctype = string.lower(info['content-type'])
	else:
	    ctype = "text/unknown"
	doit = ctype == 'text/html'
	if doit:
	    self.status = OK
	    p = htmllib.HTMLParser(NullFormatter())
	    p.feed(text)
	    p.close()
	    for a in p.anchorlist:
		self.manager.add_url(a, self)
	else:
	    self.status = NOHTML
	    self.manager.message("skip %s for %s",
				 ctype, self.url)


class Robot:

    infoclass = URLInfo

    def __init__(self, roots, blacklist = [], debug = 0):
	self.roots = roots
	self.debug = debug
	self.blacklist = map(self.normalize, blacklist)
	self.table = {}
	self.todo = {}
	self.todo_later = {}
	for root in self.roots:
	    self.add_url(root)
	self.urlopener = urllib.URLopener()
	self.addheaders = [
		("User-agent", "Python-robot/%s" % __version__),
		("Accept", "text/html"),
		]

    def run(self, maxlevels = 10):
	for i in range(maxlevels):
	    print
	    print "Level", i, "..."
	    print
	    if not self.todo:
		print "No new stuff"
		break
	    self.explore()
	    self.report()

    def run_externals(self):
	print
	print "Exploring non-local URLs"
	print
	self.debug = self.debug or 1
	self.todo = self.todo_later
	self.todo_later = {}
	self.explore()
	self.report()

    def explore(self):
	for it in self.todo.values():
	    it.examine()
	    del self.todo[it.url]

    def report(self):
	self.report_stats()
	self.report_bad()

    def report_stats(self):
	stats = {}
	for st in Status: stats[st] = 0
	for it in self.table.values():
	    st = it.status
	    stats[st] = stats[st] + 1
	print
	print "STATISTICS"
	print "=========="
	print
	print "OK               %3d" % stats[OK]
	print "No HTML          %3d" % stats[NOHTML]
	print "Moved            %3d" % stats[MOVED]
	print "Bad              %3d" % stats[BAD]
	print "Examined        ---- +"
	print "                 %3d" % \
	      (stats[OK] + stats[NOHTML] + stats[MOVED] + stats[BAD])
	print
	print "To do local      %3d" % len(self.todo)
	print "Non-local        %3d" % (stats[UNKNOWN] - len(self.todo))
	print "                ---- +"
	print "Not yet examined %3d" % stats[UNKNOWN]

    def report_bad(self):
	# Find and report bad URLs
	bad = {}
	moved = {}
	for it in self.table.values():
	    if it.status is BAD:
		bad[it] = it.refs
	    elif it.status is MOVED:
		moved[it.url] = it
	if moved:
	    print
	    print "MOVED REFERENCES"
	    print "================"
	    print
	    movers = moved.keys()
	    movers.sort()
	    for m in movers:
		print m, "moved to", moved[m].addinfo
	if bad:
	    print
	    print "BAD REFERENCES"
	    print "=============="
	    print
	    badrefs = invert(bad)
	    bads = badrefs.keys()
	    bads.sort()
	    for b in bads:
		print b.url, "has bad refs to:"
		for br in badrefs[b]:
		    print "\t" + `br.url`

    def urlopen(self, it):
	self.urlopener.addheaders = self.addheaders[:]
	if it.refs:
	    self.urlopener.addheader("Referer", it.refs[0].url)
	return self.urlopener.open(it.url)

    def add_url(self, url, ref = None):
	if ref:
	    url = urlparse.urljoin(ref.url, url)
	url = self.normalize(url)
	if self.table.has_key(url):
	    it = self.table[url]
	else:
	    l = self.is_local(url)
	    it = self.table[url] = self.infoclass(url, self, l)
	    if l:
		t = self.todo
	    else:
		t = self.todo_later
	    t[it.url] = it
	if ref:
	    it.add_ref(ref)

    def is_local(self, url):
	for root in self.roots:
	    if url[:len(root)] == root:
		break
	else:
	    return 0
	return not self.is_blacklisted(url)

    def is_blacklisted(self, url):
	for bl in self.blacklist:
	    if url[:len(bl)] == bl:
		return 1
	return 0

    def normalize(self, url):
	"""Normalize a URL.

	- convert the scheme and host to lower case (but not user/pass!)
	- drop the port if it is 80
	- drop the fragment

	XXX Could use dns to map host to its real name or IP address?

	"""

	(scheme, netloc, path, params, query, fr) = \
		 urlparse.urlparse(url)
	i = string.find(netloc, '@') + 1
	usernpass, hostport = netloc[:i], netloc[i:]
	hostport = string.lower(hostport)
	i = string.find(hostport, ':')
	if i >= 0:
	    host, port = hostport[:i], hostport[i:]
	    try:
		p = string.atoi(port[1:])
		if p == 80:
		    port = ''
	    except:
		pass
	    hostport = host + port
	hostport = string.lower(hostport)
	netloc = usernpass + hostport
	scheme = string.lower(scheme)
	nurl = urlparse.urlunparse(
		(scheme, netloc, path, params, query, ''))
	if self.debug > 1 and nurl != url:
	    self.message("normalize %s\n\t-> %s", url, nurl)
	return nurl

    def message(self, format, *args):
	if self.debug:
	    print format % args


def invert(table):
    index = {}                           # empty dictionary
    for key in table.keys():
	for value in table[key]:
	    if not index.has_key(value):
		index[value] = []    # empty list
	    index[value].append(key)
    return index


def main():
    import getopt
    opts, args = getopt.getopt(sys.argv[1:], 'b:vx')
    bldefault = [
	    'http://www.python.org/doc/ext/',
	    'http://www.python.org/doc/lib/',
	    'http://www.python.org/doc/ref/',
	    'http://www.python.org/doc/tut/',
	    ]
    argsdefault = [
	    'http://www.python.org/'
	    ]
    blacklist = []
    debug = 0
    externals = 0
    for o, a in opts:
	if o == '-b':
	    if a == '-':
		bl_nodefault = 1
	    else:
		blacklist.append(a)
	if o == '-v':
	    debug = debug + 1
	if o == '-x':
	    externals = 1
    if not args:
	args = argsdefault
	if not blacklist:
	    blacklist = bldefault
    r = Robot(args, blacklist, debug)
    r.run()
    if externals:
	r.run_externals()


if __name__ == '__main__':
    main()