- From: Daniel W. Connolly <connolly@beach.w3.org>
- Date: Mon, 19 Feb 1996 00:02:26 -0500
- To: MegaZone <megazone@livingston.com>
- Cc: www-html@w3.org
In message <199602130336.TAA23804@server.livingston.com>, MegaZone writes:
> (I'd like to find a checker I can have recurse our site to
>check them all and just report errors. I know my own code has some silly
>things (When tired I sometimes close tags that don't require closing) but
>moreso, some pages are done by someone in marketing and I find errors in
>her HTML often enough for it to be a concern for me as Webmaster.
>
>The tools I've tried are one page at a time.
The HTML validation service is based on James Clark's sgml parser
(available via www.jclark.com).
There are lots of web walkers. I'll attach one below.
20 points to the folks that glue them together and get them to work
for MegaZone (as an alpha tester) and eventually for everybody. Heck:
stick a cheesy GUI on it, and from reading the trade rags, this would
probably sell like hotcakes at $49/copy ;-)
(courtesy of Guido VanRossum... requires python 1.3. See www.python.org)
#! /usr/local/bin/python
"""A Web robot that checks out one or more trees and reports bad links.
Usage: robot [-v] [-x] [-b blacklist_url] ... [root_url] ...
Options:
-b blacklist_url -- don't descend below this URL; may be repeated
-v -- verbose
-x -- check external links as well (in last phase)
Arguments:
root_url -- The root(s) of the trees to be checked.
Default: http://www.python.org/.
If no -b options are given, a number of default ones are assumed.
Algorithm:
"""
__version__ = "1.2"
import urllib
import sys
import os
import urlparse
import htmllib
import string
import regex
import md5
import time
from types import *
from formatter import NullFormatter
# Status enum type
Status = []
for st in "UNKNOWN", "OK", "NOHTML", "MOVED", "BAD":
Status.append(st)
exec "%s = %s" % (st, `st`)
del st
class URLInfo:
def __init__(self, url, manager, is_local):
self.url = url
self.manager = manager
self.is_local = is_local
self.status = UNKNOWN
self.addinfo = None
self.refs = []
def add_ref(self, ref):
if ref not in self.refs:
self.refs.append(ref)
def examine(self):
self.manager.message("examine %s", self.url)
f = self.urlopen()
if f:
try:
info = f.info()
text = f.read()
self.process(info, text)
finally:
f.close()
def urlopen(self):
self.manager.message("urlopen %s", self.url)
try:
f = self.manager.urlopen(self)
except IOError, msg:
nurl = None
err = None
if type(msg) == TupleType and len(msg) == 4:
err = msg[1]
if err == 302:
m = msg[3]
if m.has_key('location'):
nurl = m['location']
elif m.has_key('uri'):
nurl = m['uri']
elif err == 500:
# Come back later
print "500 error:", self.url
self.manager.add_url(self.url, self)
time.sleep(10) # Oops! back off a while
return
if nurl:
self.manager.message("relocated %s\n\t-> %s",
self.url, nurl)
self.status = MOVED
self.addinfo = nurl
self.manager.add_url(nurl, self)
return
if err is not None:
msg = msg[:3]
print "Error", self.url, msg
self.status = BAD
self.addinfo = msg
return
if not self.is_local:
f.close()
return
return f
def process(self, info, text):
self.manager.message("process %s", self.url)
if info and info.has_key('content-type'):
ctype = string.lower(info['content-type'])
else:
ctype = "text/unknown"
doit = ctype == 'text/html'
if doit:
self.status = OK
p = htmllib.HTMLParser(NullFormatter())
p.feed(text)
p.close()
for a in p.anchorlist:
self.manager.add_url(a, self)
else:
self.status = NOHTML
self.manager.message("skip %s for %s",
ctype, self.url)
class Robot:
infoclass = URLInfo
def __init__(self, roots, blacklist = [], debug = 0):
self.roots = roots
self.debug = debug
self.blacklist = map(self.normalize, blacklist)
self.table = {}
self.todo = {}
self.todo_later = {}
for root in self.roots:
self.add_url(root)
self.urlopener = urllib.URLopener()
self.addheaders = [
("User-agent", "Python-robot/%s" % __version__),
("Accept", "text/html"),
]
def run(self, maxlevels = 10):
for i in range(maxlevels):
print
print "Level", i, "..."
print
if not self.todo:
print "No new stuff"
break
self.explore()
self.report()
def run_externals(self):
print
print "Exploring non-local URLs"
print
self.debug = self.debug or 1
self.todo = self.todo_later
self.todo_later = {}
self.explore()
self.report()
def explore(self):
for it in self.todo.values():
it.examine()
del self.todo[it.url]
def report(self):
self.report_stats()
self.report_bad()
def report_stats(self):
stats = {}
for st in Status: stats[st] = 0
for it in self.table.values():
st = it.status
stats[st] = stats[st] + 1
print
print "STATISTICS"
print "=========="
print
print "OK %3d" % stats[OK]
print "No HTML %3d" % stats[NOHTML]
print "Moved %3d" % stats[MOVED]
print "Bad %3d" % stats[BAD]
print "Examined ---- +"
print " %3d" % \
(stats[OK] + stats[NOHTML] + stats[MOVED] + stats[BAD])
print
print "To do local %3d" % len(self.todo)
print "Non-local %3d" % (stats[UNKNOWN] - len(self.todo))
print " ---- +"
print "Not yet examined %3d" % stats[UNKNOWN]
def report_bad(self):
# Find and report bad URLs
bad = {}
moved = {}
for it in self.table.values():
if it.status is BAD:
bad[it] = it.refs
elif it.status is MOVED:
moved[it.url] = it
if moved:
print
print "MOVED REFERENCES"
print "================"
print
movers = moved.keys()
movers.sort()
for m in movers:
print m, "moved to", moved[m].addinfo
if bad:
print
print "BAD REFERENCES"
print "=============="
print
badrefs = invert(bad)
bads = badrefs.keys()
bads.sort()
for b in bads:
print b.url, "has bad refs to:"
for br in badrefs[b]:
print "\t" + `br.url`
def urlopen(self, it):
self.urlopener.addheaders = self.addheaders[:]
if it.refs:
self.urlopener.addheader("Referer", it.refs[0].url)
return self.urlopener.open(it.url)
def add_url(self, url, ref = None):
if ref:
url = urlparse.urljoin(ref.url, url)
url = self.normalize(url)
if self.table.has_key(url):
it = self.table[url]
else:
l = self.is_local(url)
it = self.table[url] = self.infoclass(url, self, l)
if l:
t = self.todo
else:
t = self.todo_later
t[it.url] = it
if ref:
it.add_ref(ref)
def is_local(self, url):
for root in self.roots:
if url[:len(root)] == root:
break
else:
return 0
return not self.is_blacklisted(url)
def is_blacklisted(self, url):
for bl in self.blacklist:
if url[:len(bl)] == bl:
return 1
return 0
def normalize(self, url):
"""Normalize a URL.
- convert the scheme and host to lower case (but not user/pass!)
- drop the port if it is 80
- drop the fragment
XXX Could use dns to map host to its real name or IP address?
"""
(scheme, netloc, path, params, query, fr) = \
urlparse.urlparse(url)
i = string.find(netloc, '@') + 1
usernpass, hostport = netloc[:i], netloc[i:]
hostport = string.lower(hostport)
i = string.find(hostport, ':')
if i >= 0:
host, port = hostport[:i], hostport[i:]
try:
p = string.atoi(port[1:])
if p == 80:
port = ''
except:
pass
hostport = host + port
hostport = string.lower(hostport)
netloc = usernpass + hostport
scheme = string.lower(scheme)
nurl = urlparse.urlunparse(
(scheme, netloc, path, params, query, ''))
if self.debug > 1 and nurl != url:
self.message("normalize %s\n\t-> %s", url, nurl)
return nurl
def message(self, format, *args):
if self.debug:
print format % args
def invert(table):
index = {} # empty dictionary
for key in table.keys():
for value in table[key]:
if not index.has_key(value):
index[value] = [] # empty list
index[value].append(key)
return index
def main():
import getopt
opts, args = getopt.getopt(sys.argv[1:], 'b:vx')
bldefault = [
'http://www.python.org/doc/ext/',
'http://www.python.org/doc/lib/',
'http://www.python.org/doc/ref/',
'http://www.python.org/doc/tut/',
]
argsdefault = [
'http://www.python.org/'
]
blacklist = []
debug = 0
externals = 0
for o, a in opts:
if o == '-b':
if a == '-':
bl_nodefault = 1
else:
blacklist.append(a)
if o == '-v':
debug = debug + 1
if o == '-x':
externals = 1
if not args:
args = argsdefault
if not blacklist:
blacklist = bldefault
r = Robot(args, blacklist, debug)
r.run()
if externals:
r.run_externals()
if __name__ == '__main__':
main()
Received on Monday, 19 February 1996 00:02:30 UTC