- From: Daniel W. Connolly <connolly@beach.w3.org>
- Date: Mon, 19 Feb 1996 00:02:26 -0500
- To: MegaZone <megazone@livingston.com>
- Cc: www-html@w3.org
In message <199602130336.TAA23804@server.livingston.com>, MegaZone writes: > (I'd like to find a checker I can have recurse our site to >check them all and just report errors. I know my own code has some silly >things (When tired I sometimes close tags that don't require closing) but >moreso, some pages are done by someone in marketing and I find errors in >her HTML often enough for it to be a concern for me as Webmaster. > >The tools I've tried are one page at a time. The HTML validation service is based on James Clark's sgml parser (available via www.jclark.com). There are lots of web walkers. I'll attach one below. 20 points to the folks that glue them together and get them to work for MegaZone (as an alpha tester) and eventually for everybody. Heck: stick a cheesy GUI on it, and from reading the trade rags, this would probably sell like hotcakes at $49/copy ;-) (courtesy of Guido VanRossum... requires python 1.3. See www.python.org) #! /usr/local/bin/python """A Web robot that checks out one or more trees and reports bad links. Usage: robot [-v] [-x] [-b blacklist_url] ... [root_url] ... Options: -b blacklist_url -- don't descend below this URL; may be repeated -v -- verbose -x -- check external links as well (in last phase) Arguments: root_url -- The root(s) of the trees to be checked. Default: http://www.python.org/. If no -b options are given, a number of default ones are assumed. Algorithm: """ __version__ = "1.2" import urllib import sys import os import urlparse import htmllib import string import regex import md5 import time from types import * from formatter import NullFormatter # Status enum type Status = [] for st in "UNKNOWN", "OK", "NOHTML", "MOVED", "BAD": Status.append(st) exec "%s = %s" % (st, `st`) del st class URLInfo: def __init__(self, url, manager, is_local): self.url = url self.manager = manager self.is_local = is_local self.status = UNKNOWN self.addinfo = None self.refs = [] def add_ref(self, ref): if ref not in self.refs: self.refs.append(ref) def examine(self): self.manager.message("examine %s", self.url) f = self.urlopen() if f: try: info = f.info() text = f.read() self.process(info, text) finally: f.close() def urlopen(self): self.manager.message("urlopen %s", self.url) try: f = self.manager.urlopen(self) except IOError, msg: nurl = None err = None if type(msg) == TupleType and len(msg) == 4: err = msg[1] if err == 302: m = msg[3] if m.has_key('location'): nurl = m['location'] elif m.has_key('uri'): nurl = m['uri'] elif err == 500: # Come back later print "500 error:", self.url self.manager.add_url(self.url, self) time.sleep(10) # Oops! back off a while return if nurl: self.manager.message("relocated %s\n\t-> %s", self.url, nurl) self.status = MOVED self.addinfo = nurl self.manager.add_url(nurl, self) return if err is not None: msg = msg[:3] print "Error", self.url, msg self.status = BAD self.addinfo = msg return if not self.is_local: f.close() return return f def process(self, info, text): self.manager.message("process %s", self.url) if info and info.has_key('content-type'): ctype = string.lower(info['content-type']) else: ctype = "text/unknown" doit = ctype == 'text/html' if doit: self.status = OK p = htmllib.HTMLParser(NullFormatter()) p.feed(text) p.close() for a in p.anchorlist: self.manager.add_url(a, self) else: self.status = NOHTML self.manager.message("skip %s for %s", ctype, self.url) class Robot: infoclass = URLInfo def __init__(self, roots, blacklist = [], debug = 0): self.roots = roots self.debug = debug self.blacklist = map(self.normalize, blacklist) self.table = {} self.todo = {} self.todo_later = {} for root in self.roots: self.add_url(root) self.urlopener = urllib.URLopener() self.addheaders = [ ("User-agent", "Python-robot/%s" % __version__), ("Accept", "text/html"), ] def run(self, maxlevels = 10): for i in range(maxlevels): print print "Level", i, "..." print if not self.todo: print "No new stuff" break self.explore() self.report() def run_externals(self): print print "Exploring non-local URLs" print self.debug = self.debug or 1 self.todo = self.todo_later self.todo_later = {} self.explore() self.report() def explore(self): for it in self.todo.values(): it.examine() del self.todo[it.url] def report(self): self.report_stats() self.report_bad() def report_stats(self): stats = {} for st in Status: stats[st] = 0 for it in self.table.values(): st = it.status stats[st] = stats[st] + 1 print print "STATISTICS" print "==========" print print "OK %3d" % stats[OK] print "No HTML %3d" % stats[NOHTML] print "Moved %3d" % stats[MOVED] print "Bad %3d" % stats[BAD] print "Examined ---- +" print " %3d" % \ (stats[OK] + stats[NOHTML] + stats[MOVED] + stats[BAD]) print print "To do local %3d" % len(self.todo) print "Non-local %3d" % (stats[UNKNOWN] - len(self.todo)) print " ---- +" print "Not yet examined %3d" % stats[UNKNOWN] def report_bad(self): # Find and report bad URLs bad = {} moved = {} for it in self.table.values(): if it.status is BAD: bad[it] = it.refs elif it.status is MOVED: moved[it.url] = it if moved: print print "MOVED REFERENCES" print "================" print movers = moved.keys() movers.sort() for m in movers: print m, "moved to", moved[m].addinfo if bad: print print "BAD REFERENCES" print "==============" print badrefs = invert(bad) bads = badrefs.keys() bads.sort() for b in bads: print b.url, "has bad refs to:" for br in badrefs[b]: print "\t" + `br.url` def urlopen(self, it): self.urlopener.addheaders = self.addheaders[:] if it.refs: self.urlopener.addheader("Referer", it.refs[0].url) return self.urlopener.open(it.url) def add_url(self, url, ref = None): if ref: url = urlparse.urljoin(ref.url, url) url = self.normalize(url) if self.table.has_key(url): it = self.table[url] else: l = self.is_local(url) it = self.table[url] = self.infoclass(url, self, l) if l: t = self.todo else: t = self.todo_later t[it.url] = it if ref: it.add_ref(ref) def is_local(self, url): for root in self.roots: if url[:len(root)] == root: break else: return 0 return not self.is_blacklisted(url) def is_blacklisted(self, url): for bl in self.blacklist: if url[:len(bl)] == bl: return 1 return 0 def normalize(self, url): """Normalize a URL. - convert the scheme and host to lower case (but not user/pass!) - drop the port if it is 80 - drop the fragment XXX Could use dns to map host to its real name or IP address? """ (scheme, netloc, path, params, query, fr) = \ urlparse.urlparse(url) i = string.find(netloc, '@') + 1 usernpass, hostport = netloc[:i], netloc[i:] hostport = string.lower(hostport) i = string.find(hostport, ':') if i >= 0: host, port = hostport[:i], hostport[i:] try: p = string.atoi(port[1:]) if p == 80: port = '' except: pass hostport = host + port hostport = string.lower(hostport) netloc = usernpass + hostport scheme = string.lower(scheme) nurl = urlparse.urlunparse( (scheme, netloc, path, params, query, '')) if self.debug > 1 and nurl != url: self.message("normalize %s\n\t-> %s", url, nurl) return nurl def message(self, format, *args): if self.debug: print format % args def invert(table): index = {} # empty dictionary for key in table.keys(): for value in table[key]: if not index.has_key(value): index[value] = [] # empty list index[value].append(key) return index def main(): import getopt opts, args = getopt.getopt(sys.argv[1:], 'b:vx') bldefault = [ 'http://www.python.org/doc/ext/', 'http://www.python.org/doc/lib/', 'http://www.python.org/doc/ref/', 'http://www.python.org/doc/tut/', ] argsdefault = [ 'http://www.python.org/' ] blacklist = [] debug = 0 externals = 0 for o, a in opts: if o == '-b': if a == '-': bl_nodefault = 1 else: blacklist.append(a) if o == '-v': debug = debug + 1 if o == '-x': externals = 1 if not args: args = argsdefault if not blacklist: blacklist = bldefault r = Robot(args, blacklist, debug) r.run() if externals: r.run_externals() if __name__ == '__main__': main()
Received on Monday, 19 February 1996 00:02:30 UTC