2002/ws/desc/tools .cvsignore,NONE,1.1 http_auth.py,NONE,1.1 http_head.py,NONE,1.1 nschecker,NONE,1.1 from Hugo Haas on 2004-08-04 (public-ws-desc-eds@w3.org from August 2004)

From: Hugo Haas <hugo@dev.w3.org>
Date: Wed, 04 Aug 2004 09:41:09 +0000
To: public-ws-desc-eds@w3.org
Message-ID: <E1BsIGo-0003vb-DD@lionel-hutz.w3.org>
Update of /sources/public/2002/ws/desc/tools
In directory hutz:/tmp/cvs-serv14919

Added Files:
	.cvsignore http_auth.py http_head.py nschecker 
Log Message:
Local version of nschecker


--- NEW FILE: http_head.py ---
""" $Id: http_head.py,v 1.1 2004/08/04 09:41:07 hugo Exp $
    Module to make a HTTP Head
""" 

from urllib import *

class HEADURLopener(FancyURLopener):
    def open_http(self, url):
        import httplib
        user_passwd = None
        if type(url) is type(""):
            host, selector = splithost(url)
            if host:
                user_passwd, host = splituser(host)
                host = unquote(host)
            realhost = host
        else:
            host, selector = url
            urltype, rest = splittype(selector)
            url = rest
            user_passwd = None
            if string.lower(urltype) != 'http':
                realhost = None
            else:
                realhost, rest = splithost(rest)
                if realhost:
                    user_passwd, realhost = splituser(realhost)
                if user_passwd:
                    selector = "%s://%s%s" % (urltype, realhost, rest)
            #print "proxy via http:", host, selector
        if not host: raise IOError, ('http error', 'no host given')
        if user_passwd:
            import base64
            auth = string.strip(base64.encodestring(user_passwd))
        else:
            auth = None
        h = httplib.HTTP(host) 
        h.putrequest('HEAD', selector)
        if auth: h.putheader('Authorization', 'Basic %s' % auth)
        if realhost: h.putheader('Host', realhost)
        for args in self.addheaders: apply(h.putheader, args)
        h.endheaders()
        errcode, errmsg, headers = h.getreply()
        fp = h.getfile()
        if errcode == 200:
            import urllib
            return urllib.addinfourl(fp, headers, "http:" + url)
        else:
            return self.http_error(url, fp, errcode, errmsg, headers)

--- NEW FILE: .cvsignore ---
*.pyc

--- NEW FILE: nschecker ---
#!/usr/local/bin/python
""" $Id: nschecker,v 1.1 2004/08/04 09:41:07 hugo Exp $
"""

import cgi
import sys
import os
import urlparse
import urllib
import http_head
import http_auth
import re
import popen2

Page1 = """
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US">
<head>
<link href="http://www.w3.org/StyleSheets/base" rel="stylesheet"/>
<link href="http://www.w3.org/2001/11/results" rel="stylesheet" />
<title>Namespaces checker service%s</title></head>
<body>

<p><a href="http://www.w3.org/"><img src="http://www.w3.org/Icons/w3c_home" alt="W3C"/></a></p>

<h1>Namespaces checker%s</h1>
<h2>Description</h2>
<p>This tool takes the URI of an (X)HTML document as input and outputs the "visible" URIs in it and make a HTTP HEAD on them to check their validity. The intent is to help the <a href="/Guide/pubrules">pubrules</a> checking. It <strong>does not</strong> check the validity of the anchors.</p>
<p>You may want to use <a href="http://validator.w3.org/checklink">the linkchecker tool</a> to check all your links in your document.</p>
"""

Page2 = """
<form method="GET">
<label>URI(s) of the document(s) you want to check URIs: <input type="text" name="uri" value="%s"/></label>
<input type="submit" value="Get results"/>
</form>

<hr />
<address>
script $Revision: 1.1 $ of $Date: 2004/08/04 09:41:07 $<br />
by <a href="http://www.w3.org/People/Dom/">Dominique Hazael-Massieux</a><br />
</address>
</body>
</html>
"""

class myHEADURLopener(http_head.HEADURLopener):
        res =""
	def http_error_default(self, url, fp, errcode, errmsg, headers):
		return None

	def http_error(self, url, fp, errcode, errmsg, headers, data=None):
		self.res = self.res + formatHeaders(errcode,errmsg,headers)
		return urllib.URLopener.http_error(self, url, fp, errcode, errmsg, headers, data)		

        def retry_http_basic_auth(self, url, realm, data=None):
                return None

def formatHeaders(errcode,errmsg,headers):
        classe=""
        if errcode==200:
                classe=" class='yes'"
        elif errcode==301 or errcode==302:
                classe=""
        elif errcode==401:
                classe=" class='tocheck'"
        else:
                classe=" class='no'"
	return( "-> <span" + classe + ">"+ `errcode` + "</span> (<span class='errmsg'>" + errmsg + '</span>) ')

def serveRequest():
    fields = cgi.FieldStorage()

    if not fields.has_key('uri'):
        print "Content-Type: text/html; charset=utf-8"
	print
        print Page1 % ("","")
	print Page2 % ("")
    else:
        addr = fields['uri'].value
	if len(urlparse.urlparse(addr)[0])<2:
		print "Status: 403"
		print "Content-Type: text/plain"
		print
		print "sorry, I decline to handle file: addresses"
	else:
                title = " for %s" % (addr)
                link = " for <a href='%s'>%s</a>" % (addr,addr)
		print Page1 % (title,link)
                import http_auth
		url_opener = http_auth.ProxyAuthURLopener()
                error = ""
                try:
        		doc = url_opener.open(addr)
           	except IOError, (errno, strerror):
			doc = None
                command = "/usr/bin/lynx -nolist -dump -stdin"
                (piperfd,pipewfd,pipeErr) = popen2.popen3(command)
                if (doc):
                        pipewfd.write(doc.read())
                        doc.close()
                        pipewfd.close()
                        if (piperfd):
                                head_request = myHEADURLopener()
                                line = piperfd.readline()
                                pattern = "(http://[^><\s\"'\&\)]*)[>|<|\s|\"|'|\&\)]"
                                uriMatcher = re.compile(pattern)
                                foundUris = {}
                                while line:
                                        results = uriMatcher.findall(line)
                                        for uri in results:
                                                if uri[-1]==".":
                                                        uri=uri[:-1]
                                                if foundUris.has_key(uri):
                                                        foundUris[uri] = foundUris[uri] + 1
                                                else:
                                                        foundUris[uri] = 1
                                        line = piperfd.readline()
                                piperfd.close()
        		print '<p>URIs found :</p>\n<dl>\n'
                        uris = foundUris.keys()
                        uris.sort()
        		for uri in uris:
                                plural= ""
                                if foundUris[uri]>1:
                                        plural = "s"
        			print "<dt><a href='"+uri+"'>"+uri+"</a> (%s occurence%s)</dt>\n" % (foundUris[uri],plural)
                                example_org = "(http://([^.]*\.)*example\.(net|org|com)($|/))"
                                example_org_matcher = re.compile(example_org)
                                if example_org_matcher.search(uri):
                                        print "<dd>Example URI</dd>"
                                else:
                                        print "<dd>"
                                        head = head_request.open(uri)
                                        print head_request.res
                                        if head:
                                                print formatHeaders(200,"OK","")
                                                head.close()
                                        head_request.res = ""
                                        if uri[:18]=="http://www.w3.org/":
                                                print """ www.w3.org URI, if a namespace, make sure it is <a href="http://www.w3.org/1999/10/nsuri">compliant with the NS rules</a>"""
                                        else:
                                                print """; this URI is not on www.w3.org, <span class="tocheck">make sure it's not a namespace</span>"""
                                        print "</dd>\n"
                        print "</dl>\n"
                else:
                        print "<p><span class='no'>An error</span> (%s) occured trying to get <a href='%s'>%s</a>.</p>" % (url_opener.error,addr,addr)
		print Page2 % (addr)	

if __name__ == '__main__':
    if os.environ.has_key('SCRIPT_NAME'):
        serveRequest()

--- NEW FILE: http_auth.py ---
import urllib
import os

class ProxyAuthURLopener(urllib.FancyURLopener):
	error = ""
	def http_error_default(self, url, fp, errcode, errmsg, headers):
		self.error = `errcode` + " " + errmsg
		return None

        def http_error_304(self,uri,fp,errocode,errmsg,headers):
                print 'HTTP/1.1 304 Not Modified'
                return None

	def retry_http_basic_auth(self, url, realm, data=None):
		if os.environ.has_key('HTTP_AUTHORIZATION') and os.environ['HTTP_AUTHORIZATION']:
			self.addheader('Authorization',os.environ['HTTP_AUTHORIZATION'])
			del os.environ['HTTP_AUTHORIZATION']

			if data is None:
				return self.open('http:' + url)
			else:
				return self.open('http:' + url,data)
		else:
			global Page
			print 'Status: 401 Authorization Required'
			print 'WWW-Authenticate: Basic realm="%s"' % realm
			print 'Connection: close'
			Page = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html>
<head>
<title>401 Authorization Required</title>
</head>
<body>
<h1>Authorization Required</h1>
<p>You need %s access to http:%s to use this service.</p>
""" % (realm,url)
			return None
Received on Wednesday, 4 August 2004 05:41:29 UTC