soap rdfq hack for chump from Dan Brickley on 2001-08-19 (www-archive@w3.org from August 2001)

From: Dan Brickley <danbri@w3.org>
Date: Sun, 19 Aug 2001 13:21:25 -0400 (EDT)
To: <www-archive@w3.org>
Message-ID: <Pine.LNX.4.30.0108191320550.7957-100000@tux.w3.org>
# Copyright (c) 2001 by Matt Biddulph and Edd Dumbill, Useful Information Company
# All rights reserved.
#
# License is granted to use or modify this software ("Daily Chump") for
# commercial or non-commercial use provided the copyright of the author is
# preserved in any distributed or derivative work.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESSED OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

# daily chump v 1.0

# $Id: dailychump.py,v 1.7 2001/07/12 21:19:33 edmundd Exp $

import string
import re
import time
import tempfile
import shutil
import os
import os.path
from EntityEncoder import EntityEncoder
from xmllib import XMLParser, procclose, illegal, tagfind

class TimeFormatter:
    def format_time(self,formattime):
        return time.strftime("%Y-%m-%d %H:%M",time.gmtime(formattime))

class StyleSheetAwareXMLParser(XMLParser):
    """ This class is needed to override a bug in Py 1.5.2's xmllib
    which meant it would reject an xml-stylesheet PI """
    def parse_proc(self, i):
        rawdata = self.rawdata
        end = procclose.search(rawdata, i)
        if end is None:
            return -1
        j = end.start(0)
        if illegal.search(rawdata, i+2, j):
            self.syntax_error('illegal character in processing instruction')
        res = tagfind.match(rawdata, i+2)
        if res is None:
            raise RuntimeError, 'unexpected call to parse_proc'
        k = res.end(0)
        name = res.group(0)
        if name == 'xml:namespace':
            self.syntax_error('old-fashioned namespace declaration')
            self.__use_namespaces = -1
            # namespace declaration
            # this must come after the <?xml?> declaration (if any)
            # and before the <!DOCTYPE> (if any).
            if self.__seen_doctype or self.__seen_starttag:
                self.syntax_error('xml:namespace declaration too late in document')
            attrdict, namespace, k = self.parse_attributes(name, k, j)
            if namespace:
                self.syntax_error('namespace declaration inside namespace declaration')
            for attrname in attrdict.keys():
                if not self.__xml_namespace_attributes.has_key(attrname):
                    self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
            if not attrdict.has_key('ns') or not attrdict.has_key('prefix'):
                self.syntax_error('xml:namespace without required attributes')
            prefix = attrdict.get('prefix')
            if ncname.match(prefix) is None:
                self.syntax_error('xml:namespace illegal prefix value')
                return end.end(0)
            if self.__namespaces.has_key(prefix):
                self.syntax_error('xml:namespace prefix not unique')
            self.__namespaces[prefix] = attrdict['ns']
        else:
            if string.find(string.lower(name), 'xml ') >= 0:
                self.syntax_error('illegal processing instruction target name')
            self.handle_proc(name, rawdata[k:j])
        return end.end(0)

class Churn:
    def __init__(self,directory):
        self.database = {}
        self.directory = directory
        self.labelcount = 0
        self.set_update_time(time.time())
        self.topic=""
        self.stylesheet=""
        self.sheettype="text/css"

    def set_update_time(self,time):
        self.updatetime = time

    def get_topic(self):
        return self.topic

    def set_topic(self,topic):
        self.topic = topic
        self.save()

    def get_stylesheet(self):
        return self.stylesheet

    def get_stylesheettype(self):
        return self.sheettype

    def set_stylesheet(self,sheet):
        self.stylesheet=sheet
        if sheet[-3:] == "xsl":
            self.sheettype="text/xsl"
        else:
            self.sheettype="text/css"

    def delete_item(self, label):
        if self.database.has_key(label):
            del self.database[label]
            self.save()

    def view_item(self, label):
        entry = self.get_entry(label)
        if entry != None:
            if entry.title == '':
                return label + ": " + entry.item
            else:
                return label + ": " + entry.title + " (" + entry.item + ")"
        else:
            return 'Label '+label+' not found.'

    def view_recent_items(self, count=5):
        labels = self._timesorted_labels()
        labels = labels[0:count]
        labels.reverse()
        message = ''
        for l in labels:
            message = message + self.view_item(l) + "\n"
        return message

    def add_item(self, item, nick):
        entry = ChurnEntry(item,nick)
        label = self.get_next_label()
        self.set_entry(label, entry)
        self.save()
        return label

    def _filename(self):
        return self.directory + "/index.xml"

    def _timesorted_labels(self):
        labels = self.database.keys()
        times = []
        for l in labels:
            times.append(self.get_entry(l).time)

        # sort list of labels by the respective time entry
        # from the times list
        pairs = map(None,times,labels)
        pairs.sort()
        result = pairs[:]
        for i in xrange(len(result)):
            result[i] = result[i][1]
        result.reverse()
        return result

    def set_time_item(self,label,time):
        entry = self.get_entry(label)
        if entry != None:
            entry.set_time(time)
            self.save()

    def set_entry(self,label,entry):
        self.database[label] = entry

    def get_entry_count(self):
        return len(self.database.keys())

    def get_entry(self,label):
        if self.database.has_key(label):
            return self.database[label]
        else:
            return None

    def title_item(self,label,title):
        entry = self.get_entry(label)
        if entry != None:
            entry.set_title(title)
            self.save()
            return "titled item "+label
        else:
            return 'Label '+label+' not found.'

    def get_comments(self,label):
        entry = self.get_entry(label)
        if entry != None:
            if entry.title == '':
                return entry.item + "\n" + entry.get_comments()
            else:
                return entry.item + "\n" + entry.title + "\n" + entry.get_comments()
        else:
            return 'Label '+label+' not found.'

    def comment_item(self,label,comment,nick):
        entry = self.get_entry(label)
        if entry != None:
            entry.add_comment(comment,nick)
            self.save()
            return "commented item "+label
        else:
            return 'Label '+label+' not found.'

    def save(self):
        self.update()
        name = tempfile.mktemp()
        out_file = open(name,"w")
        out_file.write(self.serialize())
        out_file.write("\n")
        out_file.close()
        if os.path.isfile(self._filename()):
            os.remove(self._filename())
        shutil.copy(name,self._filename())
        os.unlink(name)

    def deserialize(self,data):
        c = ChurnParser()
        c.set_churn(self)
        c.feed(data)
        c.close()

    def serialize(self):
        encoder = EntityEncoder()
        serialized='<!DOCTYPE churn>\n'
        if self.get_stylesheet()!="":
            serialized = serialized + '<?xml-stylesheet href="'+\
                         encoder.encode_chars(self.get_stylesheet())+\
                         '" type="'+\
                         self.get_stylesheettype()+'"?>'+"\n"
        serialized = serialized + "<churn>\n"

        serialized = serialized + '<last-updated value="'
        serialized = serialized + "%f" % self.updatetime
        serialized = serialized + '">'
        serialized = serialized + encoder.encode_chars(TimeFormatter().format_time(self.updatetime))+"</last-updated>\n"

        serialized = serialized + '<itemcount value="'
        serialized = serialized + "%d" % self.get_entry_count()
        serialized = serialized + '" />\n'

        serialized = serialized + "<topic>"+encoder.encode_chars(self.topic)+"</topic>\n"

        for x in self._timesorted_labels():
            entry = self.get_entry(x)
            serialized = serialized + entry.serialize()
        serialized = serialized + "</churn>"
        return serialized

    def get_next_label(self):
        label = self.number_to_label(self.labelcount)
        self.labelcount = self.labelcount + 1
        return label

    def number_to_label(self,number):
        if number < 26:
            return chr(number + 65)

        if number == 26:
            return 'AA'

        count = number - 26
        label = ''
        while count > 0:
            label = chr((count % 26) + 65) + label
            count = count / 26

        if number < 52:
            return 'A' + label
        else:
            return label

    def update(self):
        self.set_update_time(time.time())

class ChurnEntry:
    def __init__(self,item,nick):
        self.item = item
        self.nick = nick
        self.comments = []
        self.set_time(time.time())
        self.title = ''

    def serialize(self):
        encoder = EntityEncoder()
        serialized = ''
        serialized = serialized + "<link"
        if self.item == 'blurb':
           serialized = serialized + ' type="blurb"'
        serialized = serialized + ">\n"
        serialized = serialized + '<time value="'
        serialized = serialized + "%f" % self.time
        serialized = serialized + '">'
        serialized = serialized + encoder.encode_chars(TimeFormatter().format_time(self.time))
        serialized = serialized + "</time>\n"
        if not self.item == 'blurb':
           serialized = serialized + "<url>"+encoder.encode_chars(self.item)+"</url>\n"
        serialized = serialized + "<nick>"+encoder.encode_chars(self.nick)+"</nick>\n"
        if self.title != '':
            serialized = serialized + "<title>"+encoder.encode_chars(self.title)+"</title>\n"
        for c in self.comments:
            nick = c[0]
            comment = c[1]
            serialized = serialized + self.serialize_comment(nick,comment,encoder)
        serialized = serialized + "</link>\n"
        return serialized

    def serialize_comment(self,nick,comment,encoder):
        comment_html = encoder.encode_chars(comment)

        italic_search = re.compile('\*([^*]+)\*')
        while italic_search.search(comment_html) != None:
            match = italic_search.search(comment_html)
            comment_html = comment_html[0:match.start(1) - 1] + '<i>' + comment_html[match.start(1):match.end(1)] + '</i>' + comment_html[match.end(1) + 1:]

        img_search = re.compile('\+\[(http[^|\]]+)\]')
        while img_search.search(comment_html) != None:
            match = img_search.search(comment_html)
            comment_html = comment_html[0:match.start(1) - 2] + '<img src="' + match.group(1) + '" />' + comment_html[match.end(1) + 1:]

        titled_img_search = re.compile('\+\[([^|]+)\|([^\]]+)\]')
        while titled_img_search.search(comment_html) != None:
            match = titled_img_search.search(comment_html)
            if string.find(match.group(1),"http") == 0: # begins with http
                url_index = 1
                title_index = 2
            else:
                url_index = 2
                title_index = 1

            comment_html = comment_html[0:match.start(1) - 2] + '<img src="' + match.group(url_index) + '" alt="' + match.group(title_index) + '" />' + comment_html[match.end(2) + 1:]

        url_search = re.compile('\[(http[^|\]]+)\]')
        while url_search.search(comment_html) != None:
            match = url_search.search(comment_html)
            comment_html = comment_html[0:match.start(1) - 1] + '<a href="' + match.group(1) + '">' + match.group(1) + '</a>' + comment_html[match.end(1) + 1:]

        titled_url_search = re.compile('\[([^|]+)\|([^\]]+)\]')
        while titled_url_search.search(comment_html) != None:
            match = titled_url_search.search(comment_html)
            if string.find(match.group(1),"http") == 0: # begins with http
                url_index = 1
                title_index = 2
            else:
                url_index = 2
                title_index = 1

            comment_html = comment_html[0:match.start(1) - 1] + '<a href="' + match.group(url_index) + '">' + match.group(title_index) + '</a>' + comment_html[match.end(2) + 1:]

        serialized = ''
        serialized = serialized + '<comment nick="' + nick+ '">'
        serialized = serialized + comment_html
        serialized = serialized + "</comment>\n"
        return serialized

    def set_time(self,time):
        self.time = time

    def add_comment(self,comment,nick):
        self.comments.append([nick,comment])

    def set_title(self,title):
        self.title = title

    def get_comments(self):
        comments = ''
        for c in self.comments:
            nick = c[0]
            comment = c[1]
            comments = comments + '(' + nick+ ') '
            comments = comments + comment
            comments = comments + "\n"
        return comments

class ChurnParser(StyleSheetAwareXMLParser):
    def __init__(self):
        XMLParser.__init__(self)
        self._data = ''
        self._in_a = 0
        self._a_title = ''
        self._a_href = ''
        self._entries = []
        self._current_entry = {}

    def set_churn(self,churn):
        self.churn = churn

    def get_churn(self):
        return self.churn

    def start_link(self,attrs):
        self._current_entry['title'] = ''
        self._current_entry['time'] = 0
        self._current_entry['nick'] = ''
        self._current_entry['item'] = ''
        self._current_entry['comments'] = []
        if attrs.has_key('type'):
            type = attrs['type']
            if type == 'blurb':
                self._current_entry['blurb'] = 1
        else:
            self._current_entry['blurb'] = 0

    def start_a(self,attrs):
        if self._in_comment == 1:
            if(attrs.has_key('href')):
                self._a_href = attrs['href']
            else:
                self._a_href = ''
            self._a_title = ''
            self._in_a = 1

    def start_img(self,attrs):
        if self._in_comment == 1:

            if(attrs.has_key('src')):
                self._img_src = attrs['src']
            else:
                self._img_src = ''

            if(attrs.has_key('alt')):
                self._img_title = attrs['alt']
            else:
                self._img_title = ''

    def start_i(self,attrs):
        self._data = self._data + '*'

    def end_i(self):
        self._data = self._data + '*'

    def end_img(self):
        if self._img_title != '' and self._img_src != '' and self._img_src != self._img_title:
            self._data = self._data + '+[' + self._img_title + '|' + self._img_src + ']'
        elif self._img_src != '':
            self._data = self._data + '+[' + self._img_src + ']'

    def end_a(self):
        if self._a_title != '' and self._a_href != '' and self._a_href != self._a_title:
            self._data = self._data + '[' + self._a_title + '|' + self._a_href + ']'
        elif self._a_href != '':
            self._data = self._data + '[' + self._a_href + ']'
        self._in_a = 0

    def start_itemcount(self,attrs):
        if attrs.has_key('value'):
            self._itemcount = string.atoi(attrs['value'])

    def unknown_starttag(self,tag,attrs):
        self._tag_name = tag
        self._data = ''
        if tag == 'last-updated':
            if attrs.has_key('value'):
                self._last_updated = string.atof(attrs['value'])
            else:
                self._last_updated = time.time()
        if tag == 'comment':
            self._current_entry['comment_nick'] = attrs['nick']
            self._in_comment = 1
        if tag == 'time':
            if attrs.has_key('value'):
                self._current_entry['time'] = string.atof(attrs['value'])
            else:
                self._current_entry['time'] = time.time()

    def end_title(self):
        self._current_entry['title'] = self._data

    def end_url(self):
        self._current_entry['item'] = self._data

    def end_nick(self):
        self._current_entry['nick'] = self._data

    def end_comment(self):
        self._in_comment = 0
        self._current_entry['comments'].append([self._data,self._current_entry['comment_nick']])

    def end_link(self):
        self._entries.append(self._current_entry)
        self._current_entry = {}

    def end_churn(self):
        self._entries.reverse()

        for a in self._entries:
            if a['blurb'] == 1:
                a['item'] = "blurb"
            label = self.churn.add_item(a['item'],a['nick'])
            if a['title'] != '':
                self.churn.title_item(label,a['title'])
            for c in a['comments']:
                self.churn.comment_item(label,c[0],c[1])
            self.churn.set_time_item(label,a['time'])

        self.churn.set_update_time(self._last_updated)

    def handle_data(self,text):
        if self._in_a == 1:
            self._a_title = self._a_title + text
        else:
            self._data = self._data + text


class LastUpdatedParser(StyleSheetAwareXMLParser):
    def unknown_starttag(self,tag,attrs):
        if tag == 'last-updated':
            self.lu = attrs['value']

    def get_last_updated(self):
        return string.atof(self.lu)

class DailyChump:
    def __init__(self, directory):
        self.archiver = FileArchiver(directory)
        self.churn = self.archiver.retrieve_churn()

    def set_topic(self,topic):
        self.churn = self.archiver.archive_if_necessary(self.churn)
        self.churn.set_topic(topic)

    def view_recent_items(self,count=5):
        return self.churn.view_recent_items(count)

    def get_database(self):
        return self.churn.serialize()

    def set_stylesheet(self, sheet):
        self.churn.set_stylesheet(sheet)

    def process_input(self,nick,msg):
        blurbmatch = re.compile("BLURB:\s*(.*)")
        urlmatch = re.compile("(https?:\/\/[^ ]+)")
        titlematch = re.compile("([A-Z]+):\|\s*(.*)")
        commentmatch = re.compile("([A-Z]+):\s*(.*)")
        wordnetmatch = re.compile("wordnet\s*(.*)")

        um = urlmatch.match(msg)
        bm = blurbmatch.match(msg)
        tm = titlematch.match(msg)
        cm = commentmatch.match(msg)
        wm = wordnetmatch.match(msg)

        if wm:
            term = wm.group(1)
	    import SOAP

	    query ="""SELECT ?x, ?l, ?c
  	     FROM     http://xmlns.com/wordnet/1.6/""" + term + """WHERE
	     (web::type ?x rdfs::Class)
	     (rdfs::label ?x ?l)
	     (rdfs::description ?x ?c)
	     USING web FOR http://www.w3.org/1999/02/22-rdf-syntax-ns#
	           rdfs FOR http://www.w3.org/2000/01/rdf-schema# """
	    data = """http://xmlns.com/wordnet/1.6/""" + term
	    server =SOAP.SOAPProxy("http://fireball.dyndns.org/cgi-bin/soap",namespace = "http://rdfweb.org/RDF/RDFWeb/SOAPDemo")
	    rs = server.squish(query, data)
            answer = "Results...: "
    	    for row in rs:
	      for item in row._asdict.items():
	        answer = answer + item[0] + ':' + item[1]  + " "
	    print answer
            answer = "asking wordnet web service about: "+ term +" for " +nick +"\n\n"+ answer
            return answer

        if um:
            self.churn = self.archiver.archive_if_necessary(self.churn)
            url = um.group(1)
            label = self.churn.add_item(url,nick)
            return label+": "+url+" from "+nick

        elif bm:
            self.churn = self.archiver.archive_if_necessary(self.churn)
            item = "blurb"
            title = bm.group(1)
            label = self.churn.add_item(item,nick)
            msg = self.churn.title_item(label,title)
            return label+": "+title+" from "+nick

        elif tm:
            label = tm.group(1)
            title = tm.group(2)
            return self.churn.title_item(label,title)

        elif cm:
            label = cm.group(1)
            comment = cm.group(2)
            if comment == '':
                return self.churn.get_comments(label)
            else:
                return self.churn.comment_item(label,comment,nick)

        else:
            return

class FileArchiver:
    def __init__(self, directory):
        self.filename = directory + os.sep + "index.xml"
        self.directory = directory

    def archive_if_necessary(self,churn):
        date = churn.updatetime
        if self.should_archive(date):
            #print "Archiving current file"
            destination = self.prepare_filename(self.filename)
            os.rename(self.filename,destination)
            churn = Churn(self.directory)
        return churn

    def retrieve_churn(self):
        #print "Seeding churn"
        churn = Churn(self.directory)

        if os.path.isfile(self.filename):
            date = self.get_date(self.filename)
            if self.should_archive(date):
                #print "Archiving current file"
                destination = self.prepare_filename(self.filename)
                os.rename(self.filename,destination)
            else:
                #print "Reading current file"
                file = open(self.filename,'r')
                data = file.read()
                file.close()
                churn.deserialize(data)

        churn.save()
        return churn

    def should_archive(self,date):
        date_components = time.gmtime(date)
        file_year = date_components[0]
        file_month = date_components[1]
        file_day = date_components[2]

        date_components = time.gmtime(time.time())
        year = date_components[0]
        month = date_components[1]
        day = date_components[2]

        if(year == file_year and month == file_month and day == file_day):
            return 0
        else:
            return 1

    def prepare_filename(self,old_filename):
        date = self.get_date(old_filename)
        date_components = time.gmtime(date)
        year = "%d" % date_components[0]
        month = "%02d" % date_components[1]
        day = "%02d" % date_components[2]
        dir = self.directory + os.sep + string.join([year, month, day],os.sep)
        if not os.path.isdir(dir):
            os.makedirs(dir)

        filename = string.join([year, month, day],"-")
        filename = dir + os.sep + filename
        if os.path.isfile(filename + ".xml"):
            count = 1
            while os.path.isfile(filename + "-" + "%d" % count + ".xml"):
                count = count + 1
            filename = filename + "-" + "%d" % count
        filename = filename + ".xml"
        #print filename
        return filename

    def get_date(self,filename):
        file = open(filename,'r')
        contents = file.read()
        file.close()

        parser = LastUpdatedParser()
        parser.feed(contents)

        return parser.get_last_updated()
Received on Sunday, 19 August 2001 13:21:24 UTC