- From: Dan Brickley <danbri@w3.org>
- Date: Sun, 19 Aug 2001 13:21:25 -0400 (EDT)
- To: <www-archive@w3.org>
# Copyright (c) 2001 by Matt Biddulph and Edd Dumbill, Useful Information Company
# All rights reserved.
#
# License is granted to use or modify this software ("Daily Chump") for
# commercial or non-commercial use provided the copyright of the author is
# preserved in any distributed or derivative work.
#
# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESSED OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
# ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# daily chump v 1.0
# $Id: dailychump.py,v 1.7 2001/07/12 21:19:33 edmundd Exp $
import string
import re
import time
import tempfile
import shutil
import os
import os.path
from EntityEncoder import EntityEncoder
from xmllib import XMLParser, procclose, illegal, tagfind
class TimeFormatter:
def format_time(self,formattime):
return time.strftime("%Y-%m-%d %H:%M",time.gmtime(formattime))
class StyleSheetAwareXMLParser(XMLParser):
""" This class is needed to override a bug in Py 1.5.2's xmllib
which meant it would reject an xml-stylesheet PI """
def parse_proc(self, i):
rawdata = self.rawdata
end = procclose.search(rawdata, i)
if end is None:
return -1
j = end.start(0)
if illegal.search(rawdata, i+2, j):
self.syntax_error('illegal character in processing instruction')
res = tagfind.match(rawdata, i+2)
if res is None:
raise RuntimeError, 'unexpected call to parse_proc'
k = res.end(0)
name = res.group(0)
if name == 'xml:namespace':
self.syntax_error('old-fashioned namespace declaration')
self.__use_namespaces = -1
# namespace declaration
# this must come after the <?xml?> declaration (if any)
# and before the <!DOCTYPE> (if any).
if self.__seen_doctype or self.__seen_starttag:
self.syntax_error('xml:namespace declaration too late in document')
attrdict, namespace, k = self.parse_attributes(name, k, j)
if namespace:
self.syntax_error('namespace declaration inside namespace declaration')
for attrname in attrdict.keys():
if not self.__xml_namespace_attributes.has_key(attrname):
self.syntax_error("unknown attribute `%s' in xml:namespace tag" % attrname)
if not attrdict.has_key('ns') or not attrdict.has_key('prefix'):
self.syntax_error('xml:namespace without required attributes')
prefix = attrdict.get('prefix')
if ncname.match(prefix) is None:
self.syntax_error('xml:namespace illegal prefix value')
return end.end(0)
if self.__namespaces.has_key(prefix):
self.syntax_error('xml:namespace prefix not unique')
self.__namespaces[prefix] = attrdict['ns']
else:
if string.find(string.lower(name), 'xml ') >= 0:
self.syntax_error('illegal processing instruction target name')
self.handle_proc(name, rawdata[k:j])
return end.end(0)
class Churn:
def __init__(self,directory):
self.database = {}
self.directory = directory
self.labelcount = 0
self.set_update_time(time.time())
self.topic=""
self.stylesheet=""
self.sheettype="text/css"
def set_update_time(self,time):
self.updatetime = time
def get_topic(self):
return self.topic
def set_topic(self,topic):
self.topic = topic
self.save()
def get_stylesheet(self):
return self.stylesheet
def get_stylesheettype(self):
return self.sheettype
def set_stylesheet(self,sheet):
self.stylesheet=sheet
if sheet[-3:] == "xsl":
self.sheettype="text/xsl"
else:
self.sheettype="text/css"
def delete_item(self, label):
if self.database.has_key(label):
del self.database[label]
self.save()
def view_item(self, label):
entry = self.get_entry(label)
if entry != None:
if entry.title == '':
return label + ": " + entry.item
else:
return label + ": " + entry.title + " (" + entry.item + ")"
else:
return 'Label '+label+' not found.'
def view_recent_items(self, count=5):
labels = self._timesorted_labels()
labels = labels[0:count]
labels.reverse()
message = ''
for l in labels:
message = message + self.view_item(l) + "\n"
return message
def add_item(self, item, nick):
entry = ChurnEntry(item,nick)
label = self.get_next_label()
self.set_entry(label, entry)
self.save()
return label
def _filename(self):
return self.directory + "/index.xml"
def _timesorted_labels(self):
labels = self.database.keys()
times = []
for l in labels:
times.append(self.get_entry(l).time)
# sort list of labels by the respective time entry
# from the times list
pairs = map(None,times,labels)
pairs.sort()
result = pairs[:]
for i in xrange(len(result)):
result[i] = result[i][1]
result.reverse()
return result
def set_time_item(self,label,time):
entry = self.get_entry(label)
if entry != None:
entry.set_time(time)
self.save()
def set_entry(self,label,entry):
self.database[label] = entry
def get_entry_count(self):
return len(self.database.keys())
def get_entry(self,label):
if self.database.has_key(label):
return self.database[label]
else:
return None
def title_item(self,label,title):
entry = self.get_entry(label)
if entry != None:
entry.set_title(title)
self.save()
return "titled item "+label
else:
return 'Label '+label+' not found.'
def get_comments(self,label):
entry = self.get_entry(label)
if entry != None:
if entry.title == '':
return entry.item + "\n" + entry.get_comments()
else:
return entry.item + "\n" + entry.title + "\n" + entry.get_comments()
else:
return 'Label '+label+' not found.'
def comment_item(self,label,comment,nick):
entry = self.get_entry(label)
if entry != None:
entry.add_comment(comment,nick)
self.save()
return "commented item "+label
else:
return 'Label '+label+' not found.'
def save(self):
self.update()
name = tempfile.mktemp()
out_file = open(name,"w")
out_file.write(self.serialize())
out_file.write("\n")
out_file.close()
if os.path.isfile(self._filename()):
os.remove(self._filename())
shutil.copy(name,self._filename())
os.unlink(name)
def deserialize(self,data):
c = ChurnParser()
c.set_churn(self)
c.feed(data)
c.close()
def serialize(self):
encoder = EntityEncoder()
serialized='<!DOCTYPE churn>\n'
if self.get_stylesheet()!="":
serialized = serialized + '<?xml-stylesheet href="'+\
encoder.encode_chars(self.get_stylesheet())+\
'" type="'+\
self.get_stylesheettype()+'"?>'+"\n"
serialized = serialized + "<churn>\n"
serialized = serialized + '<last-updated value="'
serialized = serialized + "%f" % self.updatetime
serialized = serialized + '">'
serialized = serialized + encoder.encode_chars(TimeFormatter().format_time(self.updatetime))+"</last-updated>\n"
serialized = serialized + '<itemcount value="'
serialized = serialized + "%d" % self.get_entry_count()
serialized = serialized + '" />\n'
serialized = serialized + "<topic>"+encoder.encode_chars(self.topic)+"</topic>\n"
for x in self._timesorted_labels():
entry = self.get_entry(x)
serialized = serialized + entry.serialize()
serialized = serialized + "</churn>"
return serialized
def get_next_label(self):
label = self.number_to_label(self.labelcount)
self.labelcount = self.labelcount + 1
return label
def number_to_label(self,number):
if number < 26:
return chr(number + 65)
if number == 26:
return 'AA'
count = number - 26
label = ''
while count > 0:
label = chr((count % 26) + 65) + label
count = count / 26
if number < 52:
return 'A' + label
else:
return label
def update(self):
self.set_update_time(time.time())
class ChurnEntry:
def __init__(self,item,nick):
self.item = item
self.nick = nick
self.comments = []
self.set_time(time.time())
self.title = ''
def serialize(self):
encoder = EntityEncoder()
serialized = ''
serialized = serialized + "<link"
if self.item == 'blurb':
serialized = serialized + ' type="blurb"'
serialized = serialized + ">\n"
serialized = serialized + '<time value="'
serialized = serialized + "%f" % self.time
serialized = serialized + '">'
serialized = serialized + encoder.encode_chars(TimeFormatter().format_time(self.time))
serialized = serialized + "</time>\n"
if not self.item == 'blurb':
serialized = serialized + "<url>"+encoder.encode_chars(self.item)+"</url>\n"
serialized = serialized + "<nick>"+encoder.encode_chars(self.nick)+"</nick>\n"
if self.title != '':
serialized = serialized + "<title>"+encoder.encode_chars(self.title)+"</title>\n"
for c in self.comments:
nick = c[0]
comment = c[1]
serialized = serialized + self.serialize_comment(nick,comment,encoder)
serialized = serialized + "</link>\n"
return serialized
def serialize_comment(self,nick,comment,encoder):
comment_html = encoder.encode_chars(comment)
italic_search = re.compile('\*([^*]+)\*')
while italic_search.search(comment_html) != None:
match = italic_search.search(comment_html)
comment_html = comment_html[0:match.start(1) - 1] + '<i>' + comment_html[match.start(1):match.end(1)] + '</i>' + comment_html[match.end(1) + 1:]
img_search = re.compile('\+\[(http[^|\]]+)\]')
while img_search.search(comment_html) != None:
match = img_search.search(comment_html)
comment_html = comment_html[0:match.start(1) - 2] + '<img src="' + match.group(1) + '" />' + comment_html[match.end(1) + 1:]
titled_img_search = re.compile('\+\[([^|]+)\|([^\]]+)\]')
while titled_img_search.search(comment_html) != None:
match = titled_img_search.search(comment_html)
if string.find(match.group(1),"http") == 0: # begins with http
url_index = 1
title_index = 2
else:
url_index = 2
title_index = 1
comment_html = comment_html[0:match.start(1) - 2] + '<img src="' + match.group(url_index) + '" alt="' + match.group(title_index) + '" />' + comment_html[match.end(2) + 1:]
url_search = re.compile('\[(http[^|\]]+)\]')
while url_search.search(comment_html) != None:
match = url_search.search(comment_html)
comment_html = comment_html[0:match.start(1) - 1] + '<a href="' + match.group(1) + '">' + match.group(1) + '</a>' + comment_html[match.end(1) + 1:]
titled_url_search = re.compile('\[([^|]+)\|([^\]]+)\]')
while titled_url_search.search(comment_html) != None:
match = titled_url_search.search(comment_html)
if string.find(match.group(1),"http") == 0: # begins with http
url_index = 1
title_index = 2
else:
url_index = 2
title_index = 1
comment_html = comment_html[0:match.start(1) - 1] + '<a href="' + match.group(url_index) + '">' + match.group(title_index) + '</a>' + comment_html[match.end(2) + 1:]
serialized = ''
serialized = serialized + '<comment nick="' + nick+ '">'
serialized = serialized + comment_html
serialized = serialized + "</comment>\n"
return serialized
def set_time(self,time):
self.time = time
def add_comment(self,comment,nick):
self.comments.append([nick,comment])
def set_title(self,title):
self.title = title
def get_comments(self):
comments = ''
for c in self.comments:
nick = c[0]
comment = c[1]
comments = comments + '(' + nick+ ') '
comments = comments + comment
comments = comments + "\n"
return comments
class ChurnParser(StyleSheetAwareXMLParser):
def __init__(self):
XMLParser.__init__(self)
self._data = ''
self._in_a = 0
self._a_title = ''
self._a_href = ''
self._entries = []
self._current_entry = {}
def set_churn(self,churn):
self.churn = churn
def get_churn(self):
return self.churn
def start_link(self,attrs):
self._current_entry['title'] = ''
self._current_entry['time'] = 0
self._current_entry['nick'] = ''
self._current_entry['item'] = ''
self._current_entry['comments'] = []
if attrs.has_key('type'):
type = attrs['type']
if type == 'blurb':
self._current_entry['blurb'] = 1
else:
self._current_entry['blurb'] = 0
def start_a(self,attrs):
if self._in_comment == 1:
if(attrs.has_key('href')):
self._a_href = attrs['href']
else:
self._a_href = ''
self._a_title = ''
self._in_a = 1
def start_img(self,attrs):
if self._in_comment == 1:
if(attrs.has_key('src')):
self._img_src = attrs['src']
else:
self._img_src = ''
if(attrs.has_key('alt')):
self._img_title = attrs['alt']
else:
self._img_title = ''
def start_i(self,attrs):
self._data = self._data + '*'
def end_i(self):
self._data = self._data + '*'
def end_img(self):
if self._img_title != '' and self._img_src != '' and self._img_src != self._img_title:
self._data = self._data + '+[' + self._img_title + '|' + self._img_src + ']'
elif self._img_src != '':
self._data = self._data + '+[' + self._img_src + ']'
def end_a(self):
if self._a_title != '' and self._a_href != '' and self._a_href != self._a_title:
self._data = self._data + '[' + self._a_title + '|' + self._a_href + ']'
elif self._a_href != '':
self._data = self._data + '[' + self._a_href + ']'
self._in_a = 0
def start_itemcount(self,attrs):
if attrs.has_key('value'):
self._itemcount = string.atoi(attrs['value'])
def unknown_starttag(self,tag,attrs):
self._tag_name = tag
self._data = ''
if tag == 'last-updated':
if attrs.has_key('value'):
self._last_updated = string.atof(attrs['value'])
else:
self._last_updated = time.time()
if tag == 'comment':
self._current_entry['comment_nick'] = attrs['nick']
self._in_comment = 1
if tag == 'time':
if attrs.has_key('value'):
self._current_entry['time'] = string.atof(attrs['value'])
else:
self._current_entry['time'] = time.time()
def end_title(self):
self._current_entry['title'] = self._data
def end_url(self):
self._current_entry['item'] = self._data
def end_nick(self):
self._current_entry['nick'] = self._data
def end_comment(self):
self._in_comment = 0
self._current_entry['comments'].append([self._data,self._current_entry['comment_nick']])
def end_link(self):
self._entries.append(self._current_entry)
self._current_entry = {}
def end_churn(self):
self._entries.reverse()
for a in self._entries:
if a['blurb'] == 1:
a['item'] = "blurb"
label = self.churn.add_item(a['item'],a['nick'])
if a['title'] != '':
self.churn.title_item(label,a['title'])
for c in a['comments']:
self.churn.comment_item(label,c[0],c[1])
self.churn.set_time_item(label,a['time'])
self.churn.set_update_time(self._last_updated)
def handle_data(self,text):
if self._in_a == 1:
self._a_title = self._a_title + text
else:
self._data = self._data + text
class LastUpdatedParser(StyleSheetAwareXMLParser):
def unknown_starttag(self,tag,attrs):
if tag == 'last-updated':
self.lu = attrs['value']
def get_last_updated(self):
return string.atof(self.lu)
class DailyChump:
def __init__(self, directory):
self.archiver = FileArchiver(directory)
self.churn = self.archiver.retrieve_churn()
def set_topic(self,topic):
self.churn = self.archiver.archive_if_necessary(self.churn)
self.churn.set_topic(topic)
def view_recent_items(self,count=5):
return self.churn.view_recent_items(count)
def get_database(self):
return self.churn.serialize()
def set_stylesheet(self, sheet):
self.churn.set_stylesheet(sheet)
def process_input(self,nick,msg):
blurbmatch = re.compile("BLURB:\s*(.*)")
urlmatch = re.compile("(https?:\/\/[^ ]+)")
titlematch = re.compile("([A-Z]+):\|\s*(.*)")
commentmatch = re.compile("([A-Z]+):\s*(.*)")
wordnetmatch = re.compile("wordnet\s*(.*)")
um = urlmatch.match(msg)
bm = blurbmatch.match(msg)
tm = titlematch.match(msg)
cm = commentmatch.match(msg)
wm = wordnetmatch.match(msg)
if wm:
term = wm.group(1)
import SOAP
query ="""SELECT ?x, ?l, ?c
FROM http://xmlns.com/wordnet/1.6/""" + term + """WHERE
(web::type ?x rdfs::Class)
(rdfs::label ?x ?l)
(rdfs::description ?x ?c)
USING web FOR http://www.w3.org/1999/02/22-rdf-syntax-ns#
rdfs FOR http://www.w3.org/2000/01/rdf-schema# """
data = """http://xmlns.com/wordnet/1.6/""" + term
server =SOAP.SOAPProxy("http://fireball.dyndns.org/cgi-bin/soap",namespace = "http://rdfweb.org/RDF/RDFWeb/SOAPDemo")
rs = server.squish(query, data)
answer = "Results...: "
for row in rs:
for item in row._asdict.items():
answer = answer + item[0] + ':' + item[1] + " "
print answer
answer = "asking wordnet web service about: "+ term +" for " +nick +"\n\n"+ answer
return answer
if um:
self.churn = self.archiver.archive_if_necessary(self.churn)
url = um.group(1)
label = self.churn.add_item(url,nick)
return label+": "+url+" from "+nick
elif bm:
self.churn = self.archiver.archive_if_necessary(self.churn)
item = "blurb"
title = bm.group(1)
label = self.churn.add_item(item,nick)
msg = self.churn.title_item(label,title)
return label+": "+title+" from "+nick
elif tm:
label = tm.group(1)
title = tm.group(2)
return self.churn.title_item(label,title)
elif cm:
label = cm.group(1)
comment = cm.group(2)
if comment == '':
return self.churn.get_comments(label)
else:
return self.churn.comment_item(label,comment,nick)
else:
return
class FileArchiver:
def __init__(self, directory):
self.filename = directory + os.sep + "index.xml"
self.directory = directory
def archive_if_necessary(self,churn):
date = churn.updatetime
if self.should_archive(date):
#print "Archiving current file"
destination = self.prepare_filename(self.filename)
os.rename(self.filename,destination)
churn = Churn(self.directory)
return churn
def retrieve_churn(self):
#print "Seeding churn"
churn = Churn(self.directory)
if os.path.isfile(self.filename):
date = self.get_date(self.filename)
if self.should_archive(date):
#print "Archiving current file"
destination = self.prepare_filename(self.filename)
os.rename(self.filename,destination)
else:
#print "Reading current file"
file = open(self.filename,'r')
data = file.read()
file.close()
churn.deserialize(data)
churn.save()
return churn
def should_archive(self,date):
date_components = time.gmtime(date)
file_year = date_components[0]
file_month = date_components[1]
file_day = date_components[2]
date_components = time.gmtime(time.time())
year = date_components[0]
month = date_components[1]
day = date_components[2]
if(year == file_year and month == file_month and day == file_day):
return 0
else:
return 1
def prepare_filename(self,old_filename):
date = self.get_date(old_filename)
date_components = time.gmtime(date)
year = "%d" % date_components[0]
month = "%02d" % date_components[1]
day = "%02d" % date_components[2]
dir = self.directory + os.sep + string.join([year, month, day],os.sep)
if not os.path.isdir(dir):
os.makedirs(dir)
filename = string.join([year, month, day],"-")
filename = dir + os.sep + filename
if os.path.isfile(filename + ".xml"):
count = 1
while os.path.isfile(filename + "-" + "%d" % count + ".xml"):
count = count + 1
filename = filename + "-" + "%d" % count
filename = filename + ".xml"
#print filename
return filename
def get_date(self,filename):
file = open(filename,'r')
contents = file.read()
file.close()
parser = LastUpdatedParser()
parser.feed(contents)
return parser.get_last_updated()
Received on Sunday, 19 August 2001 13:21:24 UTC