- From: Sean B. Palmer <sean+wa@infomesh.net>
- Date: Sat, 08 Jan 2005 22:52:59 +0000
- To: Dave Beckett <dave.beckett@bristol.ac.uk>
- CC: www-archive@w3.org
Here's an updated version of rdfdiff-vanilla.py annotated inline,
without documentation. The full unannotated version is available at:
http://inamidst.com/proj/rdf/rdfdiff-vanilla.py
import sys, re, weakref, md5, urllib
import ntriples
from ntriples import bNode
ntriples.r_uriref = re.compile(r'<([^\s"<>]+)>')
class Graph(object):
def __init__(self, uri):
self.triples = {}
self.cache = {}
self.parse(uri)
def parse(self, uri):
class Sink(object):
def triple(sink, s, p, o):
# Must use something that provides set-like functionality
# in order to make sure that there are no duplicate triples.
self.triples[(s, p, o)] = True
# Could also do "if not self.triples.has_key((s, p, o))..."
p = ntriples.NTriplesParser(sink=Sink())
u = urllib.urlopen(uri)
p.parse(u)
u.close()
def __hash__(self):
result = []
for (subj, pred, objt) in self.triples.iterkeys():
# Heeded your suggestion about using md5. I thought of
# other approaches such as hash(...)ing the terms and
# then bitwise XOR updating a content digest using the
# result, and interning the terms and using their IDs,
# but md5 seems the sanest approach though the performance
# may not be as good as Python's built-in hash function
# (which is more prone to collision, I believe).
if isinstance(subj, bNode):
tripleHash = md5.new(str(self.vhash(subj)))
else: tripleHash = md5.new(subj)
for term in (pred, objt):
# Without this, _:pq _:r _:s will give the same hash
# as _:p _:qr _:s, but I don't think there's any
# combination of legal N-Triples for which this'll
# actually be a bother given that subjects can't be
# literals and predicates can't be bNodes or
# literals, so I'm leaving it out for now:
# tripleHash.update('\t')
if isinstance(term, bNode):
# Use update instead of joining the terms together.
tripleHash.update(str(self.vhash(term)))
else: tripleHash.update(term)
result.append(tripleHash.digest())
result.sort()
return hash(tuple(result))
def vhashmemo(self, term, done=False):
# Now memoizing, with weakref, the result of a
# vhash method call per your suggestion.
if self.cache.has_key((term, done)):
obj = self.cache[(term, done)]()
if obj is not None:
return obj
result = self.vhash(term, done=done)
self.cache[(term, done)] = weakref.ref(result)
return result
# Nothing below here has changed.
def vhash(self, term, done=False):
result = []
for triple in self.triples:
if term in triple:
for pos in xrange(3):
if not isinstance(triple[pos], bNode):
result.append(triple[pos])
elif done or (triple[pos] == term):
result.append(pos)
else: result.append(self.vhash(triple[pos], done=True))
result.sort()
return tuple(result)
def compare(p, q):
return hash(Graph(p)) == hash(Graph(q))
def main():
result = compare(sys.argv[1], sys.argv[2])
print ('no', 'yes')[result]
if __name__=="__main__":
main()
Apologies for the delay,
--
Sean B. Palmer, http://inamidst.com/sbp/
Received on Saturday, 8 January 2005 22:53:35 UTC