- From: Sean B. Palmer <sean+wa@infomesh.net>
- Date: Sat, 08 Jan 2005 22:52:59 +0000
- To: Dave Beckett <dave.beckett@bristol.ac.uk>
- CC: www-archive@w3.org
Here's an updated version of rdfdiff-vanilla.py annotated inline, without documentation. The full unannotated version is available at: http://inamidst.com/proj/rdf/rdfdiff-vanilla.py import sys, re, weakref, md5, urllib import ntriples from ntriples import bNode ntriples.r_uriref = re.compile(r'<([^\s"<>]+)>') class Graph(object): def __init__(self, uri): self.triples = {} self.cache = {} self.parse(uri) def parse(self, uri): class Sink(object): def triple(sink, s, p, o): # Must use something that provides set-like functionality # in order to make sure that there are no duplicate triples. self.triples[(s, p, o)] = True # Could also do "if not self.triples.has_key((s, p, o))..." p = ntriples.NTriplesParser(sink=Sink()) u = urllib.urlopen(uri) p.parse(u) u.close() def __hash__(self): result = [] for (subj, pred, objt) in self.triples.iterkeys(): # Heeded your suggestion about using md5. I thought of # other approaches such as hash(...)ing the terms and # then bitwise XOR updating a content digest using the # result, and interning the terms and using their IDs, # but md5 seems the sanest approach though the performance # may not be as good as Python's built-in hash function # (which is more prone to collision, I believe). if isinstance(subj, bNode): tripleHash = md5.new(str(self.vhash(subj))) else: tripleHash = md5.new(subj) for term in (pred, objt): # Without this, _:pq _:r _:s will give the same hash # as _:p _:qr _:s, but I don't think there's any # combination of legal N-Triples for which this'll # actually be a bother given that subjects can't be # literals and predicates can't be bNodes or # literals, so I'm leaving it out for now: # tripleHash.update('\t') if isinstance(term, bNode): # Use update instead of joining the terms together. tripleHash.update(str(self.vhash(term))) else: tripleHash.update(term) result.append(tripleHash.digest()) result.sort() return hash(tuple(result)) def vhashmemo(self, term, done=False): # Now memoizing, with weakref, the result of a # vhash method call per your suggestion. if self.cache.has_key((term, done)): obj = self.cache[(term, done)]() if obj is not None: return obj result = self.vhash(term, done=done) self.cache[(term, done)] = weakref.ref(result) return result # Nothing below here has changed. def vhash(self, term, done=False): result = [] for triple in self.triples: if term in triple: for pos in xrange(3): if not isinstance(triple[pos], bNode): result.append(triple[pos]) elif done or (triple[pos] == term): result.append(pos) else: result.append(self.vhash(triple[pos], done=True)) result.sort() return tuple(result) def compare(p, q): return hash(Graph(p)) == hash(Graph(q)) def main(): result = compare(sys.argv[1], sys.argv[2]) print ('no', 'yes')[result] if __name__=="__main__": main() Apologies for the delay, -- Sean B. Palmer, http://inamidst.com/sbp/
Received on Saturday, 8 January 2005 22:53:35 UTC