Re: Redland Isomorphism Tool?

Here's an updated version of rdfdiff-vanilla.py annotated inline, 
without documentation. The full unannotated version is available at:

    http://inamidst.com/proj/rdf/rdfdiff-vanilla.py

import sys, re, weakref, md5, urllib
import ntriples
from ntriples import bNode

ntriples.r_uriref = re.compile(r'<([^\s"<>]+)>')

class Graph(object):
    def __init__(self, uri):
       self.triples = {}
       self.cache = {}
       self.parse(uri)

    def parse(self, uri):
       class Sink(object):
          def triple(sink, s, p, o):
             # Must use something that provides set-like functionality
             # in order to make sure that there are no duplicate triples.
             self.triples[(s, p, o)] = True
             # Could also do "if not self.triples.has_key((s, p, o))..."

       p = ntriples.NTriplesParser(sink=Sink())
       u = urllib.urlopen(uri)
       p.parse(u)
       u.close()

    def __hash__(self):
       result = []
       for (subj, pred, objt) in self.triples.iterkeys():
          # Heeded your suggestion about using md5. I thought of
          # other approaches such as hash(...)ing the terms and
          # then bitwise XOR updating a content digest using the
          # result, and interning the terms and using their IDs,
          # but md5 seems the sanest approach though the performance
          # may not be as good as Python's built-in hash function
          # (which is more prone to collision, I believe).
          if isinstance(subj, bNode):
             tripleHash = md5.new(str(self.vhash(subj)))
          else: tripleHash = md5.new(subj)

          for term in (pred, objt):
             # Without this, _:pq _:r _:s will give the same hash
             # as _:p _:qr _:s, but I don't think there's any
             # combination of legal N-Triples for which this'll
             # actually be a bother given that subjects can't be
             # literals and predicates can't be bNodes or
             # literals, so I'm leaving it out for now:
             # tripleHash.update('\t')
             if isinstance(term, bNode):
                # Use update instead of joining the terms together.
                tripleHash.update(str(self.vhash(term)))
             else: tripleHash.update(term)

          result.append(tripleHash.digest())
       result.sort()
       return hash(tuple(result))

    def vhashmemo(self, term, done=False):
       # Now memoizing, with weakref, the result of a
       # vhash method call per your suggestion.
       if self.cache.has_key((term, done)):
          obj = self.cache[(term, done)]()
          if obj is not None:
             return obj
       result = self.vhash(term, done=done)
       self.cache[(term, done)] = weakref.ref(result)
       return result

    # Nothing below here has changed.
    def vhash(self, term, done=False):
       result = []
       for triple in self.triples:
          if term in triple:
             for pos in xrange(3):
                if not isinstance(triple[pos], bNode):
                   result.append(triple[pos])
                elif done or (triple[pos] == term):
                   result.append(pos)
                else: result.append(self.vhash(triple[pos], done=True))
       result.sort()
       return tuple(result)

def compare(p, q):
    return hash(Graph(p)) == hash(Graph(q))

def main():
    result = compare(sys.argv[1], sys.argv[2])
    print ('no', 'yes')[result]

if __name__=="__main__":
    main()

Apologies for the delay,

-- 
Sean B. Palmer, http://inamidst.com/sbp/

Received on Saturday, 8 January 2005 22:53:35 UTC