Re: Redland Isomorphism Tool?

Thoughts so far as annotations to your code.  I don't have python 2.4
and don't need it for 1 feature, sets that aren't really exercised
here.  Redland provides triple set by default anyway.

vhash(term) is called way too many times, you need to cache the result

Dave


#!/usr/bin/env python
"""
Vanilla RDF Graph Isomorphism Tester
Author: Sean B. Palmer, inamidst.com
Uses the pyrple algorithm
Usage: ./rdfdiff-vanilla.py <ntriplesP> <ntriplesQ>
Requirements: 
   Python2.4+
   http://inamidst.com/proj/rdf/ntriples.py
References: 
   http://inamidst.com/proj/rdf/rdfdiff.py
   http://miscoranda.com/comments/129#id2005010405560004
"""

# Built-in function
# hash(object)
#    Return the hash value of the object (if it has one). Hash values
#    are integers. They are used to quickly compare dictionary keys
#    during a dictionary lookup. Numeric values that compare equal
#    have the same hash value (even if they are of different types,
#    as is the case for 1 and 1.0).

 
import sys, re, urllib
import ntriples
from ntriples import bNode

ntriples.r_uriref = re.compile(r'<([^\s"<>]+)>')

class Graph(object): 
   def __init__(self, uri): 
      # FIXME 2.4 set
      # self.triples = set()
      # changed to 2.3 list
      self.triples = []
      self.parse(uri)

   def parse(self, uri):
      """Read triples at uri into Graph"""
      class Sink(object): 
         def triple(sink, s, p, o):
            # FIXME 2.4 set add operation
            # self.triples.add((s, p, o))
            # changed to 2.3 list add
            self.triples.append((s, p, o))

      p = ntriples.NTriplesParser(sink=Sink())
      u = urllib.urlopen(uri)
      p.parse(u)
      u.close()

   def __hash__(self):
      """Make a graph hash based on the triples inside it"""

      # builds a sorted sequence of hashes of triples
      result = []

      # set iteration
      for triple in self.triples: 
         newTriple = []
         # for each part of the triple, s, p, o
         for term in triple: 
            if isinstance(term, bNode):
               # bnodes get a tuple equivalent to the use of the bnode
               # in the graph
               newTriple.append(self.vhash(term))
            else:
               # named items add themselves
               newTriple.append(term)

         # newTriple is redundant as it is immediately turned into an
         # integer hash.  Should just make a content digest (say md5)
         # instead of newTriple.append()
         tripleHash = hash(tuple(newTriple))
         result.append(tripleHash)
         
      # sort sequence by hash integer order
      # O(#triples)
      result.sort()

      # which immediately is used to make an integer
      h=hash(tuple(result))
      print "__hash__ result: '"+str(h)+"'"
      return h


   def vhash(self, term, done=False):
      """Return a tuple () for a blank node s,p,o term
      equivalent to the use of the blank node in the graph.

      This is called multiple times for the same term in every triple.
      It is a very expensive operation and the result MUST BE CACHED
      but isn't so far.

      It could just return a content digest that was equiv to the
      tuple's contents as the return value is only ever used as
      to form a content digest via hash().
      """
      result = []

      # set iteration
      for triple in self.triples:
         # if term is s, p or o in triple
         if term in triple:
            # for each part of the triple, pos 1(s),2(p),3(o)
            for pos in xrange(3): 
               if not isinstance(triple[pos], bNode):
                  # non-bnode, add the triple term
                  result.append(triple[pos])
               elif done or (triple[pos] == term):
                  # no recursion or match, add the position
                  result.append(pos)
               else:
                  # else recurse and add the vhash of this triple term
                  # which is itself a tuple
                  result.append(self.vhash(triple[pos], done=True))

      # sort sequence by hash integer order
      # O(#triples containing term)
      result.sort()

      v=tuple(result)

      print "vhash '"+term+"' result: '"+str(v)+"'"
      return v

def compare(p, q): 
   return hash(Graph(p)) == hash(Graph(q))

def main(): 
   result = compare(sys.argv[1], sys.argv[2])
   print ('no', 'yes')[result]

if __name__=="__main__": 
   main()

Received on Thursday, 6 January 2005 14:34:51 UTC