#!/usr/bin/env python """ Gleaning Resource Descriptions from Dialects of Languages (GRDDL) That is, GRDDL provides a relatively inexpensive mechanism for bootstrapping RDF content from uniform XML dialects; shifting the burden from formulating RDF to creating transformation algorithms specifically for each dialect. XML Transformation languages such as XSLT are quite versatile in their ability to process, manipulate, and generate XML. The use of XSLT to generate XHTML from single-purpose XML vocabularies is historically celebrated as a powerful idiom for separating structured content from presentation. GRDDL shifts this idiom to a different end: separating structured content from its authoritative meaning (or semantics). GRDDL works by associating transformations for an individual document, either through direct inclusion of references or indirectly through profile and namespace documents. See: http://4suite.org/docs/CoreManual.xml#xpath_query Copyright (c) 2006, Chimezie Ogbuji All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of inamidst.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ import time, sys, urllib2, re from sets import Set try: from cStringIO import StringIO except ImportError: from StringIO import StringIO from pprint import pprint from rdflib import Variable, BNode, URIRef, Literal, Namespace, RDF, RDFS from rdflib.Collection import Collection from rdflib.Graph import ConjunctiveGraph, QuotedGraph, ReadOnlyGraphAggregate, Graph from rdflib.syntax.NamespaceManager import NamespaceManager import Ft.Xml.Domlette from Ft.Xml import Parse from Ft.Xml.Xslt import Processor from Ft.Xml import InputSource from Ft.Lib.Uri import Absolutize GRDDL_NS = Namespace("http://www.w3.org/2003/g/data-view#") GRDDL_URI = u'http://www.w3.org/2003/g/data-view' XHTML_NS = u"http://www.w3.org/1999/xhtml" #Built-in list of namespace uri's that should terminate any recursive namespace dispatch NSDispatchTermination = [XHTML_NS] DEBUG = False #Flag to determine whether or not to only attempt to parse a URL as XML if it's mime-type #is appropriate CHECK_XML_MIMETYPE = False class Glean(object): """ Handles all the GRDDL XML parsing and XSLT transformation from URLs """ def __init__(self, url, graph, preParsedDOM=None): """ >>> g = Glean(u'http://www.w3.org/2003/g/po-doc.xml',Graph()) >>> g.dom.documentElement.localName u'purchaseOrder' """ self.graph = graph self.url = url if preParsedDOM: self.dom = preParsedDOM else: req = urllib2.Request(url) try: #peek in response headers to determine content-type u = urllib2.urlopen(req) headers = u.info() if re.match(r'(?:text|application)/.*\+?xml', headers['content-type']) is None and CHECK_XML_MIMETYPE: #What does the spec mandate about this scenario? if DEBUG: print "Ignoring non-xml information resource", url print headers['content-type'].split(';')[0] self.dom = None else: rt = u.read() self.dom = Ft.Xml.Domlette.NonvalidatingReader.parseString(rt, url) except Exception, e: if DEBUG: print "Unable to parse ", url, repr(e) #Unable to glean. Fail gracefully.. self.dom = None self.appliedTransforms = [] def transform(self, transformURLs): """ Takes a space seperated list of transform url's and applies them against thhe pre-parsed DOM of the GRDDL source - making sure to avoid transformation already applied """ processor = Processor.Processor() for xformURL in transformURLs.split(): if DEBUG: print "applying transformation %s" % (xformURL) if xformURL not in self.appliedTransforms: self.appliedTransforms.append(xformURL) #The transform url is resolved against the source URL (to accomodate relative urls) transform = InputSource.DefaultFactory.fromUri(Absolutize(xformURL, self.url)) processor.appendStylesheet(transform) #see: http://www.w3.org/TR/grddl/#stylepi result = processor.runNode(self.dom, self.url, ignorePis=1) #get output method / media-type # # #How to accomodate @media-type? method = processor.outputParams.method[-1] if method == 'xml': self.graph.parse(StringIO(result), publicID=self.url) elif method == 'text': #Attempt a Notation 3 parse (covers NTriples, and Turtle) self.graph.parse(StringIO(result), format='n3', publicID=self.url) else: #HTML result - recursive GRDDL mechanism? raise Exception("unsupported output type") class GRDDLAgent: """ The main entry point for the GRDDL agent Takes a url and a graph to store the GRDDL result and attempts to 'glean' in the 4 major ways that GRDDL mandates """ def __init__(self, url, graph): if DEBUG: print "Attempting a comprehensive glean of ", url parsedSource = None for gleanMethod in [XMLGlean, XMLNSGlean, XHTMLProfileGlean, ValidXHTMLGlean]: #Don't reparse the GRDDL source if not parsedSource: gleaned = gleanMethod(url, graph) parsedSource = gleaned.dom elif parsedSource: gleanMethod(url, graph, preParsedDOM=parsedSource) class XMLGlean(Glean): """ http://www.w3.org/TR/grddl/#grddl-xml - Adding GRDDL to well-formed XML The general form of associating a GRDDL transformation link with a well-formed XML document is by adorning the root element with a grddl namespace declaration and a grddl:transformation attribute whose value is a URI reference, or list of URI references, that refer to executable scripts or programs which are expected to transform the source document into RDF. """ def __init__(self, url, graph, preParsedDOM=None): """ >>> g = XMLGlean(u'http://www.w3.org/2003/g/po-ex', Graph()) >>> g.appliedTransforms[0] u'http://www.w3.org/2003/g/embeddedRDF.xsl' >>> pprint(list(g.graph)) [(u'http://www.w3.org/2003/g/po-ex', u'http://www.w3.org/2003/g/data-view#namespaceTransformation', u'http://www.w3.org/2003/g/grokPO.xsl')] """ super(XMLGlean, self).__init__(url, graph) if self.dom: self.xforms = [attr.value for attr in self.dom.xpath(u'/*/@data-view:transformation', {u'data-view':u'http://www.w3.org/2003/g/data-view#'})] if self.xforms: self.transform(self.xforms[0]) class XMLNSGlean(Glean): """ http://www.w3.org/TR/grddl/#ns-bind - Using GRDDL with XML Namespace Documents Any resource available for retrieval from a namespace URI is a namespace document (cf. section 4.5.4. Namespace documents in [WEBARCH]). For example, a namespace document may have an XML Schema representation or an RDF Schema representation, or perhaps both To associate a GRDDL transformation with a whole dialect, have the namespace document include the grddl:namespaceTransformation property. * if an information resource ?D has an XML representation whose root element has a namespace name ?NS then any GRDDL result of the resource identified by ?NS is a GRDDL result of ?D * if an information resource ?D has an XML representation whose root element has a namespace name ?NSDOC** and ?D has a GRDDL result that includes, for any ?TX, the RDF triple { ?NSDOC ?TX } then ?TX is also a transformation of ?D """ def __init__(self, url, graph, preParsedDOM=None): """ >>> g = XMLNSGlean(u'http://www.w3.org/2003/g/po-doc.xml', Graph()) >>> g.nsURI u'http://www.w3.org/2003/g/po-ex' >>> len(g.graph) 16 """ super(XMLNSGlean, self).__init__(url, graph) self.nsURI = None if self.dom: self.nsURI = self.dom.xpath(u'/*')[0].namespaceURI if self.nsURI and self.nsURI not in NSDispatchTermination: #glean GRDDL result from the namespace document GRDDLAgent(self.nsURI, self.graph) continueRecursion = True #setup a set of processed transforms to avoid infinite namespace snooping cycles processedNSXForms = Set() #Recursively find 'new' namespace transformations while continueRecursion: todoXForms = Set() for s, p, xform in self.graph.triples((self.nsURI, GRDDL_NS.namespaceTransformation, None)): if xform not in processedNSXForms: todoXForms.add(xform) #continue only if we have xforms to apply continueRecursion = bool(todoXForms) #apply the new namespace transforms on the GRDDL source, merging the GRDDL results as we go for newXForm in todoXForms: self.transform(newXForm) processedNSXForms.add(newXForm) class ValidXHTMLGlean(Glean): """ http://www.w3.org/TR/grddl/#grddl-xhtml - Using GRDDL with valid XHTML The general form of adding a GRDDL assertion to a valid XHTML document is by specifying the GRDDL profile in the profile attribute of the head element, and transformation as the value of the rel attribute of a link or a element whose href attribute value is a URI reference that refers to an executable script or program which is expected to transform the source document into RDF. This method is suitable for use with valid XHTML documents which are constrained by an XML DTD. Stated more formally: * An XHTML document whose metadata profiles include http://www.w3.org/2003/g/data-view has a GRDDL transformation for each resource identified by a link of type transformation. """ def __init__(self, url, graph, preParsedDOM=None): super(ValidXHTMLGlean, self).__init__(url, graph) if self.dom: xhtmlNSMap = {u'xhtml':XHTML_NS} for xform in self.dom.xpath(u'/xhtml:html[xhtml:head[contains(@profile, "%s")]]//xhtml:*[(local-name() = "a" or local-name() = "link") and @rel="transformation"]/@href'%GRDDL_URI, xhtmlNSMap): self.transform(xform.value) class XHTMLProfileGlean(Glean): """ http://www.w3.org/TR/grddl/#profile-bind - GRDDL for HTML Profiles A more formal description on the relation between GRDDL and XHTML profiles follows: * if an information resource ?D has an XHTML representation whose profile attribute refers to ?PROFILE, then any GRDDL result of ?PROFILE is a GRDDL result of ?D * if an information resource ?D has an XHTML representation whose profile attribute refers to ?PROFILE and ?D has a GRDDL result that includes, for any ?TX, the RDF triple { ?PROFILE ?TX } then ?TX is also a GRDDL transformation of ?D """ def __init__(self, url, graph, preParsedDOM=None): """ >>> g = XHTMLProfileGlean(u'http://www.w3.org/2003/g/data-view', Graph()) >>> g.profiles >>> len(g.graph) """ super(XHTMLProfileGlean, self).__init__(url, graph) self.profiles = [] if self.dom: profile = self.dom.xpath(u'/xhtml:html/xhtml:head/@profile', {u'xhtml':XHTML_NS}) if profile: self.profiles = profile[0].value.split() for profile in self.profiles: if profile == GRDDL_URI: continue if DEBUG: print "processing profile url: ", profile #glean GRDDL result from the profile document GRDDLAgent(profile, self.graph) continueRecursion = True #setup a set of processed transforms to avoid infinite profile snooping cycles processedProfileXForms = Set() #Recursively find 'new' namespace transformations while continueRecursion: todoXForms = Set() for s, p, xform in self.graph.triples((profile, GRDDL_NS.profileTransformation, None)): if xform not in processedProfileXForms: todoXForms.add(xform) #continue only if we have xforms to apply continueRecursion = bool(todoXForms) #apply the new namespace transforms on the GRDDL source, merging the GRDDL results as we go for newXForm in todoXForms: self.transform(newXForm) processedProfileXForms.add(newXForm) OWL_NS = Namespace("http://www.w3.org/2002/07/owl#") Any = None LOG = Namespace("http://www.w3.org/2000/10/swap/log#") def main(): graph = Graph() g=GRDDLAgent(sys.argv[1], graph) print graph.serialize(format='xml') def test(): import doctest doctest.testmod() # main() if __name__ == '__main__': test()