#!/usr/bin/env python
"""
Gleaning Resource Descriptions from Dialects of Languages (GRDDL)

That is, GRDDL provides a relatively inexpensive mechanism for bootstrapping RDF content from
uniform XML dialects; shifting the burden from formulating RDF to creating transformation 
algorithms specifically for each dialect. XML Transformation languages such as XSLT are 
quite versatile in their ability to process, manipulate, and generate XML. The use of XSLT 
to generate XHTML from single-purpose XML vocabularies is historically celebrated as a powerful 
idiom for separating structured content from presentation.

GRDDL shifts this idiom to a different end: separating structured content from its authoritative 
meaning (or semantics). GRDDL works by associating transformations for an individual document, 
either through direct inclusion of references or indirectly through profile and namespace documents. 

See: http://4suite.org/docs/CoreManual.xml#xpath_query

Copyright (c) 2006, Chimezie Ogbuji
All rights reserved.

Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:

    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
    * Neither the name of inamidst.com nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""
import time, sys, urllib2, re
from sets import Set
try:
    from cStringIO import StringIO
except ImportError:        
    from StringIO import StringIO
from pprint import pprint
from rdflib import Variable, BNode, URIRef, Literal, Namespace, RDF, RDFS
from rdflib.Collection import Collection
from rdflib.Graph import ConjunctiveGraph, QuotedGraph, ReadOnlyGraphAggregate, Graph
from rdflib.syntax.NamespaceManager import NamespaceManager
import  Ft.Xml.Domlette
from Ft.Xml import Parse
from Ft.Xml.Xslt import Processor
from Ft.Xml import InputSource
from Ft.Lib.Uri import Absolutize

GRDDL_NS = Namespace("http://www.w3.org/2003/g/data-view#")
GRDDL_URI = u'http://www.w3.org/2003/g/data-view'
XHTML_NS = u"http://www.w3.org/1999/xhtml"
#Built-in list of namespace uri's that should terminate any recursive namespace dispatch
NSDispatchTermination = [XHTML_NS]
DEBUG = False
#Flag to determine whether or not to only attempt to parse a URL as XML if it's mime-type
#is appropriate
CHECK_XML_MIMETYPE = False

class Glean(object):
    """
    Handles all the GRDDL XML parsing and XSLT transformation from URLs
    """
    def __init__(self, url, graph, preParsedDOM=None):
        """
        
        >>> g = Glean(u'http://www.w3.org/2003/g/po-doc.xml',Graph())
        >>> g.dom.documentElement.localName
        u'purchaseOrder'
        
        """
        self.graph = graph
        self.url = url
        if preParsedDOM:
            self.dom = preParsedDOM 
        else:
            req = urllib2.Request(url)
            try:
                #peek in response headers to determine content-type
                u = urllib2.urlopen(req)
                headers = u.info()
                if re.match(r'(?:text|application)/.*\+?xml', headers['content-type']) is None and CHECK_XML_MIMETYPE:
                    #What does the spec mandate about this scenario?
                    if DEBUG:
                        print "Ignoring non-xml information resource", url
                        print headers['content-type'].split(';')[0]
                    self.dom = None
                else:
                    rt = u.read()
                    self.dom = Ft.Xml.Domlette.NonvalidatingReader.parseString(rt, url)
            except Exception, e:
                if DEBUG:
                    print "Unable to parse ", url, repr(e)
                #Unable to glean.  Fail gracefully..
                self.dom = None
        self.appliedTransforms = []
        
    def transform(self, transformURLs):
        """
        Takes a space seperated list of transform url's and applies them against thhe
        pre-parsed DOM of the GRDDL source - making sure to avoid transformation already applied
        """        
        processor = Processor.Processor()
        for xformURL in transformURLs.split():
            if DEBUG:
                print "applying transformation %s" % (xformURL)
            if xformURL not in self.appliedTransforms:
                self.appliedTransforms.append(xformURL)
            #The transform url is resolved against the source URL (to accomodate relative urls)
            transform = InputSource.DefaultFactory.fromUri(Absolutize(xformURL, self.url))
            processor.appendStylesheet(transform)
            #see: http://www.w3.org/TR/grddl/#stylepi
            result = processor.runNode(self.dom, self.url, ignorePis=1)
            #get output method / media-type
#            <!-- Category: top-level-element -->
#            <xsl:output
#              method = "xml" | "html" | "text" | qname-but-not-ncname
#              version = nmtoken
#              encoding = string
#              omit-xml-declaration = "yes" | "no"
#              standalone = "yes" | "no"
#              doctype-public = string
#              doctype-system = string
#              cdata-section-elements = qnames
#              indent = "yes" | "no"
#              media-type = string />

            #How to accomodate @media-type?
            method = processor.outputParams.method[-1]
            if method == 'xml':
                self.graph.parse(StringIO(result), publicID=self.url)
            elif method == 'text':
                #Attempt a Notation 3 parse (covers NTriples, and Turtle)
                self.graph.parse(StringIO(result), format='n3', publicID=self.url)
            else:
                #HTML result - recursive GRDDL mechanism?
                raise Exception("unsupported output type")

class GRDDLAgent:
    """
    The main entry point for the GRDDL agent
    Takes a url and a graph to store the GRDDL result and attempts to 'glean'
    in the 4 major ways that GRDDL mandates
    """
    def __init__(self, url, graph):
        if DEBUG:
            print "Attempting a comprehensive glean of ", url
        parsedSource = None
        for gleanMethod in [XMLGlean, XMLNSGlean, XHTMLProfileGlean, ValidXHTMLGlean]:
            #Don't reparse the GRDDL source
            if not parsedSource:
                gleaned = gleanMethod(url, graph)
                parsedSource = gleaned.dom                
            elif parsedSource:
                gleanMethod(url, graph, preParsedDOM=parsedSource)            

class XMLGlean(Glean):
    """
    http://www.w3.org/TR/grddl/#grddl-xml - Adding GRDDL to well-formed XML
    
    The general form of associating a GRDDL transformation link with a well-formed 
    XML document is by adorning the root element with a grddl namespace declaration 
    and a grddl:transformation attribute whose value is a URI reference, or list 
    of URI references, that refer to executable scripts or programs which are 
    expected to transform the source document into RDF.
        
    """
    def __init__(self, url, graph, preParsedDOM=None):
        """        
        >>> g = XMLGlean(u'http://www.w3.org/2003/g/po-ex', Graph())
        >>> g.appliedTransforms[0]
        u'http://www.w3.org/2003/g/embeddedRDF.xsl'
        >>> pprint(list(g.graph))
        [(u'http://www.w3.org/2003/g/po-ex',
          u'http://www.w3.org/2003/g/data-view#namespaceTransformation',
          u'http://www.w3.org/2003/g/grokPO.xsl')]
        """
        super(XMLGlean, self).__init__(url, graph)
        if self.dom:
            self.xforms = [attr.value for attr in self.dom.xpath(u'/*/@data-view:transformation', {u'data-view':u'http://www.w3.org/2003/g/data-view#'})]
            if self.xforms:
                self.transform(self.xforms[0])

class XMLNSGlean(Glean):
    """
    http://www.w3.org/TR/grddl/#ns-bind - Using GRDDL with XML Namespace Documents
    
    Any resource available for retrieval from a namespace URI is a namespace document 
    (cf. section 4.5.4. Namespace documents in [WEBARCH]). For example, a namespace 
    document may have an XML Schema representation or an RDF Schema representation, 
    or perhaps both
        
    To associate a GRDDL transformation with a whole dialect, have the namespace document 
    include the grddl:namespaceTransformation property.
    
    * if an information resource ?D  has an XML representation whose root element has a 
      namespace name ?NS then any GRDDL result of the resource identified by ?NS  is a GRDDL 
      result of ?D
    * if an information resource ?D has an XML representation whose root element has a 
      namespace name ?NSDOC** and ?D has a GRDDL result that includes, for any ?TX, the RDF 
      triple { ?NSDOC <http://www.w3.org/2003/g/data-view#namespaceTransformation> ?TX } then 
      ?TX is also a transformation of ?D            
        
    """
    def __init__(self, url, graph, preParsedDOM=None):
        """
        
        >>> g = XMLNSGlean(u'http://www.w3.org/2003/g/po-doc.xml', Graph())
        >>> g.nsURI
        u'http://www.w3.org/2003/g/po-ex'
        >>> len(g.graph)
        16
        """
        super(XMLNSGlean, self).__init__(url, graph)
        self.nsURI = None
        if self.dom:
            self.nsURI = self.dom.xpath(u'/*')[0].namespaceURI
            if self.nsURI and self.nsURI not in NSDispatchTermination:
                #glean GRDDL result from the namespace document
                GRDDLAgent(self.nsURI, self.graph)
                continueRecursion = True
                #setup a set of processed transforms to avoid infinite namespace snooping cycles
                processedNSXForms = Set()
                #Recursively find 'new' namespace transformations
                while continueRecursion:
                    todoXForms = Set()
                    for s, p, xform in self.graph.triples((self.nsURI, GRDDL_NS.namespaceTransformation, None)):
                        if xform not in processedNSXForms:
                            todoXForms.add(xform)
                    #continue only if we have xforms to apply
                    continueRecursion = bool(todoXForms)
                    #apply the new namespace transforms on the GRDDL source, merging the GRDDL results as we go
                    for newXForm in todoXForms:
                        self.transform(newXForm)
                        processedNSXForms.add(newXForm)

class ValidXHTMLGlean(Glean):
    """
    http://www.w3.org/TR/grddl/#grddl-xhtml - Using GRDDL with valid XHTML
    
    The general form of adding a GRDDL assertion to a valid XHTML document 
    is by specifying the GRDDL profile in the profile attribute of the head 
    element, and transformation as the value of the rel attribute of a link 
    or a element whose href attribute value is a URI reference that refers 
    to an executable script or program which is expected to transform the 
    source document into RDF. This method is suitable for use with valid 
    XHTML documents which are constrained by an XML DTD.
    
    Stated more formally:

    * An XHTML document whose metadata profiles include 
      http://www.w3.org/2003/g/data-view has a GRDDL transformation for each 
      resource identified by a link of type transformation.
              
    """
    def __init__(self, url, graph, preParsedDOM=None):
        super(ValidXHTMLGlean, self).__init__(url, graph)
        if self.dom:
            xhtmlNSMap = {u'xhtml':XHTML_NS}
            for xform in self.dom.xpath(u'/xhtml:html[xhtml:head[contains(@profile, "%s")]]//xhtml:*[(local-name() = "a" or local-name() = "link") and @rel="transformation"]/@href'%GRDDL_URI, xhtmlNSMap):
                self.transform(xform.value)

class XHTMLProfileGlean(Glean):
    """
    http://www.w3.org/TR/grddl/#profile-bind - GRDDL for HTML Profiles
    
    A more formal description on the relation between GRDDL and XHTML profiles follows:

    * if an information resource ?D has an XHTML representation whose profile attribute 
      refers to ?PROFILE, then any GRDDL result of ?PROFILE is a GRDDL result of ?D
    * if an information resource ?D has an XHTML representation whose profile attribute 
      refers to ?PROFILE and ?D has a GRDDL result that includes, for any ?TX, the RDF triple 
      { ?PROFILE <http://www.w3.org/2003/g/data-view#profileTransformation> ?TX } then ?TX 
      is also a GRDDL transformation of ?D    
    
    """
    def __init__(self, url, graph, preParsedDOM=None):
        """
        >>> g = XHTMLProfileGlean(u'http://www.w3.org/2003/g/data-view', Graph())
        >>> g.profiles
        >>> len(g.graph)
        
        """
        super(XHTMLProfileGlean, self).__init__(url, graph)
        self.profiles = []
        if self.dom:
            profile = self.dom.xpath(u'/xhtml:html/xhtml:head/@profile', {u'xhtml':XHTML_NS})
            if profile:
                self.profiles = profile[0].value.split()
                for profile in self.profiles:
                    if profile == GRDDL_URI:
                        continue
                    if DEBUG:
                        print "processing profile url: ", profile
                    #glean GRDDL result from the profile document
                    GRDDLAgent(profile, self.graph)
                    continueRecursion = True
                    #setup a set of processed transforms to avoid infinite profile snooping cycles
                    processedProfileXForms = Set()
                    #Recursively find 'new' namespace transformations
                    while continueRecursion:
                        todoXForms = Set()
                        for s, p, xform in self.graph.triples((profile, GRDDL_NS.profileTransformation, None)):
                            if xform not in processedProfileXForms:
                                todoXForms.add(xform)
                        #continue only if we have xforms to apply
                        continueRecursion = bool(todoXForms)
                        #apply the new namespace transforms on the GRDDL source, merging the GRDDL results as we go
                        for newXForm in todoXForms:
                            self.transform(newXForm)
                            processedProfileXForms.add(newXForm)

        
OWL_NS    = Namespace("http://www.w3.org/2002/07/owl#")
Any = None
LOG = Namespace("http://www.w3.org/2000/10/swap/log#")
    
def main():
    graph = Graph()
    g=GRDDLAgent(sys.argv[1], graph)
    print graph.serialize(format='xml')
    
def test():
    import doctest
    doctest.testmod()
#    main()

if __name__ == '__main__':
    test()