#!/usr/bin/env python

'''
Include external xml chunks into an XHTML file
according two two different schemes:
regexp: within an XHTML file <include> elements indicate
  tokens to be used with a regular expression
  to excerpt external XML found between comments tags.
  (Source document is invalid; easy to demarcate
   contigous regions, fast.)
xml: within an XHTML file <?pi?>'s specify an XPointer
  which indicates what XML to excerpt.
  (Source document is valid, requires no change to 
   schema and examples, flexible expressions; slower (i.e. Xpath),
   not all XPath/XPointer expressions are supported presently.)
Usage: pin.py -m [r(egexp)|x(ml)] input output'''

def encode(s):
    '''
    Encode XML for inclusion in an XHTML document
    '''
    
    s = s.replace("&", "&amp;")  
    s = s.replace("<", "&lt;")    
    s = s.replace(">", "&gt;")
    s = s.replace("\t", "  ")     
    return s

from xml.sax.handler import ContentHandler
class specHandler(ContentHandler):
    '''
    The event handler class that outputs an Xpath selection
    over an external resources.
    '''

    def __init__(self, infd, outfd):
        """Save the the input/output file descriptors to self."""
        self.infd = infd
        self.outfd = outfd

        
    def Default(self, data):
        '''Handler for everything but the PI'''
        self.outfd.write(data)
        
        
    def PI(self, target, data):
        '''
        PI handler: output the approriate XML from the external source
        '''
        
        import re, string, sys, urlparse
        loc = ''     # the url absent a pathment
        path = ''    # an xpath pulled form a xptr fragment
                
        pattern = re.compile('\s*class="(?P<hclass>.*?)" source="(?P<url>.*?)"')
        match = pattern.match(data)
        hclass, url = match.group('hclass','url')
        url = urlparse.urlparse(url)
        loc = ''.join(url[0:3])
        frag = ''.join(url[5])
        print "\n**** location='%s' frag='%s'" % (loc, frag)
        resource = open(loc).read()
        if frag != '':
            if frag[:9] == 'xpointer(':    # isolate the actual xpath expression
                path = frag[9:-1]    
                
                from xml.dom.ext.reader import PyExpat
                from xml.xpath import Compile
                from xml.xpath.Context import Context
                from xml.dom.ext import Canonicalize
                from xml.dom.ext import PrettyPrint

                reader = PyExpat.Reader()
                dom = reader.fromString(resource)
#                 expression = Compile("//complexType[@name='RequestAbstractType']")
                expression = Compile(path)
                print "    Expression: ", expression
                context = Context(dom.documentElement) 
                nodes = expression.evaluate(context)
                if nodes == []:
                    print "Expression '%s' not found" % frag
                    outfd.write('<pre class="%s">Expression "%s" not found</pre>' % (hclass, frag))
                else:
                    for node in nodes:
                        chunk = Canonicalize(node,unsuppressedPrefixes=[])    # exc-c14n = no ns; but ugly white-space
#                         chunk = PrettyPrint(node)    # unneeded ns; but pretty white-space
                        chunk = '<pre class="%s">%s</pre>' % (hclass, encode(chunk))
                        outfd.write(chunk)
            else:
                print "Sorry, I don't understand this fragment ID"
                sys.exit()
        else:
            print "   returning external resource"
            chunk = '<pre class="%s">%s</pre>' % (hclass, encode(resource))
            outfd.write(chunk)

    
def xmlTransform(infd, outfd):
    '''
    This transform takes a PI within XHTML:
      <?include class="Code" 
        source="Schemas/xkms.xsd#xpointer(//complexType[@name='RespondWith'])"?>
    and subsitutes it with XML from the source 
    '''
    
    # Parse, instantiate and kick off the event handlers   
    from xml.parsers import expat
        
    print "infd is", infd
    print "outfd is", outfd
    sh = specHandler(infd, outfd)
    parser = expat.ParserCreate()
    
    parser.ProcessingInstructionHandler = sh.PI
    parser.DefaultHandler = sh.Default
    parser.ParseFile(infd)
    
    
    
    
def regexpTransform(infd, outfd):
    """
    This transform takes an include element within XHTML:
      <include class="Code" source="Schemas/xkms.xsd" section="MessageAbstractType"></include>
    and subsitutes it with XML from the source as bounded by the comments:
      <!-- MessageAbstractType --> blah blah <!-- /MessageAbstractType -->
    """
    import re, sys
    pattern = re.compile('\s*<include class="(?P<hclass>.*?)" source="(?P<source>.*?)"( section="(?P<section>.*?)")?.*?></include>')
    
    for line in infd.xreadlines():
        hclass, source, section = '','',''
        match = pattern.match(line)
        if match: 
            if match.group('hclass'): hclass = match.group('hclass')
            if match.group('source'): source = match.group('source')
            if match.group('section'): section = match.group('section')
            line = regexpFetch(hclass, source, section)
        outfd.write(line)

        
def regexpFetch(hclass, source, section):
    
    import re, sys
    
    try:
        resourceal = open(source).read()
    except:
        print "file %s doesn't exist" % source
        sys.exit()

    print "hclass=%s, source=%s, section=%s" % (hclass, source, section)
                    
    if section == '': 
        chunk = encode(source)    # this might be broken
        chunk = '<pre class="%s">%s</pre>' % (hclass, chunk)
        return chunk
    else:
        match = re.search('<!--\s?%s\s?-->(.*?)<!--\s?/%s\s?-->' % (section,section),
            resourceal,re.DOTALL)
        if match:
            chunk = match.group(1)
            chunk = encode(chunk)
            chunk = '<pre class="%s">%s</pre>' % (hclass, chunk)
            return chunk
        else:
            return '<pre class="%s">%s not found.</pre>' % (hclass,section)
        

if __name__ == "__main__":

    import getopt, sys
    mode = 'xml'

    try:
        (options,files) = getopt.getopt (sys.argv[1:],"m:")
    except getopt.error:
        print "Error: Unknown option or missing argument."
        print "Usage: pin.py -m [r(egexp)|x(ml)] input output"
    for (option,value) in options:
        if option == '-m':
            if value == 'r': mode = 'regexp'
            elif value == 'x': mode = 'xml'
    print "FILES: ", files
    try:
        infd = open(files[0])
    except IndexError:
        infd = sys.stdin
    try:
        outfd = open(files[1], 'w')
    except IndexError:
        outfd = sys.stdout
    
    if mode == 'regexp' : regexpTransform(infd,outfd)
    elif mode == 'xml' : xmlTransform(infd,outfd)
    else: print "Sorry, I don't know what transform you want."
    
    infd.close()
    outfd.close()
