#!/usr/bin/python """ NTriples Tools: Parses and serializes N-Triples documents. http://infomesh.net/2001/10/ntriples/ Built on Aaron Swartz's RDF API: http://blogspace.com/rdf/rdfapi.txt cf. http://www.w3.org/TR/2001/WD-rdf-testcases-20010912/#ntriples """ import sys, string, re, urllib import rdfapi as rdf __author__ = "Sean B. Palmer with Aaron Swartz" __version__ = '1.1' __license__ = """ Copyright (C) 2001 Sean B. Palmer. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. """ def parse(document, store=rdf.Store()): bNodes, CTriple = {}, [] # Comprehensible regexps t = r'(<[^>]+>|_:[^\s]+|\"(?:\\\"|[^"])*\")' rt = re.compile(r'[ \t]*'+t+r'[ \t]+'+t+r'[ \t]+'+t+r'[ \t]*.[ \t]*') rc = re.compile(r'(\#[^\n]*)') rw = re.compile(r'[ \t]+') # Normalize the new lines in document if len(document) == 0: raise 'Document has no content' else: document = string.replace(document, '\r\n', '\n') document = string.replace(document, '\r', '\n') # Parse document into tripleList lines = string.split(document, '\n') for line in lines: if len(line) == 0: continue # line has no content (a double '\n') elif rt.match(line): terms = rt.findall(line)[0] for term in terms: if term[0] == '<' and term[-1] == '>': # Term is a URI-view CTriple.append(term[1:-1]) elif term[:2] == '_:': # Term is an unlabelled node: bNode bNode = term[2:] if re.compile(r'[A-Za-z][A-Za-z0-9]*', re.S).match(bNode): if not bNode in bNodes.keys(): bNodes[bNode] = rdf.node() CTriple.append(bNodes[bNode]) else: raise 'bnode: "'+bNode+'" is not a valid bNode' elif term[0] == '"' and term[-1] == '"': CTriple.append(unicode(term[1:-1])) else: raise 'Term '+str(term)+' is not a valid NTriples term.' store.triple(CTriple[0], CTriple[1], CTriple[2]) CTriple = [] # Reset the current triple elif rc.match(line): continue # Line is a comment elif rw.match(line): continue # Line is just whitespace else: raise "Line is invalid", line # Validity error return store def serialize(store): """Prints out as NTriples (co-written).""" nodeIdMap, nodeIdNum, nt, utol = {}, 0, '', rdf.URIToLiteral for t in store.tripleList: for pos in [t.subject, t.predicate, t.object]: if (not hasattr(pos, 'uri') and pos not in nodeIdMap.keys()): nodeIdNum += 1; nodeIdMap[pos] = 's'+`nodeIdNum` for pos in [t.subject, t.predicate, t.object]: if pos in nodeIdMap.keys(): nt += '_:'+nodeIdMap[pos]+' ' elif pos.uri[:6] == 'data:,': nt += '"'+utol(pos.uri)+'" ' else: nt += '<'+pos.uri+'> ' nt += '.\n' return string.rstrip(nt) def run(): x = parse(open(sys.argv[1]).read()) print serialize(x) # Main program if __name__ == "__main__": run() # Phew