- From: Yosi Scharf <syosi@mit.edu>
- Date: Thu, 20 Jan 2005 18:56:20 -0500
- To: "Sean B. Palmer" <sean+cwm@infomesh.net>
- Cc: public-cwm-talk@w3.org
- Message-ID: <41F04524.10907@mit.edu>
I have been working on the last couple of days at getting cwm to use n3p.py as its n3 parser. This neccessitated the following changes to n3p.py: - the regexp for whitespace was fixed so a comment could end the file, with no trailing newline - I had to create a way for a N3Parser to be created where it would just accept a string, instead of trying to open the n3 file itself - On larger files (notably http://www.w3.org/2000/10/swap/test/rdfcore-tests.n3 , it was running into the recursion limit. I made it no longer recursive. - "has" was added to the initial list of known keywords attached is the diff. It is less clean than it could be, because I was running into indent spacing issues. The way to invode N3Parser without it opening anything is an ugly hack. Strangely enough, the modifications to parse() actually work, as far as I can tell. Yosi Scharf
40c40 < r_whitespace = re.compile(r'[ \t\r\n]*(?:(?:#[^\n]*)?\r?\n)?') --- > r_whitespace = re.compile(r'[ \t\r\n]*(?:(?:#[^\n]*)?\r?(?:$|\n))?') 51,56c51,59 < def __init__(self, uri, branches, regexps): < if (uri != 'file:///dev/stdin'): < u = urllib.urlopen(uri) < self.data = u.read() < u.close() < else: self.data = sys.stdin.read() --- > def __init__(self, uri, branches, regexps): > if uri == 'nowhere': > pass > else: > if (uri != 'file:///dev/stdin'): > u = urllib.urlopen(uri) > self.data = u.read() > u.close() > else: self.data = sys.stdin.read() 61c64 < self.keywords = set(("a", "is", "of", "this")) --- > self.keywords = set(("a", "is", "of", "this", "has")) 65,102c68,116 < def parse(self, prod): < tok = self.token() < # Got an opened production < self.onStart(abbr(prod)) < if not tok: < return tok # EOF < < prodBranch = self.branches[prod] < sequence = prodBranch.get(tok, None) < if sequence is None: < print >> sys.stderr, 'prodBranch', prodBranch < raise Exception("Found %s when expecting a %s" % (tok, prod)) < for term in sequence: < if isinstance(term, unicode): < j = self.pos + len(term) < word = self.data[self.pos:j] < if word == term: < self.onToken(term, word) < self.pos = j < elif '@' + word[:-1] == term: < self.onToken(term, word[:-1]) < self.pos = j - 1 < else: raise Exception("Found %s; %s expected" % \ < (self.data[self.pos:self.pos+10], term)) < elif not self.regexps.has_key(term): < self.parse(term) < continue < else: < regexp = self.regexps[term] < m = regexp.match(self.data, self.pos) < if not m: < raise Exception("Token: %r should match %s" % \ < (self.data[self.pos:self.pos+10], regexp.pattern)) < end = m.end() < self.onToken(abbr(term), self.data[self.pos:end]) < self.pos = end < self.token() < self.onFinish() --- > def parse(self, prod): > todo_stack = [[prod, None]] > while todo_stack: > #print todo_stack > #prod = todo_stack.pop() > if todo_stack[-1][1] is None: > todo_stack[-1][1] = [] > tok = self.token() > # Got an opened production > self.onStart(abbr(todo_stack[-1][0])) > if not tok: > return tok # EOF > > prodBranch = self.branches[todo_stack[-1][0]] > sequence = prodBranch.get(tok, None) > if sequence is None: > print >> sys.stderr, 'prodBranch', prodBranch > raise Exception("Found %s when expecting a %s . todo_stack=%s" % (tok, todo_stack[-1][0], `todo_stack`)) > for term in sequence: > todo_stack[-1][1].append(term) > while todo_stack[-1][1]: > term = todo_stack[-1][1].pop(0) > if isinstance(term, unicode): > j = self.pos + len(term) > word = self.data[self.pos:j] > if word == term: > self.onToken(term, word) > self.pos = j > elif '@' + word[:-1] == term: > self.onToken(term, word[:-1]) > self.pos = j - 1 > else: raise Exception("Found %s; %s expected" % \ > (self.data[self.pos:self.pos+10], term)) > elif not self.regexps.has_key(term): > todo_stack.append([term, None]) > continue > else: > regexp = self.regexps[term] > m = regexp.match(self.data, self.pos) > if not m: > raise Exception("Token: %r should match %s" % \ > (self.data[self.pos:self.pos+10], regexp.pattern)) > end = m.end() > self.onToken(abbr(term), self.data[self.pos:end]) > self.pos = end > self.token() > while todo_stack[-1][1] == []: > todo_stack.pop() > self.onFinish()
Received on Thursday, 20 January 2005 23:56:55 UTC