notes from n3p / cwm integration

I have been working on the last couple of days at getting cwm to use 
n3p.py as its n3 parser.

This neccessitated the following changes to n3p.py:
- the regexp for whitespace was fixed so a comment could end the file, 
with no trailing newline
- I had to create a way for a N3Parser to be created where it would just 
accept a string, instead of trying to open the n3 file itself
- On larger files (notably 
http://www.w3.org/2000/10/swap/test/rdfcore-tests.n3 , it was running 
into the recursion limit. I made it no longer recursive.
-  "has" was added to the initial list of known keywords


attached is the diff. It is less clean than it could be, because I was 
running into indent spacing issues.
The way to invode N3Parser without it opening anything is an ugly hack. 
Strangely enough, the modifications to parse() actually work, as far as 
I can tell.


Yosi Scharf
40c40
< r_whitespace = re.compile(r'[ \t\r\n]*(?:(?:#[^\n]*)?\r?\n)?')
---
> r_whitespace = re.compile(r'[ \t\r\n]*(?:(?:#[^\n]*)?\r?(?:$|\n))?')
51,56c51,59
<    def __init__(self, uri, branches, regexps): 
<       if (uri != 'file:///dev/stdin'): 
<          u = urllib.urlopen(uri)
<          self.data = u.read()
<          u.close()
<       else: self.data = sys.stdin.read()
---
>    def __init__(self, uri, branches, regexps):
>       if uri == 'nowhere':
>           pass
>       else:
>           if (uri != 'file:///dev/stdin'): 
>              u = urllib.urlopen(uri)
>              self.data = u.read()
>              u.close()
>           else: self.data = sys.stdin.read()
61c64
<       self.keywords = set(("a", "is", "of", "this"))
---
>       self.keywords = set(("a", "is", "of", "this", "has"))
65,102c68,116
<    def parse(self, prod): 
<       tok = self.token()
<       # Got an opened production
<       self.onStart(abbr(prod))
<       if not tok: 
<          return tok # EOF
< 
<       prodBranch = self.branches[prod]
<       sequence = prodBranch.get(tok, None)
<       if sequence is None: 
<          print >> sys.stderr, 'prodBranch', prodBranch
<          raise Exception("Found %s when expecting a %s" % (tok, prod))
<       for term in sequence: 
<          if isinstance(term, unicode): 
<             j = self.pos + len(term)
<             word = self.data[self.pos:j]
<             if word == term: 
<                self.onToken(term, word)
<                self.pos = j
<             elif '@' + word[:-1] == term: 
<                self.onToken(term, word[:-1])
<                self.pos = j - 1
<             else: raise Exception("Found %s; %s expected" % \
<                          (self.data[self.pos:self.pos+10], term))
<          elif not self.regexps.has_key(term): 
<             self.parse(term)
<             continue
<          else: 
<             regexp = self.regexps[term]
<             m = regexp.match(self.data, self.pos)
<             if not m: 
<                raise Exception("Token: %r should match %s" % \
<                       (self.data[self.pos:self.pos+10], regexp.pattern))
<             end = m.end()
<             self.onToken(abbr(term), self.data[self.pos:end])
<             self.pos = end
<          self.token()
<       self.onFinish()
---
>    def parse(self, prod):
>       todo_stack = [[prod, None]]
>       while todo_stack:
>           #print todo_stack
>           #prod = todo_stack.pop()
>           if todo_stack[-1][1] is None:
>               todo_stack[-1][1] = []
>               tok = self.token()
>               # Got an opened production
>               self.onStart(abbr(todo_stack[-1][0]))
>               if not tok: 
>                  return tok # EOF
> 
>               prodBranch = self.branches[todo_stack[-1][0]]
>               sequence = prodBranch.get(tok, None)
>               if sequence is None: 
>                  print >> sys.stderr, 'prodBranch', prodBranch
>                  raise Exception("Found %s when expecting a %s . todo_stack=%s" % (tok, todo_stack[-1][0], `todo_stack`))
>               for term in sequence:
>                  todo_stack[-1][1].append(term)
>           while todo_stack[-1][1]:
>              term = todo_stack[-1][1].pop(0)
>              if isinstance(term, unicode): 
>                 j = self.pos + len(term)
>                 word = self.data[self.pos:j]
>                 if word == term: 
>                    self.onToken(term, word)
>                    self.pos = j
>                 elif '@' + word[:-1] == term: 
>                    self.onToken(term, word[:-1])
>                    self.pos = j - 1
>                 else: raise Exception("Found %s; %s expected" % \
>                              (self.data[self.pos:self.pos+10], term))
>              elif not self.regexps.has_key(term): 
>                 todo_stack.append([term, None])
>                 continue
>              else: 
>                 regexp = self.regexps[term]
>                 m = regexp.match(self.data, self.pos)
>                 if not m: 
>                    raise Exception("Token: %r should match %s" % \
>                           (self.data[self.pos:self.pos+10], regexp.pattern))
>                 end = m.end()
>                 self.onToken(abbr(term), self.data[self.pos:end])
>                 self.pos = end
>              self.token()
>           while todo_stack[-1][1] == []:
>               todo_stack.pop()
>               self.onFinish()

Received on Thursday, 20 January 2005 23:56:55 UTC