- From: <danbri@fireball.danbri.org>
- Date: Wed, 27 Feb 2002 19:58:28 -0500 (EST)
- To: www-archive@w3.org
#!/usr/bin/env ruby
#
# A rough Ruby Squish parser
# (an exercise in doing it wrong...)
# useful refs:
#http://www.io.com/~jimm/downloads/rubytalk/talk.html
#
#
#daml query links:
# http://www.daml.org/listarchive/joint-committee/0856.html
#
# http://www.hpl.hp.com/semweb/rdql.html
# http://www.hpl.hp.com/semweb/rdql-grammar.html
class SquishQuery
# incl parsed query state
attr_accessor :select_args, :from_args, :using_args, :where_args, :clauses, :xmlns, :full_clauses, :all_vars
#, :select_arglist
## Hmm, don't understand why needed a method for an array field
## todo: read docs on attr_accessor
def sels
return @select_arglist
end
def initialize
# puts "SquishParser: initalised :) "
@clauses = []
@full_clauses = []
@xmlns = {}
@loginfo = []
@verboselog = []
@all_vars = {}
# todo: find out where/how one declares fields of a class!
# seems to be no point in doing so, since it's all so runtimy
# but I want to document them somewhere. Maybe just via attr_accessor?
# @select_args
@from_args=""
# @using_args
# @where_args
# state used in WHILE productions
# @lastpred
# @lastsub
# @lastobj
end
def loginfo (production, content, nextdata)
logmsg = "LOGINFO: ['#{production}'] got content: '#{content}'\n"
# @loginfo.push(logmsg)
# @verboselog.push logmsg + " remains: #{nextdata}\n\n"
end
def logwarning (msg)
@warnings.push(msg)
puts "PARSER WARNING: #{msg}\n"
end
def parseFromText (text)
text.gsub!(/\n/," ")
text.gsub!(/\r/," ")
parse = select_keyword(text)
extractAllVars
expandAllNamespaces
return parse
end
# select_keyword:
# entry point
#
# -> select_arglist
#
def select_keyword (text)
if text =~ /^\s*SELECT\s+(.*)/i
nextdata = $1
# puts "Found select keyword. Next is: #{nextdata}\n"
loginfo ( 'select_keyword', 'SELECT', nextdata)
return select_arglist (nextdata)
end
puts "Error: Expected first chars to be 'SELECT ' GOT: #{text}"
end
################
# select_arglist:
#
def select_arglist (text)
###
#
# The SELECT arg list can be terminated with an (optional) FROM clause
if text =~ /^\s*(.*?)\s*(FROM\s+.*)/i
content = $1
#
# content.gsub!(/\s*$/, "") ## todo: shouldn't have to strip ws here
nextdata = $2
@select_args = content
loginfo ('select_arglist (found FROM) ', content, nextdata)
rawsel = content
rawsel.gsub!("\\?","")
vars = rawsel.split(/,\s*/) # ws mandatory?
# puts ("SELS-with-from: #{vars}\n")
@select_arglist = vars;
return from_keyword(nextdata) # move to from_keyword production
###
#
# Else it should be terminated with a WHERE clause
elsif text =~ /(.*)\s+WHERE\s+(.*)/i
# puts "select_arglist: no from, got WHERE!\n"
content = $1
nextdata = $2
@select_args = content
loginfo ('select_arglist (omitted FROM) ', content, nextdata)
rawsel = content # Should be separate function
rawsel.gsub!("\\?","")
vars = rawsel.split(/,\s*/)
#puts ("SELS-without-from: #{vars.inspect}\n")
@select_arglist = vars;
if where_lpar (nextdata)
#puts "Processed all where clauses OK, nothing left to do."# todo
end
# this is a *mess*!
# if @remainder =~ /\S/
# return using_keyword ( nextdata )
# end
return true # Done!
end
# neither ended in FROM or WHERE
puts "Error: Expected arg list for SELECT to end with FROM or WHERE. GOT: #{text}"
end
# the optional FROM keyword
# -> where_lpar (todo: to where_keyword)
#
def from_keyword (text)
if text =~ /\s*FROM\s+(.*?)(\s+WHERE\s+.*)/i
content = $1
nextdata = $2
loginfo ( 'from_keyword, next is where_lpar', content, nextdata)
@from_args = content
nextdata.gsub!(/WHERE\s+/i,"") ## TODO: make a node for WHERE_KEYWORD
return where_lpar(nextdata)
end
puts "Expecting (optional) FROM; didn't find it."
return false
end
# using_keyword:
# final clause (optional), deals with namespace expansions
#
def using_keyword (text)
# puts "Using-keyword got: #{text} \n"
if text =~ /\s*USING+(.*)/i
nextdata = $1
loginfo ( 'using_keyword', 'USING', nextdata)
# puts "using_keyword: #{nextdata} \n"
return using_arglist(nextdata)
end
puts "Expecting 'USING' keyword, found: #{text} "
return false
end
def using_arglist(text)
if text =~ /\s*(.*)$/
content = $1
loginfo ( 'using_arglist', content, '[end]' )
@using_args = content
usedPrefixes(content) # extract
#puts "using_args: #{content} \n"
return true
end
puts "Expected arguments to USING, found nothing.\n"
return false
end
#### the 'WHERE' clause
# where_lpar: '('
# where clause, left paren
#
# -> deal with each pred_expr or drop out of WHERE via using_keyword
#
def where_lpar (text)
if text =~ /^\s*\(\s*(.*)/
nextdata = $1
loginfo ( 'where_lpar', ' ( ', nextdata )
return pred_expr( nextdata )
end
if text =~ /^\s*USING/i
return using_keyword(text)
end
puts " ====== Expected '(' or 'USING' Got: #{text}"
# note: test1-bogusclause.squish tries 'ABUSING' clause
@remainder=text
return false;
end
# pred_expr
# where clause, predicate expression
# -> sub_expr
#
def pred_expr (text)
if (text =~ /\s*(\S+)\s+(.*)/)
content = $1
@lastpred = content
nextdata = $2
loginfo ( 'pred_lpar', content, nextdata )
return sub_expr(nextdata)
end
puts "Error: pred_expr didn't find expected content \n"
return false
end
# sub_expr
# where clause, subject expression
# -> obj_expr
#
def sub_expr (text)
if (text =~ /\s*(\S+)\s+(.*)\s*/)
content = $1
@lastsub = content
nextdata = $2
loginfo ( 'sub_expr', content, nextdata )
return obj_expr(nextdata)
end
puts "Error: sub_expr didn't find expected content in '#{text}'\n"
end
# obj_expr
# where clause, object expression
# -> where_rpar
#
def obj_expr (text)
if (text =~ /(\S+)\s*(\).*)/)
content = $1
# puts("object: "+content+"\n")
@lastobj=content
nextdata = $2
loginfo ( 'obj_expr', content, nextdata )
return where_rpar(nextdata)
end
puts "Error: obj_expr didn't find expected content in '#{text} \n"
end
# where_rpar:
# where clause, right paren
# -> where_lpar
#
def where_rpar (text)
# puts "Does text '#{text} match ')'... \n"
if text =~ /^\)\s*(.*)\s*/
nextdata = $1
# puts "rpar: next prod: where_lpar #{$1} \n"
loginfo ( 'where_rpar', ' ) ', nextdata )
if (nextdata =~ /\S/ )
clause = "#{@lastsub} -- #{@lastpred} --> #{@lastobj}\n"
qt = [ @lastpred, @lastsub, @lastobj ]
self.clauses.push( qt )
## puts ("DEBUG: #{clauses.inspect} \n")
## TODO: Store this in SquishQuery
return where_lpar( nextdata )
end
puts "[finished with the entire WHERE clause]\n\n"
return true # we're done with this where
# should flush temporary state, store goodstuff etc
# Note: qnames need expanding later
end
puts "Error: Expected ')'"
return false;
end
# output requested variables as contents of a Squish SELECT clause
#
# eg: "?x, ?y, ?z"
#
# todo: refactor to hide the * case from apps that want a clean list
# of vars, ie. sharecode w/ algae function. self.sels is bad data.
#
def toVarQList
s=[]
if self.sels[0] =~ /^\*/
return "*" # special case for 'SELECT *' (ie. select all named vars)
end
self.sels.each {|q| s.push("?#{q}") }
return s.join(", ")
end
def toSquish
sq = "SELECT #{toVarQList} \n"
if from_args =~ /\S/
sq += "FROM #{from_args} \n"
end
sq += "WHERE \n"
cl= clauses.each { |qt|
sq += " ( #{qt[0]} #{qt[1]} #{qt[2]} ) \n" #todo: commas? whitespace?
}
sq += "USING #{using_args} \n"
return sq
end
# expand namespaces and store in expanded_clauses
#
def expandAllNamespaces
sq=""
full_clauses=[] # reset state. wise?
cl= clauses.each { |qt|
p = expns(qt[0])
s = expns(qt[1])
o = expns(qt[2])
sq += " ( #{p} #{s} #{o} ) "
# puts ("Epanding: #{sq} \n")
self.full_clauses.push([p,s,o] )
}
return ("Expanded: "+sq)
end
def expns (text)
# possibly qualified expression, eg dc::foo
if text =~ /(\w+)::(.*)/
if ($1 != nil)
ns = xmlns[$1]
if (ns == nil)
puts "Error: undeclared namespace #{$1}\n"
else
text = xmlns[$1] + $2
end
end
# puts "Expns: Got #{$1} and #{$2} -> #{text} \n"
end
return text
end
# update self.all_vars based on variables named in self.clauses
#
def extractAllVars
## todo: reset state first? same issue re xmlns / USING...
clauses.each { |qt|
parts = qt[0..3]
parts.each { |term|
if term =~ /^\?(.*)/
l = self.all_vars[$1] # look for list of clauses using this variable
if l == nil
self.all_vars[$1] = [qt]
else
self.all_vars[$1].push(qt)
end
end
}
}
end
# Output in Algae syntax (URI for spec?)
# todo: look at commas, throw exception for '*' or workaround as below
# - for this, need an query.allvarnames() method
# - warn if there's a FROM clause, or figure out Algae syntax
# - find out Algae notation for using ns prefixes
def toAlgae
a = "(ask '(\n"
cl= full_clauses.each { |qt|
a += " ( #{qt[0]} #{qt[1]} #{qt[2]} ) \n"
}
vars = toVarQList
vout=[]
if vars =~ /\*/
# warnings.push "Algae doesn't support * selector. (todo) Default is collect all vars"
all_vars.keys.each {|q| vout.push("?#{q}") }
else
vars.each {|q| vout.push("#{q}") }
end
a += " ) collect '( #{vout.join(', ')} )\n)\n" ## lose the commas?
return a
end
def usedPrefixes (text)
nslist = text.split(/\s+/)
state='prefix' #Todo: investigate ruby constants mechanism
content=""
nslist.each { |item|
if state =~ /prefix/
content=item
state='for'
elsif state =~ /for/
state = 'uri'
elsif state =~ /uri/
self.xmlns[content]=item
content=""
state = 'prefix'
elsif
puts "TODO: parse USING clause properly!\n"
end
}
return xmlns
end
################################################################################
##
## Squish2SQL facilities
#
def toSQLQuery
sql=""
sqlVariableNamesA = []
sqlVariableNamesB = []
sqlVariableMatchAB = {}
realToSqlVarname_A={}
id_a_clause=1 # counter
# RDBMS table and field names
p_field='predicate'
s_field='subject'
o_field='object'
main_table = 'triples'
lookup_table = 'resources'
# for storing generated WHERE Clause fragments, two categories:
where_triples=[] # triples, ie. "a" (assertions)
where_lookup=[] # libby's "b" or 'resources' table
clauses.each { |clause|
p,s,o = clause[0..2]
p=expns(p)
s=expns(s)
o=expns(o)
p_bound = false
s_bound = false
o_bound = false
p.gsub!(/^\?/,"") # this by reference, changes the contents of clauses
# TODO: this will trip us up. Fix! decide on whether has ? or not
s.gsub!(/^\?/,"")
o.gsub!(/^\?/, "")
#puts("CLAUSE: #{p} ; #{s} ; #{o} \n")
all_vars.keys.each { |varname|
# puts "Scanning for varname: #{varname.inspect} sub=#{s.inspect} id: #{id_a_clause}\n"
if s.eql? varname
realToSqlVarname_A[varname] = "a#{id_a_clause}.#{s_field}"
s_bound = true # hmm, back to front?
end
if p.eql? varname
realToSqlVarname_A[varname]= "a#{id_a_clause}.#{p_field}"
p_bound= true
end
if o.eql? varname
realToSqlVarname_A[varname]= "a#{id_a_clause}.#{o_field}"
o_bound = true
end
}
sh1_sub = hashcodeIntFromString(s) # wasteful, not always needed (see below)
sh1_pred = hashcodeIntFromString(p)
sh1_obj = hashcodeIntFromString(o)
# puts "\n\nSHA-Triple: s=#{s} (#{sh1_sub}) p=#{p} (#{sh1_pred}) o=#{o} (#{sh1_obj}) \n"
# puts "Var bindings: #{s_bound} #{p_bound} #{o_bound} \n"
if !s_bound
where_triples.push "a#{id_a_clause}.#{s_field} = '#{sh1_sub}'"
#puts "DEBUG: s = #{sh1_sub} \n"
end
if !p_bound
where_triples.push "a#{id_a_clause}.#{p_field} = '#{sh1_pred}'"
#puts "DEBUG: p = #{sh1_pred} \n"
end
if !o_bound
where_triples.push "a#{id_a_clause}.#{o_field} = '#{sh1_obj}'"
#puts "DEBUG: o = #{sh1_obj} \n"
end
id_a_clause += 1
} # end big loop thru clauses
#puts "VARNAMES: #{realToSqlVarname_A.inspect}\n"
#puts "Got constraints: \n\n #{where_triples.inspect} \n"
sqlVarnames=[] # todo: describe this
all_vars.keys.each do |variableNameToMatch|
sqlVariableNamesA = []
#puts "\nScanning for clauses that use variable: #{variableNameToMatch} \n\n"
cl_idx=1
clauses.each do |clause|
# puts "Clause: #{clause.inspect} \n"
p,s,o = clause[0..2]
p_bound = false
s_bound = false
o_bound = false
if s.eql? variableNameToMatch
sqlVariableNamesA.push("a#{cl_idx}.#{s_field}")
#puts "Matched! (subject)\n"
end
if p.eql? variableNameToMatch
sqlVariableNamesA.push("a#{cl_idx}.#{p_field}")
#puts "Matched! (predicate)\n"
end
if o.eql? variableNameToMatch
sqlVariableNamesA.push("a#{cl_idx}.#{o_field}")
#puts "Matched! (object) \n"
end
cl_idx= cl_idx+1
#++ didn't.
end
if sqlVariableNamesA.size > 1
sqlVarnames.push(sqlVariableNamesA)
end
end
#puts "Current varname equalities for where_triples constraints: #{sqlVarnames.inspect}\n\n"
#
# this writes "a1.subject = a2.predicate" constraints.
# note that there is some redundancy on this method. (@@check java code)
sqlVarnames.each { |bindings|
j=0
bindings.each { |part|
if (j+1<bindings.size)
where_triples.push " #{part} = #{bindings[j+1]} "
end
j=j+1 # j++ not work
}
}
# selectvars is the list of b variables and the actual variables that match them:
# e.g b1.value as ?x
selectvars=[]
clause_lookup_id=1
realToSqlVarname_A.keys.each { |realkey|
val = realToSqlVarname_A[realkey]
# drop '?' prefix (needed? seems not)
realkey.sub!(/!\?/,"")
#puts "building where_lookup constraints: key=#{realkey} val=#{val} \n\n\n"
if self.sels.include?(realkey)
selectvars.push "b#{clause_lookup_id}.value AS #{realkey}"
sqlVariableNamesB.push("b#{clause_lookup_id}");
where_lookup.push("b#{clause_lookup_id}.key="+val );
end
if (sqlVariableMatchAB[val] == nil)
tmp=[]
tmp.push "b#{clause_lookup_id}.value}"
sqlVariableMatchAB[val]=tmp
else
sqlVariableMatchAB[val].push "b#{clause_lookup_id}.value}"
end
clause_lookup_id += 1
}
sql = "SELECT DISTINCT "+ selectvars.join(", ")+" "
sql += "FROM "
id_a_clause.times do |i|
sql += " #{main_table} a#{i+1}, "
end
lookup_tmp=[]
sqlVariableNamesB.each do |v|
lookup_tmp.push "#{lookup_table} #{v}"
end
sql += lookup_tmp.join(", ")
sql += "\nWHERE\n\t" + where_lookup.join(" AND ") + " AND "+ where_triples.join(" AND ")
##############################################################################################
# A walk-through:
# SELECT DISTINCT
# b2.value AS mbox, b5.value AS thumb, b7.value AS name
##### ^ 'b' is the lookups-prefix (a* was for triples, b* for resource id lookups
##### ^ '1' us a where-clause-counter (var may have several numbers)
##### ^ 'value' is the field name from lookups that contains content (not sha1'd ints
##### ^ from query.sels ([1]), the variable name
##### ....repeated for query.sels.each, picking a
##### counter number from the clause numbers they appear in
# sql += "'bn.value AS xyz, bm.value AS abc, bn.value AS pqr (etc...)' \n"
#
# as=[]
# sels.each { |wanted|
# as.push ("b?.value AS #{wanted}")
# puts "\t\t\tSQLTODO: get the n=? counter for #{wanted} from clauses"
# }
# sql += as.join(", ")
# FROM
# triples a1, triples a2, triples a3, triples a4,
# triples a5, triples a6, triples a7,
# resources b2, resources b5, resources b7
#
# sql += "\n FROM \n"
# sql += "\ttriples a1, triples, a2, (etc...) triples...ax \n"
#
# sql += "\tresources b2, resources, b3, resources bz (etc...) \n"
#
# WHERE
# a1.predicate = '116868652'
# AND a2.predicate = '116868652'
# AND a3.predicate = '1547507681'
# AND a3.object = '1145937192'
# AND a4.predicate = '1547507681'
# AND a5.predicate = '1577895888'
# AND a6.predicate = '-1848367484'
# AND a7.predicate = '-221079518'
# AND a1.subject=a3.subject
# AND a1.object=a2.object
### For each Squish WHERE clause query triple,
###
# sql += "WHERE \n"
# sql += "a1.predicate = '<blah>' AND ...(etc...)\n\n"
return sql
end
# [1] unless *, should fix this
## Other functions (should move to basicrdf.rb someday)
def hashcodeIntFromString (data)
require 'sha1'
sh = SHA1::new(data)
dig = sh.digest()
r = (dig[0])|((dig[1]) << 8)|((dig[2]) << 16)|((dig[3]) << 24)
# restrict to signed 32 bit int (didn't need this in java)
if ( r > ( ( 1 << 31 ) -1 ) )
return ( r - ( 1 << 32 ) )
end
return r
end
end # end of SquishQuery class definition
############################################################################
squish = `cat #{ ARGV[0]}`
#puts "#{squish} \n\n"
query = SquishQuery.new()
query.parseFromText(squish)
#puts "Query parser output was :\n #{query.toSquish} \n\n"
#query.expandAllNamespaces # move internal
#puts "Allvars: #{query.all_vars.keys} \n"
#puts "Algae: #{query.toAlgae} \n\n"
#puts query.inspect
puts "#{query.toSQLQuery} \n\n"
# we can round-trip our output. This is good...
#query2 = SquishQuery.new()
#query2.clauses=[]
#q3 = query2.parseFromText(squish)
#puts "Final Output:\n #{query2.toSquish}\n\n"
Received on Wednesday, 6 March 2002 12:45:02 UTC