- From: <danbri@fireball.danbri.org>
- Date: Wed, 27 Feb 2002 19:58:28 -0500 (EST)
- To: www-archive@w3.org
#!/usr/bin/env ruby # # A rough Ruby Squish parser # (an exercise in doing it wrong...) # useful refs: #http://www.io.com/~jimm/downloads/rubytalk/talk.html # # #daml query links: # http://www.daml.org/listarchive/joint-committee/0856.html # # http://www.hpl.hp.com/semweb/rdql.html # http://www.hpl.hp.com/semweb/rdql-grammar.html class SquishQuery # incl parsed query state attr_accessor :select_args, :from_args, :using_args, :where_args, :clauses, :xmlns, :full_clauses, :all_vars #, :select_arglist ## Hmm, don't understand why needed a method for an array field ## todo: read docs on attr_accessor def sels return @select_arglist end def initialize # puts "SquishParser: initalised :) " @clauses = [] @full_clauses = [] @xmlns = {} @loginfo = [] @verboselog = [] @all_vars = {} # todo: find out where/how one declares fields of a class! # seems to be no point in doing so, since it's all so runtimy # but I want to document them somewhere. Maybe just via attr_accessor? # @select_args @from_args="" # @using_args # @where_args # state used in WHILE productions # @lastpred # @lastsub # @lastobj end def loginfo (production, content, nextdata) logmsg = "LOGINFO: ['#{production}'] got content: '#{content}'\n" # @loginfo.push(logmsg) # @verboselog.push logmsg + " remains: #{nextdata}\n\n" end def logwarning (msg) @warnings.push(msg) puts "PARSER WARNING: #{msg}\n" end def parseFromText (text) text.gsub!(/\n/," ") text.gsub!(/\r/," ") parse = select_keyword(text) extractAllVars expandAllNamespaces return parse end # select_keyword: # entry point # # -> select_arglist # def select_keyword (text) if text =~ /^\s*SELECT\s+(.*)/i nextdata = $1 # puts "Found select keyword. Next is: #{nextdata}\n" loginfo ( 'select_keyword', 'SELECT', nextdata) return select_arglist (nextdata) end puts "Error: Expected first chars to be 'SELECT ' GOT: #{text}" end ################ # select_arglist: # def select_arglist (text) ### # # The SELECT arg list can be terminated with an (optional) FROM clause if text =~ /^\s*(.*?)\s*(FROM\s+.*)/i content = $1 # # content.gsub!(/\s*$/, "") ## todo: shouldn't have to strip ws here nextdata = $2 @select_args = content loginfo ('select_arglist (found FROM) ', content, nextdata) rawsel = content rawsel.gsub!("\\?","") vars = rawsel.split(/,\s*/) # ws mandatory? # puts ("SELS-with-from: #{vars}\n") @select_arglist = vars; return from_keyword(nextdata) # move to from_keyword production ### # # Else it should be terminated with a WHERE clause elsif text =~ /(.*)\s+WHERE\s+(.*)/i # puts "select_arglist: no from, got WHERE!\n" content = $1 nextdata = $2 @select_args = content loginfo ('select_arglist (omitted FROM) ', content, nextdata) rawsel = content # Should be separate function rawsel.gsub!("\\?","") vars = rawsel.split(/,\s*/) #puts ("SELS-without-from: #{vars.inspect}\n") @select_arglist = vars; if where_lpar (nextdata) #puts "Processed all where clauses OK, nothing left to do."# todo end # this is a *mess*! # if @remainder =~ /\S/ # return using_keyword ( nextdata ) # end return true # Done! end # neither ended in FROM or WHERE puts "Error: Expected arg list for SELECT to end with FROM or WHERE. GOT: #{text}" end # the optional FROM keyword # -> where_lpar (todo: to where_keyword) # def from_keyword (text) if text =~ /\s*FROM\s+(.*?)(\s+WHERE\s+.*)/i content = $1 nextdata = $2 loginfo ( 'from_keyword, next is where_lpar', content, nextdata) @from_args = content nextdata.gsub!(/WHERE\s+/i,"") ## TODO: make a node for WHERE_KEYWORD return where_lpar(nextdata) end puts "Expecting (optional) FROM; didn't find it." return false end # using_keyword: # final clause (optional), deals with namespace expansions # def using_keyword (text) # puts "Using-keyword got: #{text} \n" if text =~ /\s*USING+(.*)/i nextdata = $1 loginfo ( 'using_keyword', 'USING', nextdata) # puts "using_keyword: #{nextdata} \n" return using_arglist(nextdata) end puts "Expecting 'USING' keyword, found: #{text} " return false end def using_arglist(text) if text =~ /\s*(.*)$/ content = $1 loginfo ( 'using_arglist', content, '[end]' ) @using_args = content usedPrefixes(content) # extract #puts "using_args: #{content} \n" return true end puts "Expected arguments to USING, found nothing.\n" return false end #### the 'WHERE' clause # where_lpar: '(' # where clause, left paren # # -> deal with each pred_expr or drop out of WHERE via using_keyword # def where_lpar (text) if text =~ /^\s*\(\s*(.*)/ nextdata = $1 loginfo ( 'where_lpar', ' ( ', nextdata ) return pred_expr( nextdata ) end if text =~ /^\s*USING/i return using_keyword(text) end puts " ====== Expected '(' or 'USING' Got: #{text}" # note: test1-bogusclause.squish tries 'ABUSING' clause @remainder=text return false; end # pred_expr # where clause, predicate expression # -> sub_expr # def pred_expr (text) if (text =~ /\s*(\S+)\s+(.*)/) content = $1 @lastpred = content nextdata = $2 loginfo ( 'pred_lpar', content, nextdata ) return sub_expr(nextdata) end puts "Error: pred_expr didn't find expected content \n" return false end # sub_expr # where clause, subject expression # -> obj_expr # def sub_expr (text) if (text =~ /\s*(\S+)\s+(.*)\s*/) content = $1 @lastsub = content nextdata = $2 loginfo ( 'sub_expr', content, nextdata ) return obj_expr(nextdata) end puts "Error: sub_expr didn't find expected content in '#{text}'\n" end # obj_expr # where clause, object expression # -> where_rpar # def obj_expr (text) if (text =~ /(\S+)\s*(\).*)/) content = $1 # puts("object: "+content+"\n") @lastobj=content nextdata = $2 loginfo ( 'obj_expr', content, nextdata ) return where_rpar(nextdata) end puts "Error: obj_expr didn't find expected content in '#{text} \n" end # where_rpar: # where clause, right paren # -> where_lpar # def where_rpar (text) # puts "Does text '#{text} match ')'... \n" if text =~ /^\)\s*(.*)\s*/ nextdata = $1 # puts "rpar: next prod: where_lpar #{$1} \n" loginfo ( 'where_rpar', ' ) ', nextdata ) if (nextdata =~ /\S/ ) clause = "#{@lastsub} -- #{@lastpred} --> #{@lastobj}\n" qt = [ @lastpred, @lastsub, @lastobj ] self.clauses.push( qt ) ## puts ("DEBUG: #{clauses.inspect} \n") ## TODO: Store this in SquishQuery return where_lpar( nextdata ) end puts "[finished with the entire WHERE clause]\n\n" return true # we're done with this where # should flush temporary state, store goodstuff etc # Note: qnames need expanding later end puts "Error: Expected ')'" return false; end # output requested variables as contents of a Squish SELECT clause # # eg: "?x, ?y, ?z" # # todo: refactor to hide the * case from apps that want a clean list # of vars, ie. sharecode w/ algae function. self.sels is bad data. # def toVarQList s=[] if self.sels[0] =~ /^\*/ return "*" # special case for 'SELECT *' (ie. select all named vars) end self.sels.each {|q| s.push("?#{q}") } return s.join(", ") end def toSquish sq = "SELECT #{toVarQList} \n" if from_args =~ /\S/ sq += "FROM #{from_args} \n" end sq += "WHERE \n" cl= clauses.each { |qt| sq += " ( #{qt[0]} #{qt[1]} #{qt[2]} ) \n" #todo: commas? whitespace? } sq += "USING #{using_args} \n" return sq end # expand namespaces and store in expanded_clauses # def expandAllNamespaces sq="" full_clauses=[] # reset state. wise? cl= clauses.each { |qt| p = expns(qt[0]) s = expns(qt[1]) o = expns(qt[2]) sq += " ( #{p} #{s} #{o} ) " # puts ("Epanding: #{sq} \n") self.full_clauses.push([p,s,o] ) } return ("Expanded: "+sq) end def expns (text) # possibly qualified expression, eg dc::foo if text =~ /(\w+)::(.*)/ if ($1 != nil) ns = xmlns[$1] if (ns == nil) puts "Error: undeclared namespace #{$1}\n" else text = xmlns[$1] + $2 end end # puts "Expns: Got #{$1} and #{$2} -> #{text} \n" end return text end # update self.all_vars based on variables named in self.clauses # def extractAllVars ## todo: reset state first? same issue re xmlns / USING... clauses.each { |qt| parts = qt[0..3] parts.each { |term| if term =~ /^\?(.*)/ l = self.all_vars[$1] # look for list of clauses using this variable if l == nil self.all_vars[$1] = [qt] else self.all_vars[$1].push(qt) end end } } end # Output in Algae syntax (URI for spec?) # todo: look at commas, throw exception for '*' or workaround as below # - for this, need an query.allvarnames() method # - warn if there's a FROM clause, or figure out Algae syntax # - find out Algae notation for using ns prefixes def toAlgae a = "(ask '(\n" cl= full_clauses.each { |qt| a += " ( #{qt[0]} #{qt[1]} #{qt[2]} ) \n" } vars = toVarQList vout=[] if vars =~ /\*/ # warnings.push "Algae doesn't support * selector. (todo) Default is collect all vars" all_vars.keys.each {|q| vout.push("?#{q}") } else vars.each {|q| vout.push("#{q}") } end a += " ) collect '( #{vout.join(', ')} )\n)\n" ## lose the commas? return a end def usedPrefixes (text) nslist = text.split(/\s+/) state='prefix' #Todo: investigate ruby constants mechanism content="" nslist.each { |item| if state =~ /prefix/ content=item state='for' elsif state =~ /for/ state = 'uri' elsif state =~ /uri/ self.xmlns[content]=item content="" state = 'prefix' elsif puts "TODO: parse USING clause properly!\n" end } return xmlns end ################################################################################ ## ## Squish2SQL facilities # def toSQLQuery sql="" sqlVariableNamesA = [] sqlVariableNamesB = [] sqlVariableMatchAB = {} realToSqlVarname_A={} id_a_clause=1 # counter # RDBMS table and field names p_field='predicate' s_field='subject' o_field='object' main_table = 'triples' lookup_table = 'resources' # for storing generated WHERE Clause fragments, two categories: where_triples=[] # triples, ie. "a" (assertions) where_lookup=[] # libby's "b" or 'resources' table clauses.each { |clause| p,s,o = clause[0..2] p=expns(p) s=expns(s) o=expns(o) p_bound = false s_bound = false o_bound = false p.gsub!(/^\?/,"") # this by reference, changes the contents of clauses # TODO: this will trip us up. Fix! decide on whether has ? or not s.gsub!(/^\?/,"") o.gsub!(/^\?/, "") #puts("CLAUSE: #{p} ; #{s} ; #{o} \n") all_vars.keys.each { |varname| # puts "Scanning for varname: #{varname.inspect} sub=#{s.inspect} id: #{id_a_clause}\n" if s.eql? varname realToSqlVarname_A[varname] = "a#{id_a_clause}.#{s_field}" s_bound = true # hmm, back to front? end if p.eql? varname realToSqlVarname_A[varname]= "a#{id_a_clause}.#{p_field}" p_bound= true end if o.eql? varname realToSqlVarname_A[varname]= "a#{id_a_clause}.#{o_field}" o_bound = true end } sh1_sub = hashcodeIntFromString(s) # wasteful, not always needed (see below) sh1_pred = hashcodeIntFromString(p) sh1_obj = hashcodeIntFromString(o) # puts "\n\nSHA-Triple: s=#{s} (#{sh1_sub}) p=#{p} (#{sh1_pred}) o=#{o} (#{sh1_obj}) \n" # puts "Var bindings: #{s_bound} #{p_bound} #{o_bound} \n" if !s_bound where_triples.push "a#{id_a_clause}.#{s_field} = '#{sh1_sub}'" #puts "DEBUG: s = #{sh1_sub} \n" end if !p_bound where_triples.push "a#{id_a_clause}.#{p_field} = '#{sh1_pred}'" #puts "DEBUG: p = #{sh1_pred} \n" end if !o_bound where_triples.push "a#{id_a_clause}.#{o_field} = '#{sh1_obj}'" #puts "DEBUG: o = #{sh1_obj} \n" end id_a_clause += 1 } # end big loop thru clauses #puts "VARNAMES: #{realToSqlVarname_A.inspect}\n" #puts "Got constraints: \n\n #{where_triples.inspect} \n" sqlVarnames=[] # todo: describe this all_vars.keys.each do |variableNameToMatch| sqlVariableNamesA = [] #puts "\nScanning for clauses that use variable: #{variableNameToMatch} \n\n" cl_idx=1 clauses.each do |clause| # puts "Clause: #{clause.inspect} \n" p,s,o = clause[0..2] p_bound = false s_bound = false o_bound = false if s.eql? variableNameToMatch sqlVariableNamesA.push("a#{cl_idx}.#{s_field}") #puts "Matched! (subject)\n" end if p.eql? variableNameToMatch sqlVariableNamesA.push("a#{cl_idx}.#{p_field}") #puts "Matched! (predicate)\n" end if o.eql? variableNameToMatch sqlVariableNamesA.push("a#{cl_idx}.#{o_field}") #puts "Matched! (object) \n" end cl_idx= cl_idx+1 #++ didn't. end if sqlVariableNamesA.size > 1 sqlVarnames.push(sqlVariableNamesA) end end #puts "Current varname equalities for where_triples constraints: #{sqlVarnames.inspect}\n\n" # # this writes "a1.subject = a2.predicate" constraints. # note that there is some redundancy on this method. (@@check java code) sqlVarnames.each { |bindings| j=0 bindings.each { |part| if (j+1<bindings.size) where_triples.push " #{part} = #{bindings[j+1]} " end j=j+1 # j++ not work } } # selectvars is the list of b variables and the actual variables that match them: # e.g b1.value as ?x selectvars=[] clause_lookup_id=1 realToSqlVarname_A.keys.each { |realkey| val = realToSqlVarname_A[realkey] # drop '?' prefix (needed? seems not) realkey.sub!(/!\?/,"") #puts "building where_lookup constraints: key=#{realkey} val=#{val} \n\n\n" if self.sels.include?(realkey) selectvars.push "b#{clause_lookup_id}.value AS #{realkey}" sqlVariableNamesB.push("b#{clause_lookup_id}"); where_lookup.push("b#{clause_lookup_id}.key="+val ); end if (sqlVariableMatchAB[val] == nil) tmp=[] tmp.push "b#{clause_lookup_id}.value}" sqlVariableMatchAB[val]=tmp else sqlVariableMatchAB[val].push "b#{clause_lookup_id}.value}" end clause_lookup_id += 1 } sql = "SELECT DISTINCT "+ selectvars.join(", ")+" " sql += "FROM " id_a_clause.times do |i| sql += " #{main_table} a#{i+1}, " end lookup_tmp=[] sqlVariableNamesB.each do |v| lookup_tmp.push "#{lookup_table} #{v}" end sql += lookup_tmp.join(", ") sql += "\nWHERE\n\t" + where_lookup.join(" AND ") + " AND "+ where_triples.join(" AND ") ############################################################################################## # A walk-through: # SELECT DISTINCT # b2.value AS mbox, b5.value AS thumb, b7.value AS name ##### ^ 'b' is the lookups-prefix (a* was for triples, b* for resource id lookups ##### ^ '1' us a where-clause-counter (var may have several numbers) ##### ^ 'value' is the field name from lookups that contains content (not sha1'd ints ##### ^ from query.sels ([1]), the variable name ##### ....repeated for query.sels.each, picking a ##### counter number from the clause numbers they appear in # sql += "'bn.value AS xyz, bm.value AS abc, bn.value AS pqr (etc...)' \n" # # as=[] # sels.each { |wanted| # as.push ("b?.value AS #{wanted}") # puts "\t\t\tSQLTODO: get the n=? counter for #{wanted} from clauses" # } # sql += as.join(", ") # FROM # triples a1, triples a2, triples a3, triples a4, # triples a5, triples a6, triples a7, # resources b2, resources b5, resources b7 # # sql += "\n FROM \n" # sql += "\ttriples a1, triples, a2, (etc...) triples...ax \n" # # sql += "\tresources b2, resources, b3, resources bz (etc...) \n" # # WHERE # a1.predicate = '116868652' # AND a2.predicate = '116868652' # AND a3.predicate = '1547507681' # AND a3.object = '1145937192' # AND a4.predicate = '1547507681' # AND a5.predicate = '1577895888' # AND a6.predicate = '-1848367484' # AND a7.predicate = '-221079518' # AND a1.subject=a3.subject # AND a1.object=a2.object ### For each Squish WHERE clause query triple, ### # sql += "WHERE \n" # sql += "a1.predicate = '<blah>' AND ...(etc...)\n\n" return sql end # [1] unless *, should fix this ## Other functions (should move to basicrdf.rb someday) def hashcodeIntFromString (data) require 'sha1' sh = SHA1::new(data) dig = sh.digest() r = (dig[0])|((dig[1]) << 8)|((dig[2]) << 16)|((dig[3]) << 24) # restrict to signed 32 bit int (didn't need this in java) if ( r > ( ( 1 << 31 ) -1 ) ) return ( r - ( 1 << 32 ) ) end return r end end # end of SquishQuery class definition ############################################################################ squish = `cat #{ ARGV[0]}` #puts "#{squish} \n\n" query = SquishQuery.new() query.parseFromText(squish) #puts "Query parser output was :\n #{query.toSquish} \n\n" #query.expandAllNamespaces # move internal #puts "Allvars: #{query.all_vars.keys} \n" #puts "Algae: #{query.toAlgae} \n\n" #puts query.inspect puts "#{query.toSQLQuery} \n\n" # we can round-trip our output. This is good... #query2 = SquishQuery.new() #query2.clauses=[] #q3 = query2.parseFromText(squish) #puts "Final Output:\n #{query2.toSquish}\n\n"
Received on Wednesday, 6 March 2002 12:45:02 UTC