[www-archive] <none>

#!/usr/bin/env ruby
#
# A rough Ruby Squish parser
# (an exercise in doing it wrong...)
# useful refs:
#http://www.io.com/~jimm/downloads/rubytalk/talk.html
#
#
#daml query links:
# http://www.daml.org/listarchive/joint-committee/0856.html
#
# http://www.hpl.hp.com/semweb/rdql.html
# http://www.hpl.hp.com/semweb/rdql-grammar.html



class SquishQuery


  # incl parsed query state
  attr_accessor :select_args, :from_args, :using_args, :where_args,  :clauses, :xmlns, :full_clauses, :all_vars

#, :select_arglist

  ## Hmm, don't understand why needed a method for an array field
  ## todo: read docs on attr_accessor
  def sels
    return @select_arglist
  end 
 
  def initialize   
#    puts "SquishParser: initalised :) "
    @clauses  = []
    @full_clauses = []
    @xmlns = {}
    @loginfo = []
    @verboselog = []
    @all_vars = {}

    # todo: find out where/how one declares fields of a class!
    # seems to be no point in doing so, since it's all so runtimy
    # but I want to document them somewhere. Maybe just via attr_accessor?

    # @select_args
     @from_args=""
    # @using_args
    # @where_args
    # state used in WHILE productions
    #  @lastpred
    #  @lastsub
    #  @lastobj
end


  def loginfo (production, content, nextdata)
    logmsg = "LOGINFO: ['#{production}'] got content: '#{content}'\n"
#    @loginfo.push(logmsg)
#    @verboselog.push  logmsg + "    remains: #{nextdata}\n\n"
  end


  def logwarning (msg)
    @warnings.push(msg)
    puts "PARSER WARNING: #{msg}\n"
  end 


  def parseFromText (text)
    text.gsub!(/\n/," ")
    text.gsub!(/\r/," ")
   

    parse = select_keyword(text)
    extractAllVars
    expandAllNamespaces
    return parse
  end 

  # select_keyword:
  # entry point
  #
  # -> select_arglist
  #
  def select_keyword (text)
    if text =~ /^\s*SELECT\s+(.*)/i
      nextdata = $1
      # puts "Found select keyword. Next is: #{nextdata}\n"
      loginfo ( 'select_keyword', 'SELECT', nextdata)
      return select_arglist (nextdata)
    end
    puts "Error: Expected first chars to be 'SELECT ' GOT: #{text}"
  end  



  ################
  # select_arglist:
  #
  def select_arglist (text)
    ###
    #
    # The SELECT arg list can be terminated with an (optional) FROM clause
    if text =~ /^\s*(.*?)\s*(FROM\s+.*)/i
      content = $1
#
 #     content.gsub!(/\s*$/, "") ## todo: shouldn't have to strip ws here

      nextdata = $2
      @select_args = content 

      loginfo ('select_arglist (found FROM) ', content, nextdata)

      rawsel = content
      rawsel.gsub!("\\?","")
      vars = rawsel.split(/,\s*/) # ws mandatory?
#      puts ("SELS-with-from: #{vars}\n")
      @select_arglist = vars;

      return from_keyword(nextdata) # move to from_keyword production

    ### 
    #
    # Else it should be terminated with a WHERE clause
    elsif text =~ /(.*)\s+WHERE\s+(.*)/i 
      # puts "select_arglist: no from, got WHERE!\n"
      content = $1
      nextdata = $2
      @select_args = content


      loginfo ('select_arglist (omitted FROM) ', content, nextdata)
      
      rawsel = content			# Should be separate function
      rawsel.gsub!("\\?","")
      vars = rawsel.split(/,\s*/)
      #puts ("SELS-without-from: #{vars.inspect}\n")
      @select_arglist = vars;

      if where_lpar (nextdata)
      #puts "Processed all where clauses OK, nothing left to do."# todo
      end
      # this is a *mess*! 
      #      if @remainder =~ /\S/
      #        return using_keyword ( nextdata )
      #      end
      return true # Done!
    end
    # neither ended in FROM or WHERE
    puts "Error: Expected arg list for SELECT to end with FROM or WHERE. GOT: #{text}"
  end  



  # the optional FROM keyword
  #  -> where_lpar (todo: to where_keyword)
  #
  def from_keyword (text)
    if text =~ /\s*FROM\s+(.*?)(\s+WHERE\s+.*)/i
      content = $1
      nextdata = $2
      loginfo ( 'from_keyword, next is where_lpar', content, nextdata)
      @from_args = content
      nextdata.gsub!(/WHERE\s+/i,"") ## TODO: make a node for WHERE_KEYWORD
      return where_lpar(nextdata)
    end
    puts "Expecting (optional) FROM; didn't find it."
    return false
  end


  # using_keyword:
  # final clause (optional), deals with namespace expansions
  #
  def using_keyword (text)
    # puts "Using-keyword got: #{text} \n"
    if text =~ /\s*USING+(.*)/i
      nextdata = $1
      loginfo ( 'using_keyword', 'USING', nextdata)

      # puts "using_keyword: #{nextdata} \n"
      return using_arglist(nextdata)
    end
    puts "Expecting 'USING' keyword, found: #{text} "
    return false
  end 
  

  def using_arglist(text)
    if text =~ /\s*(.*)$/
      content = $1

      loginfo ( 'using_arglist', content, '[end]' )

      @using_args = content
      usedPrefixes(content) # extract
      #puts "using_args: #{content} \n"
      return true
    end
    puts "Expected arguments to USING, found nothing.\n"
    return false
  end

  #### the 'WHERE' clause

  # where_lpar: '('
  # where clause, left paren
  # 
  # -> deal with each pred_expr or drop out of WHERE via using_keyword
  #
  def where_lpar (text)
    if text =~ /^\s*\(\s*(.*)/  
      nextdata = $1
      loginfo ( 'where_lpar', ' ( ', nextdata )
      return pred_expr( nextdata )
    end
    if text =~ /^\s*USING/i
      return using_keyword(text)
    end
    puts " ====== Expected '(' or 'USING' Got: #{text}" 
    # note: test1-bogusclause.squish tries 'ABUSING' clause 
    @remainder=text
    return false;
  end

  # pred_expr
  # where clause, predicate expression
  # -> sub_expr
  #
  def pred_expr (text) 
    if (text =~ /\s*(\S+)\s+(.*)/)
      content = $1
      @lastpred = content
      nextdata = $2
      loginfo ( 'pred_lpar', content, nextdata )

      return sub_expr(nextdata)
    end
    puts "Error: pred_expr didn't find expected content \n"
    return false
  end


  # sub_expr
  # where clause, subject expression
  # -> obj_expr
  #
  def sub_expr (text)
    if (text =~ /\s*(\S+)\s+(.*)\s*/)
      content = $1
      @lastsub = content
      nextdata = $2
      loginfo ( 'sub_expr', content, nextdata )
      return obj_expr(nextdata)
    end
    puts "Error: sub_expr didn't find expected content in '#{text}'\n"
  end


 # obj_expr
  # where clause, object expression
  # -> where_rpar
  #
  def obj_expr (text)
    if (text =~ /(\S+)\s*(\).*)/)
      content = $1
      # puts("object: "+content+"\n")
      @lastobj=content 
      nextdata = $2
      loginfo ( 'obj_expr', content, nextdata )
      return where_rpar(nextdata)
    end
    puts "Error: obj_expr didn't find expected content in '#{text} \n"
  end






  # where_rpar: 
  # where clause, right paren
  # -> where_lpar
  #
  def where_rpar (text)
    # puts "Does text '#{text} match ')'... \n"
    if text =~ /^\)\s*(.*)\s*/  
      nextdata = $1
  #    puts "rpar: next prod: where_lpar #{$1} \n"
      loginfo ( 'where_rpar', ' ) ', nextdata )

      if (nextdata =~ /\S/ ) 

        clause = "#{@lastsub} -- #{@lastpred} --> #{@lastobj}\n"
        qt = [ @lastpred, @lastsub, @lastobj ]
        self.clauses.push( qt ) 
 
        ## puts ("DEBUG: #{clauses.inspect} \n")
	## TODO: Store this in SquishQuery
        return where_lpar( nextdata )
      end 
      puts "[finished with the entire WHERE clause]\n\n"
         return true # we're done with this where
	          # should flush temporary state, store goodstuff etc
		  # Note: qnames need expanding later
    end
    puts "Error: Expected ')'"
    return false;
  end


# output requested variables as contents of a Squish SELECT clause
#
# eg: "?x, ?y, ?z"
# 
# todo: refactor to hide the * case from apps that want a clean list
# of vars, ie. sharecode w/ algae function. self.sels is bad data.
#
def toVarQList
  s=[]
  if self.sels[0] =~ /^\*/ 
    return "*" # special case for 'SELECT *' (ie. select all named vars)
  end
  self.sels.each {|q| s.push("?#{q}") } 
  return s.join(", ")
end

def toSquish 
  sq = "SELECT #{toVarQList} \n"
  if from_args =~ /\S/
    sq += "FROM #{from_args} \n"
  end
  sq += "WHERE \n"
  cl= clauses.each { |qt|
    sq += " ( #{qt[0]} #{qt[1]} #{qt[2]} ) \n" #todo: commas? whitespace?
  } 
  sq += "USING #{using_args} \n"
  return sq
end


# expand namespaces and store in expanded_clauses
#
def expandAllNamespaces
   sq=""
   full_clauses=[] # reset state. wise?
   cl= clauses.each { |qt|
     p = expns(qt[0])
     s = expns(qt[1])
     o = expns(qt[2])
    sq += " ( #{p} #{s} #{o} ) "
    # puts ("Epanding: #{sq} \n")
    self.full_clauses.push([p,s,o] ) 
   }
   return ("Expanded: "+sq)
end

def expns (text)
  # possibly qualified expression, eg dc::foo
  if text =~ /(\w+)::(.*)/
    if ($1 != nil) 
      ns = xmlns[$1]
      if (ns == nil) 
        puts "Error: undeclared namespace #{$1}\n"
      else 
      text = xmlns[$1] + $2
      end
    end
#   puts "Expns: Got #{$1} and #{$2} -> #{text} \n"  
  end
  return text
end


# update self.all_vars based on variables named in self.clauses
#
def extractAllVars
  ## todo: reset state first? same issue re xmlns / USING...
  clauses.each { |qt|
    parts = qt[0..3]
    parts.each { |term|
     if term =~ /^\?(.*)/
       l = self.all_vars[$1] # look for list of clauses using this variable
       if l == nil
         self.all_vars[$1] = [qt]
       else
        self.all_vars[$1].push(qt)
       end
     end
    }
  }
end
  
  

# Output in Algae syntax (URI for spec?)
# todo: look at commas, throw exception for '*' or workaround as below
#       - for this, need an query.allvarnames() method 
#	- warn if there's a FROM clause, or figure out Algae syntax
#       - find out Algae notation for using ns prefixes
def toAlgae 
  a = "(ask '(\n"
  cl= full_clauses.each { |qt|
    a += " ( #{qt[0]} #{qt[1]} #{qt[2]} ) \n"
  }
  vars = toVarQList
  vout=[]
  if vars =~ /\*/
     #    warnings.push "Algae doesn't support * selector. (todo) Default is  collect all vars"
     all_vars.keys.each {|q| vout.push("?#{q}") }  
  else
    vars.each {|q| vout.push("#{q}") } 
  end
  a += " ) collect '( #{vout.join(', ')} )\n)\n" ## lose the commas?
  return a
end


def usedPrefixes (text)
  nslist = text.split(/\s+/)
  state='prefix' #Todo: investigate ruby constants mechanism
  content=""
  nslist.each { |item| 
    if state =~ /prefix/
      content=item
      state='for'
    elsif state =~ /for/
      state = 'uri'
    elsif state =~ /uri/
      self.xmlns[content]=item
      content=""
      state = 'prefix'
    elsif 
      puts "TODO: parse USING clause properly!\n"
    end
  }


  return xmlns
end



################################################################################
##
## Squish2SQL facilities
#

def toSQLQuery 

  sql=""
  sqlVariableNamesA = []
  sqlVariableNamesB = []
  sqlVariableMatchAB = {}
  realToSqlVarname_A={}
  id_a_clause=1 # counter

  # RDBMS table and field names
  p_field='predicate'
  s_field='subject'
  o_field='object'
  main_table = 'triples'
  lookup_table = 'resources'

  # for storing generated WHERE Clause fragments, two categories:
  where_triples=[]   # triples, ie. "a" (assertions)
  where_lookup=[]	   # libby's "b" or 'resources' table

  clauses.each { |clause|
    p,s,o = clause[0..2]
    p=expns(p)
    s=expns(s)
    o=expns(o)
    p_bound = false
    s_bound = false
    o_bound = false
    p.gsub!(/^\?/,"") # this by reference, changes the contents of clauses 
                      # TODO: this will trip us up. Fix! decide on whether has ? or not
    s.gsub!(/^\?/,"")
    o.gsub!(/^\?/, "")
    #puts("CLAUSE:  #{p} ; #{s} ; #{o} \n")
    all_vars.keys.each { |varname|
      # puts "Scanning for varname: #{varname.inspect} sub=#{s.inspect} id: #{id_a_clause}\n"
      if s.eql? varname
        realToSqlVarname_A[varname] =  "a#{id_a_clause}.#{s_field}"
        s_bound = true # hmm, back to front?
      end
      if p.eql? varname
        realToSqlVarname_A[varname]=  "a#{id_a_clause}.#{p_field}"
        p_bound= true
      end
      if o.eql? varname
        realToSqlVarname_A[varname]= "a#{id_a_clause}.#{o_field}"
        o_bound = true
      end
    }
    sh1_sub = hashcodeIntFromString(s) # wasteful, not always needed (see below)
    sh1_pred = hashcodeIntFromString(p)
    sh1_obj = hashcodeIntFromString(o)

    # puts "\n\nSHA-Triple: s=#{s} (#{sh1_sub}) p=#{p} (#{sh1_pred}) o=#{o} (#{sh1_obj})  \n"
    # puts "Var bindings: #{s_bound} #{p_bound} #{o_bound} \n"

    if !s_bound
      where_triples.push "a#{id_a_clause}.#{s_field} = '#{sh1_sub}'"
      #puts "DEBUG: s = #{sh1_sub} \n"
    end 
    if !p_bound
      where_triples.push "a#{id_a_clause}.#{p_field} = '#{sh1_pred}'"
      #puts "DEBUG: p = #{sh1_pred} \n"
    end 
    if !o_bound
      where_triples.push "a#{id_a_clause}.#{o_field} = '#{sh1_obj}'"
      #puts "DEBUG: o = #{sh1_obj} \n"
    end 

    id_a_clause += 1

  } # end big loop thru clauses

  #puts "VARNAMES: #{realToSqlVarname_A.inspect}\n"
  #puts "Got constraints: \n\n #{where_triples.inspect} \n"

  sqlVarnames=[] # todo: describe this 

  all_vars.keys.each do |variableNameToMatch|
    sqlVariableNamesA = []
    #puts "\nScanning for clauses that use variable: #{variableNameToMatch} \n\n"
    cl_idx=1
    clauses.each do |clause|   
      # puts "Clause: #{clause.inspect} \n"
      p,s,o = clause[0..2]
      p_bound = false
      s_bound = false
      o_bound = false
      if s.eql? variableNameToMatch
        sqlVariableNamesA.push("a#{cl_idx}.#{s_field}")
        #puts "Matched! (subject)\n"
      end
      if p.eql? variableNameToMatch
        sqlVariableNamesA.push("a#{cl_idx}.#{p_field}")
        #puts "Matched! (predicate)\n"
      end
      if o.eql? variableNameToMatch
        sqlVariableNamesA.push("a#{cl_idx}.#{o_field}")
        #puts "Matched! (object) \n"
      end
  
      cl_idx= cl_idx+1
      #++ didn't.
    end

    if sqlVariableNamesA.size > 1 
      sqlVarnames.push(sqlVariableNamesA)
    end
  end

  #puts "Current varname equalities for where_triples constraints: #{sqlVarnames.inspect}\n\n"

  #
  # this writes "a1.subject = a2.predicate" constraints.
  # note that there is some redundancy on this method. (@@check java code)
  sqlVarnames.each { |bindings|
    j=0
    bindings.each { |part| 
      if (j+1<bindings.size) 
        where_triples.push " #{part} = #{bindings[j+1]} " 
      end
      j=j+1 # j++ not work
    }
  }

  # selectvars is the list of b variables and the actual variables that match them:
  # e.g b1.value as ?x
  selectvars=[]
  

  clause_lookup_id=1
  realToSqlVarname_A.keys.each { |realkey|
    val = realToSqlVarname_A[realkey]
    # drop '?' prefix (needed? seems not)
    realkey.sub!(/!\?/,"") 
    #puts "building where_lookup constraints: key=#{realkey} val=#{val} \n\n\n"
  if self.sels.include?(realkey)
    selectvars.push "b#{clause_lookup_id}.value AS #{realkey}"
    sqlVariableNamesB.push("b#{clause_lookup_id}");
    where_lookup.push("b#{clause_lookup_id}.key="+val );
  end
  
  if (sqlVariableMatchAB[val] == nil) 
    tmp=[]
    tmp.push "b#{clause_lookup_id}.value}"
    sqlVariableMatchAB[val]=tmp
  else
    sqlVariableMatchAB[val].push "b#{clause_lookup_id}.value}"
  end  
  
  clause_lookup_id += 1
  }

  sql = "SELECT DISTINCT "+ selectvars.join(", ")+" "
  sql += "FROM "

id_a_clause.times do |i|
  sql += " #{main_table} a#{i+1}, "
end

lookup_tmp=[]
sqlVariableNamesB.each do |v|
  lookup_tmp.push "#{lookup_table} #{v}"
end 
sql += lookup_tmp.join(", ")

sql += "\nWHERE\n\t" + where_lookup.join(" AND ") + " AND "+ where_triples.join(" AND ")

##############################################################################################

# A walk-through:
#    SELECT DISTINCT 
#    b2.value AS mbox, b5.value AS thumb, b7.value AS name
 
##### ^ 'b' is the lookups-prefix (a* was for triples, b* for resource id lookups 
#####  ^ '1' us a where-clause-counter (var may have several numbers) 
#####    ^ 'value' is the field name from lookups that contains content (not sha1'd ints
#####             ^ from query.sels ([1]), the variable name
#####               ....repeated for query.sels.each, picking a
#####               counter number from the clause numbers they appear in

#  sql += "'bn.value AS xyz, bm.value AS abc, bn.value AS pqr (etc...)' \n" 
#  
#  as=[]
#  sels.each { |wanted|
  #  as.push ("b?.value AS #{wanted}")
  #   puts "\t\t\tSQLTODO: get the n=? counter for #{wanted} from clauses"
#  }
#  sql += as.join(", ")

#     FROM
#        triples a1,  triples a2,  triples a3,  triples a4,
#        triples a5,  triples a6,  triples a7,
#        resources b2,  resources b5,  resources b7
#
#  sql += "\n FROM \n"
#  sql += "\ttriples a1, triples, a2, (etc...) triples...ax \n"
#
#  sql += "\tresources b2, resources, b3, resources bz (etc...) \n"
#
#    WHERE 
#         a1.predicate = '116868652'
#   AND   a2.predicate = '116868652'
#     AND a3.predicate = '1547507681'
#     AND    a3.object = '1145937192'
#     AND a4.predicate = '1547507681'
#   AND   a5.predicate = '1577895888'
#   AND a6.predicate = '-1848367484'
#   AND a7.predicate = '-221079518'
#   AND a1.subject=a3.subject
#   AND a1.object=a2.object

### For each Squish WHERE clause query triple,
### 

# sql += "WHERE \n"
#  sql += "a1.predicate = '<blah>' AND ...(etc...)\n\n"

  return sql
end


# [1] unless *, should fix this

## Other functions (should move to basicrdf.rb someday)


def hashcodeIntFromString (data)
  require 'sha1'
  sh = SHA1::new(data)
  dig = sh.digest()
  r = (dig[0])|((dig[1]) << 8)|((dig[2]) << 16)|((dig[3]) << 24)
  # restrict to signed 32 bit int (didn't need this in java)
  if  ( r > ( ( 1 << 31 ) -1 ) )
    return ( r - ( 1 << 32 ) )
  end
  return r
end




end # end of SquishQuery class definition




############################################################################



squish =  `cat #{ ARGV[0]}` 
#puts  "#{squish}  \n\n"

query = SquishQuery.new()
query.parseFromText(squish) 

#puts "Query parser output was :\n #{query.toSquish}  \n\n"

#query.expandAllNamespaces # move internal
#puts "Allvars: #{query.all_vars.keys} \n"
#puts "Algae: #{query.toAlgae} \n\n"

#puts query.inspect

puts "#{query.toSQLQuery} \n\n"



# we can round-trip our output. This is good...
#query2 = SquishQuery.new()
#query2.clauses=[]
#q3 = query2.parseFromText(squish)
#puts "Final Output:\n #{query2.toSquish}\n\n"

Received on Wednesday, 6 March 2002 12:45:02 UTC