Add more flexibility in how whitespace is specified.
[gazelle.git] / compiler / grammar.lua
blob98b8742ed42724227b225e869f56aaafe43908a4
1 --[[--------------------------------------------------------------------
3 Gazelle: a system for building fast, reusable parsers
5 grammar.lua
7 This is the top-level data structure representing a Gazelle grammar.
9 It contains all DFAs for all levels of the grammar, as well as all
10 metadata about the grammar, like what the start symbol is.
12 It is created and initially populated by the grammar parser. The
13 parser adds RTNs for each rule and IntFAs for each terminal, as well
14 as metadata explicitly or implicitly contained in the source file.
16 It is annotated with lookahead information in the form of GLAs by
17 the LL lookahead calculation.
19 The IntFAs for each terminal are combined into the grammar's master
20 IntFAs by the IntFA-combining step.
22 Finally, it is written out to bytecode by the bytecode emitting step.
24 Copyright (c) 2008 Joshua Haberman. See LICENSE for details.
26 --------------------------------------------------------------------]]--
28 require "data_structures"
29 require "fa_algorithms"
30 require "intfa_combine"
32 Grammar = {name="Grammar"}
34 function Grammar:new()
35 local obj = newobject(self)
36 obj.rtns = OrderedMap:new()
37 obj.terminals = {}
38 obj.master_intfas = OrderedSet:new()
39 obj.start = nil -- what rule the entire grammar starts on
40 return obj
41 end
43 -- Add a nonterminal and its associated RTN to the grammar.
44 -- TODO: how should redefinition be caught and warned/errored?
45 function Grammar:add_nonterm(name, rtn, slot_count, text)
46 rtn.name = name
47 rtn.slot_count = slot_count
48 rtn.text = text
49 for state in each(rtn:states()) do
50 state.rtn = rtn
51 end
52 self.rtns:add(name, rtn)
53 end
55 -- Add a terminal and its associated IntFA to the grammar.
56 -- TODO: how should redefinition be caught and warned/errored?
57 function Grammar:add_terminal(name, intfa)
58 self.terminals[name] = intfa
59 end
61 function Grammar:add_allow(what_to_allow, start_nonterm, end_nonterms)
62 -- kind of a hack to do this here.
63 self:determinize_rtns()
65 local children_func = function(rule_name)
66 if not end_nonterms:contains(rule_name) then
67 print(rule_name)
68 local rtn = self.rtns:get(rule_name)
69 if not rtn then
70 error(string.format("Error computing ignore: rule %s does not exist", rule_name))
71 end
73 -- get sub-rules
74 local subrules = Set:new()
75 for state in each(rtn:states()) do
76 for edge_val, dest_state, properties in state:transitions() do
77 if fa.is_nonterm(edge_val) and properties.slotnum ~= -1 then
78 subrules:add(edge_val.name)
79 end
80 end
81 end
83 -- add self-transitions for every state
84 for state in each(rtn:states()) do
85 state:add_transition(what_to_allow, state, {name=what_to_allow.name, slotnum=-1})
86 end
88 return subrules
89 end
90 end
92 depth_first_traversal(start_nonterm, children_func)
93 end
95 function Grammar:check_defined()
96 for name, rtn in each(self.rtns) do
97 for rtn_state in each(rtn:states()) do
98 for edge_val in rtn_state:transitions() do
99 if fa.is_nonterm(edge_val) and not self.rtns:contains(edge_val.name) then
100 error(string.format("Rule '%s' was referred to but never defined.", edge_val.name))
107 function Grammar:get_rtn_states_needing_intfa()
108 local states = Set:new()
109 for name, rtn in each(self.rtns) do
110 for state in each(rtn:states()) do
111 if state:needs_intfa() then
112 states:add(state)
116 return states
119 function Grammar:get_rtn_states_needing_gla()
120 local states = Set:new()
121 for name, rtn in each(self.rtns) do
122 for state in each(rtn:states()) do
123 if state:needs_gla() then
124 states:add(state)
128 return states
131 function copy_attributes(rtn, new_rtn)
132 new_rtn.name = rtn.name
133 new_rtn.slot_count = rtn.slot_count
134 new_rtn.text = rtn.text
135 for state in each(new_rtn:states()) do
136 state.rtn = new_rtn
140 function Grammar:assign_priorities()
141 -- For each non-epsilon transition in the grammar, we want to find the epsilon
142 -- closure (within the rule -- no following nonterminal or final transitions)
143 -- of all reverse transitions and assign any priorities the epsilon
144 -- transitions have to the non-epsilon transition.
146 -- Begin by building a list of reverse epsilon transitions. For each state
147 -- that has at least one epsilon transition going into it, we build a 2-tuple
148 -- of {set of states that are the source of an epsilon transitions,
149 -- map of priority_class -> priority in that class}
150 for name, rtn in each(self.rtns) do
151 local reverse_epsilon_transitions = {}
152 for state in each(rtn:states()) do
153 for edge_val, dest_state, properties in state:transitions() do
154 if edge_val == fa.e then
155 reverse_epsilon_transitions[dest_state] = reverse_epsilon_transitions[dest_state] or {Set:new(), {}}
156 reverse_epsilon_transitions[dest_state][1]:add(state)
157 if properties and properties.priority_class then
158 if reverse_epsilon_transitions[dest_state][2][properties.priority_class] then
159 error("Unexpected.")
161 reverse_epsilon_transitions[dest_state][2][properties.priority_class] = properties.priority
167 local priorities = {}
168 local children = function(state, stack)
169 local reverse_transitions = reverse_epsilon_transitions[state]
170 if reverse_transitions then
171 local child_states, priority_classes = unpack(reverse_transitions)
172 for priority_class, priority in pairs(priority_classes) do
173 if priorities[priority_class] then
174 error("Unexpected please report the grammar that triggered this error!")
176 priorities[priority_class] = priority
178 return child_states
181 for state in each(rtn:states()) do
182 priorities = {}
183 depth_first_traversal(state, children)
184 for edge_val, dest_state, properties in state:transitions() do
185 if edge_val ~= fa.e then
186 -- non-epsilon transitions should always have properties assigned,
187 -- because they always have slotnums and slotnames.
188 properties.priorities = priorities
191 if state.final then
192 state.final = {priorities = priorities}
198 function Grammar:determinize_rtns()
199 local new_rtns = OrderedMap:new()
200 for name, rtn in each(self.rtns) do
201 local new_rtn = nfa_to_dfa(rtn)
202 copy_attributes(rtn, new_rtn)
203 new_rtns:add(name, new_rtn)
205 self.rtns = new_rtns
208 function Grammar:minimize_rtns()
209 local new_rtns = OrderedMap:new()
210 for name, rtn in each(self.rtns) do
211 local new_rtn = hopcroft_minimize(rtn)
212 copy_attributes(rtn, new_rtn)
213 new_rtns:add(name, new_rtn)
215 self.rtns = new_rtns
218 function Grammar:generate_intfas()
219 -- first generate the set of states that need an IntFA: some RTN
220 -- states and all nonfinal GLA states.
221 local states = self:get_rtn_states_needing_intfa()
222 for rtn_state in each(self:get_rtn_states_needing_gla()) do
223 for gla_state in each(rtn_state.gla:states()) do
224 if not gla_state.final then
225 states:add(gla_state)
230 -- All states in the "states" set are nonfinal and have only
231 -- terminals as transitions. Create a list of:
232 -- {state, set of outgoing terms}
233 -- pairs for the states.
234 local state_term_pairs = {}
235 for state in each(states) do
236 local terms = Set:new()
237 for edge_val in state:transitions() do
238 if fa.is_nonterm(edge_val) or terms:contains(edge_val) then
239 error(string.format("Internal error with state %s, edge %s", serialize(state, 6, " "), serialize(edge_val)))
241 if edge_val ~= fa.eof then -- EOF doesn't need to be lexed in the IntFAs
242 terms:add(edge_val)
245 table.insert(state_term_pairs, {state, terms})
248 self.master_intfas = intfa_combine(self.terminals, state_term_pairs)
251 --[[--------------------------------------------------------------------
253 The remaining methods are for linearizing all the graph-like data
254 structures into an ordered form, in preparation for emitting them to
255 an output format like bytecode.
257 --------------------------------------------------------------------]]--
260 --[[--------------------------------------------------------------------
262 Grammar:get_strings(): Returns an ordered set that contains all strings
263 needed by any part of the grammar as it currently stands.
265 --------------------------------------------------------------------]]--
267 function Grammar:get_strings()
268 local strings = Set:new()
270 -- add the names of all the terminals
271 for term, _ in pairs(self.terminals) do
272 strings:add(term)
275 -- add the names of the rtns, and of named edges with the rtns.
276 for name, rtn in each(grammar.rtns) do
277 strings:add(name)
279 for rtn_state in each(rtn:states()) do
280 for edge_val, target_state, properties in rtn_state:transitions() do
281 if properties then
282 strings:add(properties.name)
288 -- sort the strings for deterministic output
289 strings = strings:to_array()
290 table.sort(strings)
291 strings_ordered_set = OrderedSet:new()
292 for string in each(strings) do
293 strings_ordered_set:add(string)
296 return strings_ordered_set
300 --[[--------------------------------------------------------------------
302 Grammar:get_flattened_rtn_list(): Creates and returns a list of all
303 the RTNs, states, and transitions in a particular and stable order,
304 ready for emitting to the outside world. The RTNs themselves are
305 returned in the order they were defined, except that the start RTN
306 is always emitted first.
308 Returns:
309 OrderedMap: {rtn_name -> {
310 states=OrderedSet {RTNState},
311 transitions={RTNState -> {list of {edge_val, target_state, properties}}
312 slot_count=slot_count
315 --------------------------------------------------------------------]]--
317 function Grammar:get_flattened_rtn_list()
318 local rtns = OrderedMap:new()
320 -- create each RTN with a list of states
321 for name, rtn in self.rtns:each() do
323 -- order states such that the start state is emitted first.
324 -- TODO (maybe): make this completely stable.
325 local states = rtn:states()
326 states:remove(rtn.start)
327 states = states:to_array()
328 table.insert(states, 1, rtn.start)
329 states = OrderedSet:new(states)
331 -- ensure that start RTN is emitted first
332 if name == self.start then
333 rtns:insert_front(name, {states=states, transitions={}, slot_count=rtn.slot_count})
334 else
335 rtns:add(name, {states=states, transitions={}, slot_count=rtn.slot_count})
339 -- create a list of transitions for every state
340 for name, rtn in each(rtns) do
341 for state in each(rtn.states) do
342 transitions = {}
343 for edge_val, target_state, properties in state:transitions() do
344 table.insert(transitions, {edge_val, target_state, properties})
347 -- sort RTN transitions thusly:
348 -- 1. terminal transitions come before nonterminal transitions
349 -- 2. terminal transitions are sorted by the low integer of the range
350 -- 3. nonterminal transitions are sorted by the name of the nonterminal
351 -- 4. transitions with the same edge value are sorted by their order
352 -- in the list of states (which is stable) (TODO: no it's not, yet)
353 sort_func = function (a, b)
354 if not fa.is_nonterm(a[1]) and fa.is_nonterm(b[1]) then
355 return true
356 elseif fa.is_nonterm(a[1]) and not fa.is_nonterm(b[1]) then
357 return false
358 elseif fa.is_nonterm(a[1]) then
359 if a[1] == b[1] then
360 return rtn.states:offset_of(a[2]) < rtn.states:offset_of(b[2])
361 else
362 return a[1].name < b[1].name
364 else
365 if a[1].low == b[1].low then
366 return rtn.states:offset_of(a[2]) < rtn.states:offset_of(b[2])
367 else
368 return a[1].low < b[1].low
372 table.sort(transitions, sort_func)
374 rtn.transitions[state] = transitions
378 return rtns
381 function Grammar:get_flattened_gla_list()
382 local glas = OrderedSet:new()
384 for name, rtn in each(self.rtns) do
385 for state in each(rtn:states()) do
386 if state.gla then
387 glas:add(state.gla)
392 return glas
395 -- vim:et:sts=2:sw=2