1 --[[--------------------------------------------------------------------
3 Gazelle: a system for building fast, reusable parsers
7 This is the top-level data structure representing a Gazelle grammar.
9 It contains all DFAs for all levels of the grammar, as well as all
10 metadata about the grammar, like what the start symbol is.
12 It is created and initially populated by the grammar parser. The
13 parser adds RTNs for each rule and IntFAs for each terminal, as well
14 as metadata explicitly or implicitly contained in the source file.
16 It is annotated with lookahead information in the form of GLAs by
17 the LL lookahead calculation.
19 The IntFAs for each terminal are combined into the grammar's master
20 IntFAs by the IntFA-combining step.
22 Finally, it is written out to bytecode by the bytecode emitting step.
24 Copyright (c) 2008 Joshua Haberman. See LICENSE for details.
26 --------------------------------------------------------------------]]--
28 require
"data_structures"
29 require
"fa_algorithms"
30 require
"intfa_combine"
32 Grammar
= {name
="Grammar"}
34 function Grammar
:new()
35 local obj
= newobject(self
)
36 obj
.rtns
= OrderedMap
:new()
38 obj
.master_intfas
= OrderedSet
:new()
39 obj
.start
= nil -- what rule the entire grammar starts on
43 -- Add a nonterminal and its associated RTN to the grammar.
44 -- TODO: how should redefinition be caught and warned/errored?
45 function Grammar
:add_nonterm(name
, rtn
, slot_count
, text
)
47 rtn
.slot_count
= slot_count
49 for state
in each(rtn
:states()) do
52 self
.rtns
:add(name
, rtn
)
55 -- Add a terminal and its associated IntFA to the grammar.
56 -- TODO: how should redefinition be caught and warned/errored?
57 function Grammar
:add_terminal(name
, intfa
)
58 self
.terminals
[name
] = intfa
61 function Grammar
:add_allow(what_to_allow
, start_nonterm
, end_nonterms
)
62 -- kind of a hack to do this here.
63 self
:determinize_rtns()
65 local children_func
= function(rule_name
)
66 if not end_nonterms
:contains(rule_name
) then
68 local rtn
= self
.rtns
:get(rule_name
)
70 error(string.format("Error computing ignore: rule %s does not exist", rule_name
))
74 local subrules
= Set
:new()
75 for state
in each(rtn
:states()) do
76 for edge_val
, dest_state
, properties
in state
:transitions() do
77 if fa
.is_nonterm(edge_val
) and properties
.slotnum
~= -1 then
78 subrules
:add(edge_val
.name
)
83 -- add self-transitions for every state
84 for state
in each(rtn
:states()) do
85 state
:add_transition(what_to_allow
, state
, {name
=what_to_allow
.name
, slotnum
=-1})
92 depth_first_traversal(start_nonterm
, children_func
)
95 function Grammar
:check_defined()
96 for name
, rtn
in each(self
.rtns
) do
97 for rtn_state
in each(rtn
:states()) do
98 for edge_val
in rtn_state
:transitions() do
99 if fa
.is_nonterm(edge_val
) and not self
.rtns
:contains(edge_val
.name
) then
100 error(string.format("Rule '%s' was referred to but never defined.", edge_val
.name
))
107 function Grammar
:get_rtn_states_needing_intfa()
108 local states
= Set
:new()
109 for name
, rtn
in each(self
.rtns
) do
110 for state
in each(rtn
:states()) do
111 if state
:needs_intfa() then
119 function Grammar
:get_rtn_states_needing_gla()
120 local states
= Set
:new()
121 for name
, rtn
in each(self
.rtns
) do
122 for state
in each(rtn
:states()) do
123 if state
:needs_gla() then
131 function copy_attributes(rtn
, new_rtn
)
132 new_rtn
.name
= rtn
.name
133 new_rtn
.slot_count
= rtn
.slot_count
134 new_rtn
.text
= rtn
.text
135 for state
in each(new_rtn
:states()) do
140 function Grammar
:assign_priorities()
141 -- For each non-epsilon transition in the grammar, we want to find the epsilon
142 -- closure (within the rule -- no following nonterminal or final transitions)
143 -- of all reverse transitions and assign any priorities the epsilon
144 -- transitions have to the non-epsilon transition.
146 -- Begin by building a list of reverse epsilon transitions. For each state
147 -- that has at least one epsilon transition going into it, we build a 2-tuple
148 -- of {set of states that are the source of an epsilon transitions,
149 -- map of priority_class -> priority in that class}
150 for name
, rtn
in each(self
.rtns
) do
151 local reverse_epsilon_transitions
= {}
152 for state
in each(rtn
:states()) do
153 for edge_val
, dest_state
, properties
in state
:transitions() do
154 if edge_val
== fa
.e
then
155 reverse_epsilon_transitions
[dest_state
] = reverse_epsilon_transitions
[dest_state
] or {Set
:new(), {}}
156 reverse_epsilon_transitions
[dest_state
][1]:add(state
)
157 if properties
and properties
.priority_class
then
158 if reverse_epsilon_transitions
[dest_state
][2][properties
.priority_class
] then
161 reverse_epsilon_transitions
[dest_state
][2][properties
.priority_class
] = properties
.priority
167 local priorities
= {}
168 local children
= function(state
, stack
)
169 local reverse_transitions
= reverse_epsilon_transitions
[state
]
170 if reverse_transitions
then
171 local child_states
, priority_classes
= unpack(reverse_transitions
)
172 for priority_class
, priority
in pairs(priority_classes
) do
173 if priorities
[priority_class
] then
174 error("Unexpected please report the grammar that triggered this error!")
176 priorities
[priority_class
] = priority
181 for state
in each(rtn
:states()) do
183 depth_first_traversal(state
, children
)
184 for edge_val
, dest_state
, properties
in state
:transitions() do
185 if edge_val
~= fa
.e
then
186 -- non-epsilon transitions should always have properties assigned,
187 -- because they always have slotnums and slotnames.
188 properties
.priorities
= priorities
192 state
.final
= {priorities
= priorities
}
198 function Grammar
:determinize_rtns()
199 local new_rtns
= OrderedMap
:new()
200 for name
, rtn
in each(self
.rtns
) do
201 local new_rtn
= nfa_to_dfa(rtn
)
202 copy_attributes(rtn
, new_rtn
)
203 new_rtns
:add(name
, new_rtn
)
208 function Grammar
:minimize_rtns()
209 local new_rtns
= OrderedMap
:new()
210 for name
, rtn
in each(self
.rtns
) do
211 local new_rtn
= hopcroft_minimize(rtn
)
212 copy_attributes(rtn
, new_rtn
)
213 new_rtns
:add(name
, new_rtn
)
218 function Grammar
:generate_intfas()
219 -- first generate the set of states that need an IntFA: some RTN
220 -- states and all nonfinal GLA states.
221 local states
= self
:get_rtn_states_needing_intfa()
222 for rtn_state
in each(self
:get_rtn_states_needing_gla()) do
223 for gla_state
in each(rtn_state
.gla
:states()) do
224 if not gla_state
.final
then
225 states
:add(gla_state
)
230 -- All states in the "states" set are nonfinal and have only
231 -- terminals as transitions. Create a list of:
232 -- {state, set of outgoing terms}
233 -- pairs for the states.
234 local state_term_pairs
= {}
235 for state
in each(states
) do
236 local terms
= Set
:new()
237 for edge_val
in state
:transitions() do
238 if fa
.is_nonterm(edge_val
) or terms
:contains(edge_val
) then
239 error(string.format("Internal error with state %s, edge %s", serialize(state
, 6, " "), serialize(edge_val
)))
241 if edge_val
~= fa
.eof
then -- EOF doesn't need to be lexed in the IntFAs
245 table.insert(state_term_pairs
, {state
, terms
})
248 self
.master_intfas
= intfa_combine(self
.terminals
, state_term_pairs
)
251 --[[--------------------------------------------------------------------
253 The remaining methods are for linearizing all the graph-like data
254 structures into an ordered form, in preparation for emitting them to
255 an output format like bytecode.
257 --------------------------------------------------------------------]]--
260 --[[--------------------------------------------------------------------
262 Grammar:get_strings(): Returns an ordered set that contains all strings
263 needed by any part of the grammar as it currently stands.
265 --------------------------------------------------------------------]]--
267 function Grammar
:get_strings()
268 local strings
= Set
:new()
270 -- add the names of all the terminals
271 for term
, _
in pairs(self
.terminals
) do
275 -- add the names of the rtns, and of named edges with the rtns.
276 for name
, rtn
in each(grammar
.rtns
) do
279 for rtn_state
in each(rtn
:states()) do
280 for edge_val
, target_state
, properties
in rtn_state
:transitions() do
282 strings
:add(properties
.name
)
288 -- sort the strings for deterministic output
289 strings
= strings
:to_array()
291 strings_ordered_set
= OrderedSet
:new()
292 for string in each(strings
) do
293 strings_ordered_set
:add(string)
296 return strings_ordered_set
300 --[[--------------------------------------------------------------------
302 Grammar:get_flattened_rtn_list(): Creates and returns a list of all
303 the RTNs, states, and transitions in a particular and stable order,
304 ready for emitting to the outside world. The RTNs themselves are
305 returned in the order they were defined, except that the start RTN
306 is always emitted first.
309 OrderedMap: {rtn_name -> {
310 states=OrderedSet {RTNState},
311 transitions={RTNState -> {list of {edge_val, target_state, properties}}
312 slot_count=slot_count
315 --------------------------------------------------------------------]]--
317 function Grammar
:get_flattened_rtn_list()
318 local rtns
= OrderedMap
:new()
320 -- create each RTN with a list of states
321 for name
, rtn
in self
.rtns
:each() do
323 -- order states such that the start state is emitted first.
324 -- TODO (maybe): make this completely stable.
325 local states
= rtn
:states()
326 states
:remove(rtn
.start
)
327 states
= states
:to_array()
328 table.insert(states
, 1, rtn
.start
)
329 states
= OrderedSet
:new(states
)
331 -- ensure that start RTN is emitted first
332 if name
== self
.start
then
333 rtns
:insert_front(name
, {states
=states
, transitions
={}, slot_count
=rtn
.slot_count
})
335 rtns
:add(name
, {states
=states
, transitions
={}, slot_count
=rtn
.slot_count
})
339 -- create a list of transitions for every state
340 for name
, rtn
in each(rtns
) do
341 for state
in each(rtn
.states
) do
343 for edge_val
, target_state
, properties
in state
:transitions() do
344 table.insert(transitions
, {edge_val
, target_state
, properties
})
347 -- sort RTN transitions thusly:
348 -- 1. terminal transitions come before nonterminal transitions
349 -- 2. terminal transitions are sorted by the low integer of the range
350 -- 3. nonterminal transitions are sorted by the name of the nonterminal
351 -- 4. transitions with the same edge value are sorted by their order
352 -- in the list of states (which is stable) (TODO: no it's not, yet)
353 sort_func
= function (a
, b
)
354 if not fa
.is_nonterm(a
[1]) and fa
.is_nonterm(b
[1]) then
356 elseif fa
.is_nonterm(a
[1]) and not fa
.is_nonterm(b
[1]) then
358 elseif fa
.is_nonterm(a
[1]) then
360 return rtn
.states
:offset_of(a
[2]) < rtn
.states
:offset_of(b
[2])
362 return a
[1].name
< b
[1].name
365 if a
[1].low
== b
[1].low
then
366 return rtn
.states
:offset_of(a
[2]) < rtn
.states
:offset_of(b
[2])
368 return a
[1].low
< b
[1].low
372 table.sort(transitions
, sort_func
)
374 rtn
.transitions
[state
] = transitions
381 function Grammar
:get_flattened_gla_list()
382 local glas
= OrderedSet
:new()
384 for name
, rtn
in each(self
.rtns
) do
385 for state
in each(rtn
:states()) do