1 --[[--------------------------------------------------------------------
3 Gazelle: a system for building fast, reusable parsers
7 Algorithms that operate on finite automata (NFAs and/or DFAs). For
8 the most part these can work on any of the FA types, since they do
9 not interpret the meaning of the edges. It is nice to keep these
10 algorithms separate from the FA data structure in fa.lua.
12 Copyright (c) 2008 Joshua Haberman. See LICENSE for details.
14 --------------------------------------------------------------------]]--
17 --[[--------------------------------------------------------------------
19 NFA to DFA conversion.
21 --------------------------------------------------------------------]]--
23 function epsilon_closure(state
)
24 return depth_first_traversal(state
, function(s
) return s
:transitions_for(fa
.e
, "ALL") end)
27 -- we as input an array of NFAs, one for each token we want to match simultaneously,
28 -- and strings describing each token. So:
29 -- { {nfa1, "token string 1"},
30 -- {nfa2, "token string 2", etc} }
31 function nfas_to_dfa(nfa_string_pairs
, ambiguous_ok
)
32 ambiguous_ok
= ambiguous_ok
or false
33 -- First we need to mark all final states and capture groups with the token string
36 for i
, nfa_string_pair
in ipairs(nfa_string_pairs
) do
37 local nfa
, token_string
= unpack(nfa_string_pair
)
38 table.insert(nfas
, nfa
)
40 -- Mark the nfa fragment's final state as the final state for this *token*
41 nfa
.final
.final
= token_string
44 -- Now combine all the nfas with alternation
45 local final_nfa
= nfa_construct
.alt(nfas
)
46 return nfa_to_dfa(final_nfa
, ambiguous_ok
)
49 function new_dfa_state(nfa
, nfa_states
, ambiguous_ok
)
50 local dfa_state
= nfa
:new_state()
52 -- If this is a final state for one or more of the nfas, make it an
53 -- (appropriately labeled) final state for the dfa
54 for nfa_state
in nfa_states
:each() do
55 if nfa_state
.final
then
56 if dfa_state
.final
and dfa_state
.final
~= nfa_state
.final
then
58 if type(dfa_state
.final
) ~= "table" then dfa_state
.final
= Set
:new({dfa_state
.final
}) end
59 dfa_state
.final
:add(nfa_state
.final
)
61 error("Ambiguous finality not supported yet!! (" .. tostring(dfa_state
.final
) .. " and " .. tostring(nfa_state
.final
.. ")"))
64 dfa_state
.final
= nfa_state
.final
72 function nfa_to_dfa(nfa
, ambiguous_ok
)
73 -- The sets of NFA states we need to process for outgoing transitions
74 local first_nfa_states
= epsilon_closure(nfa
.start
)
75 local queue
= Queue
:new(first_nfa_states
)
77 local dfa
= nfa
:new_graph
{start
= new_dfa_state(nfa
, first_nfa_states
, ambiguous_ok
)}
78 -- The DFA states we create from sets of NFA states
79 local dfa_states
= {[first_nfa_states
:hash_key()] = dfa
.start
}
81 while not queue
:isempty() do
82 local nfa_states
= queue
:dequeue()
83 local dfa_state
= dfa_states
[nfa_states
:hash_key()]
85 -- Generate a list of symbols that transition out of this set of NFA states.
86 -- We prefer this to iterating over the entire symbol space because it's
87 -- vastly more efficient in the case of a large symbol space (eg. Unicode)
88 local symbol_tuples
= nfa
:get_outgoing_edge_values(nfa_states
)
90 -- For each output symbol, generate the list of destination NFA states that
91 -- recognizing this symbol could put you in (including epsilon transitions).
92 for symbol_tuple
in each(symbol_tuples
) do
93 local symbol
, properties
= unpack(symbol_tuple
)
94 local dest_nfa_states
= Set
:new()
95 for nfa_state
in nfa_states
:each() do
96 -- equivalence classes dictate that this character represents what will
97 -- happen to ALL characters in the set
98 local target_states
= nfa_state
:transitions_for(symbol
, properties
)
100 if target_states
then
101 for target_state
in each(target_states
) do
102 dest_nfa_states
:add(target_state
)
103 dest_nfa_states
:add_collection(epsilon_closure(target_state
):to_array())
108 -- this is necessary because (at the moment) get_outgoing_edge_values will generate
109 -- combinations of symbol/properties that don't *actually* transtion anywhere
110 -- TODO: get rid of that shortcoming
111 if not dest_nfa_states
:isempty() then
113 -- create a DFA state for this set of NFA states, if one does not
115 local dest_dfa_state
= dfa_states
[dest_nfa_states
:hash_key()]
116 if dest_dfa_state
== nil then
117 dest_dfa_state
= new_dfa_state(nfa
, dest_nfa_states
, ambiguous_ok
)
118 dfa_states
[dest_nfa_states
:hash_key()] = dest_dfa_state
119 queue
:enqueue(dest_nfa_states
)
122 -- create a transition from the current DFA state into the new one
123 dfa_state
:add_transition(symbol
, dest_dfa_state
, properties
)
133 --[[--------------------------------------------------------------------
137 hopcroft_minimize(dfa): transform a DFA into an equivalent DFA with
138 the minimal number of states. Uses Hopcroft's algorithm, which is
139 O(n lg n) in the number of states, as explained by both Hopcroft and
140 Gries (see BIBLIOGRAPHY for details).
142 --------------------------------------------------------------------]]--
144 function hopcroft_minimize(dfa
)
145 -- First create the alphabet and an inverse transition table.
146 local alphabet
= dfa
:get_outgoing_edge_values(dfa
:states())
147 local inverse_transitions
= {}
149 for state
in each(dfa
:states()) do
150 for symbol
, dest_state
, properties
in state
:transitions() do
151 inverse_transitions
[dest_state
] = inverse_transitions
[dest_state
] or dfa
:new_state()
152 inverse_transitions
[dest_state
]:add_transition(symbol
, state
, properties
)
156 -- Create initial blocks, grouped by finality.
157 local initial_blocks
= {}
158 for state
in each(dfa
:states()) do
159 local finality
= state
.final
or "NONE"
160 initial_blocks
[finality
] = initial_blocks
[finality
] or {}
161 table.insert(initial_blocks
[finality
], state
)
164 local blocks
= Set
:new()
165 local work_queue
= Queue
:new()
166 local work_queue_set
= Set
:new()
167 for finality
, states
in pairs(initial_blocks
) do
168 local block
= Set
:new(states
)
170 for state
in each(states
) do
173 for symbol_tuple
in each(alphabet
) do
174 local symbol
, properties
= unpack(symbol_tuple
)
175 work_queue
:enqueue({block
, symbol
, properties
})
176 work_queue_set
:add(tostring(block
) .. tostring(symbol
) .. tostring(properties
))
180 local num_iterations
= 0
181 while (not work_queue
:isempty()) do
182 num_iterations
= num_iterations
+ 1
183 local block
, symbol
, properties
= unpack(work_queue
:dequeue())
184 work_queue_set
:remove(tostring(block
) .. tostring(symbol
) .. tostring(properties
))
186 -- determine what blocks need to be split
187 local states_to_split
= Set
:new()
188 for state
in each(block
) do
189 if inverse_transitions
[state
] then
190 states_to_split
:add_collection(inverse_transitions
[state
]:transitions_for(symbol
, properties
))
196 for state_to_split
in each(states_to_split
) do
197 for state
in each(state_to_split
.block
) do
198 local dest_state
= state
:transitions_for(symbol
, properties
):to_array()[1]
199 if not (dest_state
and dest_state
.block
== block
) then
200 if not new_twins
[state
.block
] then
201 local new_twin
= Set
:new()
203 new_twins
[state
.block
] = new_twin
205 new_twins
[state
.block
]:add(state
)
210 -- fix work queue according to splits
211 for old_block
, new_twin
in pairs(new_twins
) do
212 for state
in each(new_twin
) do
213 state
.block
:remove(state
)
214 state
.block
= new_twin
218 if old_block
:count() < new_twin
:count() then
219 smaller_block
= old_block
221 smaller_block
= new_twin
224 for alphabet_symbol_tuple
in each(alphabet
) do
225 local alphabet_symbol
, alphabet_properties
= unpack(alphabet_symbol_tuple
)
226 if work_queue_set
:contains(tostring(old_block
) .. tostring(alphabet_symbol
) .. tostring(alphabet_properties
)) then
227 work_queue
:enqueue({new_twin
, alphabet_symbol
, alphabet_properties
})
228 work_queue_set
:add(tostring(new_twin
) .. tostring(alphabet_symbol
) .. tostring(alphabet_properties
))
230 work_queue
:enqueue({smaller_block
, alphabet_symbol
, alphabet_properties
})
231 work_queue_set
:add(tostring(smaller_block
) .. tostring(alphabet_symbol
) .. tostring(alphabet_properties
))
237 -- the blocks are the new states
239 for block
in blocks
:each() do
240 states
[block
] = dfa
:new_state()
241 for state
in each(block
) do
243 states
[block
].final
= state
.final
248 local minimal_dfa
= dfa
:new_graph()
249 minimal_dfa
.start
= states
[dfa
.start
.block
]
250 for block
in blocks
:each() do
251 for state
in each(block
) do
252 for symbol
, dest_state
, properties
in state
:transitions() do
253 states
[block
]:add_transition(symbol
, states
[dest_state
.block
], properties
)
258 -- print("Num states: " .. tostring(dfa:states():count()) ..
259 -- ", alphabet size: " .. tostring(#alphabet) ..
260 -- ", num iterations: " .. tostring(num_iterations))
265 --[[--------------------------------------------------------------------
269 fa_isequal(fa1, fa2): Returns true if the given finite automata are
270 equal, false otherwise.
272 --------------------------------------------------------------------]]--
274 function fa_isequal(fa1
, fa2
)
275 local equivalent_states
={[fa1
.start
]=fa2
.start
}
276 local queue
= Queue
:new(fa1
.start
)
277 local fa2_seen_states
= Set
:new()
278 fa2_seen_states
:add(fa2
.start
)
280 while not queue
:isempty() do
281 local s1
= queue
:dequeue()
282 local s2
= equivalent_states
[s1
]
284 if s1
:num_transitions() ~= s2
:num_transitions() then
288 if (s1
.final
or s2
.final
) then
289 if not (s1
.final
and s2
.final
) then
293 if type(s1
.final
) ~= type(s2
.final
) then
297 if type(s1
.final
) == "table" then
298 if not table_shallow_eql(s1
.final
, s2
.final
) then
301 elseif s1
.final
~= s2
.final
then
306 for edge_val
, dest_state
in s1
:transitions() do
307 local s2_dest_state
= s2
:dest_state_for(edge_val
)
308 if not s2_dest_state
then
310 elseif equivalent_states
[dest_state
] then
311 if equivalent_states
[dest_state
] ~= s2_dest_state
then
314 elseif fa2_seen_states
:contains(s2_dest_state
) then
315 -- we have seen this state before, but not as equivalent to
319 equivalent_states
[dest_state
] = s2_dest_state
320 fa2_seen_states
:add(s2_dest_state
)
321 queue
:enqueue(dest_state
)
330 --[[--------------------------------------------------------------------
334 fa_longest_path(fa): Returns an integer representing how long the
335 longest path from the start state to a final state can be. Returns
336 math.huge if the graph has cycles.
338 --------------------------------------------------------------------]]--
340 function fa_longest_path(fa
)
342 local current_depth
= 0
343 local seen
= Set
:new()
344 function dfs_helper(state
)
346 if state
.final
and current_depth
> longest
then
347 longest
= current_depth
350 for edge_val
, dest_state
in state
:transitions() do
351 if seen
:contains(dest_state
) then
354 current_depth
= current_depth
+ 1
355 dfs_helper(dest_state
)
356 current_depth
= current_depth
- 1