Add more flexibility in how whitespace is specified.
[gazelle.git] / compiler / bytecode.lua
blob1f4e1e6e7d3e838e9d00f88f0b6ec50ff0ac33bd
1 --[[--------------------------------------------------------------------
3 Gazelle: a system for building fast, reusable parsers
5 bytecode.lua
7 Code that takes the final optimized parsing structures and emits them
8 to bytecode (in Bitcode format).
10 Copyright (c) 2007-2008 Joshua Haberman. See LICENSE for details.
12 --------------------------------------------------------------------]]--
14 require "bc"
16 -- See FILEFORMAT for details about what these constants mean
17 BC_INTFAS = 8
18 BC_INTFA = 9
19 BC_STRINGS = 10
20 BC_RTNS = 11
21 BC_RTN = 12
22 BC_GLAS = 13
23 BC_GLA = 14
25 BC_INTFA_STATE = 0
26 BC_INTFA_FINAL_STATE = 1
27 BC_INTFA_TRANSITION = 2
28 BC_INTFA_TRANSITION_RANGE = 3
30 BC_STRING = 0
32 BC_RTN_INFO = 0
33 BC_RTN_STATE_WITH_INTFA = 2
34 BC_RTN_STATE_WITH_GLA = 3
35 BC_RTN_TRIVIAL_STATE = 4
36 BC_RTN_TRANSITION_TERMINAL = 5
37 BC_RTN_TRANSITION_NONTERM = 6
39 BC_GLA_STATE = 0
40 BC_GLA_FINAL_STATE = 1
41 BC_GLA_TRANSITION = 2
43 if not print_verbose then
44 function print_verbose(str)
45 print(str)
46 end
47 end
49 function write_bytecode(grammar, outfilename)
50 -- write Bitcode header
51 bc_file = bc.File:new(outfilename, "GH")
52 abbrevs = define_abbrevs(bc_file)
54 -- Obtain linearized representations of all the DFAs from the Grammar object.
55 local strings = grammar:get_strings()
56 local rtns = grammar:get_flattened_rtn_list()
57 local glas = grammar:get_flattened_gla_list()
58 local intfas = grammar.master_intfas
60 -- emit the strings
61 print_verbose(string.format("Writing %d strings...", strings:count()))
62 bc_file:enter_subblock(BC_STRINGS)
63 for string in each(strings) do
64 bc_file:write_abbreviated_record(abbrevs.bc_string, string)
65 end
66 bc_file:end_subblock(BC_STRINGS)
68 -- emit the intfas
69 print_verbose(string.format("Writing %d IntFAs...", intfas:count()))
70 bc_file:enter_subblock(BC_INTFAS)
71 for intfa in each(intfas) do
72 emit_intfa(intfa, strings, bc_file, abbrevs)
73 end
74 bc_file:end_subblock(BC_INTFAS)
76 -- emit the GLAs
77 print_verbose(string.format("Writing %d GLAs...", glas:count()))
78 bc_file:enter_subblock(BC_GLAS)
79 for gla in each(glas) do
80 emit_gla(gla, strings, rtns, intfas, bc_file, abbrevs)
81 end
82 bc_file:end_subblock(BC_GLAS)
84 -- emit the RTNs
85 bc_file:enter_subblock(BC_RTNS)
86 print_verbose(string.format("Writing %d RTNs...", rtns:count()))
87 for name, rtn in each(rtns) do
88 emit_rtn(name, rtn, rtns, glas, intfas, strings, bc_file, abbrevs)
89 end
90 bc_file:end_subblock(BC_RTNS)
92 end
95 function emit_intfa(intfa, strings, bc_file, abbrevs)
96 bc_file:enter_subblock(BC_INTFA)
98 local intfa_state_offsets = {}
99 local intfa_transitions = {}
101 -- order the states such that the start state is emitted first
102 local states = intfa:states()
103 states:remove(intfa.start)
104 states = states:to_array()
105 table.insert(states, 1, intfa.start)
107 -- do a first pass over the states that records their order and builds
108 -- each state's list of transitions.
109 local state_transitions = {}
110 for i, state in ipairs(states) do
111 intfa_state_offsets[state] = i - 1
113 state_transitions[state] = {}
114 for edge_val, target_state, properties in state:transitions() do
115 for range in edge_val:each_range() do
116 table.insert(state_transitions[state], {range, target_state})
120 -- sort the transitions into a stable order, to make the output
121 -- more deterministic.
122 table.sort(state_transitions[state], function (a, b) return a[1].low < b[1].low end)
124 -- add this state's transitions to the global list of transitions for the IntFA
125 for t in each(state_transitions[state])
126 do table.insert(intfa_transitions, t)
130 print_verbose(string.format(" %d states, %d transitions", #states, #intfa_transitions))
132 -- emit the states
133 for state in each(states) do
134 if state.final then
135 bc_file:write_abbreviated_record(abbrevs.bc_intfa_final_state,
136 #state_transitions[state],
137 strings:offset_of(state.final))
138 else
139 bc_file:write_abbreviated_record(abbrevs.bc_intfa_state, #state_transitions[state])
143 -- emit the transitions
144 for transition in each(intfa_transitions) do
145 local range, target_state = unpack(transition)
146 target_state_offset = intfa_state_offsets[target_state]
147 if range.low == range.high then
148 bc_file:write_abbreviated_record(abbrevs.bc_intfa_transition, range.low, target_state_offset)
149 else
150 local high = range.high
151 if high == math.huge then high = 255 end -- temporary ASCII-specific hack
152 bc_file:write_abbreviated_record(abbrevs.bc_intfa_transition_range, range.low, high, target_state_offset)
156 bc_file:end_subblock(BC_INTFA)
159 function emit_gla(gla, strings, rtns, intfas, bc_file, abbrevs)
160 bc_file:enter_subblock(BC_GLA)
162 local states = OrderedSet:new()
163 states:add(gla.start)
164 for state in each(gla:states()) do
165 if state ~= gla.start then
166 states:add(state)
170 -- emit states
171 for state in each(states) do
172 if state.final then
173 -- figure out the offset of the RTN transition this GLA final state implies.
174 local ordered_rtn = rtns:get(gla.rtn_state.rtn.name)
175 local transitions = ordered_rtn.transitions[gla.rtn_state]
176 local transition_offset = nil
177 if state.final[1] == 0 and state.final[2] == 0 then
178 -- This is a "return" prediction
179 transition_offset = 0
180 else
181 for i=1,#transitions do
182 if transitions[i][1] == state.final[1] and transitions[i][2] == state.final[2] then
183 transition_offset = i
184 break
189 if transition_offset == nil then
190 error("GLA final state indicated a state that was not found in the RTN state.")
192 bc_file:write_abbreviated_record(abbrevs.bc_gla_final_state, transition_offset)
193 else
194 bc_file:write_abbreviated_record(abbrevs.bc_gla_state,
195 intfas:offset_of(state.intfa),
196 state:num_transitions())
200 -- emit transitions
201 for state in each(states) do
202 for edge_val, dest_state in state:transitions() do
203 local edge_num = 0
204 if edge_val ~= fa.eof then
205 edge_num = strings:offset_of(edge_val) + 1
207 bc_file:write_abbreviated_record(abbrevs.bc_gla_transition,
208 edge_num,
209 states:offset_of(dest_state))
213 bc_file:end_subblock(BC_GLA)
216 function emit_rtn(name, rtn, rtns, glas, intfas, strings, bc_file, abbrevs)
217 -- emit RTN name
218 bc_file:enter_subblock(BC_RTN)
219 bc_file:write_abbreviated_record(abbrevs.bc_rtn_info, strings:offset_of(name), rtn.slot_count)
221 -- emit states
222 for state in each(rtn.states) do
223 local is_final
224 if state.final then
225 is_final = 1
226 else
227 is_final = 0
230 if state.gla then
231 bc_file:write_abbreviated_record(abbrevs.bc_rtn_state_with_gla,
232 #rtn.transitions[state],
233 is_final,
234 glas:offset_of(state.gla))
235 elseif state.intfa then
236 bc_file:write_abbreviated_record(abbrevs.bc_rtn_state_with_intfa,
237 #rtn.transitions[state],
238 is_final,
239 intfas:offset_of(state.intfa))
240 else
241 bc_file:write_abbreviated_record(abbrevs.bc_rtn_trivial_state,
242 #rtn.transitions[state],
243 is_final)
247 -- emit transitions
248 for state in each(rtn.states) do
249 for transition in each(rtn.transitions[state]) do
250 local edge_val, dest_state, properties = unpack(transition)
251 local bytecode_slotnum
252 if properties.slotnum == -1 then
253 bytecode_slotnum = 0
254 else
255 bytecode_slotnum = properties.slotnum
257 if fa.is_nonterm(edge_val) then
258 bc_file:write_abbreviated_record(abbrevs.bc_rtn_transition_nonterm,
259 rtns:offset_of_key(edge_val.name),
260 rtn.states:offset_of(dest_state),
261 strings:offset_of(properties.name),
262 bytecode_slotnum)
263 else
264 bc_file:write_abbreviated_record(abbrevs.bc_rtn_transition_terminal,
265 strings:offset_of(edge_val),
266 rtn.states:offset_of(dest_state),
267 strings:offset_of(properties.name),
268 bytecode_slotnum)
273 bc_file:end_subblock(BC_RTN)
277 function define_abbrevs(bc_file)
278 abbrevs = {}
280 -- Enter a BLOCKINFO record to define abbreviations for all our records.
281 -- See FILEFORMAT for a description of what all the record types mean.
282 bc_file:enter_subblock(bc.BLOCKINFO)
284 -- IntFA abbreviations
285 bc_file:write_unabbreviated_record(bc.SETBID, BC_INTFA)
287 abbrevs.bc_intfa_final_state = bc_file:define_abbreviation(4,
288 bc.LiteralOp:new(BC_INTFA_FINAL_STATE),
289 bc.VBROp:new(5),
290 bc.VBROp:new(5))
292 abbrevs.bc_intfa_state = bc_file:define_abbreviation(5,
293 bc.LiteralOp:new(BC_INTFA_STATE),
294 bc.VBROp:new(5))
296 abbrevs.bc_intfa_transition = bc_file:define_abbreviation(6,
297 bc.LiteralOp:new(BC_INTFA_TRANSITION),
298 bc.VBROp:new(8),
299 bc.VBROp:new(6))
301 abbrevs.bc_intfa_transition_range = bc_file:define_abbreviation(7,
302 bc.LiteralOp:new(BC_INTFA_TRANSITION_RANGE),
303 bc.VBROp:new(8),
304 bc.VBROp:new(8),
305 bc.VBROp:new(6))
307 -- Strings abbreviations
308 bc_file:write_unabbreviated_record(bc.SETBID, BC_STRINGS)
310 abbrevs.bc_string = bc_file:define_abbreviation(4,
311 bc.LiteralOp:new(BC_STRING),
312 bc.ArrayOp:new(bc.FixedOp:new(7)))
314 -- RTN abbreviations
315 bc_file:write_unabbreviated_record(bc.SETBID, BC_RTN)
317 abbrevs.bc_rtn_info = bc_file:define_abbreviation(4,
318 bc.LiteralOp:new(BC_RTN_INFO),
319 bc.VBROp:new(6),
320 bc.VBROp:new(4))
322 abbrevs.bc_rtn_state_with_intfa = bc_file:define_abbreviation(5,
323 bc.LiteralOp:new(BC_RTN_STATE_WITH_INTFA),
324 bc.VBROp:new(4),
325 bc.FixedOp:new(1),
326 bc.VBROp:new(4))
328 abbrevs.bc_rtn_state_with_gla = bc_file:define_abbreviation(6,
329 bc.LiteralOp:new(BC_RTN_STATE_WITH_GLA),
330 bc.VBROp:new(4),
331 bc.FixedOp:new(1),
332 bc.VBROp:new(4))
334 abbrevs.bc_rtn_trivial_state = bc_file:define_abbreviation(7,
335 bc.LiteralOp:new(BC_RTN_TRIVIAL_STATE),
336 bc.FixedOp:new(1),
337 bc.FixedOp:new(1))
339 abbrevs.bc_rtn_transition_terminal = bc_file:define_abbreviation(8,
340 bc.LiteralOp:new(BC_RTN_TRANSITION_TERMINAL),
341 bc.VBROp:new(6),
342 bc.VBROp:new(5),
343 bc.VBROp:new(5),
344 bc.VBROp:new(4))
346 abbrevs.bc_rtn_transition_nonterm = bc_file:define_abbreviation(9,
347 bc.LiteralOp:new(BC_RTN_TRANSITION_NONTERM),
348 bc.VBROp:new(6),
349 bc.VBROp:new(5),
350 bc.VBROp:new(5),
351 bc.VBROp:new(4))
353 -- GLA abbreviations
354 bc_file:write_unabbreviated_record(bc.SETBID, BC_GLA)
356 abbrevs.bc_gla_state = bc_file:define_abbreviation(4,
357 bc.LiteralOp:new(BC_GLA_STATE),
358 bc.VBROp:new(4),
359 bc.VBROp:new(4))
361 abbrevs.bc_gla_final_state = bc_file:define_abbreviation(5,
362 bc.LiteralOp:new(BC_GLA_FINAL_STATE),
363 bc.VBROp:new(4))
365 abbrevs.bc_gla_transition = bc_file:define_abbreviation(6,
366 bc.LiteralOp:new(BC_GLA_TRANSITION),
367 bc.VBROp:new(4),
368 bc.VBROp:new(4))
370 bc_file:end_subblock(bc.BLOCKINFO)
372 return abbrevs
375 -- vim:et:sts=2:sw=2