initial
[prop.git] / lib-src / automata / lexergen.cc
blob8440c3ea267d2bb4e1b8363b34993f1e184a5e75
1 //////////////////////////////////////////////////////////////////////////////
2 // NOTICE:
3 //
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are free to incorporate any part of ADLib and Prop into
9 // your programs.
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
16 // code.
18 // This software is still under development and we welcome any suggestions
19 // and help from the users.
21 // Allen Leung
22 // 1994
23 //////////////////////////////////////////////////////////////////////////////
25 #include <assert.h>
26 #include <string.h>
27 #include <ctype.h>
28 #include <new.h>
29 #include <AD/automata/lexergen.h> // Lexical scanner generator
30 #include <AD/automata/nfa.h> // NFAs
31 #include <AD/automata/nfa_node.h> // NFA nodes
32 #include <AD/automata/gentable.h> // Table emitter
33 #include <AD/strings/charesc.h> // Escape sequence parsing
34 #include <AD/memory/mempool.h> // Memory pools
35 #include <AD/hash/dchash.h> // Direct chaining hash table
36 #include <AD/contain/vararray.h> // Generic arrays
37 #include <AD/contain/bitset.h> // bit set
38 #include <AD/contain/fbitset.h> // fast bit set
40 ////////////////////////////////////////////////////////////////////////////
41 // Make hidden types visible
42 ////////////////////////////////////////////////////////////////////////////
43 typedef LexerGen::Symbol Symbol;
44 typedef LexerGen::State State;
45 typedef LexerGen::Rule Rule;
46 typedef LexerGen::Options Options;
48 ////////////////////////////////////////////////////////////////////////////
49 // Constructor and destructor
50 ////////////////////////////////////////////////////////////////////////////
51 LexerGen::LexerGen()
52 : bytes_used(0), bad_rule(-1), error_msg(0), rule(0), equiv_classes(0) {}
53 LexerGen::~LexerGen() { delete [] rule; delete [] equiv_classes; }
55 ////////////////////////////////////////////////////////////////////////////
56 // To create a set of tables
57 ////////////////////////////////////////////////////////////////////////////
58 void LexerGen::create_tables(int size, int states, Symbol min, Symbol max)
59 { Super::create_tables(size,states,min,max);
60 rule = new Rule [states];
63 ////////////////////////////////////////////////////////////////////////////
64 // To grow the number of states
65 ////////////////////////////////////////////////////////////////////////////
66 void LexerGen::grow_states(int increment)
67 { Rule * new_rule = new Rule [ number_of_states + increment ];
68 memcpy(new_rule,rule,sizeof(Rule) * number_of_states);
69 delete [] rule;
70 rule = new_rule;
71 Super::grow_states(increment);
74 ////////////////////////////////////////////////////////////////////////////
75 // The main entry point of the lexical scanner compiler
76 ////////////////////////////////////////////////////////////////////////////
77 void LexerGen::compile
78 ( int n, const char * const * regexps, const char * const * contexts,
79 int max_ch, int options
82 ////////////////////////////////////////////////////////////////////////
83 // Here are the tables and maps that we'll need.
84 ////////////////////////////////////////////////////////////////////////
85 MemPool mem(4096); // Memory pool with page size of 4K
86 DCHashTable<FastBitSet *,State> dstates(1023,1.0);
87 // Map sets of nfa states to dfa states
88 VarArray<FastBitSet *> nstates; // Inverse of above
89 VarArray<Rule> accept_rule; // Accept rule of state, if any
90 NFA nfa(mem); // create the nfa
92 ////////////////////////////////////////////////////////////////////////
93 // Compile the regexps into an NFA
94 ////////////////////////////////////////////////////////////////////////
95 nfa.compile(n, regexps, contexts, max_ch, options & CaseInsensitive);
96 bad_rule = nfa.bad();
97 error_msg = nfa.error_msg();
98 if (! ok()) return;
100 ////////////////////////////////////////////////////////////////////////
101 // Compile the equivalence class table.
102 ////////////////////////////////////////////////////////////////////////
103 max_char = max_ch;
104 equiv_classes = nfa.get_equiv_classes();
106 ////////////////////////////////////////////////////////////////////////
107 // Start generating the tables.
108 ////////////////////////////////////////////////////////////////////////
109 create_tables(nfa.state_count() * nfa.equiv_class_count(),
110 nfa.state_count(), 0, nfa.equiv_class_count() - 1);
111 start();
113 ////////////////////////////////////////////////////////////////////////
114 // Create an error state. This state always jams
115 ////////////////////////////////////////////////////////////////////////
116 { FastBitSet * err = new (mem) FastBitSet(mem, nfa.state_count());
117 dstates.insert(err,(State)0);
118 nstates[0] = err;
119 rule[0] = (options & Backtracking) ? -1 : 0;
122 ////////////////////////////////////////////////////////////////////////
123 // Create two start states for each context.
124 ////////////////////////////////////////////////////////////////////////
125 { for (State s = 1; s <= nfa.context_count() * 2 + 2; s++)
126 { NFA_Node * theNFA = nfa.context_nfa(s-1);
127 if (theNFA == 0) {
128 nstates[s] = 0;
129 accept_rule[s] = 0;
130 } else {
131 FastBitSet * set = new (mem) FastBitSet(mem, nfa.state_count());
132 theNFA->closure(set);
133 accept_rule[s] = NFA_Node::accept_thinning(set,n);
134 dstates.insert(set,s);
135 nstates[s] = set;
140 ////////////////////////////////////////////////////////////////////////
141 // Out going transitions indexed by equiv classes.
142 ////////////////////////////////////////////////////////////////////////
143 FastBitSet ** trans =
144 (FastBitSet**)mem.m_alloc
145 (sizeof(FastBitSet *) * nfa.equiv_class_count());
146 State * delta = // out going state
147 (State *)mem.c_alloc(sizeof(State) * nfa.equiv_class_count());
148 { for (int i = nfa.equiv_class_count() - 1; i >= 0; i--)
149 trans[i] = new (mem) FastBitSet(mem, nfa.state_count());
152 //{ for (int i = 0; i < nfa.state_count(); i++)
153 // if (nfa[i]) cerr << "State " << i << " = " << *nfa[i]->delta() << '\n';
156 ////////////////////////////////////////////////////////////////////////
157 // Compute the rest of the states
158 ////////////////////////////////////////////////////////////////////////
159 State number_of_dfa_states = nfa.context_count() * 2 + 3;
160 for (State s = 1; s < number_of_dfa_states; s++)
161 { register FastBitSet * set = nstates[s];
162 if (set == 0) continue;
163 Bool is_jammed = true; // assume no out transition for now.
165 /////////////////////////////////////////////////////////////////////
166 // Compute transitions from this state
167 /////////////////////////////////////////////////////////////////////
168 { FastBitSet::Iter iter(*set);
169 while (1)
170 { register int i = iter.next();
171 if (i < 0) break;
172 NFA_Delta * node = nfa[i];
173 if (node) // delta node
174 { Symbol c = node->symbol();
175 if (c >= 0)
176 { node->add_delta(trans[equiv_classes[c]]);
178 else
179 { const FastBitSet& chars = *nfa.char_class_map(c);
180 int ch;
181 for (ch = nfa.equiv_class_count() - 1; ch >= 0; ch--)
182 { if (chars[ch]) node->add_delta(trans[ch]); }
188 /////////////////////////////////////////////////////////////////////
189 // Update delta.
190 /////////////////////////////////////////////////////////////////////
191 for (int i = nfa.equiv_class_count() - 1; i >= 0; i--)
192 { Ix d = dstates.lookup(trans[i]);
193 if (d)
194 { // use old state
195 delta[i] = dstates.value(d);
196 if (delta[i] != 0) is_jammed = false;
197 trans[i]->clear();
198 } else
199 { // create a new state
200 Rule r = NFA_Node::accept_thinning(trans[i],n);
201 dstates.insert(trans[i], number_of_dfa_states);
202 nstates[number_of_dfa_states] = trans[i];
203 accept_rule[number_of_dfa_states] = r;
204 trans[i] = new (mem) FastBitSet(mem, nfa.state_count());
205 delta[i] = number_of_dfa_states++;
206 is_jammed = false;
207 // cerr << "State " << s << " = " << *set
208 // << " n = " << n << " rule = " << r << "\n";
212 /////////////////////////////////////////////////////////////////////
213 // insert transition into table.
214 /////////////////////////////////////////////////////////////////////
215 add_state(s, delta);
217 /////////////////////////////////////////////////////////////////////
218 // The rule table actually contains information concerning whether
219 // a state has any out going states.
220 /////////////////////////////////////////////////////////////////////
221 if ((options & Backtracking) && is_jammed)
222 rule[s] = -accept_rule[s]-1;
223 else
224 rule[s] = accept_rule[s];
227 ////////////////////////////////////////////////////////////////////////
228 // Finish the table compression
229 ////////////////////////////////////////////////////////////////////////
230 finish();
232 ////////////////////////////////////////////////////////////////////////
233 // Compute statistics
234 ////////////////////////////////////////////////////////////////////////
235 bytes_used = mem.bytes_used();
236 equiv_class_count = nfa.equiv_class_count();
239 ////////////////////////////////////////////////////////////////////////////
240 // Emit C++ code
241 ////////////////////////////////////////////////////////////////////////////
242 ostream& LexerGen::gen_code(ostream& out, const char name[]) const
243 { if (ok()) {
244 Super::gen_code(out,name);
245 TablePrinter pr;
246 pr.print(out, (const char *)equiv_classes,
247 max_char + 1, sizeof(Symbol),
248 "static const unsigned char ", name, "equiv_classes", true);
249 pr.print(out, (const char *)rule,
250 max_state + 1, sizeof(Rule),
251 "static const DFATables::Rule ", name, "accept_rule");
253 return out;
256 ////////////////////////////////////////////////////////////////////////////
257 // Method to generate a report.
258 ////////////////////////////////////////////////////////////////////////////
259 ostream& LexerGen::print_report(ostream& log) const
260 { log << "Lexer statistics:"
261 "\nNumber of states = " << (max_state + 1)
262 << "\nNumber of next/check entries = " << (max_check + 1)
263 << "\nNumber of equivalence character classes = " << equiv_class_count
264 << "\nBytes used = "
265 << ((bytes_used + 512)/1024) << "(k)\n\n";
266 return log;