initial
[prop.git] / lib-src / automata / lexergen2.cc
blob4689407b7cdffdf6cf91b93fbff467951d3dfbb5
1 //////////////////////////////////////////////////////////////////////////////
2 // NOTICE:
3 //
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are free to incorporate any part of ADLib and Prop into
9 // your programs.
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
16 // code.
18 // This software is still under development and we welcome any suggestions
19 // and help from the users.
21 // Allen Leung
22 // 1994
23 //////////////////////////////////////////////////////////////////////////////
25 #include <assert.h>
26 #include <string.h>
27 #include <ctype.h>
28 #include <new.h>
29 #include <AD/automata/lexergen.h> // Lexical scanner generator
30 #include <AD/automata/gentable.h> // Table emitter
31 #include <AD/strings/charesc.h> // Escape sequence parsing
32 #include <AD/memory/mempool.h> // Memory pools
33 #include <AD/hash/dchash.h> // Direct chaining hash table
34 #include <AD/contain/vararray.h> // Generic arrays
35 #include <AD/contain/bitset.h> // bit set
37 ////////////////////////////////////////////////////////////////////////////
38 // Make hidden types visible
39 ////////////////////////////////////////////////////////////////////////////
40 typedef LexerGen::Symbol Symbol;
41 typedef LexerGen::State State;
42 typedef LexerGen::Rule Rule;
43 typedef LexerGen::Options Options;
45 ////////////////////////////////////////////////////////////////////////////
46 // Constructor and destructor
47 ////////////////////////////////////////////////////////////////////////////
48 LexerGen::LexerGen() : rule(0), bad_rule(-1), equiv_classes(0) {}
49 LexerGen::~LexerGen() { delete [] rule; delete [] equiv_classes; }
51 ////////////////////////////////////////////////////////////////////////////
52 // Some flags and constants
53 ////////////////////////////////////////////////////////////////////////////
54 #define nil 0 // nil pointer
55 #define RANGE_BIT 0x80000000 // marks a range in character class
56 #define COMPLEMENT 0x40000000 // marks a set complement in the same
57 #define END_CHAR_CLASS 0x20000000 // marks the end of a character class
59 ////////////////////////////////////////////////////////////////////////////
60 // To create a set of tables
61 ////////////////////////////////////////////////////////////////////////////
62 void LexerGen::create_tables(int size, int states, Symbol min, Symbol max)
63 { Super::create_tables(size,states,min,max);
64 rule = new Rule [states];
67 ////////////////////////////////////////////////////////////////////////////
68 // To grow the number of states
69 ////////////////////////////////////////////////////////////////////////////
70 void LexerGen::grow_states(int increment)
71 { Rule * new_rule = new Rule [ number_of_states + increment ];
72 memcpy(new_rule,rule,sizeof(Rule) * number_of_states);
73 delete [] rule;
74 rule = new_rule;
75 Super::grow_states(increment);
78 ////////////////////////////////////////////////////////////////////////////
79 // A node in the nfa.
80 ////////////////////////////////////////////////////////////////////////////
81 struct Nfa {
82 enum { Accept, Delta, Epsilon } tag;
83 BitSet * epsilon_closure;
84 union {
85 struct Dummy2 {
86 State state; // nfa state number
87 Symbol c; // symbol to branch on
88 Nfa * out; // out transition
89 BitSet * delta_set; //
90 } n;
91 Nfa * epsilon[2]; // out transitions
94 void * operator new (int, MemPool& mem)
95 { Nfa * nfa = (Nfa*)mem[sizeof(Nfa)];
96 nfa->epsilon_closure = nil; return nfa;
99 void * operator new (int, MemPool& mem, Nfa * x, Nfa * y)
100 { if (x == nil) return y;
101 Nfa * nfa = (Nfa*)mem[sizeof(Nfa)];
102 nfa->tag = Epsilon; nfa->epsilon_closure = nil;
103 nfa->epsilon[0] = x; nfa->epsilon[1] = y; return nfa;
107 ////////////////////////////////////////////////////////////////////////////
108 // Parse a regular expression and construct a sub-nfa
109 ////////////////////////////////////////////////////////////////////////////
110 static Nfa * construct_nfa
111 ( const char * regexp, Rule rule, MemPool& mem,
112 int& number_of_nfa_states, Bool partitions[], Bool singletons[],
113 Symbol ** char_classes, Symbol * character_classes[],
114 int& char_class_num, const char * const * contexts, int number_of_contexts,
115 Nfa * context_nfas[], Bool foldcase
117 { Nfa * stack[LexerGen::Max_star_height];
118 register const char * p; // current pointer to regular expression
119 register Symbol c; // current character
120 register Nfa * root; // the root of the current sub-nfa
121 register Nfa * current; // the current point of the sub-nfa
122 register Nfa * next; // the next availble node of the nfa
123 Nfa ** sp = stack;
124 char ch; int i;
125 Bool in_context[LexerGen::Max_contexts];
126 Bool anchored = false; // pattern must start at beginning of line??
127 Bool in_any_context = false;
130 // Determine the context, if any
132 if (regexp[0] == '<') {
133 char name[256];
134 char * q;
135 int i;
136 memset(in_context,0,sizeof(in_context));
137 if (contexts == nil) goto syntax_error;
138 for (p = regexp+1, q = name;; ) {
139 switch (c = *p++) {
140 case '\0': goto syntax_error;
141 case '>': case ',':
142 q = '\0';
143 for (i = 0; contexts[i]; i++)
144 if (strcmp(contexts[i], name) == 0) goto found;
145 goto syntax_error;
146 found: q = name;
147 in_context[i] = true;
148 in_any_context = true;
149 if (c == '>') goto finish;
150 break;
151 default: *q++ = c; break;
154 finish: ;
155 } else p = regexp;
158 // Now scan the pattern and parse
160 root = current = next = new(mem) Nfa;
161 for (;;) {
162 Nfa * next_node = new(mem) Nfa; // Get a new node
163 switch (c = (unsigned char)*p++) {
164 case '(': // grouping
165 case '|': // disjunction
166 { sp[0] = root; sp[1] = next; sp[2] = c == '|' ? (Nfa*)1 : nil;
167 sp += 3; root = current = next = next_node;
168 } break;
169 case '\0': case ')': // end of pattern or end of grouping
170 { while (sp > stack && sp[-1]) {
171 Nfa * old_root = sp[-3];
172 Nfa * old_next = sp[-2];
173 sp -= 3;
174 Nfa * n = new(mem) Nfa;
175 old_next->tag = Nfa::Epsilon;
176 old_next->epsilon[0] = next;
177 old_next->epsilon[1] = nil;
178 n->tag = Nfa::Epsilon;
179 n->epsilon[0] = old_root;
180 n->epsilon[1] = root;
181 root = n;
183 if (c == ')') {
184 if (sp == stack) goto syntax_error;
185 Nfa * old_next = sp[-2];
186 Nfa * old_root = sp[-3];
187 sp -= 3;
188 old_next->tag = Nfa::Epsilon;
189 old_next->epsilon[0] = root;
190 old_next->epsilon[1] = nil;
191 root = current = old_root;
192 } else {
193 if (sp > stack) goto syntax_error;
194 goto done;
196 } break;
197 case '^': if (p-1 > regexp) goto build_delta;
198 anchored = true; break;
199 case '$': c = '\n'; goto build_delta;
200 case '*': if (root == next) goto syntax_error;
201 { *next_node = *current;
202 current->tag = Nfa::Epsilon;
203 current->epsilon[0] = next_node;
204 current->epsilon[1] = next;
205 next->tag = Nfa::Epsilon;
206 next->epsilon[0] = next_node;
207 next->epsilon[1] = new(mem) Nfa;
208 next = next->epsilon[1];
209 } break;
210 case '+': if (root == next) goto syntax_error;
211 { next->tag = Nfa::Epsilon;
212 next->epsilon[0] = current;
213 next->epsilon[1] = next_node;
214 next = next_node;
215 } break;
216 case '?': if (root == next) goto syntax_error;
217 { *next_node = *current;
218 current->tag = Nfa::Epsilon;
219 current->epsilon[0] = next_node;
220 current->epsilon[1] = next;
221 } break;
222 case '.': // wild card
223 { Symbol * char_class = *char_classes;
224 *char_class++ = '\n' | COMPLEMENT;
225 *char_class++ = END_CHAR_CLASS;
226 singletons['\n'] = true;
227 character_classes[char_class_num++] = *char_classes;
228 *char_classes = char_class;
229 c = -char_class_num;
230 goto build_delta;
231 } break;
232 case '[': // process a character class
233 { Symbol start_range = -1;
234 const char * start = p;
235 Bool complement = false;
236 Symbol last_char;
237 Symbol * char_class = *char_classes;
238 for (last_char = -1; (c = (unsigned char)*p++) != ']'; ) {
239 switch (c) {
240 case '\0': goto syntax_error;
241 case '-': if (last_char < 0) goto syntax_error;
242 start_range = last_char;
243 break;
244 case '^': if (p-1 == start) { complement = true; break; }
245 case '\\': p = parse_char(p-1,ch); c = (unsigned char)ch;
246 default:
247 if (foldcase) c = toupper(c);
248 *char_class++ = c;
249 if (start_range >= 0) {
250 if (start_range >= c) goto syntax_error;
251 char_class[-2] |= RANGE_BIT;
252 start_range = -1;
253 last_char = -1;
254 partitions[c+1] = true;
255 } else {
256 partitions[c] = true;
257 last_char = c;
258 if (*p != '-') singletons[c] = true;
260 break;
263 *char_class++ = END_CHAR_CLASS;
264 if (complement) (*char_classes)[0] |= COMPLEMENT;
265 character_classes[char_class_num++] = *char_classes;
266 *char_classes = char_class;
267 c = -char_class_num;
268 goto build_delta;
270 case '\\': p = parse_char(p-1,ch); c = ch;
271 build_delta: default: // process normal characters
272 { next->tag = Nfa::Delta;
273 next->n.state = number_of_nfa_states++;
274 if (foldcase && c >= 0) c = toupper(c);
275 next->n.c = c;
276 next->n.out = next_node;
277 current = next; next = next->n.out;
278 if (c >= 0) singletons[c] = true;
279 } break;
282 done:
283 next->tag = Nfa::Accept;
284 next->n.state = rule;
285 for (i = 0; i < number_of_contexts; i++) {
286 if (in_context[i]) {
287 if (! anchored)
288 context_nfas[2*i+2] = new(mem,context_nfas[2*i+2],root) Nfa;
289 context_nfas[2*i+3] = new(mem,context_nfas[2*i+3],root) Nfa;
292 if (! in_any_context) {
293 if (! anchored)
294 context_nfas[0] = new(mem,context_nfas[0],root) Nfa;
295 context_nfas[1] = new(mem,context_nfas[1],root) Nfa;
297 return root;
299 syntax_error:
300 return nil;
303 ////////////////////////////////////////////////////////////////////////////
304 // Compute the epsilon closure for one state
305 ////////////////////////////////////////////////////////////////////////////
306 static void compute_epsilon_closure(Nfa * nfa, BitSet * closure)
307 { if (nfa == nil) return;
308 switch (nfa->tag) {
309 case Nfa::Accept:
310 case Nfa::Delta: closure->add(nfa->n.state); return;
311 case Nfa::Epsilon:
312 compute_epsilon_closure(nfa->epsilon[0],closure);
313 compute_epsilon_closure(nfa->epsilon[1],closure);
314 return;
315 default:
316 assert("bug: compute_epsilon_closure()");
320 ////////////////////////////////////////////////////////////////////////////
321 // Find the accept rule of the state, if any
322 ////////////////////////////////////////////////////////////////////////////
323 inline Rule accept_thinning(BitSet * set, int n)
324 { for (int i = 0; i < n; i++)
325 if ((*set)[i]) {
326 for (int j = i+1; j < n; j++) set->remove(j);
327 return i+1;
329 return 0;
332 ////////////////////////////////////////////////////////////////////////////
333 // Compute epsilon closure for each state of the nfa.
334 ////////////////////////////////////////////////////////////////////////////
335 static BitSet * epsilon_closure
336 ( Nfa * nfa, MemPool& mem, int number_of_nfa_states, Nfa * nfa_states[], int n)
337 { BitSet * set;
338 if (nfa == nil) return nil;
339 if (nfa->epsilon_closure) return nfa->epsilon_closure;
340 switch (nfa->tag) {
341 case Nfa::Accept:
342 nfa_states[nfa->n.state] = nfa;
343 set = nfa->epsilon_closure = new(mem,number_of_nfa_states) BitSet;
344 set->add(nfa->n.state);
345 return set;
346 case Nfa::Delta:
347 nfa_states[nfa->n.state] = nfa;
348 set = nfa->epsilon_closure = new(mem,number_of_nfa_states) BitSet;
349 set->add(nfa->n.state);
350 nfa->n.delta_set = new(mem,number_of_nfa_states) BitSet;
351 compute_epsilon_closure(nfa->n.out,nfa->n.delta_set);
352 epsilon_closure(nfa->n.out,mem,number_of_nfa_states,nfa_states,n);
353 return set;
354 case Nfa::Epsilon:
355 set = nfa->epsilon_closure = new(mem,number_of_nfa_states) BitSet;
356 compute_epsilon_closure(nfa->epsilon[0],set);
357 compute_epsilon_closure(nfa->epsilon[1],set);
358 accept_thinning(set, n);
359 epsilon_closure(nfa->epsilon[0],mem,number_of_nfa_states,nfa_states,n);
360 epsilon_closure(nfa->epsilon[1],mem,number_of_nfa_states,nfa_states,n);
361 return set;
362 default:
363 assert("bug: epsilon_closure()");
364 return nil;
368 ////////////////////////////////////////////////////////////////////////////
369 // Compute transition from a set of nfa states
370 ////////////////////////////////////////////////////////////////////////////
371 inline void compute_transitions
372 (Nfa * nfa, Nfa * nfa_states[], BitSet * transitions[],
373 BitSet * char_class_map[], int number_of_equiv_classes,
374 Symbol equiv_classes[])
375 { register int s;
376 register BitSet& set = *nfa->epsilon_closure;
377 foreach_bit (s, set) {
378 Nfa * node = nfa_states[s];
379 if (node->tag == Nfa::Delta) {
380 Symbol c = node->n.c;
381 if (c >= 0) {
382 transitions[equiv_classes[c]]->Union(*node->n.delta_set);
383 } else {
384 BitSet& chars = *char_class_map[-c-1];
385 for (int ch = 0; ch < number_of_equiv_classes; ch++)
386 if (chars[ch]) transitions[ch]->Union(*node->n.delta_set);
392 ////////////////////////////////////////////////////////////////////////////
393 // The main entry point of the lexical scanner compiler
394 ////////////////////////////////////////////////////////////////////////////
395 void LexerGen::compile
396 ( int n, const char * const * regexp, const char * const * contexts,
397 int max_ch, int options
401 // Here are the tables and maps that we'll need
403 MemPool mem(4096); // Memory pool with page size of 4K
404 DCHashTable<BitSet *,State> dstates(257); // Map sets of nfa states to dfa states
405 VarArray<BitSet *> nstates; // Inverse of above
406 VarArray<Rule> accept_rule; // Accept rule of state, if any
407 Nfa ** context_nfas; // Sub-nfas of each context
410 // Local variables
412 int i; // loop index
413 State processed; // current state to be processed
414 State number_of_dfa_states = 1;
415 int number_of_nfa_states = n;
416 int number_of_equiv_classes = 0;
417 int number_of_character_classes = 0;
418 int character_class_size = 0;
419 int number_of_contexts = 0;
422 // Count the number of contexts and allocate space
424 if (contexts) while (contexts[number_of_contexts]) number_of_contexts++;
425 context_nfas =
426 (Nfa**)mem.c_alloc(sizeof(Nfa*) * (number_of_contexts + 1) * 2);
429 // Estimate the number of character classes and their combined size
431 for (i = 0; i < n; i++) {
432 character_class_size += strlen(regexp[i]) + 1;
433 for (register const char * p = regexp[i]; *p; p++)
434 if (*p == '[' || *p == '.') number_of_character_classes++;
438 // Allocate storage for the character classes
440 Bool partitions[257];
441 Bool singletons[257];
442 Symbol * char_classes =
443 (Symbol*)mem[sizeof(Symbol) * character_class_size ];
444 Symbol ** character_classes =
445 (Symbol**)mem[sizeof(Symbol*) * number_of_character_classes];
446 number_of_character_classes = 0;
449 // Set the partition map to all equivalent initially
451 memset(partitions,0,sizeof(partitions));
452 memset(singletons,0,sizeof(singletons));
456 // Parse the regular expressions and create a non-deterministic
457 // automaton.
459 Nfa * the_nfa = nil;
460 for (bad_rule = -1, i = 0; i < n; i++) {
461 Nfa * sub_nfa =
462 construct_nfa(regexp[i],i,mem,number_of_nfa_states,
463 partitions,singletons,
464 &char_classes,character_classes,
465 number_of_character_classes,contexts,
466 number_of_contexts, context_nfas,
467 options & CaseInsensitive
469 if (sub_nfa == nil) { bad_rule = i; return; }
470 the_nfa = new(mem,the_nfa,sub_nfa) Nfa;
474 // Compute the equivalence class map
476 int partition_number = -1;
477 max_char = max_ch;
478 equiv_classes = new Symbol [max_char + 1];
479 for (i = 0; i <= max_char; i++) {
480 if ((options & CaseInsensitive) && i >= 'a' && i <= 'z')
481 { equiv_classes[i] = equiv_classes[i - 'a' + 'A']; continue; }
482 if (partitions[i] || partition_number < 0)
483 partition_number = number_of_equiv_classes++;
484 if (singletons[i])
485 { equiv_classes[i] = number_of_equiv_classes++; continue; }
486 equiv_classes[i] = partition_number;
490 // Compute the characteristic map for each character class
492 BitSet ** char_class_map =
493 (BitSet**)mem[sizeof(BitSet*) * number_of_character_classes];
494 for (i = 0; i < number_of_character_classes; i++) {
495 BitSet * map = char_class_map[i] =
496 new(mem,number_of_equiv_classes) BitSet;
497 register Symbol * p;
498 for (p = character_classes[i]; *p != END_CHAR_CLASS; p++) {
499 if (*p & RANGE_BIT) {
500 for (int a = (*p & 0xff), b = (*++p & 0xff); a <= b; a++)
501 map->add(equiv_classes[a]);
502 } else map->add(equiv_classes[*p & 0xff]);
504 if (character_classes[i][0] & COMPLEMENT) map->complement();
508 // Compute epsilon closure for each state
510 Nfa ** nfa_states =
511 (Nfa**) mem.c_alloc(sizeof(Nfa*) * number_of_nfa_states);
512 epsilon_closure(the_nfa, mem, number_of_nfa_states, nfa_states, n);
515 // Start emitting table
517 create_tables(number_of_nfa_states * number_of_equiv_classes,
518 number_of_nfa_states, 0, number_of_equiv_classes-1);
519 start();
522 // Some auxiliary tables
524 BitSet * transitions [256]; // Outgoing state indexed by equiv class
525 State delta [256]; // DFA state number of outgoing state
527 for (i = number_of_equiv_classes - 1; i >= 0; i--)
528 transitions[i] = new(mem,number_of_nfa_states) BitSet;
531 // Create an error state (state number 0). This state always jams.
533 BitSet * err = new (mem,number_of_nfa_states) BitSet;
534 dstates.insert(err,(State)0);
535 nstates[0] = err;
536 rule[0] = (options & Backtracking) ? -1 : 0;
539 // Create two start states for each context, one for the context
540 // and one for the context anchored to the beginning of the line.
542 for (State s = 1; s < number_of_contexts * 2 + 3; s++) {
543 if (context_nfas[s-1] == nil) continue;
544 BitSet * set;
545 set = new(mem,number_of_nfa_states) BitSet;
546 compute_epsilon_closure(context_nfas[s-1],set);
547 accept_rule[s] = accept_thinning(set,n);
548 dstates.insert(set,s);
549 nstates[s] = set;
552 number_of_dfa_states = number_of_contexts * 2 + 3;
555 // Compute new states
557 for (processed = 1; processed < number_of_dfa_states; processed++) {
558 BitSet * set = nstates[processed];
559 Bool is_jammed = true; // no out transitions??
560 if (set == nil) continue;
561 int j, s;
562 for (i = number_of_equiv_classes - 1; i >= 0; i--)
563 transitions[i]->clear();
564 foreach_bit (s, *set)
565 compute_transitions(nfa_states[s],nfa_states,transitions,
566 char_class_map,number_of_equiv_classes,
567 equiv_classes);
568 for (i = number_of_equiv_classes - 1; i >= 0; i--) {
569 Rule accept = accept_thinning(transitions[i],n);
570 Ix d = dstates.lookup(transitions[i]);
571 if (d == nil) {
572 dstates.insert(transitions[i],number_of_dfa_states);
573 nstates[number_of_dfa_states] = transitions[i];
574 accept_rule[number_of_dfa_states] = accept;
575 delta[i] = number_of_dfa_states++;
576 transitions[i] = new(mem,number_of_nfa_states) BitSet;
577 is_jammed = false;
578 } else {
579 delta[i] = dstates.value(d);
580 if (delta[i] != 0) is_jammed = false;
583 add_state(processed,delta);
585 ///////////////////////////////////////////////////////////////////
586 // The rule table actually contains more information than
587 // just the rule number. It encodes the following information:
588 // (a) the accept rule number $r$ if the state is an accept state
589 // (b) whether the state is jammed, i.e. having no out going state
590 ///////////////////////////////////////////////////////////////////
591 if (is_jammed && (options & Backtracking))
592 rule[processed] = -accept_rule[processed]-1;
593 else
594 rule[processed] = accept_rule[processed];
598 // Finish emitting table
600 finish();
603 ////////////////////////////////////////////////////////////////////////////
604 // Emit C++ code
605 ////////////////////////////////////////////////////////////////////////////
606 ostream& LexerGen::gen_code(ostream& out, const char name[]) const
607 { if (ok()) {
608 Super::gen_code(out,name);
609 TablePrinter pr;
610 pr.print(out, (const char *)equiv_classes,
611 max_char + 1, sizeof(Symbol),
612 "static const unsigned char ", name, "equiv_classes", true);
613 pr.print(out, (const char *)rule,
614 max_state + 1, sizeof(Rule),
615 "static const DFATables::Rule ", name, "accept_rule");
617 return out;