1 //////////////////////////////////////////////////////////////////////////////
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are free to incorporate any part of ADLib and Prop into
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
18 // This software is still under development and we welcome any suggestions
19 // and help from the users.
23 //////////////////////////////////////////////////////////////////////////////
29 #include <AD/automata/lexergen.h> // Lexical scanner generator
30 #include <AD/automata/gentable.h> // Table emitter
31 #include <AD/strings/charesc.h> // Escape sequence parsing
32 #include <AD/memory/mempool.h> // Memory pools
33 #include <AD/hash/dchash.h> // Direct chaining hash table
34 #include <AD/contain/vararray.h> // Generic arrays
35 #include <AD/contain/bitset.h> // bit set
37 ////////////////////////////////////////////////////////////////////////////
38 // Make hidden types visible
39 ////////////////////////////////////////////////////////////////////////////
40 typedef LexerGen::Symbol Symbol
;
41 typedef LexerGen::State State
;
42 typedef LexerGen::Rule Rule
;
43 typedef LexerGen::Options Options
;
45 ////////////////////////////////////////////////////////////////////////////
46 // Constructor and destructor
47 ////////////////////////////////////////////////////////////////////////////
48 LexerGen::LexerGen() : rule(0), bad_rule(-1), equiv_classes(0) {}
49 LexerGen::~LexerGen() { delete [] rule
; delete [] equiv_classes
; }
51 ////////////////////////////////////////////////////////////////////////////
52 // Some flags and constants
53 ////////////////////////////////////////////////////////////////////////////
54 #define nil 0 // nil pointer
55 #define RANGE_BIT 0x80000000 // marks a range in character class
56 #define COMPLEMENT 0x40000000 // marks a set complement in the same
57 #define END_CHAR_CLASS 0x20000000 // marks the end of a character class
59 ////////////////////////////////////////////////////////////////////////////
60 // To create a set of tables
61 ////////////////////////////////////////////////////////////////////////////
62 void LexerGen::create_tables(int size
, int states
, Symbol min
, Symbol max
)
63 { Super::create_tables(size
,states
,min
,max
);
64 rule
= new Rule
[states
];
67 ////////////////////////////////////////////////////////////////////////////
68 // To grow the number of states
69 ////////////////////////////////////////////////////////////////////////////
70 void LexerGen::grow_states(int increment
)
71 { Rule
* new_rule
= new Rule
[ number_of_states
+ increment
];
72 memcpy(new_rule
,rule
,sizeof(Rule
) * number_of_states
);
75 Super::grow_states(increment
);
78 ////////////////////////////////////////////////////////////////////////////
80 ////////////////////////////////////////////////////////////////////////////
82 enum { Accept
, Delta
, Epsilon
} tag
;
83 BitSet
* epsilon_closure
;
86 State state
; // nfa state number
87 Symbol c
; // symbol to branch on
88 Nfa
* out
; // out transition
89 BitSet
* delta_set
; //
91 Nfa
* epsilon
[2]; // out transitions
94 void * operator new (int, MemPool
& mem
)
95 { Nfa
* nfa
= (Nfa
*)mem
[sizeof(Nfa
)];
96 nfa
->epsilon_closure
= nil
; return nfa
;
99 void * operator new (int, MemPool
& mem
, Nfa
* x
, Nfa
* y
)
100 { if (x
== nil
) return y
;
101 Nfa
* nfa
= (Nfa
*)mem
[sizeof(Nfa
)];
102 nfa
->tag
= Epsilon
; nfa
->epsilon_closure
= nil
;
103 nfa
->epsilon
[0] = x
; nfa
->epsilon
[1] = y
; return nfa
;
107 ////////////////////////////////////////////////////////////////////////////
108 // Parse a regular expression and construct a sub-nfa
109 ////////////////////////////////////////////////////////////////////////////
110 static Nfa
* construct_nfa
111 ( const char * regexp
, Rule rule
, MemPool
& mem
,
112 int& number_of_nfa_states
, Bool partitions
[], Bool singletons
[],
113 Symbol
** char_classes
, Symbol
* character_classes
[],
114 int& char_class_num
, const char * const * contexts
, int number_of_contexts
,
115 Nfa
* context_nfas
[], Bool foldcase
117 { Nfa
* stack
[LexerGen::Max_star_height
];
118 register const char * p
; // current pointer to regular expression
119 register Symbol c
; // current character
120 register Nfa
* root
; // the root of the current sub-nfa
121 register Nfa
* current
; // the current point of the sub-nfa
122 register Nfa
* next
; // the next availble node of the nfa
125 Bool in_context
[LexerGen::Max_contexts
];
126 Bool anchored
= false; // pattern must start at beginning of line??
127 Bool in_any_context
= false;
130 // Determine the context, if any
132 if (regexp
[0] == '<') {
136 memset(in_context
,0,sizeof(in_context
));
137 if (contexts
== nil
) goto syntax_error
;
138 for (p
= regexp
+1, q
= name
;; ) {
140 case '\0': goto syntax_error
;
143 for (i
= 0; contexts
[i
]; i
++)
144 if (strcmp(contexts
[i
], name
) == 0) goto found
;
147 in_context
[i
] = true;
148 in_any_context
= true;
149 if (c
== '>') goto finish
;
151 default: *q
++ = c
; break;
158 // Now scan the pattern and parse
160 root
= current
= next
= new(mem
) Nfa
;
162 Nfa
* next_node
= new(mem
) Nfa
; // Get a new node
163 switch (c
= (unsigned char)*p
++) {
164 case '(': // grouping
165 case '|': // disjunction
166 { sp
[0] = root
; sp
[1] = next
; sp
[2] = c
== '|' ? (Nfa
*)1 : nil
;
167 sp
+= 3; root
= current
= next
= next_node
;
169 case '\0': case ')': // end of pattern or end of grouping
170 { while (sp
> stack
&& sp
[-1]) {
171 Nfa
* old_root
= sp
[-3];
172 Nfa
* old_next
= sp
[-2];
174 Nfa
* n
= new(mem
) Nfa
;
175 old_next
->tag
= Nfa::Epsilon
;
176 old_next
->epsilon
[0] = next
;
177 old_next
->epsilon
[1] = nil
;
178 n
->tag
= Nfa::Epsilon
;
179 n
->epsilon
[0] = old_root
;
180 n
->epsilon
[1] = root
;
184 if (sp
== stack
) goto syntax_error
;
185 Nfa
* old_next
= sp
[-2];
186 Nfa
* old_root
= sp
[-3];
188 old_next
->tag
= Nfa::Epsilon
;
189 old_next
->epsilon
[0] = root
;
190 old_next
->epsilon
[1] = nil
;
191 root
= current
= old_root
;
193 if (sp
> stack
) goto syntax_error
;
197 case '^': if (p
-1 > regexp
) goto build_delta
;
198 anchored
= true; break;
199 case '$': c
= '\n'; goto build_delta
;
200 case '*': if (root
== next
) goto syntax_error
;
201 { *next_node
= *current
;
202 current
->tag
= Nfa::Epsilon
;
203 current
->epsilon
[0] = next_node
;
204 current
->epsilon
[1] = next
;
205 next
->tag
= Nfa::Epsilon
;
206 next
->epsilon
[0] = next_node
;
207 next
->epsilon
[1] = new(mem
) Nfa
;
208 next
= next
->epsilon
[1];
210 case '+': if (root
== next
) goto syntax_error
;
211 { next
->tag
= Nfa::Epsilon
;
212 next
->epsilon
[0] = current
;
213 next
->epsilon
[1] = next_node
;
216 case '?': if (root
== next
) goto syntax_error
;
217 { *next_node
= *current
;
218 current
->tag
= Nfa::Epsilon
;
219 current
->epsilon
[0] = next_node
;
220 current
->epsilon
[1] = next
;
222 case '.': // wild card
223 { Symbol
* char_class
= *char_classes
;
224 *char_class
++ = '\n' | COMPLEMENT
;
225 *char_class
++ = END_CHAR_CLASS
;
226 singletons
['\n'] = true;
227 character_classes
[char_class_num
++] = *char_classes
;
228 *char_classes
= char_class
;
232 case '[': // process a character class
233 { Symbol start_range
= -1;
234 const char * start
= p
;
235 Bool complement
= false;
237 Symbol
* char_class
= *char_classes
;
238 for (last_char
= -1; (c
= (unsigned char)*p
++) != ']'; ) {
240 case '\0': goto syntax_error
;
241 case '-': if (last_char
< 0) goto syntax_error
;
242 start_range
= last_char
;
244 case '^': if (p
-1 == start
) { complement
= true; break; }
245 case '\\': p
= parse_char(p
-1,ch
); c
= (unsigned char)ch
;
247 if (foldcase
) c
= toupper(c
);
249 if (start_range
>= 0) {
250 if (start_range
>= c
) goto syntax_error
;
251 char_class
[-2] |= RANGE_BIT
;
254 partitions
[c
+1] = true;
256 partitions
[c
] = true;
258 if (*p
!= '-') singletons
[c
] = true;
263 *char_class
++ = END_CHAR_CLASS
;
264 if (complement
) (*char_classes
)[0] |= COMPLEMENT
;
265 character_classes
[char_class_num
++] = *char_classes
;
266 *char_classes
= char_class
;
270 case '\\': p
= parse_char(p
-1,ch
); c
= ch
;
271 build_delta
: default: // process normal characters
272 { next
->tag
= Nfa::Delta
;
273 next
->n
.state
= number_of_nfa_states
++;
274 if (foldcase
&& c
>= 0) c
= toupper(c
);
276 next
->n
.out
= next_node
;
277 current
= next
; next
= next
->n
.out
;
278 if (c
>= 0) singletons
[c
] = true;
283 next
->tag
= Nfa::Accept
;
284 next
->n
.state
= rule
;
285 for (i
= 0; i
< number_of_contexts
; i
++) {
288 context_nfas
[2*i
+2] = new(mem
,context_nfas
[2*i
+2],root
) Nfa
;
289 context_nfas
[2*i
+3] = new(mem
,context_nfas
[2*i
+3],root
) Nfa
;
292 if (! in_any_context
) {
294 context_nfas
[0] = new(mem
,context_nfas
[0],root
) Nfa
;
295 context_nfas
[1] = new(mem
,context_nfas
[1],root
) Nfa
;
303 ////////////////////////////////////////////////////////////////////////////
304 // Compute the epsilon closure for one state
305 ////////////////////////////////////////////////////////////////////////////
306 static void compute_epsilon_closure(Nfa
* nfa
, BitSet
* closure
)
307 { if (nfa
== nil
) return;
310 case Nfa::Delta
: closure
->add(nfa
->n
.state
); return;
312 compute_epsilon_closure(nfa
->epsilon
[0],closure
);
313 compute_epsilon_closure(nfa
->epsilon
[1],closure
);
316 assert("bug: compute_epsilon_closure()");
320 ////////////////////////////////////////////////////////////////////////////
321 // Find the accept rule of the state, if any
322 ////////////////////////////////////////////////////////////////////////////
323 inline Rule
accept_thinning(BitSet
* set
, int n
)
324 { for (int i
= 0; i
< n
; i
++)
326 for (int j
= i
+1; j
< n
; j
++) set
->remove(j
);
332 ////////////////////////////////////////////////////////////////////////////
333 // Compute epsilon closure for each state of the nfa.
334 ////////////////////////////////////////////////////////////////////////////
335 static BitSet
* epsilon_closure
336 ( Nfa
* nfa
, MemPool
& mem
, int number_of_nfa_states
, Nfa
* nfa_states
[], int n
)
338 if (nfa
== nil
) return nil
;
339 if (nfa
->epsilon_closure
) return nfa
->epsilon_closure
;
342 nfa_states
[nfa
->n
.state
] = nfa
;
343 set
= nfa
->epsilon_closure
= new(mem
,number_of_nfa_states
) BitSet
;
344 set
->add(nfa
->n
.state
);
347 nfa_states
[nfa
->n
.state
] = nfa
;
348 set
= nfa
->epsilon_closure
= new(mem
,number_of_nfa_states
) BitSet
;
349 set
->add(nfa
->n
.state
);
350 nfa
->n
.delta_set
= new(mem
,number_of_nfa_states
) BitSet
;
351 compute_epsilon_closure(nfa
->n
.out
,nfa
->n
.delta_set
);
352 epsilon_closure(nfa
->n
.out
,mem
,number_of_nfa_states
,nfa_states
,n
);
355 set
= nfa
->epsilon_closure
= new(mem
,number_of_nfa_states
) BitSet
;
356 compute_epsilon_closure(nfa
->epsilon
[0],set
);
357 compute_epsilon_closure(nfa
->epsilon
[1],set
);
358 accept_thinning(set
, n
);
359 epsilon_closure(nfa
->epsilon
[0],mem
,number_of_nfa_states
,nfa_states
,n
);
360 epsilon_closure(nfa
->epsilon
[1],mem
,number_of_nfa_states
,nfa_states
,n
);
363 assert("bug: epsilon_closure()");
368 ////////////////////////////////////////////////////////////////////////////
369 // Compute transition from a set of nfa states
370 ////////////////////////////////////////////////////////////////////////////
371 inline void compute_transitions
372 (Nfa
* nfa
, Nfa
* nfa_states
[], BitSet
* transitions
[],
373 BitSet
* char_class_map
[], int number_of_equiv_classes
,
374 Symbol equiv_classes
[])
376 register BitSet
& set
= *nfa
->epsilon_closure
;
377 foreach_bit (s
, set
) {
378 Nfa
* node
= nfa_states
[s
];
379 if (node
->tag
== Nfa::Delta
) {
380 Symbol c
= node
->n
.c
;
382 transitions
[equiv_classes
[c
]]->Union(*node
->n
.delta_set
);
384 BitSet
& chars
= *char_class_map
[-c
-1];
385 for (int ch
= 0; ch
< number_of_equiv_classes
; ch
++)
386 if (chars
[ch
]) transitions
[ch
]->Union(*node
->n
.delta_set
);
392 ////////////////////////////////////////////////////////////////////////////
393 // The main entry point of the lexical scanner compiler
394 ////////////////////////////////////////////////////////////////////////////
395 void LexerGen::compile
396 ( int n
, const char * const * regexp
, const char * const * contexts
,
397 int max_ch
, int options
401 // Here are the tables and maps that we'll need
403 MemPool
mem(4096); // Memory pool with page size of 4K
404 DCHashTable
<BitSet
*,State
> dstates(257); // Map sets of nfa states to dfa states
405 VarArray
<BitSet
*> nstates
; // Inverse of above
406 VarArray
<Rule
> accept_rule
; // Accept rule of state, if any
407 Nfa
** context_nfas
; // Sub-nfas of each context
413 State processed
; // current state to be processed
414 State number_of_dfa_states
= 1;
415 int number_of_nfa_states
= n
;
416 int number_of_equiv_classes
= 0;
417 int number_of_character_classes
= 0;
418 int character_class_size
= 0;
419 int number_of_contexts
= 0;
422 // Count the number of contexts and allocate space
424 if (contexts
) while (contexts
[number_of_contexts
]) number_of_contexts
++;
426 (Nfa
**)mem
.c_alloc(sizeof(Nfa
*) * (number_of_contexts
+ 1) * 2);
429 // Estimate the number of character classes and their combined size
431 for (i
= 0; i
< n
; i
++) {
432 character_class_size
+= strlen(regexp
[i
]) + 1;
433 for (register const char * p
= regexp
[i
]; *p
; p
++)
434 if (*p
== '[' || *p
== '.') number_of_character_classes
++;
438 // Allocate storage for the character classes
440 Bool partitions
[257];
441 Bool singletons
[257];
442 Symbol
* char_classes
=
443 (Symbol
*)mem
[sizeof(Symbol
) * character_class_size
];
444 Symbol
** character_classes
=
445 (Symbol
**)mem
[sizeof(Symbol
*) * number_of_character_classes
];
446 number_of_character_classes
= 0;
449 // Set the partition map to all equivalent initially
451 memset(partitions
,0,sizeof(partitions
));
452 memset(singletons
,0,sizeof(singletons
));
456 // Parse the regular expressions and create a non-deterministic
460 for (bad_rule
= -1, i
= 0; i
< n
; i
++) {
462 construct_nfa(regexp
[i
],i
,mem
,number_of_nfa_states
,
463 partitions
,singletons
,
464 &char_classes
,character_classes
,
465 number_of_character_classes
,contexts
,
466 number_of_contexts
, context_nfas
,
467 options
& CaseInsensitive
469 if (sub_nfa
== nil
) { bad_rule
= i
; return; }
470 the_nfa
= new(mem
,the_nfa
,sub_nfa
) Nfa
;
474 // Compute the equivalence class map
476 int partition_number
= -1;
478 equiv_classes
= new Symbol
[max_char
+ 1];
479 for (i
= 0; i
<= max_char
; i
++) {
480 if ((options
& CaseInsensitive
) && i
>= 'a' && i
<= 'z')
481 { equiv_classes
[i
] = equiv_classes
[i
- 'a' + 'A']; continue; }
482 if (partitions
[i
] || partition_number
< 0)
483 partition_number
= number_of_equiv_classes
++;
485 { equiv_classes
[i
] = number_of_equiv_classes
++; continue; }
486 equiv_classes
[i
] = partition_number
;
490 // Compute the characteristic map for each character class
492 BitSet
** char_class_map
=
493 (BitSet
**)mem
[sizeof(BitSet
*) * number_of_character_classes
];
494 for (i
= 0; i
< number_of_character_classes
; i
++) {
495 BitSet
* map
= char_class_map
[i
] =
496 new(mem
,number_of_equiv_classes
) BitSet
;
498 for (p
= character_classes
[i
]; *p
!= END_CHAR_CLASS
; p
++) {
499 if (*p
& RANGE_BIT
) {
500 for (int a
= (*p
& 0xff), b
= (*++p
& 0xff); a
<= b
; a
++)
501 map
->add(equiv_classes
[a
]);
502 } else map
->add(equiv_classes
[*p
& 0xff]);
504 if (character_classes
[i
][0] & COMPLEMENT
) map
->complement();
508 // Compute epsilon closure for each state
511 (Nfa
**) mem
.c_alloc(sizeof(Nfa
*) * number_of_nfa_states
);
512 epsilon_closure(the_nfa
, mem
, number_of_nfa_states
, nfa_states
, n
);
515 // Start emitting table
517 create_tables(number_of_nfa_states
* number_of_equiv_classes
,
518 number_of_nfa_states
, 0, number_of_equiv_classes
-1);
522 // Some auxiliary tables
524 BitSet
* transitions
[256]; // Outgoing state indexed by equiv class
525 State delta
[256]; // DFA state number of outgoing state
527 for (i
= number_of_equiv_classes
- 1; i
>= 0; i
--)
528 transitions
[i
] = new(mem
,number_of_nfa_states
) BitSet
;
531 // Create an error state (state number 0). This state always jams.
533 BitSet
* err
= new (mem
,number_of_nfa_states
) BitSet
;
534 dstates
.insert(err
,(State
)0);
536 rule
[0] = (options
& Backtracking
) ? -1 : 0;
539 // Create two start states for each context, one for the context
540 // and one for the context anchored to the beginning of the line.
542 for (State s
= 1; s
< number_of_contexts
* 2 + 3; s
++) {
543 if (context_nfas
[s
-1] == nil
) continue;
545 set
= new(mem
,number_of_nfa_states
) BitSet
;
546 compute_epsilon_closure(context_nfas
[s
-1],set
);
547 accept_rule
[s
] = accept_thinning(set
,n
);
548 dstates
.insert(set
,s
);
552 number_of_dfa_states
= number_of_contexts
* 2 + 3;
555 // Compute new states
557 for (processed
= 1; processed
< number_of_dfa_states
; processed
++) {
558 BitSet
* set
= nstates
[processed
];
559 Bool is_jammed
= true; // no out transitions??
560 if (set
== nil
) continue;
562 for (i
= number_of_equiv_classes
- 1; i
>= 0; i
--)
563 transitions
[i
]->clear();
564 foreach_bit (s
, *set
)
565 compute_transitions(nfa_states
[s
],nfa_states
,transitions
,
566 char_class_map
,number_of_equiv_classes
,
568 for (i
= number_of_equiv_classes
- 1; i
>= 0; i
--) {
569 Rule accept
= accept_thinning(transitions
[i
],n
);
570 Ix d
= dstates
.lookup(transitions
[i
]);
572 dstates
.insert(transitions
[i
],number_of_dfa_states
);
573 nstates
[number_of_dfa_states
] = transitions
[i
];
574 accept_rule
[number_of_dfa_states
] = accept
;
575 delta
[i
] = number_of_dfa_states
++;
576 transitions
[i
] = new(mem
,number_of_nfa_states
) BitSet
;
579 delta
[i
] = dstates
.value(d
);
580 if (delta
[i
] != 0) is_jammed
= false;
583 add_state(processed
,delta
);
585 ///////////////////////////////////////////////////////////////////
586 // The rule table actually contains more information than
587 // just the rule number. It encodes the following information:
588 // (a) the accept rule number $r$ if the state is an accept state
589 // (b) whether the state is jammed, i.e. having no out going state
590 ///////////////////////////////////////////////////////////////////
591 if (is_jammed
&& (options
& Backtracking
))
592 rule
[processed
] = -accept_rule
[processed
]-1;
594 rule
[processed
] = accept_rule
[processed
];
598 // Finish emitting table
603 ////////////////////////////////////////////////////////////////////////////
605 ////////////////////////////////////////////////////////////////////////////
606 ostream
& LexerGen::gen_code(ostream
& out
, const char name
[]) const
608 Super::gen_code(out
,name
);
610 pr
.print(out
, (const char *)equiv_classes
,
611 max_char
+ 1, sizeof(Symbol
),
612 "static const unsigned char ", name
, "equiv_classes", true);
613 pr
.print(out
, (const char *)rule
,
614 max_state
+ 1, sizeof(Rule
),
615 "static const DFATables::Rule ", name
, "accept_rule");