1 //////////////////////////////////////////////////////////////////////////////
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are free to incorporate any part of ADLib and Prop into
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
18 // This software is still under development and we welcome any suggestions
19 // and help from the users.
23 //////////////////////////////////////////////////////////////////////////////
25 #ifndef labeled_tree_for_regular_expressions_h
26 #define labeled_tree_for_regular_expressions_h
29 // The class |LabeledTree| is used to construct a labeled position
30 // tree representing a regular expression. This auxiliary data structure
31 // is used by the lexical scanner generation algorithm.
34 #include <AD/generic/generic.h> // Generic definitions
40 // Each leaf node in the labeled tree is marked by a unique ``position''
41 // marker, which is basically a positive integer. To conserve storage,
42 // this position type is limited to 2 bytes, in the range of 1 to 65535.
43 // Since the number of nodes is also limited to the number of
44 // positions, the type |NodeIndex| is also kept to 2-byte number.
47 typedef ShortWord Position
; // Position number of labeled tree
48 typedef ShortWord NodeIndex
; // Index to a node
51 // The following three defined constants are used to annotate a
52 // character set in compact form. In this compact form,
53 // character sets are represented as an array of |short|'s linearly.
54 // For example, the char set [a-zA-Z] is represented as the array
55 // { 'a' | START_RANGE, 'z', 'A' | START_RANGE, 'Z', END_CHAR_SET }
56 // while the set [^allen] is represented as the array
57 // { COMPLEMENT, 'a', 'l', 'l', 'e', 'n', END_CHAR_SET }.
61 enum LabTree_constants
{
62 END_CHAR_SET
= -1, // mark the end of a char set.
63 COMPLEMENT
= -2, // mark the set as complemented.
64 START_RANGE
= 256, // bit to mark the start of range
65 ACCEPT
= 256 // Accept state character number
69 // The following are the possible types of labeled tree nodes
72 NODE_CHAR
= 0, // character node
73 NODE_EPSILON
= 1, // epsilon node, i.e. null string
74 NODE_ACCEPT
= 2, // special node representing an accept state
75 NODE_CHAR_CLASS
= 3, // character class node
76 NODE_WILD_CARD
= 4, // node representing a wild card
77 NODE_STAR
= 5, // Kleene star '*'
78 NODE_PLUS
= 6, // non-empty Kleene star '+'
79 NODE_APPEND
= 7, // appending two trees
80 NODE_OR
= 8, // disjunction
81 NODE_BEGIN_LINE
= 9, // begin of line matching '^'
82 MARKED_NODE
= 16 // marked node for recursive traversal
86 // Each node of a labeled tree is represented by the following
87 // variant type. This type should be 8 bytes large on most
91 char tag
; // Type of node; use one byte to save space
92 Position pos
; // position number of node
94 unsigned char c
; // character of a delta node
95 int accept_rule
; // rule number of accept node
96 int char_class
; // index to character class
97 NodeIndex child
; // pointer to child of '*' or '+' nodes
99 NodeIndex left
, right
; // pointers for append or '|' nodes
105 // The type |Union| defined bellow is a dag structure
106 // that represents a finite union of position sets.
108 enum UnionType
{ UNION
, SINGLETON
};
110 UnionType tag
; // Is it a union or a set
112 Position
* set
; // pointer to a set of positions
113 struct { Union
* left
, * right
; } u
; // union of two sets
115 Union(Position
* set
) : tag(SINGLETON
) { u
.set
= set
; }
116 Union(Union
* a
, Union
* b
) : tag(UNION
) { u
.u
.left
= a
; u
.u
.right
= b
; }
121 // Information about the labeled tree
122 LabNode
* tree
; // nodes within a position tree
123 LabNode
* limit
; // next available node
124 LabNode
* forest
; // the forest of all rules
125 Char
** charsets
; // pointers to character sets
126 Char
** next_charset
; // next available character set
127 Char
* chars
; // contents of character set
128 Char
* next_chars
; // next available character set array
129 Position next_position
; // next available position number
131 // Information concerning the positions
132 Char
* char_at_position
; // The character or charset at position 'n'
133 Bool
* nullable
; // Is the tree at node nullable ?
134 int * first
; // Positions that comes first at node
135 int * last
; // Positions that comes last at node
136 int * follow
; // Positions that can following position 'n'
138 Position
* position_sets
; //
139 Position
* next_pos_set
; //
140 Position
* position_sets_limit
; //
142 int bad_rule
; // number of the first bad rule
144 LabNode
* parse_one_rule(int rule_number
, const char * pattern
);
145 void compute_tables();
146 Position
* expand_sets(Position
*);
150 LabeledTree(const char * regular_expressions
[], int count
= -1);
153 LabNode
* root() { return forest
; }
154 LabNode
* child(LabNode
* t
) { return tree
+ t
->node
.child
; }
155 LabNode
* left(LabNode
* t
) { return tree
+ t
->node
.branch
.left
; }
156 LabNode
* right(LabNode
* t
) { return tree
+ t
->node
.branch
.right
; }
157 Char
* charsetOf(LabNode
* t
) { return charsets
[t
->node
.char_class
]; }
159 int bad() const { return bad_rule
; }
160 Position
number_of_positions() const { return next_position
; }
162 int compute_equiv_classes(short []);
163 void print(const LabNode
*);