simple.cc - generated code example
[prop.git] / include / AD / automata / labtree.h
blob90c4b8995e111b4c7e82ba31eb97922197685aaf
1 //////////////////////////////////////////////////////////////////////////////
2 // NOTICE:
3 //
4 // ADLib, Prop and their related set of tools and documentation are in the
5 // public domain. The author(s) of this software reserve no copyrights on
6 // the source code and any code generated using the tools. You are encouraged
7 // to use ADLib and Prop to develop software, in both academic and commercial
8 // settings, and are free to incorporate any part of ADLib and Prop into
9 // your programs.
11 // Although you are under no obligation to do so, we strongly recommend that
12 // you give away all software developed using our tools.
14 // We also ask that credit be given to us when ADLib and/or Prop are used in
15 // your programs, and that this notice be preserved intact in all the source
16 // code.
18 // This software is still under development and we welcome any suggestions
19 // and help from the users.
21 // Allen Leung
22 // 1994
23 //////////////////////////////////////////////////////////////////////////////
25 #ifndef labeled_tree_for_regular_expressions_h
26 #define labeled_tree_for_regular_expressions_h
29 // The class |LabeledTree| is used to construct a labeled position
30 // tree representing a regular expression. This auxiliary data structure
31 // is used by the lexical scanner generation algorithm.
34 #include <AD/generic/generic.h> // Generic definitions
36 class LabeledTree {
37 public:
40 // Each leaf node in the labeled tree is marked by a unique ``position''
41 // marker, which is basically a positive integer. To conserve storage,
42 // this position type is limited to 2 bytes, in the range of 1 to 65535.
43 // Since the number of nodes is also limited to the number of
44 // positions, the type |NodeIndex| is also kept to 2-byte number.
47 typedef ShortWord Position; // Position number of labeled tree
48 typedef ShortWord NodeIndex; // Index to a node
51 // The following three defined constants are used to annotate a
52 // character set in compact form. In this compact form,
53 // character sets are represented as an array of |short|'s linearly.
54 // For example, the char set [a-zA-Z] is represented as the array
55 // { 'a' | START_RANGE, 'z', 'A' | START_RANGE, 'Z', END_CHAR_SET }
56 // while the set [^allen] is represented as the array
57 // { COMPLEMENT, 'a', 'l', 'l', 'e', 'n', END_CHAR_SET }.
59 typedef short Char;
61 enum LabTree_constants {
62 END_CHAR_SET = -1, // mark the end of a char set.
63 COMPLEMENT = -2, // mark the set as complemented.
64 START_RANGE = 256, // bit to mark the start of range
65 ACCEPT = 256 // Accept state character number
69 // The following are the possible types of labeled tree nodes
71 enum LabeledTag {
72 NODE_CHAR = 0, // character node
73 NODE_EPSILON = 1, // epsilon node, i.e. null string
74 NODE_ACCEPT = 2, // special node representing an accept state
75 NODE_CHAR_CLASS = 3, // character class node
76 NODE_WILD_CARD = 4, // node representing a wild card
77 NODE_STAR = 5, // Kleene star '*'
78 NODE_PLUS = 6, // non-empty Kleene star '+'
79 NODE_APPEND = 7, // appending two trees
80 NODE_OR = 8, // disjunction
81 NODE_BEGIN_LINE = 9, // begin of line matching '^'
82 MARKED_NODE = 16 // marked node for recursive traversal
86 // Each node of a labeled tree is represented by the following
87 // variant type. This type should be 8 bytes large on most
88 // architectures.
90 struct LabNode {
91 char tag; // Type of node; use one byte to save space
92 Position pos; // position number of node
93 union {
94 unsigned char c; // character of a delta node
95 int accept_rule; // rule number of accept node
96 int char_class; // index to character class
97 NodeIndex child; // pointer to child of '*' or '+' nodes
98 struct {
99 NodeIndex left, right; // pointers for append or '|' nodes
100 } branch;
101 } node;
105 // The type |Union| defined bellow is a dag structure
106 // that represents a finite union of position sets.
108 enum UnionType { UNION, SINGLETON };
109 struct Union {
110 UnionType tag; // Is it a union or a set
111 union {
112 Position * set; // pointer to a set of positions
113 struct { Union * left, * right; } u; // union of two sets
114 } u;
115 Union(Position * set) : tag(SINGLETON) { u.set = set; }
116 Union(Union * a, Union * b) : tag(UNION) { u.u.left = a; u.u.right = b; }
119 private:
121 // Information about the labeled tree
122 LabNode * tree; // nodes within a position tree
123 LabNode * limit; // next available node
124 LabNode * forest; // the forest of all rules
125 Char ** charsets; // pointers to character sets
126 Char ** next_charset; // next available character set
127 Char * chars; // contents of character set
128 Char * next_chars; // next available character set array
129 Position next_position; // next available position number
131 // Information concerning the positions
132 Char * char_at_position; // The character or charset at position 'n'
133 Bool * nullable; // Is the tree at node nullable ?
134 int * first; // Positions that comes first at node
135 int * last; // Positions that comes last at node
136 int * follow; // Positions that can following position 'n'
138 Position * position_sets; //
139 Position * next_pos_set; //
140 Position * position_sets_limit; //
142 int bad_rule; // number of the first bad rule
144 LabNode * parse_one_rule(int rule_number, const char * pattern);
145 void compute_tables();
146 Position * expand_sets(Position *);
148 public:
150 LabeledTree(const char * regular_expressions[], int count = -1);
151 ~LabeledTree();
153 LabNode * root() { return forest; }
154 LabNode * child(LabNode * t) { return tree + t->node.child; }
155 LabNode * left(LabNode * t) { return tree + t->node.branch.left; }
156 LabNode * right(LabNode * t) { return tree + t->node.branch.right; }
157 Char * charsetOf(LabNode * t) { return charsets[t->node.char_class]; }
159 int bad() const { return bad_rule; }
160 Position number_of_positions() const { return next_position; }
162 int compute_equiv_classes(short []);
163 void print(const LabNode *);
166 #endif