Update mojo sdk to rev 1dc8a9a5db73d3718d99917fadf31f5fb2ebad4f
[chromium-blink-merge.git] / third_party / cython / src / Cython / Plex / Lexicons.py
blob88074666b014ced0daa542c7c5accceb6c119e61
1 #=======================================================================
3 # Python Lexical Analyser
5 # Lexical Analyser Specification
7 #=======================================================================
9 import types
11 import Actions
12 import DFA
13 import Errors
14 import Machines
15 import Regexps
17 # debug_flags for Lexicon constructor
18 DUMP_NFA = 1
19 DUMP_DFA = 2
21 class State(object):
22 """
23 This class is used as part of a Plex.Lexicon specification to
24 introduce a user-defined state.
26 Constructor:
28 State(name, token_specifications)
29 """
31 name = None
32 tokens = None
34 def __init__(self, name, tokens):
35 self.name = name
36 self.tokens = tokens
38 class Lexicon(object):
39 """
40 Lexicon(specification) builds a lexical analyser from the given
41 |specification|. The specification consists of a list of
42 specification items. Each specification item may be either:
44 1) A token definition, which is a tuple:
46 (pattern, action)
48 The |pattern| is a regular axpression built using the
49 constructors defined in the Plex module.
51 The |action| is the action to be performed when this pattern
52 is recognised (see below).
54 2) A state definition:
56 State(name, tokens)
58 where |name| is a character string naming the state,
59 and |tokens| is a list of token definitions as
60 above. The meaning and usage of states is described
61 below.
63 Actions
64 -------
66 The |action| in a token specication may be one of three things:
68 1) A function, which is called as follows:
70 function(scanner, text)
72 where |scanner| is the relevant Scanner instance, and |text|
73 is the matched text. If the function returns anything
74 other than None, that value is returned as the value of the
75 token. If it returns None, scanning continues as if the IGNORE
76 action were specified (see below).
78 2) One of the following special actions:
80 IGNORE means that the recognised characters will be treated as
81 white space and ignored. Scanning will continue until
82 the next non-ignored token is recognised before returning.
84 TEXT causes the scanned text itself to be returned as the
85 value of the token.
87 3) Any other value, which is returned as the value of the token.
89 States
90 ------
92 At any given time, the scanner is in one of a number of states.
93 Associated with each state is a set of possible tokens. When scanning,
94 only tokens associated with the current state are recognised.
96 There is a default state, whose name is the empty string. Token
97 definitions which are not inside any State definition belong to
98 the default state.
100 The initial state of the scanner is the default state. The state can
101 be changed in one of two ways:
103 1) Using Begin(state_name) as the action of a token.
105 2) Calling the begin(state_name) method of the Scanner.
107 To change back to the default state, use '' as the state name.
110 machine = None # Machine
111 tables = None # StateTableMachine
113 def __init__(self, specifications, debug = None, debug_flags = 7, timings = None):
114 if type(specifications) != types.ListType:
115 raise Errors.InvalidScanner("Scanner definition is not a list")
116 if timings:
117 from Timing import time
118 total_time = 0.0
119 time1 = time()
120 nfa = Machines.Machine()
121 default_initial_state = nfa.new_initial_state('')
122 token_number = 1
123 for spec in specifications:
124 if isinstance(spec, State):
125 user_initial_state = nfa.new_initial_state(spec.name)
126 for token in spec.tokens:
127 self.add_token_to_machine(
128 nfa, user_initial_state, token, token_number)
129 token_number = token_number + 1
130 elif type(spec) == types.TupleType:
131 self.add_token_to_machine(
132 nfa, default_initial_state, spec, token_number)
133 token_number = token_number + 1
134 else:
135 raise Errors.InvalidToken(
136 token_number,
137 "Expected a token definition (tuple) or State instance")
138 if timings:
139 time2 = time()
140 total_time = total_time + (time2 - time1)
141 time3 = time()
142 if debug and (debug_flags & 1):
143 debug.write("\n============= NFA ===========\n")
144 nfa.dump(debug)
145 dfa = DFA.nfa_to_dfa(nfa, debug = (debug_flags & 3) == 3 and debug)
146 if timings:
147 time4 = time()
148 total_time = total_time + (time4 - time3)
149 if debug and (debug_flags & 2):
150 debug.write("\n============= DFA ===========\n")
151 dfa.dump(debug)
152 if timings:
153 timings.write("Constructing NFA : %5.2f\n" % (time2 - time1))
154 timings.write("Converting to DFA: %5.2f\n" % (time4 - time3))
155 timings.write("TOTAL : %5.2f\n" % total_time)
156 self.machine = dfa
158 def add_token_to_machine(self, machine, initial_state, token_spec, token_number):
159 try:
160 (re, action_spec) = self.parse_token_definition(token_spec)
161 # Disabled this -- matching empty strings can be useful
162 #if re.nullable:
163 # raise Errors.InvalidToken(
164 # token_number, "Pattern can match 0 input symbols")
165 if isinstance(action_spec, Actions.Action):
166 action = action_spec
167 else:
168 try:
169 action_spec.__call__
170 except AttributeError:
171 action = Actions.Return(action_spec)
172 else:
173 action = Actions.Call(action_spec)
174 final_state = machine.new_state()
175 re.build_machine(machine, initial_state, final_state,
176 match_bol = 1, nocase = 0)
177 final_state.set_action(action, priority = -token_number)
178 except Errors.PlexError, e:
179 raise e.__class__("Token number %d: %s" % (token_number, e))
181 def parse_token_definition(self, token_spec):
182 if type(token_spec) != types.TupleType:
183 raise Errors.InvalidToken("Token definition is not a tuple")
184 if len(token_spec) != 2:
185 raise Errors.InvalidToken("Wrong number of items in token definition")
186 pattern, action = token_spec
187 if not isinstance(pattern, Regexps.RE):
188 raise Errors.InvalidToken("Pattern is not an RE instance")
189 return (pattern, action)
191 def get_initial_state(self, name):
192 return self.machine.get_initial_state(name)