1 #=======================================================================
3 # Python Lexical Analyser
5 # Lexical Analyser Specification
7 #=======================================================================
17 # debug_flags for Lexicon constructor
23 This class is used as part of a Plex.Lexicon specification to
24 introduce a user-defined state.
28 State(name, token_specifications)
34 def __init__(self
, name
, tokens
):
38 class Lexicon(object):
40 Lexicon(specification) builds a lexical analyser from the given
41 |specification|. The specification consists of a list of
42 specification items. Each specification item may be either:
44 1) A token definition, which is a tuple:
48 The |pattern| is a regular axpression built using the
49 constructors defined in the Plex module.
51 The |action| is the action to be performed when this pattern
52 is recognised (see below).
54 2) A state definition:
58 where |name| is a character string naming the state,
59 and |tokens| is a list of token definitions as
60 above. The meaning and usage of states is described
66 The |action| in a token specication may be one of three things:
68 1) A function, which is called as follows:
70 function(scanner, text)
72 where |scanner| is the relevant Scanner instance, and |text|
73 is the matched text. If the function returns anything
74 other than None, that value is returned as the value of the
75 token. If it returns None, scanning continues as if the IGNORE
76 action were specified (see below).
78 2) One of the following special actions:
80 IGNORE means that the recognised characters will be treated as
81 white space and ignored. Scanning will continue until
82 the next non-ignored token is recognised before returning.
84 TEXT causes the scanned text itself to be returned as the
87 3) Any other value, which is returned as the value of the token.
92 At any given time, the scanner is in one of a number of states.
93 Associated with each state is a set of possible tokens. When scanning,
94 only tokens associated with the current state are recognised.
96 There is a default state, whose name is the empty string. Token
97 definitions which are not inside any State definition belong to
100 The initial state of the scanner is the default state. The state can
101 be changed in one of two ways:
103 1) Using Begin(state_name) as the action of a token.
105 2) Calling the begin(state_name) method of the Scanner.
107 To change back to the default state, use '' as the state name.
110 machine
= None # Machine
111 tables
= None # StateTableMachine
113 def __init__(self
, specifications
, debug
= None, debug_flags
= 7, timings
= None):
114 if type(specifications
) != types
.ListType
:
115 raise Errors
.InvalidScanner("Scanner definition is not a list")
117 from Timing
import time
120 nfa
= Machines
.Machine()
121 default_initial_state
= nfa
.new_initial_state('')
123 for spec
in specifications
:
124 if isinstance(spec
, State
):
125 user_initial_state
= nfa
.new_initial_state(spec
.name
)
126 for token
in spec
.tokens
:
127 self
.add_token_to_machine(
128 nfa
, user_initial_state
, token
, token_number
)
129 token_number
= token_number
+ 1
130 elif type(spec
) == types
.TupleType
:
131 self
.add_token_to_machine(
132 nfa
, default_initial_state
, spec
, token_number
)
133 token_number
= token_number
+ 1
135 raise Errors
.InvalidToken(
137 "Expected a token definition (tuple) or State instance")
140 total_time
= total_time
+ (time2
- time1
)
142 if debug
and (debug_flags
& 1):
143 debug
.write("\n============= NFA ===========\n")
145 dfa
= DFA
.nfa_to_dfa(nfa
, debug
= (debug_flags
& 3) == 3 and debug
)
148 total_time
= total_time
+ (time4
- time3
)
149 if debug
and (debug_flags
& 2):
150 debug
.write("\n============= DFA ===========\n")
153 timings
.write("Constructing NFA : %5.2f\n" % (time2
- time1
))
154 timings
.write("Converting to DFA: %5.2f\n" % (time4
- time3
))
155 timings
.write("TOTAL : %5.2f\n" % total_time
)
158 def add_token_to_machine(self
, machine
, initial_state
, token_spec
, token_number
):
160 (re
, action_spec
) = self
.parse_token_definition(token_spec
)
161 # Disabled this -- matching empty strings can be useful
163 # raise Errors.InvalidToken(
164 # token_number, "Pattern can match 0 input symbols")
165 if isinstance(action_spec
, Actions
.Action
):
170 except AttributeError:
171 action
= Actions
.Return(action_spec
)
173 action
= Actions
.Call(action_spec
)
174 final_state
= machine
.new_state()
175 re
.build_machine(machine
, initial_state
, final_state
,
176 match_bol
= 1, nocase
= 0)
177 final_state
.set_action(action
, priority
= -token_number
)
178 except Errors
.PlexError
, e
:
179 raise e
.__class
__("Token number %d: %s" % (token_number
, e
))
181 def parse_token_definition(self
, token_spec
):
182 if type(token_spec
) != types
.TupleType
:
183 raise Errors
.InvalidToken("Token definition is not a tuple")
184 if len(token_spec
) != 2:
185 raise Errors
.InvalidToken("Wrong number of items in token definition")
186 pattern
, action
= token_spec
187 if not isinstance(pattern
, Regexps
.RE
):
188 raise Errors
.InvalidToken("Pattern is not an RE instance")
189 return (pattern
, action
)
191 def get_initial_state(self
, name
):
192 return self
.machine
.get_initial_state(name
)