1 #=======================================================================
3 # Python Lexical Analyser
6 # Scanning an input stream
8 #=======================================================================
11 cython
.declare(BOL
=object, EOL
=object, EOF
=object, NOT_FOUND
=object)
14 from Regexps
import BOL
, EOL
, EOF
18 class Scanner(object):
20 A Scanner is used to read tokens from a stream of characters
21 using the token set specified by a Plex.Lexicon.
25 Scanner(lexicon, stream, name = '')
27 See the docstring of the __init__ method for details.
31 See the docstrings of the individual methods for more
34 read() --> (value, text)
35 Reads the next lexical token from the stream.
37 position() --> (name, line, col)
38 Returns the position of the last token read using the
42 Causes scanner to change state.
44 produce(value [, text])
45 Causes return of a token value to the caller of the
50 # lexicon = None # Lexicon
51 # stream = None # file-like object
54 # buf_start_pos = 0 # position in input of start of buffer
55 # next_pos = 0 # position in input of next char to read
56 # cur_pos = 0 # position in input of current char
57 # cur_line = 1 # line number of current char
58 # cur_line_start = 0 # position in input of start of current line
59 # start_pos = 0 # position in input of start of token
60 # start_line = 0 # line number of start of token
61 # start_col = 0 # position in line of start of token
62 # text = None # text of last token read
63 # initial_state = None # Node
64 # state_name = '' # Name of initial state
65 # queue = None # list of tokens to be returned
68 def __init__(self
, lexicon
, stream
, name
= '', initial_pos
= None):
70 Scanner(lexicon, stream, name = '')
72 |lexicon| is a Plex.Lexicon instance specifying the lexical tokens
75 |stream| can be a file object or anything which implements a
76 compatible read() method.
78 |name| is optional, and may be the name of the file being
79 scanned or any other identifying string.
84 self
.buf_start_pos
= 0
92 self
.state_name
= None
94 self
.lexicon
= lexicon
98 self
.initial_state
= None
102 self
.cur_line_start
= 0
105 if initial_pos
is not None:
106 self
.cur_line
, self
.cur_line_start
= initial_pos
[1], -initial_pos
[2]
110 Read the next lexical token from the stream and return a
111 tuple (value, text), where |value| is the value associated with
112 the token as specified by the Lexicon, and |text| is the actual
113 string read from the stream. Returns (None, '') on end of file.
117 self
.text
, action
= self
.scan_a_token()
122 value
= action
.perform(self
, self
.text
)
123 if value
is not None:
129 def scan_a_token(self
):
131 Read the next input sequence recognised by the machine
132 and return (text, action). Returns ('', None) on end of
135 self
.start_pos
= self
.cur_pos
136 self
.start_line
= self
.cur_line
137 self
.start_col
= self
.cur_pos
- self
.cur_line_start
138 action
= self
.run_machine_inlined()
139 if action
is not None:
141 print("Scanner: read: Performing %s %d:%d" % (
142 action
, self
.start_pos
, self
.cur_pos
))
143 text
= self
.buffer[self
.start_pos
- self
.buf_start_pos
:
144 self
.cur_pos
- self
.buf_start_pos
]
145 return (text
, action
)
147 if self
.cur_pos
== self
.start_pos
:
148 if self
.cur_char
is EOL
:
150 if self
.cur_char
is None or self
.cur_char
is EOF
:
152 raise Errors
.UnrecognizedInput(self
, self
.state_name
)
154 def run_machine_inlined(self
):
156 Inlined version of run_machine for speed.
158 state
= self
.initial_state
159 cur_pos
= self
.cur_pos
160 cur_line
= self
.cur_line
161 cur_line_start
= self
.cur_line_start
162 cur_char
= self
.cur_char
163 input_state
= self
.input_state
164 next_pos
= self
.next_pos
166 buf_start_pos
= self
.buf_start_pos
167 buf_len
= len(buffer)
168 b_action
, b_cur_pos
, b_cur_line
, b_cur_line_start
, b_cur_char
, b_input_state
, b_next_pos
= \
169 None, 0, 0, 0, u
'', 0, 0
173 print("State %d, %d/%d:%s -->" % ( #TRACE#
174 state
['number'], input_state
, cur_pos
, repr(cur_char
))) #TRACE#
175 # Begin inlined self.save_for_backup()
176 #action = state.action #@slow
177 action
= state
['action'] #@fast
178 if action
is not None:
179 b_action
, b_cur_pos
, b_cur_line
, b_cur_line_start
, b_cur_char
, b_input_state
, b_next_pos
= \
180 action
, cur_pos
, cur_line
, cur_line_start
, cur_char
, input_state
, next_pos
181 # End inlined self.save_for_backup()
183 #new_state = state.new_state(c) #@slow
184 new_state
= state
.get(c
, NOT_FOUND
) #@fast
185 if new_state
is NOT_FOUND
: #@fast
186 new_state
= c
and state
.get('else') #@fast
189 print("State %d" % new_state
['number']) #TRACE#
191 # Begin inlined: self.next_char()
194 # Begin inlined: c = self.read_char()
195 buf_index
= next_pos
- buf_start_pos
196 if buf_index
< buf_len
:
197 c
= buffer[buf_index
]
198 next_pos
= next_pos
+ 1
200 discard
= self
.start_pos
- buf_start_pos
201 data
= self
.stream
.read(0x1000)
202 buffer = self
.buffer[discard
:] + data
204 buf_start_pos
= buf_start_pos
+ discard
205 self
.buf_start_pos
= buf_start_pos
206 buf_len
= len(buffer)
207 buf_index
= buf_index
- discard
209 c
= buffer[buf_index
]
210 next_pos
= next_pos
+ 1
213 # End inlined: c = self.read_char()
222 elif input_state
== 2:
225 elif input_state
== 3:
226 cur_line
= cur_line
+ 1
227 cur_line_start
= cur_pos
= next_pos
230 elif input_state
== 4:
233 else: # input_state = 5
235 # End inlined self.next_char()
236 else: # not new_state
238 print("blocked") #TRACE#
239 # Begin inlined: action = self.back_up()
240 if b_action
is not None:
241 (action
, cur_pos
, cur_line
, cur_line_start
,
242 cur_char
, input_state
, next_pos
) = \
243 (b_action
, b_cur_pos
, b_cur_line
, b_cur_line_start
,
244 b_cur_char
, b_input_state
, b_next_pos
)
248 # End inlined: action = self.back_up()
249 self
.cur_pos
= cur_pos
250 self
.cur_line
= cur_line
251 self
.cur_line_start
= cur_line_start
252 self
.cur_char
= cur_char
253 self
.input_state
= input_state
254 self
.next_pos
= next_pos
256 if action
is not None: #TRACE#
257 print("Doing %s" % action
) #TRACE#
261 input_state
= self
.input_state
263 print("Scanner: next: %s [%d] %d" % (" "*20, input_state
, self
.cur_pos
))
265 self
.cur_pos
= self
.next_pos
275 elif input_state
== 2:
276 self
.cur_char
= u
'\n'
278 elif input_state
== 3:
279 self
.cur_line
= self
.cur_line
+ 1
280 self
.cur_line_start
= self
.cur_pos
= self
.next_pos
283 elif input_state
== 4:
286 else: # input_state = 5
289 print("--> [%d] %d %s" % (input_state
, self
.cur_pos
, repr(self
.cur_char
)))
293 Return a tuple (name, line, col) representing the location of
294 the last token read using the read() method. |name| is the
295 name that was provided to the Scanner constructor; |line|
296 is the line number in the stream (1-based); |col| is the
297 position within the line of the first character of the token
300 return (self
.name
, self
.start_line
, self
.start_col
)
302 def get_position(self
):
303 """Python accessible wrapper around position(), only for error reporting.
305 return self
.position()
307 def begin(self
, state_name
):
308 """Set the current state of the scanner to the named state."""
309 self
.initial_state
= (
310 self
.lexicon
.get_initial_state(state_name
))
311 self
.state_name
= state_name
313 def produce(self
, value
, text
= None):
315 Called from an action procedure, causes |value| to be returned
316 as the token value from read(). If |text| is supplied, it is
317 returned in place of the scanned text.
319 produce() can be called more than once during a single call to an action
320 procedure, in which case the tokens are queued up and returned one
321 at a time by subsequent calls to read(), until the queue is empty,
322 whereupon scanning resumes.
326 self
.queue
.append((value
, text
))
330 Override this method if you want something to be done at