1 #=======================================================================
3 # Python Lexical Analyser
6 # Scanning an input stream
8 #=======================================================================
11 from Regexps
import BOL
, EOL
, EOF
15 A Scanner is used to read tokens from a stream of characters
16 using the token set specified by a Plex.Lexicon.
20 Scanner(lexicon, stream, name = '')
22 See the docstring of the __init__ method for details.
26 See the docstrings of the individual methods for more
29 read() --> (value, text)
30 Reads the next lexical token from the stream.
32 position() --> (name, line, col)
33 Returns the position of the last token read using the
37 Causes scanner to change state.
39 produce(value [, text])
40 Causes return of a token value to the caller of the
45 lexicon
= None # Lexicon
46 stream
= None # file-like object
49 buf_start_pos
= 0 # position in input of start of buffer
50 next_pos
= 0 # position in input of next char to read
51 cur_pos
= 0 # position in input of current char
52 cur_line
= 1 # line number of current char
53 cur_line_start
= 0 # position in input of start of current line
54 start_pos
= 0 # position in input of start of token
55 start_line
= 0 # line number of start of token
56 start_col
= 0 # position in line of start of token
57 text
= None # text of last token read
58 initial_state
= None # Node
59 state_name
= '' # Name of initial state
60 queue
= None # list of tokens to be returned
63 def __init__(self
, lexicon
, stream
, name
= ''):
65 Scanner(lexicon, stream, name = '')
67 |lexicon| is a Plex.Lexicon instance specifying the lexical tokens
70 |stream| can be a file object or anything which implements a
71 compatible read() method.
73 |name| is optional, and may be the name of the file being
74 scanned or any other identifying string.
76 self
.lexicon
= lexicon
80 self
.initial_state
= None
84 self
.cur_line_start
= 0
90 Read the next lexical token from the stream and return a
91 tuple (value, text), where |value| is the value associated with
92 the token as specified by the Lexicon, and |text| is the actual
93 string read from the stream. Returns (None, '') on end of file.
97 self
.text
, action
= self
.scan_a_token()
102 value
= action
.perform(self
, self
.text
)
103 if value
is not None:
109 def scan_a_token(self
):
111 Read the next input sequence recognised by the machine
112 and return (text, action). Returns ('', None) on end of
115 self
.start_pos
= self
.cur_pos
116 self
.start_line
= self
.cur_line
117 self
.start_col
= self
.cur_pos
- self
.cur_line_start
119 # action = self.run_machine()
121 # action = self.run_machine_inlined()
122 action
= self
.run_machine_inlined()
125 print "Scanner: read: Performing", action
, "%d:%d" % (
126 self
.start_pos
, self
.cur_pos
)
127 base
= self
.buf_start_pos
128 text
= self
.buffer[self
.start_pos
- base
: self
.cur_pos
- base
]
129 return (text
, action
)
131 if self
.cur_pos
== self
.start_pos
:
132 if self
.cur_char
== EOL
:
134 if not self
.cur_char
or self
.cur_char
== EOF
:
136 raise Errors
.UnrecognizedInput(self
, self
.state_name
)
138 def run_machine(self
):
140 Run the machine until no more transitions are possible.
142 self
.state
= self
.initial_state
143 self
.backup_state
= None
144 while self
.transition():
146 return self
.back_up()
148 def run_machine_inlined(self
):
150 Inlined version of run_machine for speed.
152 state
= self
.initial_state
153 cur_pos
= self
.cur_pos
154 cur_line
= self
.cur_line
155 cur_line_start
= self
.cur_line_start
156 cur_char
= self
.cur_char
157 input_state
= self
.input_state
158 next_pos
= self
.next_pos
160 buf_start_pos
= self
.buf_start_pos
161 buf_len
= len(buffer)
166 print "State %d, %d/%d:%s -->" % ( #TRACE#
167 state
['number'], input_state
, cur_pos
, repr(cur_char
)), #TRACE#
168 # Begin inlined self.save_for_backup()
169 #action = state.action #@slow
170 action
= state
['action'] #@fast
173 action
, cur_pos
, cur_line
, cur_line_start
, cur_char
, input_state
, next_pos
)
174 # End inlined self.save_for_backup()
176 #new_state = state.new_state(c) #@slow
177 new_state
= state
.get(c
, -1) #@fast
178 if new_state
== -1: #@fast
179 new_state
= c
and state
.get('else') #@fast
182 print "State %d" % new_state
['number'] #TRACE#
184 # Begin inlined: self.next_char()
187 # Begin inlined: c = self.read_char()
188 buf_index
= next_pos
- buf_start_pos
189 if buf_index
< buf_len
:
190 c
= buffer[buf_index
]
191 next_pos
= next_pos
+ 1
193 discard
= self
.start_pos
- buf_start_pos
194 data
= self
.stream
.read(0x1000)
195 buffer = self
.buffer[discard
:] + data
197 buf_start_pos
= buf_start_pos
+ discard
198 self
.buf_start_pos
= buf_start_pos
199 buf_len
= len(buffer)
200 buf_index
= buf_index
- discard
202 c
= buffer[buf_index
]
203 next_pos
= next_pos
+ 1
206 # End inlined: c = self.read_char()
215 elif input_state
== 2:
218 elif input_state
== 3:
219 cur_line
= cur_line
+ 1
220 cur_line_start
= cur_pos
= next_pos
223 elif input_state
== 4:
226 else: # input_state = 5
228 # End inlined self.next_char()
229 else: # not new_state
231 print "blocked" #TRACE#
232 # Begin inlined: action = self.back_up()
234 (action
, cur_pos
, cur_line
, cur_line_start
,
235 cur_char
, input_state
, next_pos
) = backup_state
239 # End inlined: action = self.back_up()
240 self
.cur_pos
= cur_pos
241 self
.cur_line
= cur_line
242 self
.cur_line_start
= cur_line_start
243 self
.cur_char
= cur_char
244 self
.input_state
= input_state
245 self
.next_pos
= next_pos
248 print "Doing", action
#TRACE#
251 # def transition(self):
252 # self.save_for_backup()
254 # new_state = self.state.new_state(c)
257 # print "Scanner: read: State %d: %s --> State %d" % (
258 # self.state.number, repr(c), new_state.number)
259 # self.state = new_state
264 # print "Scanner: read: State %d: %s --> blocked" % (
265 # self.state.number, repr(c))
268 # def save_for_backup(self):
269 # action = self.state.get_action()
272 # print "Scanner: read: Saving backup point at", self.cur_pos
273 # self.backup_state = (
274 # action, self.cur_pos, self.cur_line, self.cur_line_start,
275 # self.cur_char, self.input_state, self.next_pos)
278 # backup_state = self.backup_state
280 # (action, self.cur_pos, self.cur_line, self.cur_line_start,
281 # self.cur_char, self.input_state, self.next_pos) = backup_state
283 # print "Scanner: read: Backing up to", self.cur_pos
289 input_state
= self
.input_state
291 print "Scanner: next:", " "*20, "[%d] %d" % (input_state
, self
.cur_pos
),
293 self
.cur_pos
= self
.next_pos
303 elif input_state
== 2:
306 elif input_state
== 3:
307 self
.cur_line
= self
.cur_line
+ 1
308 self
.cur_line_start
= self
.cur_pos
= self
.next_pos
311 elif input_state
== 4:
314 else: # input_state = 5
317 print "--> [%d] %d %s" % (input_state
, self
.cur_pos
, repr(self
.cur_char
))
319 # def read_char(self):
321 # Get the next input character, filling the buffer if necessary.
322 # Returns '' at end of file.
324 # next_pos = self.next_pos
325 # buf_index = next_pos - self.buf_start_pos
326 # if buf_index == len(self.buffer):
327 # discard = self.start_pos - self.buf_start_pos
328 # data = self.stream.read(0x1000)
329 # self.buffer = self.buffer[discard:] + data
330 # self.buf_start_pos = self.buf_start_pos + discard
331 # buf_index = buf_index - discard
334 # c = self.buffer[buf_index]
335 # self.next_pos = next_pos + 1
340 Return a tuple (name, line, col) representing the location of
341 the last token read using the read() method. |name| is the
342 name that was provided to the Scanner constructor; |line|
343 is the line number in the stream (1-based); |col| is the
344 position within the line of the first character of the token
347 return (self
.name
, self
.start_line
, self
.start_col
)
349 def begin(self
, state_name
):
350 """Set the current state of the scanner to the named state."""
351 self
.initial_state
= (
352 self
.lexicon
.get_initial_state(state_name
))
353 self
.state_name
= state_name
355 def produce(self
, value
, text
= None):
357 Called from an action procedure, causes |value| to be returned
358 as the token value from read(). If |text| is supplied, it is
359 returned in place of the scanned text.
361 produce() can be called more than once during a single call to an action
362 procedure, in which case the tokens are queued up and returned one
363 at a time by subsequent calls to read(), until the queue is empty,
364 whereupon scanning resumes.
368 self
.queue
.append((value
, text
))
372 Override this method if you want something to be done at
376 # For backward compatibility:
377 setattr(Scanner
, "yield", Scanner
.produce
)