Better example and comments.
[rox-lib.git] / python / rox / Plex / Scanners.py
blob6278d88ba4d5d360125fa489a7b4fa552d4d7ad1
1 #=======================================================================
3 # Python Lexical Analyser
6 # Scanning an input stream
8 #=======================================================================
10 import Errors
11 from Regexps import BOL, EOL, EOF
13 class Scanner:
14 """
15 A Scanner is used to read tokens from a stream of characters
16 using the token set specified by a Plex.Lexicon.
18 Constructor:
20 Scanner(lexicon, stream, name = '')
22 See the docstring of the __init__ method for details.
24 Methods:
26 See the docstrings of the individual methods for more
27 information.
29 read() --> (value, text)
30 Reads the next lexical token from the stream.
32 position() --> (name, line, col)
33 Returns the position of the last token read using the
34 read() method.
36 begin(state_name)
37 Causes scanner to change state.
39 produce(value [, text])
40 Causes return of a token value to the caller of the
41 Scanner.
43 """
45 lexicon = None # Lexicon
46 stream = None # file-like object
47 name = ''
48 buffer = ''
49 buf_start_pos = 0 # position in input of start of buffer
50 next_pos = 0 # position in input of next char to read
51 cur_pos = 0 # position in input of current char
52 cur_line = 1 # line number of current char
53 cur_line_start = 0 # position in input of start of current line
54 start_pos = 0 # position in input of start of token
55 start_line = 0 # line number of start of token
56 start_col = 0 # position in line of start of token
57 text = None # text of last token read
58 initial_state = None # Node
59 state_name = '' # Name of initial state
60 queue = None # list of tokens to be returned
61 trace = 0
63 def __init__(self, lexicon, stream, name = ''):
64 """
65 Scanner(lexicon, stream, name = '')
67 |lexicon| is a Plex.Lexicon instance specifying the lexical tokens
68 to be recognised.
70 |stream| can be a file object or anything which implements a
71 compatible read() method.
73 |name| is optional, and may be the name of the file being
74 scanned or any other identifying string.
75 """
76 self.lexicon = lexicon
77 self.stream = stream
78 self.name = name
79 self.queue = []
80 self.initial_state = None
81 self.begin('')
82 self.next_pos = 0
83 self.cur_pos = 0
84 self.cur_line_start = 0
85 self.cur_char = BOL
86 self.input_state = 1
88 def read(self):
89 """
90 Read the next lexical token from the stream and return a
91 tuple (value, text), where |value| is the value associated with
92 the token as specified by the Lexicon, and |text| is the actual
93 string read from the stream. Returns (None, '') on end of file.
94 """
95 queue = self.queue
96 while not queue:
97 self.text, action = self.scan_a_token()
98 if action is None:
99 self.produce(None)
100 self.eof()
101 else:
102 value = action.perform(self, self.text)
103 if value is not None:
104 self.produce(value)
105 result = queue[0]
106 del queue[0]
107 return result
109 def scan_a_token(self):
111 Read the next input sequence recognised by the machine
112 and return (text, action). Returns ('', None) on end of
113 file.
115 self.start_pos = self.cur_pos
116 self.start_line = self.cur_line
117 self.start_col = self.cur_pos - self.cur_line_start
118 # if self.trace:
119 # action = self.run_machine()
120 # else:
121 # action = self.run_machine_inlined()
122 action = self.run_machine_inlined()
123 if action:
124 if self.trace:
125 print "Scanner: read: Performing", action, "%d:%d" % (
126 self.start_pos, self.cur_pos)
127 base = self.buf_start_pos
128 text = self.buffer[self.start_pos - base : self.cur_pos - base]
129 return (text, action)
130 else:
131 if self.cur_pos == self.start_pos:
132 if self.cur_char == EOL:
133 self.next_char()
134 if not self.cur_char or self.cur_char == EOF:
135 return ('', None)
136 raise Errors.UnrecognizedInput(self, self.state_name)
138 def run_machine(self):
140 Run the machine until no more transitions are possible.
142 self.state = self.initial_state
143 self.backup_state = None
144 while self.transition():
145 pass
146 return self.back_up()
148 def run_machine_inlined(self):
150 Inlined version of run_machine for speed.
152 state = self.initial_state
153 cur_pos = self.cur_pos
154 cur_line = self.cur_line
155 cur_line_start = self.cur_line_start
156 cur_char = self.cur_char
157 input_state = self.input_state
158 next_pos = self.next_pos
159 buffer = self.buffer
160 buf_start_pos = self.buf_start_pos
161 buf_len = len(buffer)
162 backup_state = None
163 trace = self.trace
164 while 1:
165 if trace: #TRACE#
166 print "State %d, %d/%d:%s -->" % ( #TRACE#
167 state['number'], input_state, cur_pos, repr(cur_char)), #TRACE#
168 # Begin inlined self.save_for_backup()
169 #action = state.action #@slow
170 action = state['action'] #@fast
171 if action:
172 backup_state = (
173 action, cur_pos, cur_line, cur_line_start, cur_char, input_state, next_pos)
174 # End inlined self.save_for_backup()
175 c = cur_char
176 #new_state = state.new_state(c) #@slow
177 new_state = state.get(c, -1) #@fast
178 if new_state == -1: #@fast
179 new_state = c and state.get('else') #@fast
180 if new_state:
181 if trace: #TRACE#
182 print "State %d" % new_state['number'] #TRACE#
183 state = new_state
184 # Begin inlined: self.next_char()
185 if input_state == 1:
186 cur_pos = next_pos
187 # Begin inlined: c = self.read_char()
188 buf_index = next_pos - buf_start_pos
189 if buf_index < buf_len:
190 c = buffer[buf_index]
191 next_pos = next_pos + 1
192 else:
193 discard = self.start_pos - buf_start_pos
194 data = self.stream.read(0x1000)
195 buffer = self.buffer[discard:] + data
196 self.buffer = buffer
197 buf_start_pos = buf_start_pos + discard
198 self.buf_start_pos = buf_start_pos
199 buf_len = len(buffer)
200 buf_index = buf_index - discard
201 if data:
202 c = buffer[buf_index]
203 next_pos = next_pos + 1
204 else:
205 c = ''
206 # End inlined: c = self.read_char()
207 if c == '\n':
208 cur_char = EOL
209 input_state = 2
210 elif not c:
211 cur_char = EOL
212 input_state = 4
213 else:
214 cur_char = c
215 elif input_state == 2:
216 cur_char = '\n'
217 input_state = 3
218 elif input_state == 3:
219 cur_line = cur_line + 1
220 cur_line_start = cur_pos = next_pos
221 cur_char = BOL
222 input_state = 1
223 elif input_state == 4:
224 cur_char = EOF
225 input_state = 5
226 else: # input_state = 5
227 cur_char = ''
228 # End inlined self.next_char()
229 else: # not new_state
230 if trace: #TRACE#
231 print "blocked" #TRACE#
232 # Begin inlined: action = self.back_up()
233 if backup_state:
234 (action, cur_pos, cur_line, cur_line_start,
235 cur_char, input_state, next_pos) = backup_state
236 else:
237 action = None
238 break # while 1
239 # End inlined: action = self.back_up()
240 self.cur_pos = cur_pos
241 self.cur_line = cur_line
242 self.cur_line_start = cur_line_start
243 self.cur_char = cur_char
244 self.input_state = input_state
245 self.next_pos = next_pos
246 if trace: #TRACE#
247 if action: #TRACE#
248 print "Doing", action #TRACE#
249 return action
251 # def transition(self):
252 # self.save_for_backup()
253 # c = self.cur_char
254 # new_state = self.state.new_state(c)
255 # if new_state:
256 # if self.trace:
257 # print "Scanner: read: State %d: %s --> State %d" % (
258 # self.state.number, repr(c), new_state.number)
259 # self.state = new_state
260 # self.next_char()
261 # return 1
262 # else:
263 # if self.trace:
264 # print "Scanner: read: State %d: %s --> blocked" % (
265 # self.state.number, repr(c))
266 # return 0
268 # def save_for_backup(self):
269 # action = self.state.get_action()
270 # if action:
271 # if self.trace:
272 # print "Scanner: read: Saving backup point at", self.cur_pos
273 # self.backup_state = (
274 # action, self.cur_pos, self.cur_line, self.cur_line_start,
275 # self.cur_char, self.input_state, self.next_pos)
277 # def back_up(self):
278 # backup_state = self.backup_state
279 # if backup_state:
280 # (action, self.cur_pos, self.cur_line, self.cur_line_start,
281 # self.cur_char, self.input_state, self.next_pos) = backup_state
282 # if self.trace:
283 # print "Scanner: read: Backing up to", self.cur_pos
284 # return action
285 # else:
286 # return None
288 def next_char(self):
289 input_state = self.input_state
290 if self.trace:
291 print "Scanner: next:", " "*20, "[%d] %d" % (input_state, self.cur_pos),
292 if input_state == 1:
293 self.cur_pos = self.next_pos
294 c = self.read_char()
295 if c == '\n':
296 self.cur_char = EOL
297 self.input_state = 2
298 elif not c:
299 self.cur_char = EOL
300 self.input_state = 4
301 else:
302 self.cur_char = c
303 elif input_state == 2:
304 self.cur_char = '\n'
305 self.input_state = 3
306 elif input_state == 3:
307 self.cur_line = self.cur_line + 1
308 self.cur_line_start = self.cur_pos = self.next_pos
309 self.cur_char = BOL
310 self.input_state = 1
311 elif input_state == 4:
312 self.cur_char = EOF
313 self.input_state = 5
314 else: # input_state = 5
315 self.cur_char = ''
316 if self.trace:
317 print "--> [%d] %d %s" % (input_state, self.cur_pos, repr(self.cur_char))
319 # def read_char(self):
320 # """
321 # Get the next input character, filling the buffer if necessary.
322 # Returns '' at end of file.
323 # """
324 # next_pos = self.next_pos
325 # buf_index = next_pos - self.buf_start_pos
326 # if buf_index == len(self.buffer):
327 # discard = self.start_pos - self.buf_start_pos
328 # data = self.stream.read(0x1000)
329 # self.buffer = self.buffer[discard:] + data
330 # self.buf_start_pos = self.buf_start_pos + discard
331 # buf_index = buf_index - discard
332 # if not data:
333 # return ''
334 # c = self.buffer[buf_index]
335 # self.next_pos = next_pos + 1
336 # return c
338 def position(self):
340 Return a tuple (name, line, col) representing the location of
341 the last token read using the read() method. |name| is the
342 name that was provided to the Scanner constructor; |line|
343 is the line number in the stream (1-based); |col| is the
344 position within the line of the first character of the token
345 (0-based).
347 return (self.name, self.start_line, self.start_col)
349 def begin(self, state_name):
350 """Set the current state of the scanner to the named state."""
351 self.initial_state = (
352 self.lexicon.get_initial_state(state_name))
353 self.state_name = state_name
355 def produce(self, value, text = None):
357 Called from an action procedure, causes |value| to be returned
358 as the token value from read(). If |text| is supplied, it is
359 returned in place of the scanned text.
361 produce() can be called more than once during a single call to an action
362 procedure, in which case the tokens are queued up and returned one
363 at a time by subsequent calls to read(), until the queue is empty,
364 whereupon scanning resumes.
366 if text is None:
367 text = self.text
368 self.queue.append((value, text))
370 def eof(self):
372 Override this method if you want something to be done at
373 end of file.
376 # For backward compatibility:
377 setattr(Scanner, "yield", Scanner.produce)