openfile(): Go back to opening the files in text mode. This undoes
[python/dscho.git] / Lib / tokenize.py
blob76ea7a2ef99cfb7d3eb075983ba13f14730c6c40
1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
16 operators
18 Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
26 __credits__ = \
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
29 import string, re
30 from token import *
32 import token
33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
34 del x
35 del token
37 COMMENT = N_TOKENS
38 tok_name[COMMENT] = 'COMMENT'
39 NL = N_TOKENS + 1
40 tok_name[NL] = 'NL'
41 N_TOKENS += 2
43 def group(*choices): return '(' + '|'.join(choices) + ')'
44 def any(*choices): return apply(group, choices) + '*'
45 def maybe(*choices): return apply(group, choices) + '?'
47 Whitespace = r'[ \f\t]*'
48 Comment = r'#[^\r\n]*'
49 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
50 Name = r'[a-zA-Z_]\w*'
52 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
53 Octnumber = r'0[0-7]*[lL]?'
54 Decnumber = r'[1-9]\d*[lL]?'
55 Intnumber = group(Hexnumber, Octnumber, Decnumber)
56 Exponent = r'[eE][-+]?\d+'
57 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
58 Expfloat = r'\d+' + Exponent
59 Floatnumber = group(Pointfloat, Expfloat)
60 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
61 Number = group(Imagnumber, Floatnumber, Intnumber)
63 # Tail end of ' string.
64 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
65 # Tail end of " string.
66 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
67 # Tail end of ''' string.
68 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
69 # Tail end of """ string.
70 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
71 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
72 # Single-line ' or " string.
73 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
74 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
76 # Because of leftmost-then-longest match semantics, be sure to put the
77 # longest operators first (e.g., if = came before ==, == would get
78 # recognized as two instances of =).
79 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
80 r"//=?",
81 r"[+\-*/%&|^=<>]=?",
82 r"~")
84 Bracket = '[][(){}]'
85 Special = group(r'\r?\n', r'[:;.,`]')
86 Funny = group(Operator, Bracket, Special)
88 PlainToken = group(Number, Funny, String, Name)
89 Token = Ignore + PlainToken
91 # First (or only) line of ' or " string.
92 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
93 group("'", r'\\\r?\n'),
94 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
95 group('"', r'\\\r?\n'))
96 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
97 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
99 tokenprog, pseudoprog, single3prog, double3prog = map(
100 re.compile, (Token, PseudoToken, Single3, Double3))
101 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
102 "'''": single3prog, '"""': double3prog,
103 "r'''": single3prog, 'r"""': double3prog,
104 "u'''": single3prog, 'u"""': double3prog,
105 "ur'''": single3prog, 'ur"""': double3prog,
106 "R'''": single3prog, 'R"""': double3prog,
107 "U'''": single3prog, 'U"""': double3prog,
108 "uR'''": single3prog, 'uR"""': double3prog,
109 "Ur'''": single3prog, 'Ur"""': double3prog,
110 "UR'''": single3prog, 'UR"""': double3prog,
111 'r': None, 'R': None, 'u': None, 'U': None}
113 triple_quoted = {}
114 for t in ("'''", '"""',
115 "r'''", 'r"""', "R'''", 'R"""',
116 "u'''", 'u"""', "U'''", 'U"""',
117 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
118 "uR'''", 'uR"""', "UR'''", 'UR"""'):
119 triple_quoted[t] = t
120 single_quoted = {}
121 for t in ("'", '"',
122 "r'", 'r"', "R'", 'R"',
123 "u'", 'u"', "U'", 'U"',
124 "ur'", 'ur"', "Ur'", 'Ur"',
125 "uR'", 'uR"', "UR'", 'UR"' ):
126 single_quoted[t] = t
128 tabsize = 8
130 class TokenError(Exception): pass
132 class StopTokenizing(Exception): pass
134 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
135 print "%d,%d-%d,%d:\t%s\t%s" % \
136 (srow, scol, erow, ecol, tok_name[type], repr(token))
138 def tokenize(readline, tokeneater=printtoken):
140 The tokenize() function accepts two parameters: one representing the
141 input stream, and one providing an output mechanism for tokenize().
143 The first parameter, readline, must be a callable object which provides
144 the same interface as the readline() method of built-in file objects.
145 Each call to the function should return one line of input as a string.
147 The second parameter, tokeneater, must also be a callable object. It is
148 called once for each token, with five arguments, corresponding to the
149 tuples generated by generate_tokens().
151 try:
152 tokenize_loop(readline, tokeneater)
153 except StopTokenizing:
154 pass
156 # backwards compatible interface
157 def tokenize_loop(readline, tokeneater):
158 for token_info in generate_tokens(readline):
159 apply(tokeneater, token_info)
161 def generate_tokens(readline):
163 The generate_tokens() generator requires one argment, readline, which
164 must be a callable object which provides the same interface as the
165 readline() method of built-in file objects. Each call to the function
166 should return one line of input as a string.
168 The generator produces 5-tuples with these members: the token type; the
169 token string; a 2-tuple (srow, scol) of ints specifying the row and
170 column where the token begins in the source; a 2-tuple (erow, ecol) of
171 ints specifying the row and column where the token ends in the source;
172 and the line on which the token was found. The line passed is the
173 logical line; continuation lines are included.
175 lnum = parenlev = continued = 0
176 namechars, numchars = string.ascii_letters + '_', '0123456789'
177 contstr, needcont = '', 0
178 contline = None
179 indents = [0]
181 while 1: # loop over lines in stream
182 line = readline()
183 lnum = lnum + 1
184 pos, max = 0, len(line)
186 if contstr: # continued string
187 if not line:
188 raise TokenError, ("EOF in multi-line string", strstart)
189 endmatch = endprog.match(line)
190 if endmatch:
191 pos = end = endmatch.end(0)
192 yield (STRING, contstr + line[:end],
193 strstart, (lnum, end), contline + line)
194 contstr, needcont = '', 0
195 contline = None
196 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
197 yield (ERRORTOKEN, contstr + line,
198 strstart, (lnum, len(line)), contline)
199 contstr = ''
200 contline = None
201 continue
202 else:
203 contstr = contstr + line
204 contline = contline + line
205 continue
207 elif parenlev == 0 and not continued: # new statement
208 if not line: break
209 column = 0
210 while pos < max: # measure leading whitespace
211 if line[pos] == ' ': column = column + 1
212 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
213 elif line[pos] == '\f': column = 0
214 else: break
215 pos = pos + 1
216 if pos == max: break
218 if line[pos] in '#\r\n': # skip comments or blank lines
219 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
220 (lnum, pos), (lnum, len(line)), line)
221 continue
223 if column > indents[-1]: # count indents or dedents
224 indents.append(column)
225 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
226 while column < indents[-1]:
227 indents = indents[:-1]
228 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
230 else: # continued statement
231 if not line:
232 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
233 continued = 0
235 while pos < max:
236 pseudomatch = pseudoprog.match(line, pos)
237 if pseudomatch: # scan for tokens
238 start, end = pseudomatch.span(1)
239 spos, epos, pos = (lnum, start), (lnum, end), end
240 token, initial = line[start:end], line[start]
242 if initial in numchars or \
243 (initial == '.' and token != '.'): # ordinary number
244 yield (NUMBER, token, spos, epos, line)
245 elif initial in '\r\n':
246 yield (parenlev > 0 and NL or NEWLINE,
247 token, spos, epos, line)
248 elif initial == '#':
249 yield (COMMENT, token, spos, epos, line)
250 elif token in triple_quoted:
251 endprog = endprogs[token]
252 endmatch = endprog.match(line, pos)
253 if endmatch: # all on one line
254 pos = endmatch.end(0)
255 token = line[start:pos]
256 yield (STRING, token, spos, (lnum, pos), line)
257 else:
258 strstart = (lnum, start) # multiple lines
259 contstr = line[start:]
260 contline = line
261 break
262 elif initial in single_quoted or \
263 token[:2] in single_quoted or \
264 token[:3] in single_quoted:
265 if token[-1] == '\n': # continued string
266 strstart = (lnum, start)
267 endprog = (endprogs[initial] or endprogs[token[1]] or
268 endprogs[token[2]])
269 contstr, needcont = line[start:], 1
270 contline = line
271 break
272 else: # ordinary string
273 yield (STRING, token, spos, epos, line)
274 elif initial in namechars: # ordinary name
275 yield (NAME, token, spos, epos, line)
276 elif initial == '\\': # continued stmt
277 continued = 1
278 else:
279 if initial in '([{': parenlev = parenlev + 1
280 elif initial in ')]}': parenlev = parenlev - 1
281 yield (OP, token, spos, epos, line)
282 else:
283 yield (ERRORTOKEN, line[pos],
284 (lnum, pos), (lnum, pos+1), line)
285 pos = pos + 1
287 for indent in indents[1:]: # pop remaining indent levels
288 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
289 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
291 if __name__ == '__main__': # testing
292 import sys
293 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
294 else: tokenize(sys.stdin.readline)