This commit was manufactured by cvs2svn to create tag 'r222'.
[python/dscho.git] / Lib / tokenize.py
blobb64be8083202c39f75fd7c6c815a25b8c7e59040
1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
16 operators
18 Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 from __future__ import generators
27 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
28 __credits__ = \
29 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31 import string, re
32 from token import *
34 import token
35 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
36 del token
38 COMMENT = N_TOKENS
39 tok_name[COMMENT] = 'COMMENT'
40 NL = N_TOKENS + 1
41 tok_name[NL] = 'NL'
42 N_TOKENS += 2
44 def group(*choices): return '(' + '|'.join(choices) + ')'
45 def any(*choices): return apply(group, choices) + '*'
46 def maybe(*choices): return apply(group, choices) + '?'
48 Whitespace = r'[ \f\t]*'
49 Comment = r'#[^\r\n]*'
50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51 Name = r'[a-zA-Z_]\w*'
53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54 Octnumber = r'0[0-7]*[lL]?'
55 Decnumber = r'[1-9]\d*[lL]?'
56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
57 Exponent = r'[eE][-+]?\d+'
58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59 Expfloat = r'\d+' + Exponent
60 Floatnumber = group(Pointfloat, Expfloat)
61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
62 Number = group(Imagnumber, Floatnumber, Intnumber)
64 # Tail end of ' string.
65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66 # Tail end of " string.
67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68 # Tail end of ''' string.
69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70 # Tail end of """ string.
71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73 # Single-line ' or " string.
74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77 # Because of leftmost-then-longest match semantics, be sure to put the
78 # longest operators first (e.g., if = came before ==, == would get
79 # recognized as two instances of =).
80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
81 r"//=?",
82 r"[+\-*/%&|^=<>]=?",
83 r"~")
85 Bracket = '[][(){}]'
86 Special = group(r'\r?\n', r'[:;.,`]')
87 Funny = group(Operator, Bracket, Special)
89 PlainToken = group(Number, Funny, String, Name)
90 Token = Ignore + PlainToken
92 # First (or only) line of ' or " string.
93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r'\\\r?\n'),
95 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r'\\\r?\n'))
97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
100 tokenprog, pseudoprog, single3prog, double3prog = map(
101 re.compile, (Token, PseudoToken, Single3, Double3))
102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103 "'''": single3prog, '"""': double3prog,
104 "r'''": single3prog, 'r"""': double3prog,
105 "u'''": single3prog, 'u"""': double3prog,
106 "ur'''": single3prog, 'ur"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "uR'''": single3prog, 'uR"""': double3prog,
110 "Ur'''": single3prog, 'Ur"""': double3prog,
111 "UR'''": single3prog, 'UR"""': double3prog,
112 'r': None, 'R': None, 'u': None, 'U': None}
114 tabsize = 8
116 class TokenError(Exception): pass
118 class StopTokenizing(Exception): pass
120 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
121 print "%d,%d-%d,%d:\t%s\t%s" % \
122 (srow, scol, erow, ecol, tok_name[type], repr(token))
124 def tokenize(readline, tokeneater=printtoken):
126 The tokenize() function accepts two parameters: one representing the
127 input stream, and one providing an output mechanism for tokenize().
129 The first parameter, readline, must be a callable object which provides
130 the same interface as the readline() method of built-in file objects.
131 Each call to the function should return one line of input as a string.
133 The second parameter, tokeneater, must also be a callable object. It is
134 called once for each token, with five arguments, corresponding to the
135 tuples generated by generate_tokens().
137 try:
138 tokenize_loop(readline, tokeneater)
139 except StopTokenizing:
140 pass
142 # backwards compatible interface
143 def tokenize_loop(readline, tokeneater):
144 for token_info in generate_tokens(readline):
145 apply(tokeneater, token_info)
147 def generate_tokens(readline):
149 The generate_tokens() generator requires one argment, readline, which
150 must be a callable object which provides the same interface as the
151 readline() method of built-in file objects. Each call to the function
152 should return one line of input as a string.
154 The generator produces 5-tuples with these members: the token type; the
155 token string; a 2-tuple (srow, scol) of ints specifying the row and
156 column where the token begins in the source; a 2-tuple (erow, ecol) of
157 ints specifying the row and column where the token ends in the source;
158 and the line on which the token was found. The line passed is the
159 logical line; continuation lines are included.
161 lnum = parenlev = continued = 0
162 namechars, numchars = string.ascii_letters + '_', '0123456789'
163 contstr, needcont = '', 0
164 contline = None
165 indents = [0]
167 while 1: # loop over lines in stream
168 line = readline()
169 lnum = lnum + 1
170 pos, max = 0, len(line)
172 if contstr: # continued string
173 if not line:
174 raise TokenError, ("EOF in multi-line string", strstart)
175 endmatch = endprog.match(line)
176 if endmatch:
177 pos = end = endmatch.end(0)
178 yield (STRING, contstr + line[:end],
179 strstart, (lnum, end), contline + line)
180 contstr, needcont = '', 0
181 contline = None
182 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
183 yield (ERRORTOKEN, contstr + line,
184 strstart, (lnum, len(line)), contline)
185 contstr = ''
186 contline = None
187 continue
188 else:
189 contstr = contstr + line
190 contline = contline + line
191 continue
193 elif parenlev == 0 and not continued: # new statement
194 if not line: break
195 column = 0
196 while pos < max: # measure leading whitespace
197 if line[pos] == ' ': column = column + 1
198 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
199 elif line[pos] == '\f': column = 0
200 else: break
201 pos = pos + 1
202 if pos == max: break
204 if line[pos] in '#\r\n': # skip comments or blank lines
205 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
206 (lnum, pos), (lnum, len(line)), line)
207 continue
209 if column > indents[-1]: # count indents or dedents
210 indents.append(column)
211 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
212 while column < indents[-1]:
213 indents = indents[:-1]
214 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
216 else: # continued statement
217 if not line:
218 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
219 continued = 0
221 while pos < max:
222 pseudomatch = pseudoprog.match(line, pos)
223 if pseudomatch: # scan for tokens
224 start, end = pseudomatch.span(1)
225 spos, epos, pos = (lnum, start), (lnum, end), end
226 token, initial = line[start:end], line[start]
228 if initial in numchars or \
229 (initial == '.' and token != '.'): # ordinary number
230 yield (NUMBER, token, spos, epos, line)
231 elif initial in '\r\n':
232 yield (parenlev > 0 and NL or NEWLINE,
233 token, spos, epos, line)
234 elif initial == '#':
235 yield (COMMENT, token, spos, epos, line)
236 elif token in ("'''", '"""', # triple-quoted
237 "r'''", 'r"""', "R'''", 'R"""',
238 "u'''", 'u"""', "U'''", 'U"""',
239 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
240 "uR'''", 'uR"""', "UR'''", 'UR"""'):
241 endprog = endprogs[token]
242 endmatch = endprog.match(line, pos)
243 if endmatch: # all on one line
244 pos = endmatch.end(0)
245 token = line[start:pos]
246 yield (STRING, token, spos, (lnum, pos), line)
247 else:
248 strstart = (lnum, start) # multiple lines
249 contstr = line[start:]
250 contline = line
251 break
252 elif initial in ("'", '"') or \
253 token[:2] in ("r'", 'r"', "R'", 'R"',
254 "u'", 'u"', "U'", 'U"') or \
255 token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
256 "uR'", 'uR"', "UR'", 'UR"' ):
257 if token[-1] == '\n': # continued string
258 strstart = (lnum, start)
259 endprog = (endprogs[initial] or endprogs[token[1]] or
260 endprogs[token[2]])
261 contstr, needcont = line[start:], 1
262 contline = line
263 break
264 else: # ordinary string
265 yield (STRING, token, spos, epos, line)
266 elif initial in namechars: # ordinary name
267 yield (NAME, token, spos, epos, line)
268 elif initial == '\\': # continued stmt
269 continued = 1
270 else:
271 if initial in '([{': parenlev = parenlev + 1
272 elif initial in ')]}': parenlev = parenlev - 1
273 yield (OP, token, spos, epos, line)
274 else:
275 yield (ERRORTOKEN, line[pos],
276 (lnum, pos), (lnum, pos+1), line)
277 pos = pos + 1
279 for indent in indents[1:]: # pop remaining indent levels
280 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
281 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
283 if __name__ == '__main__': # testing
284 import sys
285 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
286 else: tokenize(sys.stdin.readline)