This commit was manufactured by cvs2svn to create tag 'r22a4-fork'.
[python/dscho.git] / Lib / tokenize.py
blobda2bcd2c0c1131997c067216a73a3c81644843ac
1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
9 the token (a string)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
16 operators
18 Older entry points
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 from __future__ import generators
27 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
28 __credits__ = \
29 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31 import string, re
32 from token import *
34 import token
35 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
36 del token
38 COMMENT = N_TOKENS
39 tok_name[COMMENT] = 'COMMENT'
40 NL = N_TOKENS + 1
41 tok_name[NL] = 'NL'
42 N_TOKENS += 2
44 def group(*choices): return '(' + '|'.join(choices) + ')'
45 def any(*choices): return apply(group, choices) + '*'
46 def maybe(*choices): return apply(group, choices) + '?'
48 Whitespace = r'[ \f\t]*'
49 Comment = r'#[^\r\n]*'
50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
51 Name = r'[a-zA-Z_]\w*'
53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
54 Octnumber = r'0[0-7]*[lL]?'
55 Decnumber = r'[1-9]\d*[lL]?'
56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
57 Exponent = r'[eE][-+]?\d+'
58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
59 Expfloat = r'\d+' + Exponent
60 Floatnumber = group(Pointfloat, Expfloat)
61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
62 Number = group(Imagnumber, Floatnumber, Intnumber)
64 # Tail end of ' string.
65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
66 # Tail end of " string.
67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
68 # Tail end of ''' string.
69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70 # Tail end of """ string.
71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73 # Single-line ' or " string.
74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77 # Because of leftmost-then-longest match semantics, be sure to put the
78 # longest operators first (e.g., if = came before ==, == would get
79 # recognized as two instances of =).
80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
81 r"//=?",
82 r"[+\-*/%&|^=<>]=?",
83 r"~")
85 Bracket = '[][(){}]'
86 Special = group(r'\r?\n', r'[:;.,`]')
87 Funny = group(Operator, Bracket, Special)
89 PlainToken = group(Number, Funny, String, Name)
90 Token = Ignore + PlainToken
92 # First (or only) line of ' or " string.
93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r'\\\r?\n'),
95 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r'\\\r?\n'))
97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
100 tokenprog, pseudoprog, single3prog, double3prog = map(
101 re.compile, (Token, PseudoToken, Single3, Double3))
102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
103 "'''": single3prog, '"""': double3prog,
104 "r'''": single3prog, 'r"""': double3prog,
105 "u'''": single3prog, 'u"""': double3prog,
106 "ur'''": single3prog, 'ur"""': double3prog,
107 "R'''": single3prog, 'R"""': double3prog,
108 "U'''": single3prog, 'U"""': double3prog,
109 "uR'''": single3prog, 'uR"""': double3prog,
110 "Ur'''": single3prog, 'Ur"""': double3prog,
111 "UR'''": single3prog, 'UR"""': double3prog,
112 'r': None, 'R': None, 'u': None, 'U': None}
114 tabsize = 8
116 class TokenError(Exception): pass
118 class StopTokenizing(Exception): pass
120 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
121 print "%d,%d-%d,%d:\t%s\t%s" % \
122 (srow, scol, erow, ecol, tok_name[type], repr(token))
124 def tokenize(readline, tokeneater=printtoken):
125 try:
126 tokenize_loop(readline, tokeneater)
127 except StopTokenizing:
128 pass
130 # backwards compatible interface
131 def tokenize_loop(readline, tokeneater):
132 for token_info in generate_tokens(readline):
133 apply(tokeneater, token_info)
135 def generate_tokens(readline):
136 lnum = parenlev = continued = 0
137 namechars, numchars = string.ascii_letters + '_', '0123456789'
138 contstr, needcont = '', 0
139 contline = None
140 indents = [0]
142 while 1: # loop over lines in stream
143 line = readline()
144 lnum = lnum + 1
145 pos, max = 0, len(line)
147 if contstr: # continued string
148 if not line:
149 raise TokenError, ("EOF in multi-line string", strstart)
150 endmatch = endprog.match(line)
151 if endmatch:
152 pos = end = endmatch.end(0)
153 yield (STRING, contstr + line[:end],
154 strstart, (lnum, end), contline + line)
155 contstr, needcont = '', 0
156 contline = None
157 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
158 yield (ERRORTOKEN, contstr + line,
159 strstart, (lnum, len(line)), contline)
160 contstr = ''
161 contline = None
162 continue
163 else:
164 contstr = contstr + line
165 contline = contline + line
166 continue
168 elif parenlev == 0 and not continued: # new statement
169 if not line: break
170 column = 0
171 while pos < max: # measure leading whitespace
172 if line[pos] == ' ': column = column + 1
173 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
174 elif line[pos] == '\f': column = 0
175 else: break
176 pos = pos + 1
177 if pos == max: break
179 if line[pos] in '#\r\n': # skip comments or blank lines
180 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
181 (lnum, pos), (lnum, len(line)), line)
182 continue
184 if column > indents[-1]: # count indents or dedents
185 indents.append(column)
186 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
187 while column < indents[-1]:
188 indents = indents[:-1]
189 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
191 else: # continued statement
192 if not line:
193 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
194 continued = 0
196 while pos < max:
197 pseudomatch = pseudoprog.match(line, pos)
198 if pseudomatch: # scan for tokens
199 start, end = pseudomatch.span(1)
200 spos, epos, pos = (lnum, start), (lnum, end), end
201 token, initial = line[start:end], line[start]
203 if initial in numchars or \
204 (initial == '.' and token != '.'): # ordinary number
205 yield (NUMBER, token, spos, epos, line)
206 elif initial in '\r\n':
207 yield (parenlev > 0 and NL or NEWLINE,
208 token, spos, epos, line)
209 elif initial == '#':
210 yield (COMMENT, token, spos, epos, line)
211 elif token in ("'''", '"""', # triple-quoted
212 "r'''", 'r"""', "R'''", 'R"""',
213 "u'''", 'u"""', "U'''", 'U"""',
214 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
215 "uR'''", 'uR"""', "UR'''", 'UR"""'):
216 endprog = endprogs[token]
217 endmatch = endprog.match(line, pos)
218 if endmatch: # all on one line
219 pos = endmatch.end(0)
220 token = line[start:pos]
221 yield (STRING, token, spos, (lnum, pos), line)
222 else:
223 strstart = (lnum, start) # multiple lines
224 contstr = line[start:]
225 contline = line
226 break
227 elif initial in ("'", '"') or \
228 token[:2] in ("r'", 'r"', "R'", 'R"',
229 "u'", 'u"', "U'", 'U"') or \
230 token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
231 "uR'", 'uR"', "UR'", 'UR"' ):
232 if token[-1] == '\n': # continued string
233 strstart = (lnum, start)
234 endprog = (endprogs[initial] or endprogs[token[1]] or
235 endprogs[token[2]])
236 contstr, needcont = line[start:], 1
237 contline = line
238 break
239 else: # ordinary string
240 yield (STRING, token, spos, epos, line)
241 elif initial in namechars: # ordinary name
242 yield (NAME, token, spos, epos, line)
243 elif initial == '\\': # continued stmt
244 continued = 1
245 else:
246 if initial in '([{': parenlev = parenlev + 1
247 elif initial in ')]}': parenlev = parenlev - 1
248 yield (OP, token, spos, epos, line)
249 else:
250 yield (ERRORTOKEN, line[pos],
251 (lnum, pos), (lnum, pos+1), line)
252 pos = pos + 1
254 for indent in indents[1:]: # pop remaining indent levels
255 yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
256 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
258 if __name__ == '__main__': # testing
259 import sys
260 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
261 else: tokenize(sys.stdin.readline)