1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__
= 'Ka-Ping Yee <ping@lfw.org>'
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33 __all__
= [x
for x
in dir(token
) if x
[0] != '_'] + ["COMMENT", "tokenize", "NL"]
38 tok_name
[COMMENT
] = 'COMMENT'
43 def group(*choices
): return '(' + '|'.join(choices
) + ')'
44 def any(*choices
): return apply(group
, choices
) + '*'
45 def maybe(*choices
): return apply(group
, choices
) + '?'
47 Whitespace
= r
'[ \f\t]*'
48 Comment
= r
'#[^\r\n]*'
49 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
50 Name
= r
'[a-zA-Z_]\w*'
52 Hexnumber
= r
'0[xX][\da-fA-F]*[lL]?'
53 Octnumber
= r
'0[0-7]*[lL]?'
54 Decnumber
= r
'[1-9]\d*[lL]?'
55 Intnumber
= group(Hexnumber
, Octnumber
, Decnumber
)
56 Exponent
= r
'[eE][-+]?\d+'
57 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
58 Expfloat
= r
'\d+' + Exponent
59 Floatnumber
= group(Pointfloat
, Expfloat
)
60 Imagnumber
= group(r
'\d+[jJ]', Floatnumber
+ r
'[jJ]')
61 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
63 # Tail end of ' string.
64 Single
= r
"[^'\\]*(?:\\.[^'\\]*)*'"
65 # Tail end of " string.
66 Double
= r
'[^"\\]*(?:\\.[^"\\]*)*"'
67 # Tail end of ''' string.
68 Single3
= r
"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
69 # Tail end of """ string.
70 Double3
= r
'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
71 Triple
= group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
72 # Single-line ' or " string.
73 String
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
74 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
76 # Because of leftmost-then-longest match semantics, be sure to put the
77 # longest operators first (e.g., if = came before ==, == would get
78 # recognized as two instances of =).
79 Operator
= group(r
"\*\*=?", r
">>=?", r
"<<=?", r
"<>", r
"!=",
85 Special
= group(r
'\r?\n', r
'[:;.,`]')
86 Funny
= group(Operator
, Bracket
, Special
)
88 PlainToken
= group(Number
, Funny
, String
, Name
)
89 Token
= Ignore
+ PlainToken
91 # First (or only) line of ' or " string.
92 ContStr
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
93 group("'", r
'\\\r?\n'),
94 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
95 group('"', r
'\\\r?\n'))
96 PseudoExtras
= group(r
'\\\r?\n', Comment
, Triple
)
97 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
99 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
100 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
101 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
102 "'''": single3prog
, '"""': double3prog
,
103 "r'''": single3prog
, 'r"""': double3prog
,
104 "u'''": single3prog
, 'u"""': double3prog
,
105 "ur'''": single3prog
, 'ur"""': double3prog
,
106 "R'''": single3prog
, 'R"""': double3prog
,
107 "U'''": single3prog
, 'U"""': double3prog
,
108 "uR'''": single3prog
, 'uR"""': double3prog
,
109 "Ur'''": single3prog
, 'Ur"""': double3prog
,
110 "UR'''": single3prog
, 'UR"""': double3prog
,
111 'r': None, 'R': None, 'u': None, 'U': None}
114 for t
in ("'''", '"""',
115 "r'''", 'r"""', "R'''", 'R"""',
116 "u'''", 'u"""', "U'''", 'U"""',
117 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
118 "uR'''", 'uR"""', "UR'''", 'UR"""'):
122 "r'", 'r"', "R'", 'R"',
123 "u'", 'u"', "U'", 'U"',
124 "ur'", 'ur"', "Ur'", 'Ur"',
125 "uR'", 'uR"', "UR'", 'UR"' ):
130 class TokenError(Exception): pass
132 class StopTokenizing(Exception): pass
134 def printtoken(type, token
, (srow
, scol
), (erow
, ecol
), line
): # for testing
135 print "%d,%d-%d,%d:\t%s\t%s" % \
136 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
138 def tokenize(readline
, tokeneater
=printtoken
):
140 The tokenize() function accepts two parameters: one representing the
141 input stream, and one providing an output mechanism for tokenize().
143 The first parameter, readline, must be a callable object which provides
144 the same interface as the readline() method of built-in file objects.
145 Each call to the function should return one line of input as a string.
147 The second parameter, tokeneater, must also be a callable object. It is
148 called once for each token, with five arguments, corresponding to the
149 tuples generated by generate_tokens().
152 tokenize_loop(readline
, tokeneater
)
153 except StopTokenizing
:
156 # backwards compatible interface
157 def tokenize_loop(readline
, tokeneater
):
158 for token_info
in generate_tokens(readline
):
159 apply(tokeneater
, token_info
)
161 def generate_tokens(readline
):
163 The generate_tokens() generator requires one argment, readline, which
164 must be a callable object which provides the same interface as the
165 readline() method of built-in file objects. Each call to the function
166 should return one line of input as a string.
168 The generator produces 5-tuples with these members: the token type; the
169 token string; a 2-tuple (srow, scol) of ints specifying the row and
170 column where the token begins in the source; a 2-tuple (erow, ecol) of
171 ints specifying the row and column where the token ends in the source;
172 and the line on which the token was found. The line passed is the
173 logical line; continuation lines are included.
175 lnum
= parenlev
= continued
= 0
176 namechars
, numchars
= string
.ascii_letters
+ '_', '0123456789'
177 contstr
, needcont
= '', 0
181 while 1: # loop over lines in stream
184 pos
, max = 0, len(line
)
186 if contstr
: # continued string
188 raise TokenError
, ("EOF in multi-line string", strstart
)
189 endmatch
= endprog
.match(line
)
191 pos
= end
= endmatch
.end(0)
192 yield (STRING
, contstr
+ line
[:end
],
193 strstart
, (lnum
, end
), contline
+ line
)
194 contstr
, needcont
= '', 0
196 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
197 yield (ERRORTOKEN
, contstr
+ line
,
198 strstart
, (lnum
, len(line
)), contline
)
203 contstr
= contstr
+ line
204 contline
= contline
+ line
207 elif parenlev
== 0 and not continued
: # new statement
210 while pos
< max: # measure leading whitespace
211 if line
[pos
] == ' ': column
= column
+ 1
212 elif line
[pos
] == '\t': column
= (column
/tabsize
+ 1)*tabsize
213 elif line
[pos
] == '\f': column
= 0
218 if line
[pos
] in '#\r\n': # skip comments or blank lines
219 yield ((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
220 (lnum
, pos
), (lnum
, len(line
)), line
)
223 if column
> indents
[-1]: # count indents or dedents
224 indents
.append(column
)
225 yield (INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
226 while column
< indents
[-1]:
227 indents
= indents
[:-1]
228 yield (DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
230 else: # continued statement
232 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
236 pseudomatch
= pseudoprog
.match(line
, pos
)
237 if pseudomatch
: # scan for tokens
238 start
, end
= pseudomatch
.span(1)
239 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
240 token
, initial
= line
[start
:end
], line
[start
]
242 if initial
in numchars
or \
243 (initial
== '.' and token
!= '.'): # ordinary number
244 yield (NUMBER
, token
, spos
, epos
, line
)
245 elif initial
in '\r\n':
246 yield (parenlev
> 0 and NL
or NEWLINE
,
247 token
, spos
, epos
, line
)
249 yield (COMMENT
, token
, spos
, epos
, line
)
250 elif token
in triple_quoted
:
251 endprog
= endprogs
[token
]
252 endmatch
= endprog
.match(line
, pos
)
253 if endmatch
: # all on one line
254 pos
= endmatch
.end(0)
255 token
= line
[start
:pos
]
256 yield (STRING
, token
, spos
, (lnum
, pos
), line
)
258 strstart
= (lnum
, start
) # multiple lines
259 contstr
= line
[start
:]
262 elif initial
in single_quoted
or \
263 token
[:2] in single_quoted
or \
264 token
[:3] in single_quoted
:
265 if token
[-1] == '\n': # continued string
266 strstart
= (lnum
, start
)
267 endprog
= (endprogs
[initial
] or endprogs
[token
[1]] or
269 contstr
, needcont
= line
[start
:], 1
272 else: # ordinary string
273 yield (STRING
, token
, spos
, epos
, line
)
274 elif initial
in namechars
: # ordinary name
275 yield (NAME
, token
, spos
, epos
, line
)
276 elif initial
== '\\': # continued stmt
279 if initial
in '([{': parenlev
= parenlev
+ 1
280 elif initial
in ')]}': parenlev
= parenlev
- 1
281 yield (OP
, token
, spos
, epos
, line
)
283 yield (ERRORTOKEN
, line
[pos
],
284 (lnum
, pos
), (lnum
, pos
+1), line
)
287 for indent
in indents
[1:]: # pop remaining indent levels
288 yield (DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
289 yield (ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
291 if __name__
== '__main__': # testing
293 if len(sys
.argv
) > 1: tokenize(open(sys
.argv
[1]).readline
)
294 else: tokenize(sys
.stdin
.readline
)