1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 __author__
= 'Ka-Ping Yee <ping@lfw.org>'
27 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
33 __all__
= [x
for x
in dir(token
) if x
[0] != '_'] + ["COMMENT", "tokenize",
34 "generate_tokens", "NL"]
39 tok_name
[COMMENT
] = 'COMMENT'
44 def group(*choices
): return '(' + '|'.join(choices
) + ')'
45 def any(*choices
): return group(*choices
) + '*'
46 def maybe(*choices
): return group(*choices
) + '?'
48 Whitespace
= r
'[ \f\t]*'
49 Comment
= r
'#[^\r\n]*'
50 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
51 Name
= r
'[a-zA-Z_]\w*'
53 Hexnumber
= r
'0[xX][\da-fA-F]*[lL]?'
54 Octnumber
= r
'0[0-7]*[lL]?'
55 Decnumber
= r
'[1-9]\d*[lL]?'
56 Intnumber
= group(Hexnumber
, Octnumber
, Decnumber
)
57 Exponent
= r
'[eE][-+]?\d+'
58 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
59 Expfloat
= r
'\d+' + Exponent
60 Floatnumber
= group(Pointfloat
, Expfloat
)
61 Imagnumber
= group(r
'\d+[jJ]', Floatnumber
+ r
'[jJ]')
62 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
64 # Tail end of ' string.
65 Single
= r
"[^'\\]*(?:\\.[^'\\]*)*'"
66 # Tail end of " string.
67 Double
= r
'[^"\\]*(?:\\.[^"\\]*)*"'
68 # Tail end of ''' string.
69 Single3
= r
"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70 # Tail end of """ string.
71 Double3
= r
'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple
= group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73 # Single-line ' or " string.
74 String
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77 # Because of leftmost-then-longest match semantics, be sure to put the
78 # longest operators first (e.g., if = came before ==, == would get
79 # recognized as two instances of =).
80 Operator
= group(r
"\*\*=?", r
">>=?", r
"<<=?", r
"<>", r
"!=",
86 Special
= group(r
'\r?\n', r
'[:;.,`]')
87 Funny
= group(Operator
, Bracket
, Special
)
89 PlainToken
= group(Number
, Funny
, String
, Name
)
90 Token
= Ignore
+ PlainToken
92 # First (or only) line of ' or " string.
93 ContStr
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r
'\\\r?\n'),
95 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r
'\\\r?\n'))
97 PseudoExtras
= group(r
'\\\r?\n', Comment
, Triple
)
98 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
100 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
101 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
102 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
103 "'''": single3prog
, '"""': double3prog
,
104 "r'''": single3prog
, 'r"""': double3prog
,
105 "u'''": single3prog
, 'u"""': double3prog
,
106 "ur'''": single3prog
, 'ur"""': double3prog
,
107 "R'''": single3prog
, 'R"""': double3prog
,
108 "U'''": single3prog
, 'U"""': double3prog
,
109 "uR'''": single3prog
, 'uR"""': double3prog
,
110 "Ur'''": single3prog
, 'Ur"""': double3prog
,
111 "UR'''": single3prog
, 'UR"""': double3prog
,
112 'r': None, 'R': None, 'u': None, 'U': None}
115 for t
in ("'''", '"""',
116 "r'''", 'r"""', "R'''", 'R"""',
117 "u'''", 'u"""', "U'''", 'U"""',
118 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
119 "uR'''", 'uR"""', "UR'''", 'UR"""'):
123 "r'", 'r"', "R'", 'R"',
124 "u'", 'u"', "U'", 'U"',
125 "ur'", 'ur"', "Ur'", 'Ur"',
126 "uR'", 'uR"', "UR'", 'UR"' ):
131 class TokenError(Exception): pass
133 class StopTokenizing(Exception): pass
135 def printtoken(type, token
, (srow
, scol
), (erow
, ecol
), line
): # for testing
136 print "%d,%d-%d,%d:\t%s\t%s" % \
137 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
139 def tokenize(readline
, tokeneater
=printtoken
):
141 The tokenize() function accepts two parameters: one representing the
142 input stream, and one providing an output mechanism for tokenize().
144 The first parameter, readline, must be a callable object which provides
145 the same interface as the readline() method of built-in file objects.
146 Each call to the function should return one line of input as a string.
148 The second parameter, tokeneater, must also be a callable object. It is
149 called once for each token, with five arguments, corresponding to the
150 tuples generated by generate_tokens().
153 tokenize_loop(readline
, tokeneater
)
154 except StopTokenizing
:
157 # backwards compatible interface
158 def tokenize_loop(readline
, tokeneater
):
159 for token_info
in generate_tokens(readline
):
160 tokeneater(*token_info
)
162 def generate_tokens(readline
):
164 The generate_tokens() generator requires one argment, readline, which
165 must be a callable object which provides the same interface as the
166 readline() method of built-in file objects. Each call to the function
167 should return one line of input as a string.
169 The generator produces 5-tuples with these members: the token type; the
170 token string; a 2-tuple (srow, scol) of ints specifying the row and
171 column where the token begins in the source; a 2-tuple (erow, ecol) of
172 ints specifying the row and column where the token ends in the source;
173 and the line on which the token was found. The line passed is the
174 logical line; continuation lines are included.
176 lnum
= parenlev
= continued
= 0
177 namechars
, numchars
= string
.ascii_letters
+ '_', '0123456789'
178 contstr
, needcont
= '', 0
182 while 1: # loop over lines in stream
185 pos
, max = 0, len(line
)
187 if contstr
: # continued string
189 raise TokenError
, ("EOF in multi-line string", strstart
)
190 endmatch
= endprog
.match(line
)
192 pos
= end
= endmatch
.end(0)
193 yield (STRING
, contstr
+ line
[:end
],
194 strstart
, (lnum
, end
), contline
+ line
)
195 contstr
, needcont
= '', 0
197 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
198 yield (ERRORTOKEN
, contstr
+ line
,
199 strstart
, (lnum
, len(line
)), contline
)
204 contstr
= contstr
+ line
205 contline
= contline
+ line
208 elif parenlev
== 0 and not continued
: # new statement
211 while pos
< max: # measure leading whitespace
212 if line
[pos
] == ' ': column
= column
+ 1
213 elif line
[pos
] == '\t': column
= (column
/tabsize
+ 1)*tabsize
214 elif line
[pos
] == '\f': column
= 0
219 if line
[pos
] in '#\r\n': # skip comments or blank lines
220 yield ((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
221 (lnum
, pos
), (lnum
, len(line
)), line
)
224 if column
> indents
[-1]: # count indents or dedents
225 indents
.append(column
)
226 yield (INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
227 while column
< indents
[-1]:
228 indents
= indents
[:-1]
229 yield (DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
231 else: # continued statement
233 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
237 pseudomatch
= pseudoprog
.match(line
, pos
)
238 if pseudomatch
: # scan for tokens
239 start
, end
= pseudomatch
.span(1)
240 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
241 token
, initial
= line
[start
:end
], line
[start
]
243 if initial
in numchars
or \
244 (initial
== '.' and token
!= '.'): # ordinary number
245 yield (NUMBER
, token
, spos
, epos
, line
)
246 elif initial
in '\r\n':
247 yield (parenlev
> 0 and NL
or NEWLINE
,
248 token
, spos
, epos
, line
)
250 yield (COMMENT
, token
, spos
, epos
, line
)
251 elif token
in triple_quoted
:
252 endprog
= endprogs
[token
]
253 endmatch
= endprog
.match(line
, pos
)
254 if endmatch
: # all on one line
255 pos
= endmatch
.end(0)
256 token
= line
[start
:pos
]
257 yield (STRING
, token
, spos
, (lnum
, pos
), line
)
259 strstart
= (lnum
, start
) # multiple lines
260 contstr
= line
[start
:]
263 elif initial
in single_quoted
or \
264 token
[:2] in single_quoted
or \
265 token
[:3] in single_quoted
:
266 if token
[-1] == '\n': # continued string
267 strstart
= (lnum
, start
)
268 endprog
= (endprogs
[initial
] or endprogs
[token
[1]] or
270 contstr
, needcont
= line
[start
:], 1
273 else: # ordinary string
274 yield (STRING
, token
, spos
, epos
, line
)
275 elif initial
in namechars
: # ordinary name
276 yield (NAME
, token
, spos
, epos
, line
)
277 elif initial
== '\\': # continued stmt
280 if initial
in '([{': parenlev
= parenlev
+ 1
281 elif initial
in ')]}': parenlev
= parenlev
- 1
282 yield (OP
, token
, spos
, epos
, line
)
284 yield (ERRORTOKEN
, line
[pos
],
285 (lnum
, pos
), (lnum
, pos
+1), line
)
288 for indent
in indents
[1:]: # pop remaining indent levels
289 yield (DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
290 yield (ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
292 if __name__
== '__main__': # testing
294 if len(sys
.argv
) > 1: tokenize(open(sys
.argv
[1]).readline
)
295 else: tokenize(sys
.stdin
.readline
)