1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 from __future__
import generators
27 __author__
= 'Ka-Ping Yee <ping@lfw.org>'
29 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
35 __all__
= [x
for x
in dir(token
) if x
[0] != '_'] + ["COMMENT", "tokenize", "NL"]
39 tok_name
[COMMENT
] = 'COMMENT'
44 def group(*choices
): return '(' + '|'.join(choices
) + ')'
45 def any(*choices
): return apply(group
, choices
) + '*'
46 def maybe(*choices
): return apply(group
, choices
) + '?'
48 Whitespace
= r
'[ \f\t]*'
49 Comment
= r
'#[^\r\n]*'
50 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
51 Name
= r
'[a-zA-Z_]\w*'
53 Hexnumber
= r
'0[xX][\da-fA-F]*[lL]?'
54 Octnumber
= r
'0[0-7]*[lL]?'
55 Decnumber
= r
'[1-9]\d*[lL]?'
56 Intnumber
= group(Hexnumber
, Octnumber
, Decnumber
)
57 Exponent
= r
'[eE][-+]?\d+'
58 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
59 Expfloat
= r
'\d+' + Exponent
60 Floatnumber
= group(Pointfloat
, Expfloat
)
61 Imagnumber
= group(r
'\d+[jJ]', Floatnumber
+ r
'[jJ]')
62 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
64 # Tail end of ' string.
65 Single
= r
"[^'\\]*(?:\\.[^'\\]*)*'"
66 # Tail end of " string.
67 Double
= r
'[^"\\]*(?:\\.[^"\\]*)*"'
68 # Tail end of ''' string.
69 Single3
= r
"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70 # Tail end of """ string.
71 Double3
= r
'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple
= group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73 # Single-line ' or " string.
74 String
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77 # Because of leftmost-then-longest match semantics, be sure to put the
78 # longest operators first (e.g., if = came before ==, == would get
79 # recognized as two instances of =).
80 Operator
= group(r
"\*\*=?", r
">>=?", r
"<<=?", r
"<>", r
"!=",
86 Special
= group(r
'\r?\n', r
'[:;.,`]')
87 Funny
= group(Operator
, Bracket
, Special
)
89 PlainToken
= group(Number
, Funny
, String
, Name
)
90 Token
= Ignore
+ PlainToken
92 # First (or only) line of ' or " string.
93 ContStr
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r
'\\\r?\n'),
95 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r
'\\\r?\n'))
97 PseudoExtras
= group(r
'\\\r?\n', Comment
, Triple
)
98 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
100 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
101 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
102 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
103 "'''": single3prog
, '"""': double3prog
,
104 "r'''": single3prog
, 'r"""': double3prog
,
105 "u'''": single3prog
, 'u"""': double3prog
,
106 "ur'''": single3prog
, 'ur"""': double3prog
,
107 "R'''": single3prog
, 'R"""': double3prog
,
108 "U'''": single3prog
, 'U"""': double3prog
,
109 "uR'''": single3prog
, 'uR"""': double3prog
,
110 "Ur'''": single3prog
, 'Ur"""': double3prog
,
111 "UR'''": single3prog
, 'UR"""': double3prog
,
112 'r': None, 'R': None, 'u': None, 'U': None}
116 class TokenError(Exception): pass
118 class StopTokenizing(Exception): pass
120 def printtoken(type, token
, (srow
, scol
), (erow
, ecol
), line
): # for testing
121 print "%d,%d-%d,%d:\t%s\t%s" % \
122 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
124 def tokenize(readline
, tokeneater
=printtoken
):
126 The tokenize() function accepts two parameters: one representing the
127 input stream, and one providing an output mechanism for tokenize().
129 The first parameter, readline, must be a callable object which provides
130 the same interface as the readline() method of built-in file objects.
131 Each call to the function should return one line of input as a string.
133 The second parameter, tokeneater, must also be a callable object. It is
134 called once for each token, with five arguments, corresponding to the
135 tuples generated by generate_tokens().
138 tokenize_loop(readline
, tokeneater
)
139 except StopTokenizing
:
142 # backwards compatible interface
143 def tokenize_loop(readline
, tokeneater
):
144 for token_info
in generate_tokens(readline
):
145 apply(tokeneater
, token_info
)
147 def generate_tokens(readline
):
149 The generate_tokens() generator requires one argment, readline, which
150 must be a callable object which provides the same interface as the
151 readline() method of built-in file objects. Each call to the function
152 should return one line of input as a string.
154 The generator produces 5-tuples with these members: the token type; the
155 token string; a 2-tuple (srow, scol) of ints specifying the row and
156 column where the token begins in the source; a 2-tuple (erow, ecol) of
157 ints specifying the row and column where the token ends in the source;
158 and the line on which the token was found. The line passed is the
159 logical line; continuation lines are included.
161 lnum
= parenlev
= continued
= 0
162 namechars
, numchars
= string
.ascii_letters
+ '_', '0123456789'
163 contstr
, needcont
= '', 0
167 while 1: # loop over lines in stream
170 pos
, max = 0, len(line
)
172 if contstr
: # continued string
174 raise TokenError
, ("EOF in multi-line string", strstart
)
175 endmatch
= endprog
.match(line
)
177 pos
= end
= endmatch
.end(0)
178 yield (STRING
, contstr
+ line
[:end
],
179 strstart
, (lnum
, end
), contline
+ line
)
180 contstr
, needcont
= '', 0
182 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
183 yield (ERRORTOKEN
, contstr
+ line
,
184 strstart
, (lnum
, len(line
)), contline
)
189 contstr
= contstr
+ line
190 contline
= contline
+ line
193 elif parenlev
== 0 and not continued
: # new statement
196 while pos
< max: # measure leading whitespace
197 if line
[pos
] == ' ': column
= column
+ 1
198 elif line
[pos
] == '\t': column
= (column
/tabsize
+ 1)*tabsize
199 elif line
[pos
] == '\f': column
= 0
204 if line
[pos
] in '#\r\n': # skip comments or blank lines
205 yield ((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
206 (lnum
, pos
), (lnum
, len(line
)), line
)
209 if column
> indents
[-1]: # count indents or dedents
210 indents
.append(column
)
211 yield (INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
212 while column
< indents
[-1]:
213 indents
= indents
[:-1]
214 yield (DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
216 else: # continued statement
218 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
222 pseudomatch
= pseudoprog
.match(line
, pos
)
223 if pseudomatch
: # scan for tokens
224 start
, end
= pseudomatch
.span(1)
225 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
226 token
, initial
= line
[start
:end
], line
[start
]
228 if initial
in numchars
or \
229 (initial
== '.' and token
!= '.'): # ordinary number
230 yield (NUMBER
, token
, spos
, epos
, line
)
231 elif initial
in '\r\n':
232 yield (parenlev
> 0 and NL
or NEWLINE
,
233 token
, spos
, epos
, line
)
235 yield (COMMENT
, token
, spos
, epos
, line
)
236 elif token
in ("'''", '"""', # triple-quoted
237 "r'''", 'r"""', "R'''", 'R"""',
238 "u'''", 'u"""', "U'''", 'U"""',
239 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
240 "uR'''", 'uR"""', "UR'''", 'UR"""'):
241 endprog
= endprogs
[token
]
242 endmatch
= endprog
.match(line
, pos
)
243 if endmatch
: # all on one line
244 pos
= endmatch
.end(0)
245 token
= line
[start
:pos
]
246 yield (STRING
, token
, spos
, (lnum
, pos
), line
)
248 strstart
= (lnum
, start
) # multiple lines
249 contstr
= line
[start
:]
252 elif initial
in ("'", '"') or \
253 token
[:2] in ("r'", 'r"', "R'", 'R"',
254 "u'", 'u"', "U'", 'U"') or \
255 token
[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
256 "uR'", 'uR"', "UR'", 'UR"' ):
257 if token
[-1] == '\n': # continued string
258 strstart
= (lnum
, start
)
259 endprog
= (endprogs
[initial
] or endprogs
[token
[1]] or
261 contstr
, needcont
= line
[start
:], 1
264 else: # ordinary string
265 yield (STRING
, token
, spos
, epos
, line
)
266 elif initial
in namechars
: # ordinary name
267 yield (NAME
, token
, spos
, epos
, line
)
268 elif initial
== '\\': # continued stmt
271 if initial
in '([{': parenlev
= parenlev
+ 1
272 elif initial
in ')]}': parenlev
= parenlev
- 1
273 yield (OP
, token
, spos
, epos
, line
)
275 yield (ERRORTOKEN
, line
[pos
],
276 (lnum
, pos
), (lnum
, pos
+1), line
)
279 for indent
in indents
[1:]: # pop remaining indent levels
280 yield (DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
281 yield (ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
283 if __name__
== '__main__': # testing
285 if len(sys
.argv
) > 1: tokenize(open(sys
.argv
[1]).readline
)
286 else: tokenize(sys
.stdin
.readline
)