1 """Tokenization help for Python programs.
3 generate_tokens(readline) is a generator that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF). It generates
6 5-tuples with these members:
8 the token type (see token.py)
10 the starting (row, column) indices of the token (a 2-tuple of ints)
11 the ending (row, column) indices of the token (a 2-tuple of ints)
12 the original line (string)
14 It is designed to match the working of the Python tokenizer exactly, except
15 that it produces COMMENT tokens for comments and gives type OP for all
19 tokenize_loop(readline, tokeneater)
20 tokenize(readline, tokeneater=printtoken)
21 are the same, except instead of generating tokens, tokeneater is a callback
22 function to which the 5 fields described above are passed as 5 arguments,
23 each time a new token is found."""
25 from __future__
import generators
27 __author__
= 'Ka-Ping Yee <ping@lfw.org>'
29 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
35 __all__
= [x
for x
in dir(token
) if x
[0] != '_'] + ["COMMENT", "tokenize", "NL"]
39 tok_name
[COMMENT
] = 'COMMENT'
44 def group(*choices
): return '(' + '|'.join(choices
) + ')'
45 def any(*choices
): return apply(group
, choices
) + '*'
46 def maybe(*choices
): return apply(group
, choices
) + '?'
48 Whitespace
= r
'[ \f\t]*'
49 Comment
= r
'#[^\r\n]*'
50 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
51 Name
= r
'[a-zA-Z_]\w*'
53 Hexnumber
= r
'0[xX][\da-fA-F]*[lL]?'
54 Octnumber
= r
'0[0-7]*[lL]?'
55 Decnumber
= r
'[1-9]\d*[lL]?'
56 Intnumber
= group(Hexnumber
, Octnumber
, Decnumber
)
57 Exponent
= r
'[eE][-+]?\d+'
58 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
59 Expfloat
= r
'\d+' + Exponent
60 Floatnumber
= group(Pointfloat
, Expfloat
)
61 Imagnumber
= group(r
'\d+[jJ]', Floatnumber
+ r
'[jJ]')
62 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
64 # Tail end of ' string.
65 Single
= r
"[^'\\]*(?:\\.[^'\\]*)*'"
66 # Tail end of " string.
67 Double
= r
'[^"\\]*(?:\\.[^"\\]*)*"'
68 # Tail end of ''' string.
69 Single3
= r
"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
70 # Tail end of """ string.
71 Double3
= r
'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
72 Triple
= group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
73 # Single-line ' or " string.
74 String
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
75 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
77 # Because of leftmost-then-longest match semantics, be sure to put the
78 # longest operators first (e.g., if = came before ==, == would get
79 # recognized as two instances of =).
80 Operator
= group(r
"\*\*=?", r
">>=?", r
"<<=?", r
"<>", r
"!=",
86 Special
= group(r
'\r?\n', r
'[:;.,`]')
87 Funny
= group(Operator
, Bracket
, Special
)
89 PlainToken
= group(Number
, Funny
, String
, Name
)
90 Token
= Ignore
+ PlainToken
92 # First (or only) line of ' or " string.
93 ContStr
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
94 group("'", r
'\\\r?\n'),
95 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
96 group('"', r
'\\\r?\n'))
97 PseudoExtras
= group(r
'\\\r?\n', Comment
, Triple
)
98 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
100 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
101 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
102 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
103 "'''": single3prog
, '"""': double3prog
,
104 "r'''": single3prog
, 'r"""': double3prog
,
105 "u'''": single3prog
, 'u"""': double3prog
,
106 "ur'''": single3prog
, 'ur"""': double3prog
,
107 "R'''": single3prog
, 'R"""': double3prog
,
108 "U'''": single3prog
, 'U"""': double3prog
,
109 "uR'''": single3prog
, 'uR"""': double3prog
,
110 "Ur'''": single3prog
, 'Ur"""': double3prog
,
111 "UR'''": single3prog
, 'UR"""': double3prog
,
112 'r': None, 'R': None, 'u': None, 'U': None}
116 class TokenError(Exception): pass
118 class StopTokenizing(Exception): pass
120 def printtoken(type, token
, (srow
, scol
), (erow
, ecol
), line
): # for testing
121 print "%d,%d-%d,%d:\t%s\t%s" % \
122 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
124 def tokenize(readline
, tokeneater
=printtoken
):
126 tokenize_loop(readline
, tokeneater
)
127 except StopTokenizing
:
130 # backwards compatible interface
131 def tokenize_loop(readline
, tokeneater
):
132 for token_info
in generate_tokens(readline
):
133 apply(tokeneater
, token_info
)
135 def generate_tokens(readline
):
136 lnum
= parenlev
= continued
= 0
137 namechars
, numchars
= string
.ascii_letters
+ '_', '0123456789'
138 contstr
, needcont
= '', 0
142 while 1: # loop over lines in stream
145 pos
, max = 0, len(line
)
147 if contstr
: # continued string
149 raise TokenError
, ("EOF in multi-line string", strstart
)
150 endmatch
= endprog
.match(line
)
152 pos
= end
= endmatch
.end(0)
153 yield (STRING
, contstr
+ line
[:end
],
154 strstart
, (lnum
, end
), contline
+ line
)
155 contstr
, needcont
= '', 0
157 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
158 yield (ERRORTOKEN
, contstr
+ line
,
159 strstart
, (lnum
, len(line
)), contline
)
164 contstr
= contstr
+ line
165 contline
= contline
+ line
168 elif parenlev
== 0 and not continued
: # new statement
171 while pos
< max: # measure leading whitespace
172 if line
[pos
] == ' ': column
= column
+ 1
173 elif line
[pos
] == '\t': column
= (column
/tabsize
+ 1)*tabsize
174 elif line
[pos
] == '\f': column
= 0
179 if line
[pos
] in '#\r\n': # skip comments or blank lines
180 yield ((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
181 (lnum
, pos
), (lnum
, len(line
)), line
)
184 if column
> indents
[-1]: # count indents or dedents
185 indents
.append(column
)
186 yield (INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
187 while column
< indents
[-1]:
188 indents
= indents
[:-1]
189 yield (DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
191 else: # continued statement
193 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
197 pseudomatch
= pseudoprog
.match(line
, pos
)
198 if pseudomatch
: # scan for tokens
199 start
, end
= pseudomatch
.span(1)
200 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
201 token
, initial
= line
[start
:end
], line
[start
]
203 if initial
in numchars
or \
204 (initial
== '.' and token
!= '.'): # ordinary number
205 yield (NUMBER
, token
, spos
, epos
, line
)
206 elif initial
in '\r\n':
207 yield (parenlev
> 0 and NL
or NEWLINE
,
208 token
, spos
, epos
, line
)
210 yield (COMMENT
, token
, spos
, epos
, line
)
211 elif token
in ("'''", '"""', # triple-quoted
212 "r'''", 'r"""', "R'''", 'R"""',
213 "u'''", 'u"""', "U'''", 'U"""',
214 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
215 "uR'''", 'uR"""', "UR'''", 'UR"""'):
216 endprog
= endprogs
[token
]
217 endmatch
= endprog
.match(line
, pos
)
218 if endmatch
: # all on one line
219 pos
= endmatch
.end(0)
220 token
= line
[start
:pos
]
221 yield (STRING
, token
, spos
, (lnum
, pos
), line
)
223 strstart
= (lnum
, start
) # multiple lines
224 contstr
= line
[start
:]
227 elif initial
in ("'", '"') or \
228 token
[:2] in ("r'", 'r"', "R'", 'R"',
229 "u'", 'u"', "U'", 'U"') or \
230 token
[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
231 "uR'", 'uR"', "UR'", 'UR"' ):
232 if token
[-1] == '\n': # continued string
233 strstart
= (lnum
, start
)
234 endprog
= (endprogs
[initial
] or endprogs
[token
[1]] or
236 contstr
, needcont
= line
[start
:], 1
239 else: # ordinary string
240 yield (STRING
, token
, spos
, epos
, line
)
241 elif initial
in namechars
: # ordinary name
242 yield (NAME
, token
, spos
, epos
, line
)
243 elif initial
== '\\': # continued stmt
246 if initial
in '([{': parenlev
= parenlev
+ 1
247 elif initial
in ')]}': parenlev
= parenlev
- 1
248 yield (OP
, token
, spos
, epos
, line
)
250 yield (ERRORTOKEN
, line
[pos
],
251 (lnum
, pos
), (lnum
, pos
+1), line
)
254 for indent
in indents
[1:]: # pop remaining indent levels
255 yield (DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
256 yield (ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
258 if __name__
== '__main__': # testing
260 if len(sys
.argv
) > 1: tokenize(open(sys
.argv
[1]).readline
)
261 else: tokenize(sys
.stdin
.readline
)