1 """Tokenization help for Python programs.
3 This module exports a function called 'tokenize()' that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
6 function which is called once for each token found. The latter function is
7 passed the token type, a string containing the token, the starting and
8 ending (row, column) coordinates of the token, and the original line. It is
9 designed to match the working of the Python tokenizer exactly, except that
10 it produces COMMENT tokens for comments and gives type OP for all operators."""
12 __version__
= "Ka-Ping Yee, 26 October 1997; patched, GvR 3/30/98"
18 tok_name
[COMMENT
] = 'COMMENT'
24 # Ignore now accepts \f as whitespace. Operator now includes '**'.
25 # Ignore and Special now accept \n or \r\n at the end of a line.
26 # Imagnumber is new. Expfloat is corrected to reject '0e4'.
27 # Note: to quote a backslash in a regex, it must be doubled in a r'aw' string.
29 def group(*choices
): return '(' + '|'.join(choices
) + ')'
30 def any(*choices
): return apply(group
, choices
) + '*'
31 def maybe(*choices
): return apply(group
, choices
) + '?'
33 Whitespace
= r
'[ \f\t]*'
34 Comment
= r
'#[^\r\n]*'
35 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
36 Name
= r
'[a-zA-Z_]\w*'
38 Hexnumber
= r
'0[xX][\da-fA-F]*[lL]?'
39 Octnumber
= r
'0[0-7]*[lL]?'
40 Decnumber
= r
'[1-9]\d*[lL]?'
41 Intnumber
= group(Hexnumber
, Octnumber
, Decnumber
)
42 Exponent
= r
'[eE][-+]?\d+'
43 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
44 Expfloat
= r
'[1-9]\d*' + Exponent
45 Floatnumber
= group(Pointfloat
, Expfloat
)
46 Imagnumber
= group(r
'0[jJ]', r
'[1-9]\d*[jJ]', Floatnumber
+ r
'[jJ]')
47 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
49 # Tail end of ' string.
50 Single
= r
"[^'\\]*(?:\\.[^'\\]*)*'"
51 # Tail end of " string.
52 Double
= r
'[^"\\]*(?:\\.[^"\\]*)*"'
53 # Tail end of ''' string.
54 Single3
= r
"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
55 # Tail end of """ string.
56 Double3
= r
'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
57 Triple
= group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
58 # Single-line ' or " string.
59 String
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
60 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
62 # Because of leftmost-then-longest match semantics, be sure to put the
63 # longest operators first (e.g., if = came before ==, == would get
64 # recognized as two instances of =).
65 Operator
= group(r
"\*\*=?", r
">>=?", r
"<<=?", r
"<>", r
"!=",
70 Special
= group(r
'\r?\n', r
'[:;.,`]')
71 Funny
= group(Operator
, Bracket
, Special
)
73 PlainToken
= group(Number
, Funny
, String
, Name
)
74 Token
= Ignore
+ PlainToken
76 # First (or only) line of ' or " string.
77 ContStr
= group(r
"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
78 group("'", r
'\\\r?\n'),
79 r
'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
80 group('"', r
'\\\r?\n'))
81 PseudoExtras
= group(r
'\\\r?\n', Comment
, Triple
)
82 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
84 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
85 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
86 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
87 "'''": single3prog
, '"""': double3prog
,
88 "r'''": single3prog
, 'r"""': double3prog
,
89 "u'''": single3prog
, 'u"""': double3prog
,
90 "ur'''": single3prog
, 'ur"""': double3prog
,
91 "R'''": single3prog
, 'R"""': double3prog
,
92 "U'''": single3prog
, 'U"""': double3prog
,
93 "uR'''": single3prog
, 'uR"""': double3prog
,
94 "Ur'''": single3prog
, 'Ur"""': double3prog
,
95 "UR'''": single3prog
, 'UR"""': double3prog
,
96 'r': None, 'R': None, 'u': None, 'U': None}
100 class TokenError(Exception):
103 def printtoken(type, token
, (srow
, scol
), (erow
, ecol
), line
): # for testing
104 print "%d,%d-%d,%d:\t%s\t%s" % \
105 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
107 def tokenize(readline
, tokeneater
=printtoken
):
108 lnum
= parenlev
= continued
= 0
109 namechars
, numchars
= string
.letters
+ '_', string
.digits
110 contstr
, needcont
= '', 0
114 while 1: # loop over lines in stream
117 pos
, max = 0, len(line
)
119 if contstr
: # continued string
121 raise TokenError
, ("EOF in multi-line string", strstart
)
122 endmatch
= endprog
.match(line
)
124 pos
= end
= endmatch
.end(0)
125 tokeneater(STRING
, contstr
+ line
[:end
],
126 strstart
, (lnum
, end
), contline
+ line
)
127 contstr
, needcont
= '', 0
129 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
130 tokeneater(ERRORTOKEN
, contstr
+ line
,
131 strstart
, (lnum
, len(line
)), contline
)
136 contstr
= contstr
+ line
137 contline
= contline
+ line
140 elif parenlev
== 0 and not continued
: # new statement
143 while pos
< max: # measure leading whitespace
144 if line
[pos
] == ' ': column
= column
+ 1
145 elif line
[pos
] == '\t': column
= (column
/tabsize
+ 1)*tabsize
146 elif line
[pos
] == '\f': column
= 0
151 if line
[pos
] in '#\r\n': # skip comments or blank lines
152 tokeneater((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
153 (lnum
, pos
), (lnum
, len(line
)), line
)
156 if column
> indents
[-1]: # count indents or dedents
157 indents
.append(column
)
158 tokeneater(INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
159 while column
< indents
[-1]:
160 indents
= indents
[:-1]
161 tokeneater(DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
163 else: # continued statement
165 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
169 pseudomatch
= pseudoprog
.match(line
, pos
)
170 if pseudomatch
: # scan for tokens
171 start
, end
= pseudomatch
.span(1)
172 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
173 token
, initial
= line
[start
:end
], line
[start
]
175 if initial
in numchars \
176 or (initial
== '.' and token
!= '.'): # ordinary number
177 tokeneater(NUMBER
, token
, spos
, epos
, line
)
178 elif initial
in '\r\n':
179 tokeneater(parenlev
> 0 and NL
or NEWLINE
,
180 token
, spos
, epos
, line
)
182 tokeneater(COMMENT
, token
, spos
, epos
, line
)
183 elif token
in ("'''", '"""', # triple-quoted
184 "r'''", 'r"""', "R'''", 'R"""',
185 "u'''", 'u"""', "U'''", 'U"""',
186 "ur'''", 'ur"""', "Ur'''", 'Ur"""',
187 "uR'''", 'uR"""', "UR'''", 'UR"""'):
188 endprog
= endprogs
[token
]
189 endmatch
= endprog
.match(line
, pos
)
190 if endmatch
: # all on one line
191 pos
= endmatch
.end(0)
192 token
= line
[start
:pos
]
193 tokeneater(STRING
, token
, spos
, (lnum
, pos
), line
)
195 strstart
= (lnum
, start
) # multiple lines
196 contstr
= line
[start
:]
199 elif initial
in ("'", '"') or \
200 token
[:2] in ("r'", 'r"', "R'", 'R"',
201 "u'", 'u"', "U'", 'U"') or \
202 token
[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
203 "uR'", 'uR"', "UR'", 'UR"' ):
204 if token
[-1] == '\n': # continued string
205 strstart
= (lnum
, start
)
206 endprog
= (endprogs
[initial
] or endprogs
[token
[1]] or
208 contstr
, needcont
= line
[start
:], 1
211 else: # ordinary string
212 tokeneater(STRING
, token
, spos
, epos
, line
)
213 elif initial
in namechars
: # ordinary name
214 tokeneater(NAME
, token
, spos
, epos
, line
)
215 elif initial
== '\\': # continued stmt
218 if initial
in '([{': parenlev
= parenlev
+ 1
219 elif initial
in ')]}': parenlev
= parenlev
- 1
220 tokeneater(OP
, token
, spos
, epos
, line
)
222 tokeneater(ERRORTOKEN
, line
[pos
],
223 (lnum
, pos
), (lnum
, pos
+1), line
)
226 for indent
in indents
[1:]: # pop remaining indent levels
227 tokeneater(DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
228 tokeneater(ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
230 if __name__
== '__main__': # testing
232 if len(sys
.argv
) > 1: tokenize(open(sys
.argv
[1]).readline
)
233 else: tokenize(sys
.stdin
.readline
)