1 """Tokenization help for Python programs.
3 This module exports a function called 'tokenize()' that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
6 function which is called once for each token found. The latter function is
7 passed the token type, a string containing the token, the starting and
8 ending (row, column) coordinates of the token, and the original line. It is
9 designed to match the working of the Python tokenizer exactly, except that
10 it produces COMMENT tokens for comments and gives type OP for all operators."""
12 __version__
= "Ka-Ping Yee, 26 October 1997; patched, GvR 3/30/98"
18 tok_name
[COMMENT
] = 'COMMENT'
24 # Ignore now accepts \f as whitespace. Operator now includes '**'.
25 # Ignore and Special now accept \n or \r\n at the end of a line.
26 # Imagnumber is new. Expfloat is corrected to reject '0e4'.
27 # Note: to quote a backslash in a regex, it must be doubled in a r'aw' string.
29 def group(*choices
): return '(' + string
.join(choices
, '|') + ')'
30 def any(*choices
): return apply(group
, choices
) + '*'
31 def maybe(*choices
): return apply(group
, choices
) + '?'
33 Whitespace
= r
'[ \f\t]*'
34 Comment
= r
'#[^\r\n]*'
35 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
36 Name
= r
'[a-zA-Z_]\w*'
38 Hexnumber
= r
'0[xX][\da-fA-F]*[lL]?'
39 Octnumber
= r
'0[0-7]*[lL]?'
40 Decnumber
= r
'[1-9]\d*[lL]?'
41 Intnumber
= group(Hexnumber
, Octnumber
, Decnumber
)
42 Exponent
= r
'[eE][-+]?\d+'
43 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
44 Expfloat
= r
'[1-9]\d*' + Exponent
45 Floatnumber
= group(Pointfloat
, Expfloat
)
46 Imagnumber
= group(r
'0[jJ]', r
'[1-9]\d*[jJ]', Floatnumber
+ r
'[jJ]')
47 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
49 Single
= any(r
"[^'\\]", r
'\\.') + "'"
50 Double
= any(r
'[^"\\]', r
'\\.') + '"'
51 Single3
= any(r
"[^'\\]",r
'\\.',r
"'[^'\\]",r
"'\\.",r
"''[^'\\]",r
"''\\.") + "'''"
52 Double3
= any(r
'[^"\\]',r
'\\.',r
'"[^"\\]',r
'"\\.',r
'""[^"\\]',r
'""\\.') + '"""'
53 Triple
= group("[rR]?'''", '[rR]?"""')
54 String
= group("[rR]?'" + any(r
"[^\n'\\]", r
'\\.') + "'",
55 '[rR]?"' + any(r
'[^\n"\\]', r
'\\.') + '"')
57 Operator
= group('\+=', '\-=', '\*=', '%=', '/=', '\*\*=', '&=', '\|=',
58 '\^=', '>>=', '<<=', '\+', '\-', '\*\*', '\*', '\^', '~',
59 '/', '%', '&', '\|', '<<', '>>', '==', '<=', '<>', '!=',
63 Special
= group(r
'\r?\n', r
'[:;.,`]')
64 Funny
= group(Operator
, Bracket
, Special
)
66 PlainToken
= group(Number
, Funny
, String
, Name
)
67 Token
= Ignore
+ PlainToken
69 ContStr
= group("[rR]?'" + any(r
'\\.', r
"[^\n'\\]") + group("'", r
'\\\r?\n'),
70 '[rR]?"' + any(r
'\\.', r
'[^\n"\\]') + group('"', r
'\\\r?\n'))
71 PseudoExtras
= group(r
'\\\r?\n', Comment
, Triple
)
72 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
74 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
75 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
76 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
77 "'''": single3prog
, '"""': double3prog
,
78 "r'''": single3prog
, 'r"""': double3prog
,
79 "R'''": single3prog
, 'R"""': double3prog
, 'r': None, 'R': None}
83 class TokenError(Exception):
86 def printtoken(type, token
, (srow
, scol
), (erow
, ecol
), line
): # for testing
87 print "%d,%d-%d,%d:\t%s\t%s" % \
88 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
90 def tokenize(readline
, tokeneater
=printtoken
):
91 lnum
= parenlev
= continued
= 0
92 namechars
, numchars
= string
.letters
+ '_', string
.digits
93 contstr
, needcont
= '', 0
97 while 1: # loop over lines in stream
100 pos
, max = 0, len(line
)
102 if contstr
: # continued string
104 raise TokenError
, ("EOF in multi-line string", strstart
)
105 endmatch
= endprog
.match(line
)
107 pos
= end
= endmatch
.end(0)
108 tokeneater(STRING
, contstr
+ line
[:end
],
109 strstart
, (lnum
, end
), contline
+ line
)
110 contstr
, needcont
= '', 0
112 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
113 tokeneater(ERRORTOKEN
, contstr
+ line
,
114 strstart
, (lnum
, len(line
)), contline
)
119 contstr
= contstr
+ line
120 contline
= contline
+ line
123 elif parenlev
== 0 and not continued
: # new statement
126 while pos
< max: # measure leading whitespace
127 if line
[pos
] == ' ': column
= column
+ 1
128 elif line
[pos
] == '\t': column
= (column
/tabsize
+ 1)*tabsize
129 elif line
[pos
] == '\f': column
= 0
134 if line
[pos
] in '#\r\n': # skip comments or blank lines
135 tokeneater((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
136 (lnum
, pos
), (lnum
, len(line
)), line
)
139 if column
> indents
[-1]: # count indents or dedents
140 indents
.append(column
)
141 tokeneater(INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
142 while column
< indents
[-1]:
143 indents
= indents
[:-1]
144 tokeneater(DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
146 else: # continued statement
148 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
152 pseudomatch
= pseudoprog
.match(line
, pos
)
153 if pseudomatch
: # scan for tokens
154 start
, end
= pseudomatch
.span(1)
155 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
156 token
, initial
= line
[start
:end
], line
[start
]
158 if initial
in numchars \
159 or (initial
== '.' and token
!= '.'): # ordinary number
160 tokeneater(NUMBER
, token
, spos
, epos
, line
)
161 elif initial
in '\r\n':
162 tokeneater(parenlev
> 0 and NL
or NEWLINE
,
163 token
, spos
, epos
, line
)
165 tokeneater(COMMENT
, token
, spos
, epos
, line
)
166 elif token
in ("'''", '"""', # triple-quoted
167 "r'''", 'r"""', "R'''", 'R"""'):
168 endprog
= endprogs
[token
]
169 endmatch
= endprog
.match(line
, pos
)
170 if endmatch
: # all on one line
171 pos
= endmatch
.end(0)
172 token
= line
[start
:pos
]
173 tokeneater(STRING
, token
, spos
, (lnum
, pos
), line
)
175 strstart
= (lnum
, start
) # multiple lines
176 contstr
= line
[start
:]
179 elif initial
in ("'", '"') or \
180 token
[:2] in ("r'", 'r"', "R'", 'R"'):
181 if token
[-1] == '\n': # continued string
182 strstart
= (lnum
, start
)
183 endprog
= endprogs
[initial
] or endprogs
[token
[1]]
184 contstr
, needcont
= line
[start
:], 1
187 else: # ordinary string
188 tokeneater(STRING
, token
, spos
, epos
, line
)
189 elif initial
in namechars
: # ordinary name
190 tokeneater(NAME
, token
, spos
, epos
, line
)
191 elif initial
== '\\': # continued stmt
194 if initial
in '([{': parenlev
= parenlev
+ 1
195 elif initial
in ')]}': parenlev
= parenlev
- 1
196 tokeneater(OP
, token
, spos
, epos
, line
)
198 tokeneater(ERRORTOKEN
, line
[pos
],
199 (lnum
, pos
), (lnum
, pos
+1), line
)
202 for indent
in indents
[1:]: # pop remaining indent levels
203 tokeneater(DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
204 tokeneater(ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
206 if __name__
== '__main__': # testing
208 if len(sys
.argv
) > 1: tokenize(open(sys
.argv
[1]).readline
)
209 else: tokenize(sys
.stdin
.readline
)