1 """Tokenization help for Python programs.
3 This module exports a function called 'tokenize()' that breaks a stream of
4 text into Python tokens. It accepts a readline-like method which is called
5 repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
6 function which is called once for each token found. The latter function is
7 passed the token type, a string containing the token, the starting and
8 ending (row, column) coordinates of the token, and the original line. It is
9 designed to match the working of the Python tokenizer exactly, except that
10 it produces COMMENT tokens for comments and gives type OP for all operators."""
12 __version__
= "Ka-Ping Yee, 26 October 1997; patched, GvR 3/30/98"
18 tok_name
[COMMENT
] = 'COMMENT'
24 # Ignore now accepts \f as whitespace. Operator now includes '**'.
25 # Ignore and Special now accept \n or \r\n at the end of a line.
26 # Imagnumber is new. Expfloat is corrected to reject '0e4'.
27 # Note: to quote a backslash in a regex, it must be doubled in a r'aw' string.
29 def group(*choices
): return '(' + string
.join(choices
, '|') + ')'
30 def any(*choices
): return apply(group
, choices
) + '*'
31 def maybe(*choices
): return apply(group
, choices
) + '?'
33 Whitespace
= r
'[ \f\t]*'
34 Comment
= r
'#[^\r\n]*'
35 Ignore
= Whitespace
+ any(r
'\\\r?\n' + Whitespace
) + maybe(Comment
)
36 Name
= r
'[a-zA-Z_]\w*'
38 Hexnumber
= r
'0[xX][\da-fA-F]*[lL]?'
39 Octnumber
= r
'0[0-7]*[lL]?'
40 Decnumber
= r
'[1-9]\d*[lL]?'
41 Intnumber
= group(Hexnumber
, Octnumber
, Decnumber
)
42 Exponent
= r
'[eE][-+]?\d+'
43 Pointfloat
= group(r
'\d+\.\d*', r
'\.\d+') + maybe(Exponent
)
44 Expfloat
= r
'[1-9]\d*' + Exponent
45 Floatnumber
= group(Pointfloat
, Expfloat
)
46 Imagnumber
= group(r
'0[jJ]', r
'[1-9]\d*[jJ]', Floatnumber
+ r
'[jJ]')
47 Number
= group(Imagnumber
, Floatnumber
, Intnumber
)
49 Single
= any(r
"[^'\\]", r
'\\.') + "'"
50 Double
= any(r
'[^"\\]', r
'\\.') + '"'
51 Single3
= any(r
"[^'\\]",r
'\\.',r
"'[^'\\]",r
"'\\.",r
"''[^'\\]",r
"''\\.") + "'''"
52 Double3
= any(r
'[^"\\]',r
'\\.',r
'"[^"\\]',r
'"\\.',r
'""[^"\\]',r
'""\\.') + '"""'
53 Triple
= group("[rR]?'''", '[rR]?"""')
54 String
= group("[rR]?'" + any(r
"[^\n'\\]", r
'\\.') + "'",
55 '[rR]?"' + any(r
'[^\n"\\]', r
'\\.') + '"')
57 Operator
= group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '\|',
58 '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
60 Special
= group(r
'\r?\n', r
'[:;.,`]')
61 Funny
= group(Operator
, Bracket
, Special
)
63 PlainToken
= group(Number
, Funny
, String
, Name
)
64 Token
= Ignore
+ PlainToken
66 ContStr
= group("[rR]?'" + any(r
'\\.', r
"[^\n'\\]") + group("'", r
'\\\r?\n'),
67 '[rR]?"' + any(r
'\\.', r
'[^\n"\\]') + group('"', r
'\\\r?\n'))
68 PseudoExtras
= group(r
'\\\r?\n', Comment
, Triple
)
69 PseudoToken
= Whitespace
+ group(PseudoExtras
, Number
, Funny
, ContStr
, Name
)
71 tokenprog
, pseudoprog
, single3prog
, double3prog
= map(
72 re
.compile, (Token
, PseudoToken
, Single3
, Double3
))
73 endprogs
= {"'": re
.compile(Single
), '"': re
.compile(Double
),
74 "'''": single3prog
, '"""': double3prog
,
75 "r'''": single3prog
, 'r"""': double3prog
,
76 "R'''": single3prog
, 'R"""': double3prog
, 'r': None, 'R': None}
79 TokenError
= 'TokenError'
80 def printtoken(type, token
, (srow
, scol
), (erow
, ecol
), line
): # for testing
81 print "%d,%d-%d,%d:\t%s\t%s" % \
82 (srow
, scol
, erow
, ecol
, tok_name
[type], repr(token
))
84 def tokenize(readline
, tokeneater
=printtoken
):
85 lnum
= parenlev
= continued
= 0
86 namechars
, numchars
= string
.letters
+ '_', string
.digits
87 contstr
, needcont
= '', 0
91 while 1: # loop over lines in stream
94 pos
, max = 0, len(line
)
96 if contstr
: # continued string
98 raise TokenError
, ("EOF in multi-line string", strstart
)
99 endmatch
= endprog
.match(line
)
101 pos
= end
= endmatch
.end(0)
102 tokeneater(STRING
, contstr
+ line
[:end
],
103 strstart
, (lnum
, end
), contline
+ line
)
104 contstr
, needcont
= '', 0
106 elif needcont
and line
[-2:] != '\\\n' and line
[-3:] != '\\\r\n':
107 tokeneater(ERRORTOKEN
, contstr
+ line
,
108 strstart
, (lnum
, len(line
)), contline
)
113 contstr
= contstr
+ line
114 contline
= contline
+ line
117 elif parenlev
== 0 and not continued
: # new statement
120 while pos
< max: # measure leading whitespace
121 if line
[pos
] == ' ': column
= column
+ 1
122 elif line
[pos
] == '\t': column
= (column
/tabsize
+ 1)*tabsize
123 elif line
[pos
] == '\f': column
= 0
128 if line
[pos
] in '#\r\n': # skip comments or blank lines
129 tokeneater((NL
, COMMENT
)[line
[pos
] == '#'], line
[pos
:],
130 (lnum
, pos
), (lnum
, len(line
)), line
)
133 if column
> indents
[-1]: # count indents or dedents
134 indents
.append(column
)
135 tokeneater(INDENT
, line
[:pos
], (lnum
, 0), (lnum
, pos
), line
)
136 while column
< indents
[-1]:
137 indents
= indents
[:-1]
138 tokeneater(DEDENT
, '', (lnum
, pos
), (lnum
, pos
), line
)
140 else: # continued statement
142 raise TokenError
, ("EOF in multi-line statement", (lnum
, 0))
146 pseudomatch
= pseudoprog
.match(line
, pos
)
147 if pseudomatch
: # scan for tokens
148 start
, end
= pseudomatch
.span(1)
149 spos
, epos
, pos
= (lnum
, start
), (lnum
, end
), end
150 token
, initial
= line
[start
:end
], line
[start
]
152 if initial
in numchars \
153 or (initial
== '.' and token
!= '.'): # ordinary number
154 tokeneater(NUMBER
, token
, spos
, epos
, line
)
155 elif initial
in '\r\n':
156 tokeneater(parenlev
> 0 and NL
or NEWLINE
,
157 token
, spos
, epos
, line
)
159 tokeneater(COMMENT
, token
, spos
, epos
, line
)
160 elif token
in ("'''", '"""', # triple-quoted
161 "r'''", 'r"""', "R'''", 'R"""'):
162 endprog
= endprogs
[token
]
163 endmatch
= endprog
.match(line
, pos
)
164 if endmatch
: # all on one line
165 pos
= endmatch
.end(0)
166 token
= line
[start
:pos
]
167 tokeneater(STRING
, token
, spos
, (lnum
, pos
), line
)
169 strstart
= (lnum
, start
) # multiple lines
170 contstr
= line
[start
:]
173 elif initial
in ("'", '"') or \
174 token
[:2] in ("r'", 'r"', "R'", 'R"'):
175 if token
[-1] == '\n': # continued string
176 strstart
= (lnum
, start
)
177 endprog
= endprogs
[initial
] or endprogs
[token
[1]]
178 contstr
, needcont
= line
[start
:], 1
181 else: # ordinary string
182 tokeneater(STRING
, token
, spos
, epos
, line
)
183 elif initial
in namechars
: # ordinary name
184 tokeneater(NAME
, token
, spos
, epos
, line
)
185 elif initial
== '\\': # continued stmt
188 if initial
in '([{': parenlev
= parenlev
+ 1
189 elif initial
in ')]}': parenlev
= parenlev
- 1
190 tokeneater(OP
, token
, spos
, epos
, line
)
192 tokeneater(ERRORTOKEN
, line
[pos
],
193 (lnum
, pos
), (lnum
, pos
+1), line
)
196 for indent
in indents
[1:]: # pop remaining indent levels
197 tokeneater(DEDENT
, '', (lnum
, 0), (lnum
, 0), '')
198 tokeneater(ENDMARKER
, '', (lnum
, 0), (lnum
, 0), '')
200 if __name__
== '__main__': # testing
202 if len(sys
.argv
) > 1: tokenize(open(sys
.argv
[1]).readline
)
203 else: tokenize(sys
.stdin
.readline
)