1 # This module compiles a regular expression that recognizes Python tokens.
2 # It is designed to match the working of the Python tokenizer exactly.
3 # It takes care of everything except indentation;
4 # note that un-escaped newlines are tokens, too.
5 # tokenprog.regs[3] gives the location of the token without whitespace
6 # It also defines various subexpressions, but doesn't compile them.
7 # See the function test() below for an example of how to use.
11 # Note: to get a quoted backslash in a regexp, it must be quadrupled.
13 Ignore
= '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?'
15 Name
= '[a-zA-Z_][a-zA-Z0-9_]*'
17 Hexnumber
= '0[xX][0-9a-fA-F]*[lL]?'
18 Octnumber
= '0[0-7]*[lL]?'
19 Decnumber
= '[1-9][0-9]*[lL]?'
20 Intnumber
= Hexnumber
+ '\|' + Octnumber
+ '\|' + Decnumber
21 Exponent
= '[eE][-+]?[0-9]+'
22 Pointfloat
= '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent
+ '\)?'
23 Expfloat
= '[0-9]+' + Exponent
24 Floatnumber
= Pointfloat
+ '\|' + Expfloat
25 Number
= Floatnumber
+ '\|' + Intnumber
27 String
= '\'\(\\\\.\|[^\\\n\']\)*\'' + '\|' + '"\(\\\\.\|[^\\\n"]\)*"'
28 # Note: this module *recognizes* double quotes, but for backward
29 # compatibility, it doesn't *use* them!
31 Operator
= '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>'
34 Funny
= Operator
+ '\|' + Bracket
+ '\|' + Special
36 PlainToken
= Name
+ '\|' + Number
+ '\|' + String
+ '\|' + Funny
38 Token
= Ignore
+ '\(' + PlainToken
+ '\)'
41 save_syntax
= regex
.set_syntax(0) # Use default syntax
42 tokenprog
= regex
.compile(Token
)
45 dummy
= regex
.set_syntax(save_syntax
) # Restore original syntax
55 j
= tokenprog
.match(line
, i
)
57 print 'No token at', `line
[i
:i
+20]`
+ '...'
61 a
, b
= tokenprog
.regs
[3]
63 print 'Token:', `line
[a
:b
]`