Lib/tokenize.py

   1 # This module compiles a regular expression that recognizes Python tokens.
   2 # It is designed to match the working of the Python tokenizer exactly.
   3 # It takes care of everything except indentation;
   4 # note that un-escaped newlines are tokens, too.
   5 # tokenprog.regs[3] gives the location of the token without whitespace
   6 # It also defines various subexpressions, but doesn't compile them.
   7 # See the function test() below for an example of how to use.
   8
   9 import regex
  10
  11 # Note: to get a quoted backslash in a regexp, it must be quadrupled.
  12
  13 Ignore = '[ \t]*\(\\\\\n[ \t]*\)*\(#.*\)?'
  14
  15 Name = '[a-zA-Z_][a-zA-Z0-9_]*'
  16
  17 Hexnumber = '0[xX][0-9a-fA-F]*[lL]?'
  18 Octnumber = '0[0-7]*[lL]?'
  19 Decnumber = '[1-9][0-9]*[lL]?'
  20 Intnumber = Hexnumber + '\|' + Octnumber + '\|' + Decnumber
  21 Exponent = '[eE][-+]?[0-9]+'
  22 Pointfloat = '\([0-9]+\.[0-9]*\|\.[0-9]+\)\(' + Exponent + '\)?'
  23 Expfloat = '[0-9]+' + Exponent
  24 Floatnumber = Pointfloat + '\|' + Expfloat
  25 Number = Floatnumber + '\|' + Intnumber
  26
  27 String = '\'\(\\\\.\|[^\\\n\']\)*\'' + '\|' + '"\(\\\\.\|[^\\\n"]\)*"'
  28 # Note: this module *recognizes* double quotes, but for backward
  29 # compatibility, it doesn't *use* them!
  30
  31 Operator = '~\|\+\|-\|\*\|/\|%\|\^\|&\||\|<<\|>>\|==\|<=\|<>\|!=\|>=\|=\|<\|>'
  32 Bracket = '[][(){}]'
  33 Special = '[:;.,`\n]'
  34 Funny = Operator + '\|' + Bracket + '\|' + Special
  35
  36 PlainToken = Name + '\|' + Number + '\|' + String + '\|' + Funny
  37
  38 Token = Ignore + '\(' + PlainToken + '\)'
  39
  40 try:
  41         save_syntax = regex.set_syntax(0) # Use default syntax
  42         tokenprog = regex.compile(Token)
  43 finally:
  44         if save_syntax != 0:
  45                 dummy = regex.set_syntax(save_syntax) # Restore original syntax
  46
  47
  48 def test(file):
  49         f = open(file, 'r')
  50         while 1:
  51                 line = f.readline()
  52                 if not line: break
  53                 i, n = 0, len(line)
  54                 while i < n:
  55                         j = tokenprog.match(line, i)
  56                         if j < 0:
  57                                 print 'No token at', `line[i:i+20]` + '...'
  58                                 i = i+1
  59                         else:
  60                                 i = i+j
  61                                 a, b = tokenprog.regs[3]
  62                                 if a < b:
  63                                         print 'Token:', `line[a:b]`