Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 This module exports a function called 'tokenize()' that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
   6 function which is called once for each token found.  The latter function is
   7 passed the token type, a string containing the token, the starting and
   8 ending (row, column) coordinates of the token, and the original line.  It is
   9 designed to match the working of the Python tokenizer exactly, except that
  10 it produces COMMENT tokens for comments and gives type OP for all operators."""
  11
  12 __version__ = "Ka-Ping Yee, 26 October 1997; patched, GvR 3/30/98"
  13
  14 import string, re
  15 from token import *
  16
  17 COMMENT = N_TOKENS
  18 tok_name[COMMENT] = 'COMMENT'
  19 NL = N_TOKENS + 1
  20 tok_name[NL] = 'NL'
  21
  22
  23 # Changes from 1.3:
  24 #     Ignore now accepts \f as whitespace.  Operator now includes '**'.
  25 #     Ignore and Special now accept \n or \r\n at the end of a line.
  26 #     Imagnumber is new.  Expfloat is corrected to reject '0e4'.
  27 # Note: to quote a backslash in a regex, it must be doubled in a r'aw' string.
  28
  29 def group(*choices): return '(' + string.join(choices, '|') + ')'
  30 def any(*choices): return apply(group, choices) + '*'
  31 def maybe(*choices): return apply(group, choices) + '?'
  32
  33 Whitespace = r'[ \f\t]*'
  34 Comment = r'#[^\r\n]*'
  35 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  36 Name = r'[a-zA-Z_]\w*'
  37
  38 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
  39 Octnumber = r'0[0-7]*[lL]?'
  40 Decnumber = r'[1-9]\d*[lL]?'
  41 Intnumber = group(Hexnumber, Octnumber, Decnumber)
  42 Exponent = r'[eE][-+]?\d+'
  43 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  44 Expfloat = r'[1-9]\d*' + Exponent
  45 Floatnumber = group(Pointfloat, Expfloat)
  46 Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
  47 Number = group(Imagnumber, Floatnumber, Intnumber)
  48
  49 Single = any(r"[^'\\]", r'\\.') + "'"
  50 Double = any(r'[^"\\]', r'\\.') + '"'
  51 Single3 = any(r"[^'\\]",r'\\.',r"'[^'\\]",r"'\\.",r"''[^'\\]",r"''\\.") + "'''"
  52 Double3 = any(r'[^"\\]',r'\\.',r'"[^"\\]',r'"\\.',r'""[^"\\]',r'""\\.') + '"""'
  53 Triple = group("[rR]?'''", '[rR]?"""')
  54 String = group("[rR]?'" + any(r"[^\n'\\]", r'\\.') + "'",
  55                '[rR]?"' + any(r'[^\n"\\]', r'\\.') + '"')
  56
  57 Operator = group('\+=', '\-=', '\*=', '%=', '/=', '\*\*=', '&=', '\|=',
  58                  '\^=', '>>=', '<<=', '\+', '\-', '\*\*', '\*', '\^', '~',
  59                  '/', '%', '&', '\|', '<<', '>>', '==', '<=', '<>', '!=',
  60                  '>=', '=', '<', '>')
  61
  62 Bracket = '[][(){}]'
  63 Special = group(r'\r?\n', r'[:;.,`]')
  64 Funny = group(Operator, Bracket, Special)
  65
  66 PlainToken = group(Number, Funny, String, Name)
  67 Token = Ignore + PlainToken
  68
  69 ContStr = group("[rR]?'" + any(r'\\.', r"[^\n'\\]") + group("'", r'\\\r?\n'),
  70                 '[rR]?"' + any(r'\\.', r'[^\n"\\]') + group('"', r'\\\r?\n'))
  71 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  72 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  73
  74 tokenprog, pseudoprog, single3prog, double3prog = map(
  75     re.compile, (Token, PseudoToken, Single3, Double3))
  76 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
  77             "'''": single3prog, '"""': double3prog,
  78             "r'''": single3prog, 'r"""': double3prog,
  79             "R'''": single3prog, 'R"""': double3prog, 'r': None, 'R': None}
  80
  81 tabsize = 8
  82
  83 class TokenError(Exception):
  84     pass
  85
  86 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
  87     print "%d,%d-%d,%d:\t%s\t%s" % \
  88         (srow, scol, erow, ecol, tok_name[type], repr(token))
  89
  90 def tokenize(readline, tokeneater=printtoken):
  91     lnum = parenlev = continued = 0
  92     namechars, numchars = string.letters + '_', string.digits
  93     contstr, needcont = '', 0
  94     contline = None
  95     indents = [0]
  96
  97     while 1:                                   # loop over lines in stream
  98         line = readline()
  99         lnum = lnum + 1
 100         pos, max = 0, len(line)
 101
 102         if contstr:                            # continued string
 103             if not line:
 104                 raise TokenError, ("EOF in multi-line string", strstart)
 105             endmatch = endprog.match(line)
 106             if endmatch:
 107                 pos = end = endmatch.end(0)
 108                 tokeneater(STRING, contstr + line[:end],
 109                            strstart, (lnum, end), contline + line)
 110                 contstr, needcont = '', 0
 111                 contline = None
 112             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 113                 tokeneater(ERRORTOKEN, contstr + line,
 114                            strstart, (lnum, len(line)), contline)
 115                 contstr = ''
 116                 contline = None
 117                 continue
 118             else:
 119                 contstr = contstr + line
 120                 contline = contline + line
 121                 continue
 122
 123         elif parenlev == 0 and not continued:  # new statement
 124             if not line: break
 125             column = 0
 126             while pos < max:                   # measure leading whitespace
 127                 if line[pos] == ' ': column = column + 1
 128                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
 129                 elif line[pos] == '\f': column = 0
 130                 else: break
 131                 pos = pos + 1
 132             if pos == max: break
 133
 134             if line[pos] in '#\r\n':           # skip comments or blank lines
 135                 tokeneater((NL, COMMENT)[line[pos] == '#'], line[pos:],
 136                            (lnum, pos), (lnum, len(line)), line)
 137                 continue
 138
 139             if column > indents[-1]:           # count indents or dedents
 140                 indents.append(column)
 141                 tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 142             while column < indents[-1]:
 143                 indents = indents[:-1]
 144                 tokeneater(DEDENT, '', (lnum, pos), (lnum, pos), line)
 145
 146         else:                                  # continued statement
 147             if not line:
 148                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 149             continued = 0
 150
 151         while pos < max:
 152             pseudomatch = pseudoprog.match(line, pos)
 153             if pseudomatch:                                # scan for tokens
 154                 start, end = pseudomatch.span(1)
 155                 spos, epos, pos = (lnum, start), (lnum, end), end
 156                 token, initial = line[start:end], line[start]
 157
 158                 if initial in numchars \
 159                     or (initial == '.' and token != '.'):  # ordinary number
 160                     tokeneater(NUMBER, token, spos, epos, line)
 161                 elif initial in '\r\n':
 162                     tokeneater(parenlev > 0 and NL or NEWLINE,
 163                                token, spos, epos, line)
 164                 elif initial == '#':
 165                     tokeneater(COMMENT, token, spos, epos, line)
 166                 elif token in ("'''", '"""',               # triple-quoted
 167                                "r'''", 'r"""', "R'''", 'R"""'):
 168                     endprog = endprogs[token]
 169                     endmatch = endprog.match(line, pos)
 170                     if endmatch:                           # all on one line
 171                         pos = endmatch.end(0)
 172                         token = line[start:pos]
 173                         tokeneater(STRING, token, spos, (lnum, pos), line)
 174                     else:
 175                         strstart = (lnum, start)           # multiple lines
 176                         contstr = line[start:]
 177                         contline = line
 178                         break
 179                 elif initial in ("'", '"') or \
 180                     token[:2] in ("r'", 'r"', "R'", 'R"'):
 181                     if token[-1] == '\n':                  # continued string
 182                         strstart = (lnum, start)
 183                         endprog = endprogs[initial] or endprogs[token[1]]
 184                         contstr, needcont = line[start:], 1
 185                         contline = line
 186                         break
 187                     else:                                  # ordinary string
 188                         tokeneater(STRING, token, spos, epos, line)
 189                 elif initial in namechars:                 # ordinary name
 190                     tokeneater(NAME, token, spos, epos, line)
 191                 elif initial == '\\':                      # continued stmt
 192                     continued = 1
 193                 else:
 194                     if initial in '([{': parenlev = parenlev + 1
 195                     elif initial in ')]}': parenlev = parenlev - 1
 196                     tokeneater(OP, token, spos, epos, line)
 197             else:
 198                 tokeneater(ERRORTOKEN, line[pos],
 199                            (lnum, pos), (lnum, pos+1), line)
 200                 pos = pos + 1
 201
 202     for indent in indents[1:]:                 # pop remaining indent levels
 203         tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
 204     tokeneater(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 205
 206 if __name__ == '__main__':                     # testing
 207     import sys
 208     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 209     else: tokenize(sys.stdin.readline)
 210