Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 This module exports a function called 'tokenize()' that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF) and a "token-eater"
   6 function which is called once for each token found.  The latter function is
   7 passed the token type, a string containing the token, the starting and
   8 ending (row, column) coordinates of the token, and the original line.  It is
   9 designed to match the working of the Python tokenizer exactly, except that
  10 it produces COMMENT tokens for comments and gives type OP for all operators."""
  11
  12 __version__ = "Ka-Ping Yee, 26 October 1997; patched, GvR 3/30/98"
  13
  14 import string, re
  15 from token import *
  16
  17 COMMENT = N_TOKENS
  18 tok_name[COMMENT] = 'COMMENT'
  19 NL = N_TOKENS + 1
  20 tok_name[NL] = 'NL'
  21
  22
  23 # Changes from 1.3:
  24 #     Ignore now accepts \f as whitespace.  Operator now includes '**'.
  25 #     Ignore and Special now accept \n or \r\n at the end of a line.
  26 #     Imagnumber is new.  Expfloat is corrected to reject '0e4'.
  27 # Note: to quote a backslash in a regex, it must be doubled in a r'aw' string.
  28
  29 def group(*choices): return '(' + string.join(choices, '|') + ')'
  30 def any(*choices): return apply(group, choices) + '*'
  31 def maybe(*choices): return apply(group, choices) + '?'
  32
  33 Whitespace = r'[ \f\t]*'
  34 Comment = r'#[^\r\n]*'
  35 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  36 Name = r'[a-zA-Z_]\w*'
  37
  38 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
  39 Octnumber = r'0[0-7]*[lL]?'
  40 Decnumber = r'[1-9]\d*[lL]?'
  41 Intnumber = group(Hexnumber, Octnumber, Decnumber)
  42 Exponent = r'[eE][-+]?\d+'
  43 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  44 Expfloat = r'[1-9]\d*' + Exponent
  45 Floatnumber = group(Pointfloat, Expfloat)
  46 Imagnumber = group(r'0[jJ]', r'[1-9]\d*[jJ]', Floatnumber + r'[jJ]')
  47 Number = group(Imagnumber, Floatnumber, Intnumber)
  48
  49 Single = any(r"[^'\\]", r'\\.') + "'"
  50 Double = any(r'[^"\\]', r'\\.') + '"'
  51 Single3 = any(r"[^'\\]",r'\\.',r"'[^'\\]",r"'\\.",r"''[^'\\]",r"''\\.") + "'''"
  52 Double3 = any(r'[^"\\]',r'\\.',r'"[^"\\]',r'"\\.',r'""[^"\\]',r'""\\.') + '"""'
  53 Triple = group("[rR]?'''", '[rR]?"""')
  54 String = group("[rR]?'" + any(r"[^\n'\\]", r'\\.') + "'",
  55                '[rR]?"' + any(r'[^\n"\\]', r'\\.') + '"')
  56
  57 Operator = group('\+', '\-', '\*\*', '\*', '\^', '~', '/', '%', '&', '\|',
  58                  '<<', '>>', '==', '<=', '<>', '!=', '>=', '=', '<', '>')
  59 Bracket = '[][(){}]'
  60 Special = group(r'\r?\n', r'[:;.,`]')
  61 Funny = group(Operator, Bracket, Special)
  62
  63 PlainToken = group(Number, Funny, String, Name)
  64 Token = Ignore + PlainToken
  65
  66 ContStr = group("[rR]?'" + any(r'\\.', r"[^\n'\\]") + group("'", r'\\\r?\n'),
  67                 '[rR]?"' + any(r'\\.', r'[^\n"\\]') + group('"', r'\\\r?\n'))
  68 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  69 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  70
  71 tokenprog, pseudoprog, single3prog, double3prog = map(
  72     re.compile, (Token, PseudoToken, Single3, Double3))
  73 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
  74             "'''": single3prog, '"""': double3prog,
  75             "r'''": single3prog, 'r"""': double3prog,
  76             "R'''": single3prog, 'R"""': double3prog, 'r': None, 'R': None}
  77
  78 tabsize = 8
  79 TokenError = 'TokenError'
  80 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
  81     print "%d,%d-%d,%d:\t%s\t%s" % \
  82         (srow, scol, erow, ecol, tok_name[type], repr(token))
  83
  84 def tokenize(readline, tokeneater=printtoken):
  85     lnum = parenlev = continued = 0
  86     namechars, numchars = string.letters + '_', string.digits
  87     contstr, needcont = '', 0
  88     contline = None
  89     indents = [0]
  90
  91     while 1:                                   # loop over lines in stream
  92         line = readline()
  93         lnum = lnum + 1
  94         pos, max = 0, len(line)
  95
  96         if contstr:                            # continued string
  97             if not line:
  98                 raise TokenError, ("EOF in multi-line string", strstart)
  99             endmatch = endprog.match(line)
 100             if endmatch:
 101                 pos = end = endmatch.end(0)
 102                 tokeneater(STRING, contstr + line[:end],
 103                            strstart, (lnum, end), contline + line)
 104                 contstr, needcont = '', 0
 105                 contline = None
 106             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 107                 tokeneater(ERRORTOKEN, contstr + line,
 108                            strstart, (lnum, len(line)), contline)
 109                 contstr = ''
 110                 contline = None
 111                 continue
 112             else:
 113                 contstr = contstr + line
 114                 contline = contline + line
 115                 continue
 116
 117         elif parenlev == 0 and not continued:  # new statement
 118             if not line: break
 119             column = 0
 120             while pos < max:                   # measure leading whitespace
 121                 if line[pos] == ' ': column = column + 1
 122                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
 123                 elif line[pos] == '\f': column = 0
 124                 else: break
 125                 pos = pos + 1
 126             if pos == max: break
 127
 128             if line[pos] in '#\r\n':           # skip comments or blank lines
 129                 tokeneater((NL, COMMENT)[line[pos] == '#'], line[pos:],
 130                            (lnum, pos), (lnum, len(line)), line)
 131                 continue
 132
 133             if column > indents[-1]:           # count indents or dedents
 134                 indents.append(column)
 135                 tokeneater(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 136             while column < indents[-1]:
 137                 indents = indents[:-1]
 138                 tokeneater(DEDENT, '', (lnum, pos), (lnum, pos), line)
 139
 140         else:                                  # continued statement
 141             if not line:
 142                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 143             continued = 0
 144
 145         while pos < max:
 146             pseudomatch = pseudoprog.match(line, pos)
 147             if pseudomatch:                                # scan for tokens
 148                 start, end = pseudomatch.span(1)
 149                 spos, epos, pos = (lnum, start), (lnum, end), end
 150                 token, initial = line[start:end], line[start]
 151
 152                 if initial in numchars \
 153                     or (initial == '.' and token != '.'):  # ordinary number
 154                     tokeneater(NUMBER, token, spos, epos, line)
 155                 elif initial in '\r\n':
 156                     tokeneater(parenlev > 0 and NL or NEWLINE,
 157                                token, spos, epos, line)
 158                 elif initial == '#':
 159                     tokeneater(COMMENT, token, spos, epos, line)
 160                 elif token in ("'''", '"""',               # triple-quoted
 161                                "r'''", 'r"""', "R'''", 'R"""'):
 162                     endprog = endprogs[token]
 163                     endmatch = endprog.match(line, pos)
 164                     if endmatch:                           # all on one line
 165                         pos = endmatch.end(0)
 166                         token = line[start:pos]
 167                         tokeneater(STRING, token, spos, (lnum, pos), line)
 168                     else:
 169                         strstart = (lnum, start)           # multiple lines
 170                         contstr = line[start:]
 171                         contline = line
 172                         break
 173                 elif initial in ("'", '"') or \
 174                     token[:2] in ("r'", 'r"', "R'", 'R"'):
 175                     if token[-1] == '\n':                  # continued string
 176                         strstart = (lnum, start)
 177                         endprog = endprogs[initial] or endprogs[token[1]]
 178                         contstr, needcont = line[start:], 1
 179                         contline = line
 180                         break
 181                     else:                                  # ordinary string
 182                         tokeneater(STRING, token, spos, epos, line)
 183                 elif initial in namechars:                 # ordinary name
 184                     tokeneater(NAME, token, spos, epos, line)
 185                 elif initial == '\\':                      # continued stmt
 186                     continued = 1
 187                 else:
 188                     if initial in '([{': parenlev = parenlev + 1
 189                     elif initial in ')]}': parenlev = parenlev - 1
 190                     tokeneater(OP, token, spos, epos, line)
 191             else:
 192                 tokeneater(ERRORTOKEN, line[pos],
 193                            (lnum, pos), (lnum, pos+1), line)
 194                 pos = pos + 1
 195
 196     for indent in indents[1:]:                 # pop remaining indent levels
 197         tokeneater(DEDENT, '', (lnum, 0), (lnum, 0), '')
 198     tokeneater(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 199
 200 if __name__ == '__main__':                     # testing
 201     import sys
 202     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 203     else: tokenize(sys.stdin.readline)
 204