Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 generate_tokens(readline) is a generator that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF).  It generates
   6 5-tuples with these members:
   7
   8     the token type (see token.py)
   9     the token (a string)
  10     the starting (row, column) indices of the token (a 2-tuple of ints)
  11     the ending (row, column) indices of the token (a 2-tuple of ints)
  12     the original line (string)
  13
  14 It is designed to match the working of the Python tokenizer exactly, except
  15 that it produces COMMENT tokens for comments and gives type OP for all
  16 operators
  17
  18 Older entry points
  19     tokenize_loop(readline, tokeneater)
  20     tokenize(readline, tokeneater=printtoken)
  21 are the same, except instead of generating tokens, tokeneater is a callback
  22 function to which the 5 fields described above are passed as 5 arguments,
  23 each time a new token is found."""
  24
  25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  26 __credits__ = \
  27     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  28
  29 import string, re
  30 from token import *
  31
  32 import token
  33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
  34 del x
  35 del token
  36
  37 COMMENT = N_TOKENS
  38 tok_name[COMMENT] = 'COMMENT'
  39 NL = N_TOKENS + 1
  40 tok_name[NL] = 'NL'
  41 N_TOKENS += 2
  42
  43 def group(*choices): return '(' + '|'.join(choices) + ')'
  44 def any(*choices): return apply(group, choices) + '*'
  45 def maybe(*choices): return apply(group, choices) + '?'
  46
  47 Whitespace = r'[ \f\t]*'
  48 Comment = r'#[^\r\n]*'
  49 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  50 Name = r'[a-zA-Z_]\w*'
  51
  52 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
  53 Octnumber = r'0[0-7]*[lL]?'
  54 Decnumber = r'[1-9]\d*[lL]?'
  55 Intnumber = group(Hexnumber, Octnumber, Decnumber)
  56 Exponent = r'[eE][-+]?\d+'
  57 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  58 Expfloat = r'\d+' + Exponent
  59 Floatnumber = group(Pointfloat, Expfloat)
  60 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
  61 Number = group(Imagnumber, Floatnumber, Intnumber)
  62
  63 # Tail end of ' string.
  64 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  65 # Tail end of " string.
  66 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  67 # Tail end of ''' string.
  68 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  69 # Tail end of """ string.
  70 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  71 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
  72 # Single-line ' or " string.
  73 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  74                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  75
  76 # Because of leftmost-then-longest match semantics, be sure to put the
  77 # longest operators first (e.g., if = came before ==, == would get
  78 # recognized as two instances of =).
  79 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  80                  r"//=?",
  81                  r"[+\-*/%&|^=<>]=?",
  82                  r"~")
  83
  84 Bracket = '[][(){}]'
  85 Special = group(r'\r?\n', r'[:;.,`]')
  86 Funny = group(Operator, Bracket, Special)
  87
  88 PlainToken = group(Number, Funny, String, Name)
  89 Token = Ignore + PlainToken
  90
  91 # First (or only) line of ' or " string.
  92 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  93                 group("'", r'\\\r?\n'),
  94                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  95                 group('"', r'\\\r?\n'))
  96 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  97 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  98
  99 tokenprog, pseudoprog, single3prog, double3prog = map(
 100     re.compile, (Token, PseudoToken, Single3, Double3))
 101 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 102             "'''": single3prog, '"""': double3prog,
 103             "r'''": single3prog, 'r"""': double3prog,
 104             "u'''": single3prog, 'u"""': double3prog,
 105             "ur'''": single3prog, 'ur"""': double3prog,
 106             "R'''": single3prog, 'R"""': double3prog,
 107             "U'''": single3prog, 'U"""': double3prog,
 108             "uR'''": single3prog, 'uR"""': double3prog,
 109             "Ur'''": single3prog, 'Ur"""': double3prog,
 110             "UR'''": single3prog, 'UR"""': double3prog,
 111             'r': None, 'R': None, 'u': None, 'U': None}
 112
 113 triple_quoted = {}
 114 for t in ("'''", '"""',
 115           "r'''", 'r"""', "R'''", 'R"""',
 116           "u'''", 'u"""', "U'''", 'U"""',
 117           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 118           "uR'''", 'uR"""', "UR'''", 'UR"""'):
 119     triple_quoted[t] = t
 120 single_quoted = {}
 121 for t in ("'", '"',
 122           "r'", 'r"', "R'", 'R"',
 123           "u'", 'u"', "U'", 'U"',
 124           "ur'", 'ur"', "Ur'", 'Ur"',
 125           "uR'", 'uR"', "UR'", 'UR"' ):
 126     single_quoted[t] = t
 127
 128 tabsize = 8
 129
 130 class TokenError(Exception): pass
 131
 132 class StopTokenizing(Exception): pass
 133
 134 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
 135     print "%d,%d-%d,%d:\t%s\t%s" % \
 136         (srow, scol, erow, ecol, tok_name[type], repr(token))
 137
 138 def tokenize(readline, tokeneater=printtoken):
 139     """
 140     The tokenize() function accepts two parameters: one representing the
 141     input stream, and one providing an output mechanism for tokenize().
 142
 143     The first parameter, readline, must be a callable object which provides
 144     the same interface as the readline() method of built-in file objects.
 145     Each call to the function should return one line of input as a string.
 146
 147     The second parameter, tokeneater, must also be a callable object. It is
 148     called once for each token, with five arguments, corresponding to the
 149     tuples generated by generate_tokens().
 150     """
 151     try:
 152         tokenize_loop(readline, tokeneater)
 153     except StopTokenizing:
 154         pass
 155
 156 # backwards compatible interface
 157 def tokenize_loop(readline, tokeneater):
 158     for token_info in generate_tokens(readline):
 159         apply(tokeneater, token_info)
 160
 161 def generate_tokens(readline):
 162     """
 163     The generate_tokens() generator requires one argment, readline, which
 164     must be a callable object which provides the same interface as the
 165     readline() method of built-in file objects. Each call to the function
 166     should return one line of input as a string.
 167
 168     The generator produces 5-tuples with these members: the token type; the
 169     token string; a 2-tuple (srow, scol) of ints specifying the row and
 170     column where the token begins in the source; a 2-tuple (erow, ecol) of
 171     ints specifying the row and column where the token ends in the source;
 172     and the line on which the token was found. The line passed is the
 173     logical line; continuation lines are included.
 174     """
 175     lnum = parenlev = continued = 0
 176     namechars, numchars = string.ascii_letters + '_', '0123456789'
 177     contstr, needcont = '', 0
 178     contline = None
 179     indents = [0]
 180
 181     while 1:                                   # loop over lines in stream
 182         line = readline()
 183         lnum = lnum + 1
 184         pos, max = 0, len(line)
 185
 186         if contstr:                            # continued string
 187             if not line:
 188                 raise TokenError, ("EOF in multi-line string", strstart)
 189             endmatch = endprog.match(line)
 190             if endmatch:
 191                 pos = end = endmatch.end(0)
 192                 yield (STRING, contstr + line[:end],
 193                            strstart, (lnum, end), contline + line)
 194                 contstr, needcont = '', 0
 195                 contline = None
 196             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 197                 yield (ERRORTOKEN, contstr + line,
 198                            strstart, (lnum, len(line)), contline)
 199                 contstr = ''
 200                 contline = None
 201                 continue
 202             else:
 203                 contstr = contstr + line
 204                 contline = contline + line
 205                 continue
 206
 207         elif parenlev == 0 and not continued:  # new statement
 208             if not line: break
 209             column = 0
 210             while pos < max:                   # measure leading whitespace
 211                 if line[pos] == ' ': column = column + 1
 212                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
 213                 elif line[pos] == '\f': column = 0
 214                 else: break
 215                 pos = pos + 1
 216             if pos == max: break
 217
 218             if line[pos] in '#\r\n':           # skip comments or blank lines
 219                 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 220                            (lnum, pos), (lnum, len(line)), line)
 221                 continue
 222
 223             if column > indents[-1]:           # count indents or dedents
 224                 indents.append(column)
 225                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 226             while column < indents[-1]:
 227                 indents = indents[:-1]
 228                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 229
 230         else:                                  # continued statement
 231             if not line:
 232                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 233             continued = 0
 234
 235         while pos < max:
 236             pseudomatch = pseudoprog.match(line, pos)
 237             if pseudomatch:                                # scan for tokens
 238                 start, end = pseudomatch.span(1)
 239                 spos, epos, pos = (lnum, start), (lnum, end), end
 240                 token, initial = line[start:end], line[start]
 241
 242                 if initial in numchars or \
 243                    (initial == '.' and token != '.'):      # ordinary number
 244                     yield (NUMBER, token, spos, epos, line)
 245                 elif initial in '\r\n':
 246                     yield (parenlev > 0 and NL or NEWLINE,
 247                                token, spos, epos, line)
 248                 elif initial == '#':
 249                     yield (COMMENT, token, spos, epos, line)
 250                 elif token in triple_quoted:
 251                     endprog = endprogs[token]
 252                     endmatch = endprog.match(line, pos)
 253                     if endmatch:                           # all on one line
 254                         pos = endmatch.end(0)
 255                         token = line[start:pos]
 256                         yield (STRING, token, spos, (lnum, pos), line)
 257                     else:
 258                         strstart = (lnum, start)           # multiple lines
 259                         contstr = line[start:]
 260                         contline = line
 261                         break
 262                 elif initial in single_quoted or \
 263                     token[:2] in single_quoted or \
 264                     token[:3] in single_quoted:
 265                     if token[-1] == '\n':                  # continued string
 266                         strstart = (lnum, start)
 267                         endprog = (endprogs[initial] or endprogs[token[1]] or
 268                                    endprogs[token[2]])
 269                         contstr, needcont = line[start:], 1
 270                         contline = line
 271                         break
 272                     else:                                  # ordinary string
 273                         yield (STRING, token, spos, epos, line)
 274                 elif initial in namechars:                 # ordinary name
 275                     yield (NAME, token, spos, epos, line)
 276                 elif initial == '\\':                      # continued stmt
 277                     continued = 1
 278                 else:
 279                     if initial in '([{': parenlev = parenlev + 1
 280                     elif initial in ')]}': parenlev = parenlev - 1
 281                     yield (OP, token, spos, epos, line)
 282             else:
 283                 yield (ERRORTOKEN, line[pos],
 284                            (lnum, pos), (lnum, pos+1), line)
 285                 pos = pos + 1
 286
 287     for indent in indents[1:]:                 # pop remaining indent levels
 288         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 289     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 290
 291 if __name__ == '__main__':                     # testing
 292     import sys
 293     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 294     else: tokenize(sys.stdin.readline)