Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 generate_tokens(readline) is a generator that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF).  It generates
   6 5-tuples with these members:
   7
   8     the token type (see token.py)
   9     the token (a string)
  10     the starting (row, column) indices of the token (a 2-tuple of ints)
  11     the ending (row, column) indices of the token (a 2-tuple of ints)
  12     the original line (string)
  13
  14 It is designed to match the working of the Python tokenizer exactly, except
  15 that it produces COMMENT tokens for comments and gives type OP for all
  16 operators
  17
  18 Older entry points
  19     tokenize_loop(readline, tokeneater)
  20     tokenize(readline, tokeneater=printtoken)
  21 are the same, except instead of generating tokens, tokeneater is a callback
  22 function to which the 5 fields described above are passed as 5 arguments,
  23 each time a new token is found."""
  24
  25 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  26 __credits__ = \
  27     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  28
  29 import string, re
  30 from token import *
  31
  32 import token
  33 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize",
  34            "generate_tokens", "NL"]
  35 del x
  36 del token
  37
  38 COMMENT = N_TOKENS
  39 tok_name[COMMENT] = 'COMMENT'
  40 NL = N_TOKENS + 1
  41 tok_name[NL] = 'NL'
  42 N_TOKENS += 2
  43
  44 def group(*choices): return '(' + '|'.join(choices) + ')'
  45 def any(*choices): return group(*choices) + '*'
  46 def maybe(*choices): return group(*choices) + '?'
  47
  48 Whitespace = r'[ \f\t]*'
  49 Comment = r'#[^\r\n]*'
  50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  51 Name = r'[a-zA-Z_]\w*'
  52
  53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
  54 Octnumber = r'0[0-7]*[lL]?'
  55 Decnumber = r'[1-9]\d*[lL]?'
  56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
  57 Exponent = r'[eE][-+]?\d+'
  58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  59 Expfloat = r'\d+' + Exponent
  60 Floatnumber = group(Pointfloat, Expfloat)
  61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
  62 Number = group(Imagnumber, Floatnumber, Intnumber)
  63
  64 # Tail end of ' string.
  65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  66 # Tail end of " string.
  67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  68 # Tail end of ''' string.
  69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  70 # Tail end of """ string.
  71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
  73 # Single-line ' or " string.
  74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  75                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  76
  77 # Because of leftmost-then-longest match semantics, be sure to put the
  78 # longest operators first (e.g., if = came before ==, == would get
  79 # recognized as two instances of =).
  80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  81                  r"//=?",
  82                  r"[+\-*/%&|^=<>]=?",
  83                  r"~")
  84
  85 Bracket = '[][(){}]'
  86 Special = group(r'\r?\n', r'[:;.,`]')
  87 Funny = group(Operator, Bracket, Special)
  88
  89 PlainToken = group(Number, Funny, String, Name)
  90 Token = Ignore + PlainToken
  91
  92 # First (or only) line of ' or " string.
  93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  94                 group("'", r'\\\r?\n'),
  95                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  96                 group('"', r'\\\r?\n'))
  97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  99
 100 tokenprog, pseudoprog, single3prog, double3prog = map(
 101     re.compile, (Token, PseudoToken, Single3, Double3))
 102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 103             "'''": single3prog, '"""': double3prog,
 104             "r'''": single3prog, 'r"""': double3prog,
 105             "u'''": single3prog, 'u"""': double3prog,
 106             "ur'''": single3prog, 'ur"""': double3prog,
 107             "R'''": single3prog, 'R"""': double3prog,
 108             "U'''": single3prog, 'U"""': double3prog,
 109             "uR'''": single3prog, 'uR"""': double3prog,
 110             "Ur'''": single3prog, 'Ur"""': double3prog,
 111             "UR'''": single3prog, 'UR"""': double3prog,
 112             'r': None, 'R': None, 'u': None, 'U': None}
 113
 114 triple_quoted = {}
 115 for t in ("'''", '"""',
 116           "r'''", 'r"""', "R'''", 'R"""',
 117           "u'''", 'u"""', "U'''", 'U"""',
 118           "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 119           "uR'''", 'uR"""', "UR'''", 'UR"""'):
 120     triple_quoted[t] = t
 121 single_quoted = {}
 122 for t in ("'", '"',
 123           "r'", 'r"', "R'", 'R"',
 124           "u'", 'u"', "U'", 'U"',
 125           "ur'", 'ur"', "Ur'", 'Ur"',
 126           "uR'", 'uR"', "UR'", 'UR"' ):
 127     single_quoted[t] = t
 128
 129 tabsize = 8
 130
 131 class TokenError(Exception): pass
 132
 133 class StopTokenizing(Exception): pass
 134
 135 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
 136     print "%d,%d-%d,%d:\t%s\t%s" % \
 137         (srow, scol, erow, ecol, tok_name[type], repr(token))
 138
 139 def tokenize(readline, tokeneater=printtoken):
 140     """
 141     The tokenize() function accepts two parameters: one representing the
 142     input stream, and one providing an output mechanism for tokenize().
 143
 144     The first parameter, readline, must be a callable object which provides
 145     the same interface as the readline() method of built-in file objects.
 146     Each call to the function should return one line of input as a string.
 147
 148     The second parameter, tokeneater, must also be a callable object. It is
 149     called once for each token, with five arguments, corresponding to the
 150     tuples generated by generate_tokens().
 151     """
 152     try:
 153         tokenize_loop(readline, tokeneater)
 154     except StopTokenizing:
 155         pass
 156
 157 # backwards compatible interface
 158 def tokenize_loop(readline, tokeneater):
 159     for token_info in generate_tokens(readline):
 160         tokeneater(*token_info)
 161
 162 def generate_tokens(readline):
 163     """
 164     The generate_tokens() generator requires one argment, readline, which
 165     must be a callable object which provides the same interface as the
 166     readline() method of built-in file objects. Each call to the function
 167     should return one line of input as a string.
 168
 169     The generator produces 5-tuples with these members: the token type; the
 170     token string; a 2-tuple (srow, scol) of ints specifying the row and
 171     column where the token begins in the source; a 2-tuple (erow, ecol) of
 172     ints specifying the row and column where the token ends in the source;
 173     and the line on which the token was found. The line passed is the
 174     logical line; continuation lines are included.
 175     """
 176     lnum = parenlev = continued = 0
 177     namechars, numchars = string.ascii_letters + '_', '0123456789'
 178     contstr, needcont = '', 0
 179     contline = None
 180     indents = [0]
 181
 182     while 1:                                   # loop over lines in stream
 183         line = readline()
 184         lnum = lnum + 1
 185         pos, max = 0, len(line)
 186
 187         if contstr:                            # continued string
 188             if not line:
 189                 raise TokenError, ("EOF in multi-line string", strstart)
 190             endmatch = endprog.match(line)
 191             if endmatch:
 192                 pos = end = endmatch.end(0)
 193                 yield (STRING, contstr + line[:end],
 194                            strstart, (lnum, end), contline + line)
 195                 contstr, needcont = '', 0
 196                 contline = None
 197             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 198                 yield (ERRORTOKEN, contstr + line,
 199                            strstart, (lnum, len(line)), contline)
 200                 contstr = ''
 201                 contline = None
 202                 continue
 203             else:
 204                 contstr = contstr + line
 205                 contline = contline + line
 206                 continue
 207
 208         elif parenlev == 0 and not continued:  # new statement
 209             if not line: break
 210             column = 0
 211             while pos < max:                   # measure leading whitespace
 212                 if line[pos] == ' ': column = column + 1
 213                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
 214                 elif line[pos] == '\f': column = 0
 215                 else: break
 216                 pos = pos + 1
 217             if pos == max: break
 218
 219             if line[pos] in '#\r\n':           # skip comments or blank lines
 220                 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 221                            (lnum, pos), (lnum, len(line)), line)
 222                 continue
 223
 224             if column > indents[-1]:           # count indents or dedents
 225                 indents.append(column)
 226                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 227             while column < indents[-1]:
 228                 indents = indents[:-1]
 229                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 230
 231         else:                                  # continued statement
 232             if not line:
 233                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 234             continued = 0
 235
 236         while pos < max:
 237             pseudomatch = pseudoprog.match(line, pos)
 238             if pseudomatch:                                # scan for tokens
 239                 start, end = pseudomatch.span(1)
 240                 spos, epos, pos = (lnum, start), (lnum, end), end
 241                 token, initial = line[start:end], line[start]
 242
 243                 if initial in numchars or \
 244                    (initial == '.' and token != '.'):      # ordinary number
 245                     yield (NUMBER, token, spos, epos, line)
 246                 elif initial in '\r\n':
 247                     yield (parenlev > 0 and NL or NEWLINE,
 248                                token, spos, epos, line)
 249                 elif initial == '#':
 250                     yield (COMMENT, token, spos, epos, line)
 251                 elif token in triple_quoted:
 252                     endprog = endprogs[token]
 253                     endmatch = endprog.match(line, pos)
 254                     if endmatch:                           # all on one line
 255                         pos = endmatch.end(0)
 256                         token = line[start:pos]
 257                         yield (STRING, token, spos, (lnum, pos), line)
 258                     else:
 259                         strstart = (lnum, start)           # multiple lines
 260                         contstr = line[start:]
 261                         contline = line
 262                         break
 263                 elif initial in single_quoted or \
 264                     token[:2] in single_quoted or \
 265                     token[:3] in single_quoted:
 266                     if token[-1] == '\n':                  # continued string
 267                         strstart = (lnum, start)
 268                         endprog = (endprogs[initial] or endprogs[token[1]] or
 269                                    endprogs[token[2]])
 270                         contstr, needcont = line[start:], 1
 271                         contline = line
 272                         break
 273                     else:                                  # ordinary string
 274                         yield (STRING, token, spos, epos, line)
 275                 elif initial in namechars:                 # ordinary name
 276                     yield (NAME, token, spos, epos, line)
 277                 elif initial == '\\':                      # continued stmt
 278                     continued = 1
 279                 else:
 280                     if initial in '([{': parenlev = parenlev + 1
 281                     elif initial in ')]}': parenlev = parenlev - 1
 282                     yield (OP, token, spos, epos, line)
 283             else:
 284                 yield (ERRORTOKEN, line[pos],
 285                            (lnum, pos), (lnum, pos+1), line)
 286                 pos = pos + 1
 287
 288     for indent in indents[1:]:                 # pop remaining indent levels
 289         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 290     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 291
 292 if __name__ == '__main__':                     # testing
 293     import sys
 294     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 295     else: tokenize(sys.stdin.readline)