Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 generate_tokens(readline) is a generator that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF).  It generates
   6 5-tuples with these members:
   7
   8     the token type (see token.py)
   9     the token (a string)
  10     the starting (row, column) indices of the token (a 2-tuple of ints)
  11     the ending (row, column) indices of the token (a 2-tuple of ints)
  12     the original line (string)
  13
  14 It is designed to match the working of the Python tokenizer exactly, except
  15 that it produces COMMENT tokens for comments and gives type OP for all
  16 operators
  17
  18 Older entry points
  19     tokenize_loop(readline, tokeneater)
  20     tokenize(readline, tokeneater=printtoken)
  21 are the same, except instead of generating tokens, tokeneater is a callback
  22 function to which the 5 fields described above are passed as 5 arguments,
  23 each time a new token is found."""
  24
  25 from __future__ import generators
  26
  27 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  28 __credits__ = \
  29     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  30
  31 import string, re
  32 from token import *
  33
  34 import token
  35 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
  36 del token
  37
  38 COMMENT = N_TOKENS
  39 tok_name[COMMENT] = 'COMMENT'
  40 NL = N_TOKENS + 1
  41 tok_name[NL] = 'NL'
  42 N_TOKENS += 2
  43
  44 def group(*choices): return '(' + '|'.join(choices) + ')'
  45 def any(*choices): return apply(group, choices) + '*'
  46 def maybe(*choices): return apply(group, choices) + '?'
  47
  48 Whitespace = r'[ \f\t]*'
  49 Comment = r'#[^\r\n]*'
  50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  51 Name = r'[a-zA-Z_]\w*'
  52
  53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
  54 Octnumber = r'0[0-7]*[lL]?'
  55 Decnumber = r'[1-9]\d*[lL]?'
  56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
  57 Exponent = r'[eE][-+]?\d+'
  58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  59 Expfloat = r'\d+' + Exponent
  60 Floatnumber = group(Pointfloat, Expfloat)
  61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
  62 Number = group(Imagnumber, Floatnumber, Intnumber)
  63
  64 # Tail end of ' string.
  65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  66 # Tail end of " string.
  67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  68 # Tail end of ''' string.
  69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  70 # Tail end of """ string.
  71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
  73 # Single-line ' or " string.
  74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  75                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  76
  77 # Because of leftmost-then-longest match semantics, be sure to put the
  78 # longest operators first (e.g., if = came before ==, == would get
  79 # recognized as two instances of =).
  80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  81                  r"//=?",
  82                  r"[+\-*/%&|^=<>]=?",
  83                  r"~")
  84
  85 Bracket = '[][(){}]'
  86 Special = group(r'\r?\n', r'[:;.,`]')
  87 Funny = group(Operator, Bracket, Special)
  88
  89 PlainToken = group(Number, Funny, String, Name)
  90 Token = Ignore + PlainToken
  91
  92 # First (or only) line of ' or " string.
  93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  94                 group("'", r'\\\r?\n'),
  95                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  96                 group('"', r'\\\r?\n'))
  97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  99
 100 tokenprog, pseudoprog, single3prog, double3prog = map(
 101     re.compile, (Token, PseudoToken, Single3, Double3))
 102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 103             "'''": single3prog, '"""': double3prog,
 104             "r'''": single3prog, 'r"""': double3prog,
 105             "u'''": single3prog, 'u"""': double3prog,
 106             "ur'''": single3prog, 'ur"""': double3prog,
 107             "R'''": single3prog, 'R"""': double3prog,
 108             "U'''": single3prog, 'U"""': double3prog,
 109             "uR'''": single3prog, 'uR"""': double3prog,
 110             "Ur'''": single3prog, 'Ur"""': double3prog,
 111             "UR'''": single3prog, 'UR"""': double3prog,
 112             'r': None, 'R': None, 'u': None, 'U': None}
 113
 114 tabsize = 8
 115
 116 class TokenError(Exception): pass
 117
 118 class StopTokenizing(Exception): pass
 119
 120 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
 121     print "%d,%d-%d,%d:\t%s\t%s" % \
 122         (srow, scol, erow, ecol, tok_name[type], repr(token))
 123
 124 def tokenize(readline, tokeneater=printtoken):
 125     """
 126     The tokenize() function accepts two parameters: one representing the
 127     input stream, and one providing an output mechanism for tokenize().
 128
 129     The first parameter, readline, must be a callable object which provides
 130     the same interface as the readline() method of built-in file objects.
 131     Each call to the function should return one line of input as a string.
 132
 133     The second parameter, tokeneater, must also be a callable object. It is
 134     called once for each token, with five arguments, corresponding to the
 135     tuples generated by generate_tokens().
 136     """
 137     try:
 138         tokenize_loop(readline, tokeneater)
 139     except StopTokenizing:
 140         pass
 141
 142 # backwards compatible interface
 143 def tokenize_loop(readline, tokeneater):
 144     for token_info in generate_tokens(readline):
 145         apply(tokeneater, token_info)
 146
 147 def generate_tokens(readline):
 148     """
 149     The generate_tokens() generator requires one argment, readline, which
 150     must be a callable object which provides the same interface as the
 151     readline() method of built-in file objects. Each call to the function
 152     should return one line of input as a string.
 153
 154     The generator produces 5-tuples with these members: the token type; the
 155     token string; a 2-tuple (srow, scol) of ints specifying the row and
 156     column where the token begins in the source; a 2-tuple (erow, ecol) of
 157     ints specifying the row and column where the token ends in the source;
 158     and the line on which the token was found. The line passed is the
 159     logical line; continuation lines are included.
 160     """
 161     lnum = parenlev = continued = 0
 162     namechars, numchars = string.ascii_letters + '_', '0123456789'
 163     contstr, needcont = '', 0
 164     contline = None
 165     indents = [0]
 166
 167     while 1:                                   # loop over lines in stream
 168         line = readline()
 169         lnum = lnum + 1
 170         pos, max = 0, len(line)
 171
 172         if contstr:                            # continued string
 173             if not line:
 174                 raise TokenError, ("EOF in multi-line string", strstart)
 175             endmatch = endprog.match(line)
 176             if endmatch:
 177                 pos = end = endmatch.end(0)
 178                 yield (STRING, contstr + line[:end],
 179                            strstart, (lnum, end), contline + line)
 180                 contstr, needcont = '', 0
 181                 contline = None
 182             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 183                 yield (ERRORTOKEN, contstr + line,
 184                            strstart, (lnum, len(line)), contline)
 185                 contstr = ''
 186                 contline = None
 187                 continue
 188             else:
 189                 contstr = contstr + line
 190                 contline = contline + line
 191                 continue
 192
 193         elif parenlev == 0 and not continued:  # new statement
 194             if not line: break
 195             column = 0
 196             while pos < max:                   # measure leading whitespace
 197                 if line[pos] == ' ': column = column + 1
 198                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
 199                 elif line[pos] == '\f': column = 0
 200                 else: break
 201                 pos = pos + 1
 202             if pos == max: break
 203
 204             if line[pos] in '#\r\n':           # skip comments or blank lines
 205                 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 206                            (lnum, pos), (lnum, len(line)), line)
 207                 continue
 208
 209             if column > indents[-1]:           # count indents or dedents
 210                 indents.append(column)
 211                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 212             while column < indents[-1]:
 213                 indents = indents[:-1]
 214                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 215
 216         else:                                  # continued statement
 217             if not line:
 218                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 219             continued = 0
 220
 221         while pos < max:
 222             pseudomatch = pseudoprog.match(line, pos)
 223             if pseudomatch:                                # scan for tokens
 224                 start, end = pseudomatch.span(1)
 225                 spos, epos, pos = (lnum, start), (lnum, end), end
 226                 token, initial = line[start:end], line[start]
 227
 228                 if initial in numchars or \
 229                    (initial == '.' and token != '.'):      # ordinary number
 230                     yield (NUMBER, token, spos, epos, line)
 231                 elif initial in '\r\n':
 232                     yield (parenlev > 0 and NL or NEWLINE,
 233                                token, spos, epos, line)
 234                 elif initial == '#':
 235                     yield (COMMENT, token, spos, epos, line)
 236                 elif token in ("'''", '"""',               # triple-quoted
 237                                "r'''", 'r"""', "R'''", 'R"""',
 238                                "u'''", 'u"""', "U'''", 'U"""',
 239                                "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 240                                "uR'''", 'uR"""', "UR'''", 'UR"""'):
 241                     endprog = endprogs[token]
 242                     endmatch = endprog.match(line, pos)
 243                     if endmatch:                           # all on one line
 244                         pos = endmatch.end(0)
 245                         token = line[start:pos]
 246                         yield (STRING, token, spos, (lnum, pos), line)
 247                     else:
 248                         strstart = (lnum, start)           # multiple lines
 249                         contstr = line[start:]
 250                         contline = line
 251                         break
 252                 elif initial in ("'", '"') or \
 253                     token[:2] in ("r'", 'r"', "R'", 'R"',
 254                                   "u'", 'u"', "U'", 'U"') or \
 255                     token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
 256                                   "uR'", 'uR"', "UR'", 'UR"' ):
 257                     if token[-1] == '\n':                  # continued string
 258                         strstart = (lnum, start)
 259                         endprog = (endprogs[initial] or endprogs[token[1]] or
 260                                    endprogs[token[2]])
 261                         contstr, needcont = line[start:], 1
 262                         contline = line
 263                         break
 264                     else:                                  # ordinary string
 265                         yield (STRING, token, spos, epos, line)
 266                 elif initial in namechars:                 # ordinary name
 267                     yield (NAME, token, spos, epos, line)
 268                 elif initial == '\\':                      # continued stmt
 269                     continued = 1
 270                 else:
 271                     if initial in '([{': parenlev = parenlev + 1
 272                     elif initial in ')]}': parenlev = parenlev - 1
 273                     yield (OP, token, spos, epos, line)
 274             else:
 275                 yield (ERRORTOKEN, line[pos],
 276                            (lnum, pos), (lnum, pos+1), line)
 277                 pos = pos + 1
 278
 279     for indent in indents[1:]:                 # pop remaining indent levels
 280         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 281     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 282
 283 if __name__ == '__main__':                     # testing
 284     import sys
 285     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 286     else: tokenize(sys.stdin.readline)