Lib/tokenize.py

   1 """Tokenization help for Python programs.
   2
   3 generate_tokens(readline) is a generator that breaks a stream of
   4 text into Python tokens.  It accepts a readline-like method which is called
   5 repeatedly to get the next line of input (or "" for EOF).  It generates
   6 5-tuples with these members:
   7
   8     the token type (see token.py)
   9     the token (a string)
  10     the starting (row, column) indices of the token (a 2-tuple of ints)
  11     the ending (row, column) indices of the token (a 2-tuple of ints)
  12     the original line (string)
  13
  14 It is designed to match the working of the Python tokenizer exactly, except
  15 that it produces COMMENT tokens for comments and gives type OP for all
  16 operators
  17
  18 Older entry points
  19     tokenize_loop(readline, tokeneater)
  20     tokenize(readline, tokeneater=printtoken)
  21 are the same, except instead of generating tokens, tokeneater is a callback
  22 function to which the 5 fields described above are passed as 5 arguments,
  23 each time a new token is found."""
  24
  25 from __future__ import generators
  26
  27 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
  28 __credits__ = \
  29     'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
  30
  31 import string, re
  32 from token import *
  33
  34 import token
  35 __all__ = [x for x in dir(token) if x[0] != '_'] + ["COMMENT", "tokenize", "NL"]
  36 del token
  37
  38 COMMENT = N_TOKENS
  39 tok_name[COMMENT] = 'COMMENT'
  40 NL = N_TOKENS + 1
  41 tok_name[NL] = 'NL'
  42 N_TOKENS += 2
  43
  44 def group(*choices): return '(' + '|'.join(choices) + ')'
  45 def any(*choices): return apply(group, choices) + '*'
  46 def maybe(*choices): return apply(group, choices) + '?'
  47
  48 Whitespace = r'[ \f\t]*'
  49 Comment = r'#[^\r\n]*'
  50 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
  51 Name = r'[a-zA-Z_]\w*'
  52
  53 Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
  54 Octnumber = r'0[0-7]*[lL]?'
  55 Decnumber = r'[1-9]\d*[lL]?'
  56 Intnumber = group(Hexnumber, Octnumber, Decnumber)
  57 Exponent = r'[eE][-+]?\d+'
  58 Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
  59 Expfloat = r'\d+' + Exponent
  60 Floatnumber = group(Pointfloat, Expfloat)
  61 Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
  62 Number = group(Imagnumber, Floatnumber, Intnumber)
  63
  64 # Tail end of ' string.
  65 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
  66 # Tail end of " string.
  67 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
  68 # Tail end of ''' string.
  69 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
  70 # Tail end of """ string.
  71 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
  72 Triple = group("[uU]?[rR]?'''", '[uU]?[rR]?"""')
  73 # Single-line ' or " string.
  74 String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
  75                r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
  76
  77 # Because of leftmost-then-longest match semantics, be sure to put the
  78 # longest operators first (e.g., if = came before ==, == would get
  79 # recognized as two instances of =).
  80 Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
  81                  r"//=?",
  82                  r"[+\-*/%&|^=<>]=?",
  83                  r"~")
  84
  85 Bracket = '[][(){}]'
  86 Special = group(r'\r?\n', r'[:;.,`]')
  87 Funny = group(Operator, Bracket, Special)
  88
  89 PlainToken = group(Number, Funny, String, Name)
  90 Token = Ignore + PlainToken
  91
  92 # First (or only) line of ' or " string.
  93 ContStr = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
  94                 group("'", r'\\\r?\n'),
  95                 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
  96                 group('"', r'\\\r?\n'))
  97 PseudoExtras = group(r'\\\r?\n', Comment, Triple)
  98 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
  99
 100 tokenprog, pseudoprog, single3prog, double3prog = map(
 101     re.compile, (Token, PseudoToken, Single3, Double3))
 102 endprogs = {"'": re.compile(Single), '"': re.compile(Double),
 103             "'''": single3prog, '"""': double3prog,
 104             "r'''": single3prog, 'r"""': double3prog,
 105             "u'''": single3prog, 'u"""': double3prog,
 106             "ur'''": single3prog, 'ur"""': double3prog,
 107             "R'''": single3prog, 'R"""': double3prog,
 108             "U'''": single3prog, 'U"""': double3prog,
 109             "uR'''": single3prog, 'uR"""': double3prog,
 110             "Ur'''": single3prog, 'Ur"""': double3prog,
 111             "UR'''": single3prog, 'UR"""': double3prog,
 112             'r': None, 'R': None, 'u': None, 'U': None}
 113
 114 tabsize = 8
 115
 116 class TokenError(Exception): pass
 117
 118 class StopTokenizing(Exception): pass
 119
 120 def printtoken(type, token, (srow, scol), (erow, ecol), line): # for testing
 121     print "%d,%d-%d,%d:\t%s\t%s" % \
 122         (srow, scol, erow, ecol, tok_name[type], repr(token))
 123
 124 def tokenize(readline, tokeneater=printtoken):
 125     try:
 126         tokenize_loop(readline, tokeneater)
 127     except StopTokenizing:
 128         pass
 129
 130 # backwards compatible interface
 131 def tokenize_loop(readline, tokeneater):
 132     for token_info in generate_tokens(readline):
 133         apply(tokeneater, token_info)
 134
 135 def generate_tokens(readline):
 136     lnum = parenlev = continued = 0
 137     namechars, numchars = string.ascii_letters + '_', '0123456789'
 138     contstr, needcont = '', 0
 139     contline = None
 140     indents = [0]
 141
 142     while 1:                                   # loop over lines in stream
 143         line = readline()
 144         lnum = lnum + 1
 145         pos, max = 0, len(line)
 146
 147         if contstr:                            # continued string
 148             if not line:
 149                 raise TokenError, ("EOF in multi-line string", strstart)
 150             endmatch = endprog.match(line)
 151             if endmatch:
 152                 pos = end = endmatch.end(0)
 153                 yield (STRING, contstr + line[:end],
 154                            strstart, (lnum, end), contline + line)
 155                 contstr, needcont = '', 0
 156                 contline = None
 157             elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
 158                 yield (ERRORTOKEN, contstr + line,
 159                            strstart, (lnum, len(line)), contline)
 160                 contstr = ''
 161                 contline = None
 162                 continue
 163             else:
 164                 contstr = contstr + line
 165                 contline = contline + line
 166                 continue
 167
 168         elif parenlev == 0 and not continued:  # new statement
 169             if not line: break
 170             column = 0
 171             while pos < max:                   # measure leading whitespace
 172                 if line[pos] == ' ': column = column + 1
 173                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize
 174                 elif line[pos] == '\f': column = 0
 175                 else: break
 176                 pos = pos + 1
 177             if pos == max: break
 178
 179             if line[pos] in '#\r\n':           # skip comments or blank lines
 180                 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
 181                            (lnum, pos), (lnum, len(line)), line)
 182                 continue
 183
 184             if column > indents[-1]:           # count indents or dedents
 185                 indents.append(column)
 186                 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
 187             while column < indents[-1]:
 188                 indents = indents[:-1]
 189                 yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
 190
 191         else:                                  # continued statement
 192             if not line:
 193                 raise TokenError, ("EOF in multi-line statement", (lnum, 0))
 194             continued = 0
 195
 196         while pos < max:
 197             pseudomatch = pseudoprog.match(line, pos)
 198             if pseudomatch:                                # scan for tokens
 199                 start, end = pseudomatch.span(1)
 200                 spos, epos, pos = (lnum, start), (lnum, end), end
 201                 token, initial = line[start:end], line[start]
 202
 203                 if initial in numchars or \
 204                    (initial == '.' and token != '.'):      # ordinary number
 205                     yield (NUMBER, token, spos, epos, line)
 206                 elif initial in '\r\n':
 207                     yield (parenlev > 0 and NL or NEWLINE,
 208                                token, spos, epos, line)
 209                 elif initial == '#':
 210                     yield (COMMENT, token, spos, epos, line)
 211                 elif token in ("'''", '"""',               # triple-quoted
 212                                "r'''", 'r"""', "R'''", 'R"""',
 213                                "u'''", 'u"""', "U'''", 'U"""',
 214                                "ur'''", 'ur"""', "Ur'''", 'Ur"""',
 215                                "uR'''", 'uR"""', "UR'''", 'UR"""'):
 216                     endprog = endprogs[token]
 217                     endmatch = endprog.match(line, pos)
 218                     if endmatch:                           # all on one line
 219                         pos = endmatch.end(0)
 220                         token = line[start:pos]
 221                         yield (STRING, token, spos, (lnum, pos), line)
 222                     else:
 223                         strstart = (lnum, start)           # multiple lines
 224                         contstr = line[start:]
 225                         contline = line
 226                         break
 227                 elif initial in ("'", '"') or \
 228                     token[:2] in ("r'", 'r"', "R'", 'R"',
 229                                   "u'", 'u"', "U'", 'U"') or \
 230                     token[:3] in ("ur'", 'ur"', "Ur'", 'Ur"',
 231                                   "uR'", 'uR"', "UR'", 'UR"' ):
 232                     if token[-1] == '\n':                  # continued string
 233                         strstart = (lnum, start)
 234                         endprog = (endprogs[initial] or endprogs[token[1]] or
 235                                    endprogs[token[2]])
 236                         contstr, needcont = line[start:], 1
 237                         contline = line
 238                         break
 239                     else:                                  # ordinary string
 240                         yield (STRING, token, spos, epos, line)
 241                 elif initial in namechars:                 # ordinary name
 242                     yield (NAME, token, spos, epos, line)
 243                 elif initial == '\\':                      # continued stmt
 244                     continued = 1
 245                 else:
 246                     if initial in '([{': parenlev = parenlev + 1
 247                     elif initial in ')]}': parenlev = parenlev - 1
 248                     yield (OP, token, spos, epos, line)
 249             else:
 250                 yield (ERRORTOKEN, line[pos],
 251                            (lnum, pos), (lnum, pos+1), line)
 252                 pos = pos + 1
 253
 254     for indent in indents[1:]:                 # pop remaining indent levels
 255         yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
 256     yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
 257
 258 if __name__ == '__main__':                     # testing
 259     import sys
 260     if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
 261     else: tokenize(sys.stdin.readline)