Lib/tabnanny.py

   1 #! /usr/bin/env python
   2
   3 """The Tab Nanny despises ambiguous indentation.  She knows no mercy.
   4
   5 tabnanny -- Detection of ambiguous indentation
   6
   7 For the time being this module is intended to be called as a script.
   8 However it is possible to import it into an IDE and use the function
   9 check() described below.
  10
  11 Warning: The API provided by this module is likely to change in future
  12 releases; such changes may not be backward compatible.
  13 """
  14
  15 # Released to the public domain, by Tim Peters, 15 April 1998.
  16
  17 # XXX Note: this is now a standard library module.
  18 # XXX The API needs to undergo changes however; the current code is too
  19 # XXX script-like.  This will be addressed later.
  20
  21 __version__ = "6"
  22
  23 import os
  24 import sys
  25 import getopt
  26 import tokenize
  27 if not hasattr(tokenize, 'NL'):
  28     raise ValueError("tokenize.NL doesn't exist -- tokenize module too old")
  29
  30 __all__ = ["check", "NannyNag", "process_tokens"]
  31
  32 verbose = 0
  33 filename_only = 0
  34
  35 def errprint(*args):
  36     sep = ""
  37     for arg in args:
  38         sys.stderr.write(sep + str(arg))
  39         sep = " "
  40     sys.stderr.write("\n")
  41
  42 def main():
  43     global verbose, filename_only
  44     try:
  45         opts, args = getopt.getopt(sys.argv[1:], "qv")
  46     except getopt.error, msg:
  47         errprint(msg)
  48         return
  49     for o, a in opts:
  50         if o == '-q':
  51             filename_only = filename_only + 1
  52         if o == '-v':
  53             verbose = verbose + 1
  54     if not args:
  55         errprint("Usage:", sys.argv[0], "[-v] file_or_directory ...")
  56         return
  57     for arg in args:
  58         check(arg)
  59
  60 class NannyNag(Exception):
  61     """
  62     Raised by tokeneater() if detecting an ambiguous indent.
  63     Captured and handled in check().
  64     """
  65     def __init__(self, lineno, msg, line):
  66         self.lineno, self.msg, self.line = lineno, msg, line
  67     def get_lineno(self):
  68         return self.lineno
  69     def get_msg(self):
  70         return self.msg
  71     def get_line(self):
  72         return self.line
  73
  74 def check(file):
  75     """check(file_or_dir)
  76
  77     If file_or_dir is a directory and not a symbolic link, then recursively
  78     descend the directory tree named by file_or_dir, checking all .py files
  79     along the way. If file_or_dir is an ordinary Python source file, it is
  80     checked for whitespace related problems. The diagnostic messages are
  81     written to standard output using the print statement.
  82     """
  83
  84     if os.path.isdir(file) and not os.path.islink(file):
  85         if verbose:
  86             print "%s: listing directory" % `file`
  87         names = os.listdir(file)
  88         for name in names:
  89             fullname = os.path.join(file, name)
  90             if (os.path.isdir(fullname) and
  91                 not os.path.islink(fullname) or
  92                 os.path.normcase(name[-3:]) == ".py"):
  93                 check(fullname)
  94         return
  95
  96     try:
  97         f = open(file)
  98     except IOError, msg:
  99         errprint("%s: I/O Error: %s" % (`file`, str(msg)))
 100         return
 101
 102     if verbose > 1:
 103         print "checking", `file`, "..."
 104
 105     try:
 106         process_tokens(tokenize.generate_tokens(f.readline))
 107
 108     except tokenize.TokenError, msg:
 109         errprint("%s: Token Error: %s" % (`file`, str(msg)))
 110         return
 111
 112     except NannyNag, nag:
 113         badline = nag.get_lineno()
 114         line = nag.get_line()
 115         if verbose:
 116             print "%s: *** Line %d: trouble in tab city! ***" % (
 117                 `file`, badline)
 118             print "offending line:", `line`
 119             print nag.get_msg()
 120         else:
 121             if ' ' in file: file = '"' + file + '"'
 122             if filename_only: print file
 123             else: print file, badline, `line`
 124         return
 125
 126     if verbose:
 127         print "%s: Clean bill of health." % `file`
 128
 129 class Whitespace:
 130     # the characters used for space and tab
 131     S, T = ' \t'
 132
 133     # members:
 134     #   raw
 135     #       the original string
 136     #   n
 137     #       the number of leading whitespace characters in raw
 138     #   nt
 139     #       the number of tabs in raw[:n]
 140     #   norm
 141     #       the normal form as a pair (count, trailing), where:
 142     #       count
 143     #           a tuple such that raw[:n] contains count[i]
 144     #           instances of S * i + T
 145     #       trailing
 146     #           the number of trailing spaces in raw[:n]
 147     #       It's A Theorem that m.indent_level(t) ==
 148     #       n.indent_level(t) for all t >= 1 iff m.norm == n.norm.
 149     #   is_simple
 150     #       true iff raw[:n] is of the form (T*)(S*)
 151
 152     def __init__(self, ws):
 153         self.raw  = ws
 154         S, T = Whitespace.S, Whitespace.T
 155         count = []
 156         b = n = nt = 0
 157         for ch in self.raw:
 158             if ch == S:
 159                 n = n + 1
 160                 b = b + 1
 161             elif ch == T:
 162                 n = n + 1
 163                 nt = nt + 1
 164                 if b >= len(count):
 165                     count = count + [0] * (b - len(count) + 1)
 166                 count[b] = count[b] + 1
 167                 b = 0
 168             else:
 169                 break
 170         self.n    = n
 171         self.nt   = nt
 172         self.norm = tuple(count), b
 173         self.is_simple = len(count) <= 1
 174
 175     # return length of longest contiguous run of spaces (whether or not
 176     # preceding a tab)
 177     def longest_run_of_spaces(self):
 178         count, trailing = self.norm
 179         return max(len(count)-1, trailing)
 180
 181     def indent_level(self, tabsize):
 182         # count, il = self.norm
 183         # for i in range(len(count)):
 184         #    if count[i]:
 185         #        il = il + (i/tabsize + 1)*tabsize * count[i]
 186         # return il
 187
 188         # quicker:
 189         # il = trailing + sum (i/ts + 1)*ts*count[i] =
 190         # trailing + ts * sum (i/ts + 1)*count[i] =
 191         # trailing + ts * sum i/ts*count[i] + count[i] =
 192         # trailing + ts * [(sum i/ts*count[i]) + (sum count[i])] =
 193         # trailing + ts * [(sum i/ts*count[i]) + num_tabs]
 194         # and note that i/ts*count[i] is 0 when i < ts
 195
 196         count, trailing = self.norm
 197         il = 0
 198         for i in range(tabsize, len(count)):
 199             il = il + i/tabsize * count[i]
 200         return trailing + tabsize * (il + self.nt)
 201
 202     # return true iff self.indent_level(t) == other.indent_level(t)
 203     # for all t >= 1
 204     def equal(self, other):
 205         return self.norm == other.norm
 206
 207     # return a list of tuples (ts, i1, i2) such that
 208     # i1 == self.indent_level(ts) != other.indent_level(ts) == i2.
 209     # Intended to be used after not self.equal(other) is known, in which
 210     # case it will return at least one witnessing tab size.
 211     def not_equal_witness(self, other):
 212         n = max(self.longest_run_of_spaces(),
 213                 other.longest_run_of_spaces()) + 1
 214         a = []
 215         for ts in range(1, n+1):
 216             if self.indent_level(ts) != other.indent_level(ts):
 217                 a.append( (ts,
 218                            self.indent_level(ts),
 219                            other.indent_level(ts)) )
 220         return a
 221
 222     # Return True iff self.indent_level(t) < other.indent_level(t)
 223     # for all t >= 1.
 224     # The algorithm is due to Vincent Broman.
 225     # Easy to prove it's correct.
 226     # XXXpost that.
 227     # Trivial to prove n is sharp (consider T vs ST).
 228     # Unknown whether there's a faster general way.  I suspected so at
 229     # first, but no longer.
 230     # For the special (but common!) case where M and N are both of the
 231     # form (T*)(S*), M.less(N) iff M.len() < N.len() and
 232     # M.num_tabs() <= N.num_tabs(). Proof is easy but kinda long-winded.
 233     # XXXwrite that up.
 234     # Note that M is of the form (T*)(S*) iff len(M.norm[0]) <= 1.
 235     def less(self, other):
 236         if self.n >= other.n:
 237             return False
 238         if self.is_simple and other.is_simple:
 239             return self.nt <= other.nt
 240         n = max(self.longest_run_of_spaces(),
 241                 other.longest_run_of_spaces()) + 1
 242         # the self.n >= other.n test already did it for ts=1
 243         for ts in range(2, n+1):
 244             if self.indent_level(ts) >= other.indent_level(ts):
 245                 return False
 246         return True
 247
 248     # return a list of tuples (ts, i1, i2) such that
 249     # i1 == self.indent_level(ts) >= other.indent_level(ts) == i2.
 250     # Intended to be used after not self.less(other) is known, in which
 251     # case it will return at least one witnessing tab size.
 252     def not_less_witness(self, other):
 253         n = max(self.longest_run_of_spaces(),
 254                 other.longest_run_of_spaces()) + 1
 255         a = []
 256         for ts in range(1, n+1):
 257             if self.indent_level(ts) >= other.indent_level(ts):
 258                 a.append( (ts,
 259                            self.indent_level(ts),
 260                            other.indent_level(ts)) )
 261         return a
 262
 263 def format_witnesses(w):
 264     firsts = map(lambda tup: str(tup[0]), w)
 265     prefix = "at tab size"
 266     if len(w) > 1:
 267         prefix = prefix + "s"
 268     return prefix + " " + ', '.join(firsts)
 269
 270 def process_tokens(tokens):
 271     INDENT = tokenize.INDENT
 272     DEDENT = tokenize.DEDENT
 273     NEWLINE = tokenize.NEWLINE
 274     JUNK = tokenize.COMMENT, tokenize.NL
 275     indents = [Whitespace("")]
 276     check_equal = 0
 277
 278     for (type, token, start, end, line) in tokens:
 279         if type == NEWLINE:
 280             # a program statement, or ENDMARKER, will eventually follow,
 281             # after some (possibly empty) run of tokens of the form
 282             #     (NL | COMMENT)* (INDENT | DEDENT+)?
 283             # If an INDENT appears, setting check_equal is wrong, and will
 284             # be undone when we see the INDENT.
 285             check_equal = 1
 286
 287         elif type == INDENT:
 288             check_equal = 0
 289             thisguy = Whitespace(token)
 290             if not indents[-1].less(thisguy):
 291                 witness = indents[-1].not_less_witness(thisguy)
 292                 msg = "indent not greater e.g. " + format_witnesses(witness)
 293                 raise NannyNag(start[0], msg, line)
 294             indents.append(thisguy)
 295
 296         elif type == DEDENT:
 297             # there's nothing we need to check here!  what's important is
 298             # that when the run of DEDENTs ends, the indentation of the
 299             # program statement (or ENDMARKER) that triggered the run is
 300             # equal to what's left at the top of the indents stack
 301
 302             # Ouch!  This assert triggers if the last line of the source
 303             # is indented *and* lacks a newline -- then DEDENTs pop out
 304             # of thin air.
 305             # assert check_equal  # else no earlier NEWLINE, or an earlier INDENT
 306             check_equal = 1
 307
 308             del indents[-1]
 309
 310         elif check_equal and type not in JUNK:
 311             # this is the first "real token" following a NEWLINE, so it
 312             # must be the first token of the next program statement, or an
 313             # ENDMARKER; the "line" argument exposes the leading whitespace
 314             # for this statement; in the case of ENDMARKER, line is an empty
 315             # string, so will properly match the empty string with which the
 316             # "indents" stack was seeded
 317             check_equal = 0
 318             thisguy = Whitespace(line)
 319             if not indents[-1].equal(thisguy):
 320                 witness = indents[-1].not_equal_witness(thisguy)
 321                 msg = "indent not equal e.g. " + format_witnesses(witness)
 322                 raise NannyNag(start[0], msg, line)
 323
 324
 325 if __name__ == '__main__':
 326     main()