third_party/pycoverage/coverage/phystokens.py

   1 """Better tokenizing for coverage.py."""
   2
   3 import codecs, keyword, re, sys, token, tokenize
   4 from coverage.backward import set                       # pylint: disable=W0622
   5 from coverage.parser import generate_tokens
   6
   7
   8 def phys_tokens(toks):
   9     """Return all physical tokens, even line continuations.
  10
  11     tokenize.generate_tokens() doesn't return a token for the backslash that
  12     continues lines.  This wrapper provides those tokens so that we can
  13     re-create a faithful representation of the original source.
  14
  15     Returns the same values as generate_tokens()
  16
  17     """
  18     last_line = None
  19     last_lineno = -1
  20     last_ttype = None
  21     for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
  22         if last_lineno != elineno:
  23             if last_line and last_line.endswith("\\\n"):
  24                 # We are at the beginning of a new line, and the last line
  25                 # ended with a backslash.  We probably have to inject a
  26                 # backslash token into the stream. Unfortunately, there's more
  27                 # to figure out.  This code::
  28                 #
  29                 #   usage = """\
  30                 #   HEY THERE
  31                 #   """
  32                 #
  33                 # triggers this condition, but the token text is::
  34                 #
  35                 #   '"""\\\nHEY THERE\n"""'
  36                 #
  37                 # so we need to figure out if the backslash is already in the
  38                 # string token or not.
  39                 inject_backslash = True
  40                 if last_ttype == tokenize.COMMENT:
  41                     # Comments like this \
  42                     # should never result in a new token.
  43                     inject_backslash = False
  44                 elif ttype == token.STRING:
  45                     if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
  46                         # It's a multiline string and the first line ends with
  47                         # a backslash, so we don't need to inject another.
  48                         inject_backslash = False
  49                 if inject_backslash:
  50                     # Figure out what column the backslash is in.
  51                     ccol = len(last_line.split("\n")[-2]) - 1
  52                     # Yield the token, with a fake token type.
  53                     yield (
  54                         99999, "\\\n",
  55                         (slineno, ccol), (slineno, ccol+2),
  56                         last_line
  57                         )
  58             last_line = ltext
  59             last_ttype = ttype
  60         yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
  61         last_lineno = elineno
  62
  63
  64 def source_token_lines(source):
  65     """Generate a series of lines, one for each line in `source`.
  66
  67     Each line is a list of pairs, each pair is a token::
  68
  69         [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
  70
  71     Each pair has a token class, and the token text.
  72
  73     If you concatenate all the token texts, and then join them with newlines,
  74     you should have your original `source` back, with two differences:
  75     trailing whitespace is not preserved, and a final line with no newline
  76     is indistinguishable from a final line with a newline.
  77
  78     """
  79     ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
  80     line = []
  81     col = 0
  82     source = source.expandtabs(8).replace('\r\n', '\n')
  83     tokgen = generate_tokens(source)
  84     for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
  85         mark_start = True
  86         for part in re.split('(\n)', ttext):
  87             if part == '\n':
  88                 yield line
  89                 line = []
  90                 col = 0
  91                 mark_end = False
  92             elif part == '':
  93                 mark_end = False
  94             elif ttype in ws_tokens:
  95                 mark_end = False
  96             else:
  97                 if mark_start and scol > col:
  98                     line.append(("ws", " " * (scol - col)))
  99                     mark_start = False
 100                 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
 101                 if ttype == token.NAME and keyword.iskeyword(ttext):
 102                     tok_class = "key"
 103                 line.append((tok_class, part))
 104                 mark_end = True
 105             scol = 0
 106         if mark_end:
 107             col = ecol
 108
 109     if line:
 110         yield line
 111
 112 def source_encoding(source):
 113     """Determine the encoding for `source` (a string), according to PEP 263.
 114
 115     Returns a string, the name of the encoding.
 116
 117     """
 118     # Note: this function should never be called on Python 3, since py3 has
 119     # built-in tools to do this.
 120     assert sys.version_info < (3, 0)
 121
 122     # This is mostly code adapted from Py3.2's tokenize module.
 123
 124     cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")
 125
 126     # Do this so the detect_encode code we copied will work.
 127     readline = iter(source.splitlines(True)).next
 128
 129     def _get_normal_name(orig_enc):
 130         """Imitates get_normal_name in tokenizer.c."""
 131         # Only care about the first 12 characters.
 132         enc = orig_enc[:12].lower().replace("_", "-")
 133         if re.match(r"^utf-8($|-)", enc):
 134             return "utf-8"
 135         if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
 136             return "iso-8859-1"
 137         return orig_enc
 138
 139     # From detect_encode():
 140     # It detects the encoding from the presence of a utf-8 bom or an encoding
 141     # cookie as specified in pep-0263.  If both a bom and a cookie are present,
 142     # but disagree, a SyntaxError will be raised.  If the encoding cookie is an
 143     # invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
 144     # 'utf-8-sig' is returned.
 145
 146     # If no encoding is specified, then the default will be returned.  The
 147     # default varied with version.
 148
 149     if sys.version_info <= (2, 4):
 150         default = 'iso-8859-1'
 151     else:
 152         default = 'ascii'
 153
 154     bom_found = False
 155     encoding = None
 156
 157     def read_or_stop():
 158         """Get the next source line, or ''."""
 159         try:
 160             return readline()
 161         except StopIteration:
 162             return ''
 163
 164     def find_cookie(line):
 165         """Find an encoding cookie in `line`."""
 166         try:
 167             line_string = line.decode('ascii')
 168         except UnicodeDecodeError:
 169             return None
 170
 171         matches = cookie_re.findall(line_string)
 172         if not matches:
 173             return None
 174         encoding = _get_normal_name(matches[0])
 175         try:
 176             codec = codecs.lookup(encoding)
 177         except LookupError:
 178             # This behaviour mimics the Python interpreter
 179             raise SyntaxError("unknown encoding: " + encoding)
 180
 181         if bom_found:
 182             # codecs in 2.3 were raw tuples of functions, assume the best.
 183             codec_name = getattr(codec, 'name', encoding)
 184             if codec_name != 'utf-8':
 185                 # This behaviour mimics the Python interpreter
 186                 raise SyntaxError('encoding problem: utf-8')
 187             encoding += '-sig'
 188         return encoding
 189
 190     first = read_or_stop()
 191     if first.startswith(codecs.BOM_UTF8):
 192         bom_found = True
 193         first = first[3:]
 194         default = 'utf-8-sig'
 195     if not first:
 196         return default
 197
 198     encoding = find_cookie(first)
 199     if encoding:
 200         return encoding
 201
 202     second = read_or_stop()
 203     if not second:
 204         return default
 205
 206     encoding = find_cookie(second)
 207     if encoding:
 208         return encoding
 209
 210     return default