1 """Better tokenizing for coverage.py."""
3 import codecs
, keyword
, re
, sys
, token
, tokenize
4 from coverage
.backward
import set # pylint: disable=W0622
5 from coverage
.parser
import generate_tokens
9 """Return all physical tokens, even line continuations.
11 tokenize.generate_tokens() doesn't return a token for the backslash that
12 continues lines. This wrapper provides those tokens so that we can
13 re-create a faithful representation of the original source.
15 Returns the same values as generate_tokens()
21 for ttype
, ttext
, (slineno
, scol
), (elineno
, ecol
), ltext
in toks
:
22 if last_lineno
!= elineno
:
23 if last_line
and last_line
.endswith("\\\n"):
24 # We are at the beginning of a new line, and the last line
25 # ended with a backslash. We probably have to inject a
26 # backslash token into the stream. Unfortunately, there's more
27 # to figure out. This code::
33 # triggers this condition, but the token text is::
35 # '"""\\\nHEY THERE\n"""'
37 # so we need to figure out if the backslash is already in the
38 # string token or not.
39 inject_backslash
= True
40 if last_ttype
== tokenize
.COMMENT
:
41 # Comments like this \
42 # should never result in a new token.
43 inject_backslash
= False
44 elif ttype
== token
.STRING
:
45 if "\n" in ttext
and ttext
.split('\n', 1)[0][-1] == '\\':
46 # It's a multiline string and the first line ends with
47 # a backslash, so we don't need to inject another.
48 inject_backslash
= False
50 # Figure out what column the backslash is in.
51 ccol
= len(last_line
.split("\n")[-2]) - 1
52 # Yield the token, with a fake token type.
55 (slineno
, ccol
), (slineno
, ccol
+2),
60 yield ttype
, ttext
, (slineno
, scol
), (elineno
, ecol
), ltext
64 def source_token_lines(source
):
65 """Generate a series of lines, one for each line in `source`.
67 Each line is a list of pairs, each pair is a token::
69 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
71 Each pair has a token class, and the token text.
73 If you concatenate all the token texts, and then join them with newlines,
74 you should have your original `source` back, with two differences:
75 trailing whitespace is not preserved, and a final line with no newline
76 is indistinguishable from a final line with a newline.
79 ws_tokens
= set([token
.INDENT
, token
.DEDENT
, token
.NEWLINE
, tokenize
.NL
])
82 source
= source
.expandtabs(8).replace('\r\n', '\n')
83 tokgen
= generate_tokens(source
)
84 for ttype
, ttext
, (_
, scol
), (_
, ecol
), _
in phys_tokens(tokgen
):
86 for part
in re
.split('(\n)', ttext
):
94 elif ttype
in ws_tokens
:
97 if mark_start
and scol
> col
:
98 line
.append(("ws", " " * (scol
- col
)))
100 tok_class
= tokenize
.tok_name
.get(ttype
, 'xx').lower()[:3]
101 if ttype
== token
.NAME
and keyword
.iskeyword(ttext
):
103 line
.append((tok_class
, part
))
112 def source_encoding(source
):
113 """Determine the encoding for `source` (a string), according to PEP 263.
115 Returns a string, the name of the encoding.
118 # Note: this function should never be called on Python 3, since py3 has
119 # built-in tools to do this.
120 assert sys
.version_info
< (3, 0)
122 # This is mostly code adapted from Py3.2's tokenize module.
124 cookie_re
= re
.compile(r
"coding[:=]\s*([-\w.]+)")
126 # Do this so the detect_encode code we copied will work.
127 readline
= iter(source
.splitlines(True)).next
129 def _get_normal_name(orig_enc
):
130 """Imitates get_normal_name in tokenizer.c."""
131 # Only care about the first 12 characters.
132 enc
= orig_enc
[:12].lower().replace("_", "-")
133 if re
.match(r
"^utf-8($|-)", enc
):
135 if re
.match(r
"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc
):
139 # From detect_encode():
140 # It detects the encoding from the presence of a utf-8 bom or an encoding
141 # cookie as specified in pep-0263. If both a bom and a cookie are present,
142 # but disagree, a SyntaxError will be raised. If the encoding cookie is an
143 # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
144 # 'utf-8-sig' is returned.
146 # If no encoding is specified, then the default will be returned. The
147 # default varied with version.
149 if sys
.version_info
<= (2, 4):
150 default
= 'iso-8859-1'
158 """Get the next source line, or ''."""
161 except StopIteration:
164 def find_cookie(line
):
165 """Find an encoding cookie in `line`."""
167 line_string
= line
.decode('ascii')
168 except UnicodeDecodeError:
171 matches
= cookie_re
.findall(line_string
)
174 encoding
= _get_normal_name(matches
[0])
176 codec
= codecs
.lookup(encoding
)
178 # This behaviour mimics the Python interpreter
179 raise SyntaxError("unknown encoding: " + encoding
)
182 # codecs in 2.3 were raw tuples of functions, assume the best.
183 codec_name
= getattr(codec
, 'name', encoding
)
184 if codec_name
!= 'utf-8':
185 # This behaviour mimics the Python interpreter
186 raise SyntaxError('encoding problem: utf-8')
190 first
= read_or_stop()
191 if first
.startswith(codecs
.BOM_UTF8
):
194 default
= 'utf-8-sig'
198 encoding
= find_cookie(first
)
202 second
= read_or_stop()
206 encoding
= find_cookie(second
)