Adding Peter Thatcher to the owners file.
[chromium-blink-merge.git] / third_party / pycoverage / coverage / parser.py
blob7a145a2a5346cc806848813566998bdee854d370
1 """Code parsing for Coverage."""
3 import dis, re, sys, token, tokenize
5 from coverage.backward import set, sorted, StringIO # pylint: disable=W0622
6 from coverage.backward import open_source, range # pylint: disable=W0622
7 from coverage.backward import reversed # pylint: disable=W0622
8 from coverage.backward import bytes_to_ints
9 from coverage.bytecode import ByteCodes, CodeObjects
10 from coverage.misc import nice_pair, expensive, join_regex
11 from coverage.misc import CoverageException, NoSource, NotPython
14 class CodeParser(object):
15 """Parse code to find executable lines, excluded lines, etc."""
17 def __init__(self, text=None, filename=None, exclude=None):
18 """
19 Source can be provided as `text`, the text itself, or `filename`, from
20 which the text will be read. Excluded lines are those that match
21 `exclude`, a regex.
23 """
24 assert text or filename, "CodeParser needs either text or filename"
25 self.filename = filename or "<code>"
26 self.text = text
27 if not self.text:
28 try:
29 sourcef = open_source(self.filename)
30 try:
31 self.text = sourcef.read()
32 finally:
33 sourcef.close()
34 except IOError:
35 _, err, _ = sys.exc_info()
36 raise NoSource(
37 "No source for code: '%s': %s" % (self.filename, err)
40 # Scrap the BOM if it exists.
41 if self.text and ord(self.text[0]) == 0xfeff:
42 self.text = self.text[1:]
44 self.exclude = exclude
46 self.show_tokens = False
48 # The text lines of the parsed code.
49 self.lines = self.text.split('\n')
51 # The line numbers of excluded lines of code.
52 self.excluded = set()
54 # The line numbers of docstring lines.
55 self.docstrings = set()
57 # The line numbers of class definitions.
58 self.classdefs = set()
60 # A dict mapping line numbers to (lo,hi) for multi-line statements.
61 self.multiline = {}
63 # The line numbers that start statements.
64 self.statement_starts = set()
66 # Lazily-created ByteParser
67 self._byte_parser = None
69 def _get_byte_parser(self):
70 """Create a ByteParser on demand."""
71 if not self._byte_parser:
72 self._byte_parser = \
73 ByteParser(text=self.text, filename=self.filename)
74 return self._byte_parser
75 byte_parser = property(_get_byte_parser)
77 def lines_matching(self, *regexes):
78 """Find the lines matching one of a list of regexes.
80 Returns a set of line numbers, the lines that contain a match for one
81 of the regexes in `regexes`. The entire line needn't match, just a
82 part of it.
84 """
85 regex_c = re.compile(join_regex(regexes))
86 matches = set()
87 for i, ltext in enumerate(self.lines):
88 if regex_c.search(ltext):
89 matches.add(i+1)
90 return matches
92 def _raw_parse(self):
93 """Parse the source to find the interesting facts about its lines.
95 A handful of member fields are updated.
97 """
98 # Find lines which match an exclusion pattern.
99 if self.exclude:
100 self.excluded = self.lines_matching(self.exclude)
102 # Tokenize, to find excluded suites, to find docstrings, and to find
103 # multi-line statements.
104 indent = 0
105 exclude_indent = 0
106 excluding = False
107 prev_toktype = token.INDENT
108 first_line = None
109 empty = True
111 tokgen = generate_tokens(self.text)
112 for toktype, ttext, (slineno, _), (elineno, _), ltext in tokgen:
113 if self.show_tokens: # pragma: not covered
114 print("%10s %5s %-20r %r" % (
115 tokenize.tok_name.get(toktype, toktype),
116 nice_pair((slineno, elineno)), ttext, ltext
118 if toktype == token.INDENT:
119 indent += 1
120 elif toktype == token.DEDENT:
121 indent -= 1
122 elif toktype == token.NAME and ttext == 'class':
123 # Class definitions look like branches in the byte code, so
124 # we need to exclude them. The simplest way is to note the
125 # lines with the 'class' keyword.
126 self.classdefs.add(slineno)
127 elif toktype == token.OP and ttext == ':':
128 if not excluding and elineno in self.excluded:
129 # Start excluding a suite. We trigger off of the colon
130 # token so that the #pragma comment will be recognized on
131 # the same line as the colon.
132 exclude_indent = indent
133 excluding = True
134 elif toktype == token.STRING and prev_toktype == token.INDENT:
135 # Strings that are first on an indented line are docstrings.
136 # (a trick from trace.py in the stdlib.) This works for
137 # 99.9999% of cases. For the rest (!) see:
138 # http://stackoverflow.com/questions/1769332/x/1769794#1769794
139 self.docstrings.update(range(slineno, elineno+1))
140 elif toktype == token.NEWLINE:
141 if first_line is not None and elineno != first_line:
142 # We're at the end of a line, and we've ended on a
143 # different line than the first line of the statement,
144 # so record a multi-line range.
145 rng = (first_line, elineno)
146 for l in range(first_line, elineno+1):
147 self.multiline[l] = rng
148 first_line = None
150 if ttext.strip() and toktype != tokenize.COMMENT:
151 # A non-whitespace token.
152 empty = False
153 if first_line is None:
154 # The token is not whitespace, and is the first in a
155 # statement.
156 first_line = slineno
157 # Check whether to end an excluded suite.
158 if excluding and indent <= exclude_indent:
159 excluding = False
160 if excluding:
161 self.excluded.add(elineno)
163 prev_toktype = toktype
165 # Find the starts of the executable statements.
166 if not empty:
167 self.statement_starts.update(self.byte_parser._find_statements())
169 def first_line(self, line):
170 """Return the first line number of the statement including `line`."""
171 rng = self.multiline.get(line)
172 if rng:
173 first_line = rng[0]
174 else:
175 first_line = line
176 return first_line
178 def first_lines(self, lines, *ignores):
179 """Map the line numbers in `lines` to the correct first line of the
180 statement.
182 Skip any line mentioned in any of the sequences in `ignores`.
184 Returns a set of the first lines.
187 ignore = set()
188 for ign in ignores:
189 ignore.update(ign)
190 lset = set()
191 for l in lines:
192 if l in ignore:
193 continue
194 new_l = self.first_line(l)
195 if new_l not in ignore:
196 lset.add(new_l)
197 return lset
199 def parse_source(self):
200 """Parse source text to find executable lines, excluded lines, etc.
202 Return values are 1) a set of executable line numbers, and 2) a set of
203 excluded line numbers.
205 Reported line numbers are normalized to the first line of multi-line
206 statements.
209 try:
210 self._raw_parse()
211 except (tokenize.TokenError, IndentationError):
212 _, tokerr, _ = sys.exc_info()
213 msg, lineno = tokerr.args
214 raise NotPython(
215 "Couldn't parse '%s' as Python source: '%s' at %s" %
216 (self.filename, msg, lineno)
219 excluded_lines = self.first_lines(self.excluded)
220 lines = self.first_lines(
221 self.statement_starts,
222 excluded_lines,
223 self.docstrings
226 return lines, excluded_lines
228 def arcs(self):
229 """Get information about the arcs available in the code.
231 Returns a sorted list of line number pairs. Line numbers have been
232 normalized to the first line of multiline statements.
235 all_arcs = []
236 for l1, l2 in self.byte_parser._all_arcs():
237 fl1 = self.first_line(l1)
238 fl2 = self.first_line(l2)
239 if fl1 != fl2:
240 all_arcs.append((fl1, fl2))
241 return sorted(all_arcs)
242 arcs = expensive(arcs)
244 def exit_counts(self):
245 """Get a mapping from line numbers to count of exits from that line.
247 Excluded lines are excluded.
250 excluded_lines = self.first_lines(self.excluded)
251 exit_counts = {}
252 for l1, l2 in self.arcs():
253 if l1 < 0:
254 # Don't ever report -1 as a line number
255 continue
256 if l1 in excluded_lines:
257 # Don't report excluded lines as line numbers.
258 continue
259 if l2 in excluded_lines:
260 # Arcs to excluded lines shouldn't count.
261 continue
262 if l1 not in exit_counts:
263 exit_counts[l1] = 0
264 exit_counts[l1] += 1
266 # Class definitions have one extra exit, so remove one for each:
267 for l in self.classdefs:
268 # Ensure key is there: classdefs can include excluded lines.
269 if l in exit_counts:
270 exit_counts[l] -= 1
272 return exit_counts
273 exit_counts = expensive(exit_counts)
276 ## Opcodes that guide the ByteParser.
278 def _opcode(name):
279 """Return the opcode by name from the dis module."""
280 return dis.opmap[name]
282 def _opcode_set(*names):
283 """Return a set of opcodes by the names in `names`."""
284 s = set()
285 for name in names:
286 try:
287 s.add(_opcode(name))
288 except KeyError:
289 pass
290 return s
292 # Opcodes that leave the code object.
293 OPS_CODE_END = _opcode_set('RETURN_VALUE')
295 # Opcodes that unconditionally end the code chunk.
296 OPS_CHUNK_END = _opcode_set(
297 'JUMP_ABSOLUTE', 'JUMP_FORWARD', 'RETURN_VALUE', 'RAISE_VARARGS',
298 'BREAK_LOOP', 'CONTINUE_LOOP',
301 # Opcodes that unconditionally begin a new code chunk. By starting new chunks
302 # with unconditional jump instructions, we neatly deal with jumps to jumps
303 # properly.
304 OPS_CHUNK_BEGIN = _opcode_set('JUMP_ABSOLUTE', 'JUMP_FORWARD')
306 # Opcodes that push a block on the block stack.
307 OPS_PUSH_BLOCK = _opcode_set(
308 'SETUP_LOOP', 'SETUP_EXCEPT', 'SETUP_FINALLY', 'SETUP_WITH'
311 # Block types for exception handling.
312 OPS_EXCEPT_BLOCKS = _opcode_set('SETUP_EXCEPT', 'SETUP_FINALLY')
314 # Opcodes that pop a block from the block stack.
315 OPS_POP_BLOCK = _opcode_set('POP_BLOCK')
317 # Opcodes that have a jump destination, but aren't really a jump.
318 OPS_NO_JUMP = OPS_PUSH_BLOCK
320 # Individual opcodes we need below.
321 OP_BREAK_LOOP = _opcode('BREAK_LOOP')
322 OP_END_FINALLY = _opcode('END_FINALLY')
323 OP_COMPARE_OP = _opcode('COMPARE_OP')
324 COMPARE_EXCEPTION = 10 # just have to get this const from the code.
325 OP_LOAD_CONST = _opcode('LOAD_CONST')
326 OP_RETURN_VALUE = _opcode('RETURN_VALUE')
329 class ByteParser(object):
330 """Parse byte codes to understand the structure of code."""
332 def __init__(self, code=None, text=None, filename=None):
333 if code:
334 self.code = code
335 self.text = text
336 else:
337 if not text:
338 assert filename, "If no code or text, need a filename"
339 sourcef = open_source(filename)
340 try:
341 text = sourcef.read()
342 finally:
343 sourcef.close()
344 self.text = text
346 try:
347 # Python 2.3 and 2.4 don't like partial last lines, so be sure
348 # the text ends nicely for them.
349 self.code = compile(text + '\n', filename, "exec")
350 except SyntaxError:
351 _, synerr, _ = sys.exc_info()
352 raise NotPython(
353 "Couldn't parse '%s' as Python source: '%s' at line %d" %
354 (filename, synerr.msg, synerr.lineno)
357 # Alternative Python implementations don't always provide all the
358 # attributes on code objects that we need to do the analysis.
359 for attr in ['co_lnotab', 'co_firstlineno', 'co_consts', 'co_code']:
360 if not hasattr(self.code, attr):
361 raise CoverageException(
362 "This implementation of Python doesn't support code "
363 "analysis.\n"
364 "Run coverage.py under CPython for this command."
367 def child_parsers(self):
368 """Iterate over all the code objects nested within this one.
370 The iteration includes `self` as its first value.
373 children = CodeObjects(self.code)
374 return [ByteParser(code=c, text=self.text) for c in children]
376 def _bytes_lines(self):
377 """Map byte offsets to line numbers in `code`.
379 Uses co_lnotab described in Python/compile.c to map byte offsets to
380 line numbers. Produces a sequence: (b0, l0), (b1, l1), ...
382 Only byte offsets that correspond to line numbers are included in the
383 results.
386 # Adapted from dis.py in the standard library.
387 byte_increments = bytes_to_ints(self.code.co_lnotab[0::2])
388 line_increments = bytes_to_ints(self.code.co_lnotab[1::2])
390 last_line_num = None
391 line_num = self.code.co_firstlineno
392 byte_num = 0
393 for byte_incr, line_incr in zip(byte_increments, line_increments):
394 if byte_incr:
395 if line_num != last_line_num:
396 yield (byte_num, line_num)
397 last_line_num = line_num
398 byte_num += byte_incr
399 line_num += line_incr
400 if line_num != last_line_num:
401 yield (byte_num, line_num)
403 def _find_statements(self):
404 """Find the statements in `self.code`.
406 Produce a sequence of line numbers that start statements. Recurses
407 into all code objects reachable from `self.code`.
410 for bp in self.child_parsers():
411 # Get all of the lineno information from this code.
412 for _, l in bp._bytes_lines():
413 yield l
415 def _block_stack_repr(self, block_stack):
416 """Get a string version of `block_stack`, for debugging."""
417 blocks = ", ".join(
418 ["(%s, %r)" % (dis.opname[b[0]], b[1]) for b in block_stack]
420 return "[" + blocks + "]"
422 def _split_into_chunks(self):
423 """Split the code object into a list of `Chunk` objects.
425 Each chunk is only entered at its first instruction, though there can
426 be many exits from a chunk.
428 Returns a list of `Chunk` objects.
431 # The list of chunks so far, and the one we're working on.
432 chunks = []
433 chunk = None
435 # A dict mapping byte offsets of line starts to the line numbers.
436 bytes_lines_map = dict(self._bytes_lines())
438 # The block stack: loops and try blocks get pushed here for the
439 # implicit jumps that can occur.
440 # Each entry is a tuple: (block type, destination)
441 block_stack = []
443 # Some op codes are followed by branches that should be ignored. This
444 # is a count of how many ignores are left.
445 ignore_branch = 0
447 # We have to handle the last two bytecodes specially.
448 ult = penult = None
450 # Get a set of all of the jump-to points.
451 jump_to = set()
452 bytecodes = list(ByteCodes(self.code.co_code))
453 for bc in bytecodes:
454 if bc.jump_to >= 0:
455 jump_to.add(bc.jump_to)
457 chunk_lineno = 0
459 # Walk the byte codes building chunks.
460 for bc in bytecodes:
461 # Maybe have to start a new chunk
462 start_new_chunk = False
463 first_chunk = False
464 if bc.offset in bytes_lines_map:
465 # Start a new chunk for each source line number.
466 start_new_chunk = True
467 chunk_lineno = bytes_lines_map[bc.offset]
468 first_chunk = True
469 elif bc.offset in jump_to:
470 # To make chunks have a single entrance, we have to make a new
471 # chunk when we get to a place some bytecode jumps to.
472 start_new_chunk = True
473 elif bc.op in OPS_CHUNK_BEGIN:
474 # Jumps deserve their own unnumbered chunk. This fixes
475 # problems with jumps to jumps getting confused.
476 start_new_chunk = True
478 if not chunk or start_new_chunk:
479 if chunk:
480 chunk.exits.add(bc.offset)
481 chunk = Chunk(bc.offset, chunk_lineno, first_chunk)
482 chunks.append(chunk)
484 # Look at the opcode
485 if bc.jump_to >= 0 and bc.op not in OPS_NO_JUMP:
486 if ignore_branch:
487 # Someone earlier wanted us to ignore this branch.
488 ignore_branch -= 1
489 else:
490 # The opcode has a jump, it's an exit for this chunk.
491 chunk.exits.add(bc.jump_to)
493 if bc.op in OPS_CODE_END:
494 # The opcode can exit the code object.
495 chunk.exits.add(-self.code.co_firstlineno)
496 if bc.op in OPS_PUSH_BLOCK:
497 # The opcode adds a block to the block_stack.
498 block_stack.append((bc.op, bc.jump_to))
499 if bc.op in OPS_POP_BLOCK:
500 # The opcode pops a block from the block stack.
501 block_stack.pop()
502 if bc.op in OPS_CHUNK_END:
503 # This opcode forces the end of the chunk.
504 if bc.op == OP_BREAK_LOOP:
505 # A break is implicit: jump where the top of the
506 # block_stack points.
507 chunk.exits.add(block_stack[-1][1])
508 chunk = None
509 if bc.op == OP_END_FINALLY:
510 # For the finally clause we need to find the closest exception
511 # block, and use its jump target as an exit.
512 for block in reversed(block_stack):
513 if block[0] in OPS_EXCEPT_BLOCKS:
514 chunk.exits.add(block[1])
515 break
516 if bc.op == OP_COMPARE_OP and bc.arg == COMPARE_EXCEPTION:
517 # This is an except clause. We want to overlook the next
518 # branch, so that except's don't count as branches.
519 ignore_branch += 1
521 penult = ult
522 ult = bc
524 if chunks:
525 # The last two bytecodes could be a dummy "return None" that
526 # shouldn't be counted as real code. Every Python code object seems
527 # to end with a return, and a "return None" is inserted if there
528 # isn't an explicit return in the source.
529 if ult and penult:
530 if penult.op == OP_LOAD_CONST and ult.op == OP_RETURN_VALUE:
531 if self.code.co_consts[penult.arg] is None:
532 # This is "return None", but is it dummy? A real line
533 # would be a last chunk all by itself.
534 if chunks[-1].byte != penult.offset:
535 ex = -self.code.co_firstlineno
536 # Split the last chunk
537 last_chunk = chunks[-1]
538 last_chunk.exits.remove(ex)
539 last_chunk.exits.add(penult.offset)
540 chunk = Chunk(
541 penult.offset, last_chunk.line, False
543 chunk.exits.add(ex)
544 chunks.append(chunk)
546 # Give all the chunks a length.
547 chunks[-1].length = bc.next_offset - chunks[-1].byte # pylint: disable=W0631,C0301
548 for i in range(len(chunks)-1):
549 chunks[i].length = chunks[i+1].byte - chunks[i].byte
551 #self.validate_chunks(chunks)
552 return chunks
554 def validate_chunks(self, chunks):
555 """Validate the rule that chunks have a single entrance."""
556 # starts is the entrances to the chunks
557 starts = set([ch.byte for ch in chunks])
558 for ch in chunks:
559 assert all([(ex in starts or ex < 0) for ex in ch.exits])
561 def _arcs(self):
562 """Find the executable arcs in the code.
564 Yields pairs: (from,to). From and to are integer line numbers. If
565 from is < 0, then the arc is an entrance into the code object. If to
566 is < 0, the arc is an exit from the code object.
569 chunks = self._split_into_chunks()
571 # A map from byte offsets to chunks jumped into.
572 byte_chunks = dict([(c.byte, c) for c in chunks])
574 # There's always an entrance at the first chunk.
575 yield (-1, byte_chunks[0].line)
577 # Traverse from the first chunk in each line, and yield arcs where
578 # the trace function will be invoked.
579 for chunk in chunks:
580 if not chunk.first:
581 continue
583 chunks_considered = set()
584 chunks_to_consider = [chunk]
585 while chunks_to_consider:
586 # Get the chunk we're considering, and make sure we don't
587 # consider it again
588 this_chunk = chunks_to_consider.pop()
589 chunks_considered.add(this_chunk)
591 # For each exit, add the line number if the trace function
592 # would be triggered, or add the chunk to those being
593 # considered if not.
594 for ex in this_chunk.exits:
595 if ex < 0:
596 yield (chunk.line, ex)
597 else:
598 next_chunk = byte_chunks[ex]
599 if next_chunk in chunks_considered:
600 continue
602 # The trace function is invoked if visiting the first
603 # bytecode in a line, or if the transition is a
604 # backward jump.
605 backward_jump = next_chunk.byte < this_chunk.byte
606 if next_chunk.first or backward_jump:
607 if next_chunk.line != chunk.line:
608 yield (chunk.line, next_chunk.line)
609 else:
610 chunks_to_consider.append(next_chunk)
612 def _all_chunks(self):
613 """Returns a list of `Chunk` objects for this code and its children.
615 See `_split_into_chunks` for details.
618 chunks = []
619 for bp in self.child_parsers():
620 chunks.extend(bp._split_into_chunks())
622 return chunks
624 def _all_arcs(self):
625 """Get the set of all arcs in this code object and its children.
627 See `_arcs` for details.
630 arcs = set()
631 for bp in self.child_parsers():
632 arcs.update(bp._arcs())
634 return arcs
637 class Chunk(object):
638 """A sequence of byte codes with a single entrance.
640 To analyze byte code, we have to divide it into chunks, sequences of byte
641 codes such that each chunk has only one entrance, the first instruction in
642 the block.
644 This is almost the CS concept of `basic block`_, except that we're willing
645 to have many exits from a chunk, and "basic block" is a more cumbersome
646 term.
648 .. _basic block: http://en.wikipedia.org/wiki/Basic_block
650 `line` is the source line number containing this chunk.
652 `first` is true if this is the first chunk in the source line.
654 An exit < 0 means the chunk can leave the code (return). The exit is
655 the negative of the starting line number of the code block.
658 def __init__(self, byte, line, first):
659 self.byte = byte
660 self.line = line
661 self.first = first
662 self.length = 0
663 self.exits = set()
665 def __repr__(self):
666 if self.first:
667 bang = "!"
668 else:
669 bang = ""
670 return "<%d+%d @%d%s %r>" % (
671 self.byte, self.length, self.line, bang, list(self.exits)
675 class CachedTokenizer(object):
676 """A one-element cache around tokenize.generate_tokens.
678 When reporting, coverage.py tokenizes files twice, once to find the
679 structure of the file, and once to syntax-color it. Tokenizing is
680 expensive, and easily cached.
682 This is a one-element cache so that our twice-in-a-row tokenizing doesn't
683 actually tokenize twice.
686 def __init__(self):
687 self.last_text = None
688 self.last_tokens = None
690 def generate_tokens(self, text):
691 """A stand-in for `tokenize.generate_tokens`."""
692 if text != self.last_text:
693 self.last_text = text
694 self.last_tokens = list(
695 tokenize.generate_tokens(StringIO(text).readline)
697 return self.last_tokens
699 # Create our generate_tokens cache as a callable replacement function.
700 generate_tokens = CachedTokenizer().generate_tokens