1 """Code parsing for Coverage."""
3 import dis
, re
, sys
, token
, tokenize
5 from coverage
.backward
import set, sorted, StringIO
# pylint: disable=W0622
6 from coverage
.backward
import open_source
, range # pylint: disable=W0622
7 from coverage
.backward
import reversed # pylint: disable=W0622
8 from coverage
.backward
import bytes_to_ints
9 from coverage
.bytecode
import ByteCodes
, CodeObjects
10 from coverage
.misc
import nice_pair
, expensive
, join_regex
11 from coverage
.misc
import CoverageException
, NoSource
, NotPython
14 class CodeParser(object):
15 """Parse code to find executable lines, excluded lines, etc."""
17 def __init__(self
, text
=None, filename
=None, exclude
=None):
19 Source can be provided as `text`, the text itself, or `filename`, from
20 which the text will be read. Excluded lines are those that match
24 assert text
or filename
, "CodeParser needs either text or filename"
25 self
.filename
= filename
or "<code>"
29 sourcef
= open_source(self
.filename
)
31 self
.text
= sourcef
.read()
35 _
, err
, _
= sys
.exc_info()
37 "No source for code: '%s': %s" % (self
.filename
, err
)
40 # Scrap the BOM if it exists.
41 if self
.text
and ord(self
.text
[0]) == 0xfeff:
42 self
.text
= self
.text
[1:]
44 self
.exclude
= exclude
46 self
.show_tokens
= False
48 # The text lines of the parsed code.
49 self
.lines
= self
.text
.split('\n')
51 # The line numbers of excluded lines of code.
54 # The line numbers of docstring lines.
55 self
.docstrings
= set()
57 # The line numbers of class definitions.
58 self
.classdefs
= set()
60 # A dict mapping line numbers to (lo,hi) for multi-line statements.
63 # The line numbers that start statements.
64 self
.statement_starts
= set()
66 # Lazily-created ByteParser
67 self
._byte
_parser
= None
69 def _get_byte_parser(self
):
70 """Create a ByteParser on demand."""
71 if not self
._byte
_parser
:
73 ByteParser(text
=self
.text
, filename
=self
.filename
)
74 return self
._byte
_parser
75 byte_parser
= property(_get_byte_parser
)
77 def lines_matching(self
, *regexes
):
78 """Find the lines matching one of a list of regexes.
80 Returns a set of line numbers, the lines that contain a match for one
81 of the regexes in `regexes`. The entire line needn't match, just a
85 regex_c
= re
.compile(join_regex(regexes
))
87 for i
, ltext
in enumerate(self
.lines
):
88 if regex_c
.search(ltext
):
93 """Parse the source to find the interesting facts about its lines.
95 A handful of member fields are updated.
98 # Find lines which match an exclusion pattern.
100 self
.excluded
= self
.lines_matching(self
.exclude
)
102 # Tokenize, to find excluded suites, to find docstrings, and to find
103 # multi-line statements.
107 prev_toktype
= token
.INDENT
111 tokgen
= generate_tokens(self
.text
)
112 for toktype
, ttext
, (slineno
, _
), (elineno
, _
), ltext
in tokgen
:
113 if self
.show_tokens
: # pragma: not covered
114 print("%10s %5s %-20r %r" % (
115 tokenize
.tok_name
.get(toktype
, toktype
),
116 nice_pair((slineno
, elineno
)), ttext
, ltext
118 if toktype
== token
.INDENT
:
120 elif toktype
== token
.DEDENT
:
122 elif toktype
== token
.NAME
and ttext
== 'class':
123 # Class definitions look like branches in the byte code, so
124 # we need to exclude them. The simplest way is to note the
125 # lines with the 'class' keyword.
126 self
.classdefs
.add(slineno
)
127 elif toktype
== token
.OP
and ttext
== ':':
128 if not excluding
and elineno
in self
.excluded
:
129 # Start excluding a suite. We trigger off of the colon
130 # token so that the #pragma comment will be recognized on
131 # the same line as the colon.
132 exclude_indent
= indent
134 elif toktype
== token
.STRING
and prev_toktype
== token
.INDENT
:
135 # Strings that are first on an indented line are docstrings.
136 # (a trick from trace.py in the stdlib.) This works for
137 # 99.9999% of cases. For the rest (!) see:
138 # http://stackoverflow.com/questions/1769332/x/1769794#1769794
139 self
.docstrings
.update(range(slineno
, elineno
+1))
140 elif toktype
== token
.NEWLINE
:
141 if first_line
is not None and elineno
!= first_line
:
142 # We're at the end of a line, and we've ended on a
143 # different line than the first line of the statement,
144 # so record a multi-line range.
145 rng
= (first_line
, elineno
)
146 for l
in range(first_line
, elineno
+1):
147 self
.multiline
[l
] = rng
150 if ttext
.strip() and toktype
!= tokenize
.COMMENT
:
151 # A non-whitespace token.
153 if first_line
is None:
154 # The token is not whitespace, and is the first in a
157 # Check whether to end an excluded suite.
158 if excluding
and indent
<= exclude_indent
:
161 self
.excluded
.add(elineno
)
163 prev_toktype
= toktype
165 # Find the starts of the executable statements.
167 self
.statement_starts
.update(self
.byte_parser
._find
_statements
())
169 def first_line(self
, line
):
170 """Return the first line number of the statement including `line`."""
171 rng
= self
.multiline
.get(line
)
178 def first_lines(self
, lines
, *ignores
):
179 """Map the line numbers in `lines` to the correct first line of the
182 Skip any line mentioned in any of the sequences in `ignores`.
184 Returns a set of the first lines.
194 new_l
= self
.first_line(l
)
195 if new_l
not in ignore
:
199 def parse_source(self
):
200 """Parse source text to find executable lines, excluded lines, etc.
202 Return values are 1) a set of executable line numbers, and 2) a set of
203 excluded line numbers.
205 Reported line numbers are normalized to the first line of multi-line
211 except (tokenize
.TokenError
, IndentationError):
212 _
, tokerr
, _
= sys
.exc_info()
213 msg
, lineno
= tokerr
.args
215 "Couldn't parse '%s' as Python source: '%s' at %s" %
216 (self
.filename
, msg
, lineno
)
219 excluded_lines
= self
.first_lines(self
.excluded
)
220 lines
= self
.first_lines(
221 self
.statement_starts
,
226 return lines
, excluded_lines
229 """Get information about the arcs available in the code.
231 Returns a sorted list of line number pairs. Line numbers have been
232 normalized to the first line of multiline statements.
236 for l1
, l2
in self
.byte_parser
._all
_arcs
():
237 fl1
= self
.first_line(l1
)
238 fl2
= self
.first_line(l2
)
240 all_arcs
.append((fl1
, fl2
))
241 return sorted(all_arcs
)
242 arcs
= expensive(arcs
)
244 def exit_counts(self
):
245 """Get a mapping from line numbers to count of exits from that line.
247 Excluded lines are excluded.
250 excluded_lines
= self
.first_lines(self
.excluded
)
252 for l1
, l2
in self
.arcs():
254 # Don't ever report -1 as a line number
256 if l1
in excluded_lines
:
257 # Don't report excluded lines as line numbers.
259 if l2
in excluded_lines
:
260 # Arcs to excluded lines shouldn't count.
262 if l1
not in exit_counts
:
266 # Class definitions have one extra exit, so remove one for each:
267 for l
in self
.classdefs
:
268 # Ensure key is there: classdefs can include excluded lines.
273 exit_counts
= expensive(exit_counts
)
276 ## Opcodes that guide the ByteParser.
279 """Return the opcode by name from the dis module."""
280 return dis
.opmap
[name
]
282 def _opcode_set(*names
):
283 """Return a set of opcodes by the names in `names`."""
292 # Opcodes that leave the code object.
293 OPS_CODE_END
= _opcode_set('RETURN_VALUE')
295 # Opcodes that unconditionally end the code chunk.
296 OPS_CHUNK_END
= _opcode_set(
297 'JUMP_ABSOLUTE', 'JUMP_FORWARD', 'RETURN_VALUE', 'RAISE_VARARGS',
298 'BREAK_LOOP', 'CONTINUE_LOOP',
301 # Opcodes that unconditionally begin a new code chunk. By starting new chunks
302 # with unconditional jump instructions, we neatly deal with jumps to jumps
304 OPS_CHUNK_BEGIN
= _opcode_set('JUMP_ABSOLUTE', 'JUMP_FORWARD')
306 # Opcodes that push a block on the block stack.
307 OPS_PUSH_BLOCK
= _opcode_set(
308 'SETUP_LOOP', 'SETUP_EXCEPT', 'SETUP_FINALLY', 'SETUP_WITH'
311 # Block types for exception handling.
312 OPS_EXCEPT_BLOCKS
= _opcode_set('SETUP_EXCEPT', 'SETUP_FINALLY')
314 # Opcodes that pop a block from the block stack.
315 OPS_POP_BLOCK
= _opcode_set('POP_BLOCK')
317 # Opcodes that have a jump destination, but aren't really a jump.
318 OPS_NO_JUMP
= OPS_PUSH_BLOCK
320 # Individual opcodes we need below.
321 OP_BREAK_LOOP
= _opcode('BREAK_LOOP')
322 OP_END_FINALLY
= _opcode('END_FINALLY')
323 OP_COMPARE_OP
= _opcode('COMPARE_OP')
324 COMPARE_EXCEPTION
= 10 # just have to get this const from the code.
325 OP_LOAD_CONST
= _opcode('LOAD_CONST')
326 OP_RETURN_VALUE
= _opcode('RETURN_VALUE')
329 class ByteParser(object):
330 """Parse byte codes to understand the structure of code."""
332 def __init__(self
, code
=None, text
=None, filename
=None):
338 assert filename
, "If no code or text, need a filename"
339 sourcef
= open_source(filename
)
341 text
= sourcef
.read()
347 # Python 2.3 and 2.4 don't like partial last lines, so be sure
348 # the text ends nicely for them.
349 self
.code
= compile(text
+ '\n', filename
, "exec")
351 _
, synerr
, _
= sys
.exc_info()
353 "Couldn't parse '%s' as Python source: '%s' at line %d" %
354 (filename
, synerr
.msg
, synerr
.lineno
)
357 # Alternative Python implementations don't always provide all the
358 # attributes on code objects that we need to do the analysis.
359 for attr
in ['co_lnotab', 'co_firstlineno', 'co_consts', 'co_code']:
360 if not hasattr(self
.code
, attr
):
361 raise CoverageException(
362 "This implementation of Python doesn't support code "
364 "Run coverage.py under CPython for this command."
367 def child_parsers(self
):
368 """Iterate over all the code objects nested within this one.
370 The iteration includes `self` as its first value.
373 children
= CodeObjects(self
.code
)
374 return [ByteParser(code
=c
, text
=self
.text
) for c
in children
]
376 def _bytes_lines(self
):
377 """Map byte offsets to line numbers in `code`.
379 Uses co_lnotab described in Python/compile.c to map byte offsets to
380 line numbers. Produces a sequence: (b0, l0), (b1, l1), ...
382 Only byte offsets that correspond to line numbers are included in the
386 # Adapted from dis.py in the standard library.
387 byte_increments
= bytes_to_ints(self
.code
.co_lnotab
[0::2])
388 line_increments
= bytes_to_ints(self
.code
.co_lnotab
[1::2])
391 line_num
= self
.code
.co_firstlineno
393 for byte_incr
, line_incr
in zip(byte_increments
, line_increments
):
395 if line_num
!= last_line_num
:
396 yield (byte_num
, line_num
)
397 last_line_num
= line_num
398 byte_num
+= byte_incr
399 line_num
+= line_incr
400 if line_num
!= last_line_num
:
401 yield (byte_num
, line_num
)
403 def _find_statements(self
):
404 """Find the statements in `self.code`.
406 Produce a sequence of line numbers that start statements. Recurses
407 into all code objects reachable from `self.code`.
410 for bp
in self
.child_parsers():
411 # Get all of the lineno information from this code.
412 for _
, l
in bp
._bytes
_lines
():
415 def _block_stack_repr(self
, block_stack
):
416 """Get a string version of `block_stack`, for debugging."""
418 ["(%s, %r)" % (dis
.opname
[b
[0]], b
[1]) for b
in block_stack
]
420 return "[" + blocks
+ "]"
422 def _split_into_chunks(self
):
423 """Split the code object into a list of `Chunk` objects.
425 Each chunk is only entered at its first instruction, though there can
426 be many exits from a chunk.
428 Returns a list of `Chunk` objects.
431 # The list of chunks so far, and the one we're working on.
435 # A dict mapping byte offsets of line starts to the line numbers.
436 bytes_lines_map
= dict(self
._bytes
_lines
())
438 # The block stack: loops and try blocks get pushed here for the
439 # implicit jumps that can occur.
440 # Each entry is a tuple: (block type, destination)
443 # Some op codes are followed by branches that should be ignored. This
444 # is a count of how many ignores are left.
447 # We have to handle the last two bytecodes specially.
450 # Get a set of all of the jump-to points.
452 bytecodes
= list(ByteCodes(self
.code
.co_code
))
455 jump_to
.add(bc
.jump_to
)
459 # Walk the byte codes building chunks.
461 # Maybe have to start a new chunk
462 start_new_chunk
= False
464 if bc
.offset
in bytes_lines_map
:
465 # Start a new chunk for each source line number.
466 start_new_chunk
= True
467 chunk_lineno
= bytes_lines_map
[bc
.offset
]
469 elif bc
.offset
in jump_to
:
470 # To make chunks have a single entrance, we have to make a new
471 # chunk when we get to a place some bytecode jumps to.
472 start_new_chunk
= True
473 elif bc
.op
in OPS_CHUNK_BEGIN
:
474 # Jumps deserve their own unnumbered chunk. This fixes
475 # problems with jumps to jumps getting confused.
476 start_new_chunk
= True
478 if not chunk
or start_new_chunk
:
480 chunk
.exits
.add(bc
.offset
)
481 chunk
= Chunk(bc
.offset
, chunk_lineno
, first_chunk
)
485 if bc
.jump_to
>= 0 and bc
.op
not in OPS_NO_JUMP
:
487 # Someone earlier wanted us to ignore this branch.
490 # The opcode has a jump, it's an exit for this chunk.
491 chunk
.exits
.add(bc
.jump_to
)
493 if bc
.op
in OPS_CODE_END
:
494 # The opcode can exit the code object.
495 chunk
.exits
.add(-self
.code
.co_firstlineno
)
496 if bc
.op
in OPS_PUSH_BLOCK
:
497 # The opcode adds a block to the block_stack.
498 block_stack
.append((bc
.op
, bc
.jump_to
))
499 if bc
.op
in OPS_POP_BLOCK
:
500 # The opcode pops a block from the block stack.
502 if bc
.op
in OPS_CHUNK_END
:
503 # This opcode forces the end of the chunk.
504 if bc
.op
== OP_BREAK_LOOP
:
505 # A break is implicit: jump where the top of the
506 # block_stack points.
507 chunk
.exits
.add(block_stack
[-1][1])
509 if bc
.op
== OP_END_FINALLY
:
510 # For the finally clause we need to find the closest exception
511 # block, and use its jump target as an exit.
512 for block
in reversed(block_stack
):
513 if block
[0] in OPS_EXCEPT_BLOCKS
:
514 chunk
.exits
.add(block
[1])
516 if bc
.op
== OP_COMPARE_OP
and bc
.arg
== COMPARE_EXCEPTION
:
517 # This is an except clause. We want to overlook the next
518 # branch, so that except's don't count as branches.
525 # The last two bytecodes could be a dummy "return None" that
526 # shouldn't be counted as real code. Every Python code object seems
527 # to end with a return, and a "return None" is inserted if there
528 # isn't an explicit return in the source.
530 if penult
.op
== OP_LOAD_CONST
and ult
.op
== OP_RETURN_VALUE
:
531 if self
.code
.co_consts
[penult
.arg
] is None:
532 # This is "return None", but is it dummy? A real line
533 # would be a last chunk all by itself.
534 if chunks
[-1].byte
!= penult
.offset
:
535 ex
= -self
.code
.co_firstlineno
536 # Split the last chunk
537 last_chunk
= chunks
[-1]
538 last_chunk
.exits
.remove(ex
)
539 last_chunk
.exits
.add(penult
.offset
)
541 penult
.offset
, last_chunk
.line
, False
546 # Give all the chunks a length.
547 chunks
[-1].length
= bc
.next_offset
- chunks
[-1].byte
# pylint: disable=W0631,C0301
548 for i
in range(len(chunks
)-1):
549 chunks
[i
].length
= chunks
[i
+1].byte
- chunks
[i
].byte
551 #self.validate_chunks(chunks)
554 def validate_chunks(self
, chunks
):
555 """Validate the rule that chunks have a single entrance."""
556 # starts is the entrances to the chunks
557 starts
= set([ch
.byte
for ch
in chunks
])
559 assert all([(ex
in starts
or ex
< 0) for ex
in ch
.exits
])
562 """Find the executable arcs in the code.
564 Yields pairs: (from,to). From and to are integer line numbers. If
565 from is < 0, then the arc is an entrance into the code object. If to
566 is < 0, the arc is an exit from the code object.
569 chunks
= self
._split
_into
_chunks
()
571 # A map from byte offsets to chunks jumped into.
572 byte_chunks
= dict([(c
.byte
, c
) for c
in chunks
])
574 # There's always an entrance at the first chunk.
575 yield (-1, byte_chunks
[0].line
)
577 # Traverse from the first chunk in each line, and yield arcs where
578 # the trace function will be invoked.
583 chunks_considered
= set()
584 chunks_to_consider
= [chunk
]
585 while chunks_to_consider
:
586 # Get the chunk we're considering, and make sure we don't
588 this_chunk
= chunks_to_consider
.pop()
589 chunks_considered
.add(this_chunk
)
591 # For each exit, add the line number if the trace function
592 # would be triggered, or add the chunk to those being
594 for ex
in this_chunk
.exits
:
596 yield (chunk
.line
, ex
)
598 next_chunk
= byte_chunks
[ex
]
599 if next_chunk
in chunks_considered
:
602 # The trace function is invoked if visiting the first
603 # bytecode in a line, or if the transition is a
605 backward_jump
= next_chunk
.byte
< this_chunk
.byte
606 if next_chunk
.first
or backward_jump
:
607 if next_chunk
.line
!= chunk
.line
:
608 yield (chunk
.line
, next_chunk
.line
)
610 chunks_to_consider
.append(next_chunk
)
612 def _all_chunks(self
):
613 """Returns a list of `Chunk` objects for this code and its children.
615 See `_split_into_chunks` for details.
619 for bp
in self
.child_parsers():
620 chunks
.extend(bp
._split
_into
_chunks
())
625 """Get the set of all arcs in this code object and its children.
627 See `_arcs` for details.
631 for bp
in self
.child_parsers():
632 arcs
.update(bp
._arcs
())
638 """A sequence of byte codes with a single entrance.
640 To analyze byte code, we have to divide it into chunks, sequences of byte
641 codes such that each chunk has only one entrance, the first instruction in
644 This is almost the CS concept of `basic block`_, except that we're willing
645 to have many exits from a chunk, and "basic block" is a more cumbersome
648 .. _basic block: http://en.wikipedia.org/wiki/Basic_block
650 `line` is the source line number containing this chunk.
652 `first` is true if this is the first chunk in the source line.
654 An exit < 0 means the chunk can leave the code (return). The exit is
655 the negative of the starting line number of the code block.
658 def __init__(self
, byte
, line
, first
):
670 return "<%d+%d @%d%s %r>" % (
671 self
.byte
, self
.length
, self
.line
, bang
, list(self
.exits
)
675 class CachedTokenizer(object):
676 """A one-element cache around tokenize.generate_tokens.
678 When reporting, coverage.py tokenizes files twice, once to find the
679 structure of the file, and once to syntax-color it. Tokenizing is
680 expensive, and easily cached.
682 This is a one-element cache so that our twice-in-a-row tokenizing doesn't
683 actually tokenize twice.
687 self
.last_text
= None
688 self
.last_tokens
= None
690 def generate_tokens(self
, text
):
691 """A stand-in for `tokenize.generate_tokens`."""
692 if text
!= self
.last_text
:
693 self
.last_text
= text
694 self
.last_tokens
= list(
695 tokenize
.generate_tokens(StringIO(text
).readline
)
697 return self
.last_tokens
699 # Create our generate_tokens cache as a callable replacement function.
700 generate_tokens
= CachedTokenizer().generate_tokens