[JITLink] Add support of R_X86_64_32S relocation
[llvm-project.git] / clang / utils / analyzer / CmpRuns.py
blob7afe865d77f2bb3be8b7fdb898fe7051f0992fe5
1 #!/usr/bin/env python
3 """
4 CmpRuns - A simple tool for comparing two static analyzer runs to determine
5 which reports have been added, removed, or changed.
7 This is designed to support automated testing using the static analyzer, from
8 two perspectives:
9 1. To monitor changes in the static analyzer's reports on real code bases,
10 for regression testing.
12 2. For use by end users who want to integrate regular static analyzer testing
13 into a buildbot like environment.
15 Usage:
17 # Load the results of both runs, to obtain lists of the corresponding
18 # AnalysisDiagnostic objects.
20 resultsA = load_results_from_single_run(singleRunInfoA, delete_empty)
21 resultsB = load_results_from_single_run(singleRunInfoB, delete_empty)
23 # Generate a relation from diagnostics in run A to diagnostics in run B
24 # to obtain a list of triples (a, b, confidence).
25 diff = compare_results(resultsA, resultsB)
27 """
28 import json
29 import os
30 import plistlib
31 import re
32 import sys
34 from math import log
35 from collections import defaultdict
36 from copy import copy
37 from enum import Enum
38 from typing import (Any, DefaultDict, Dict, List, NamedTuple, Optional,
39 Sequence, Set, TextIO, TypeVar, Tuple, Union)
42 Number = Union[int, float]
43 Stats = Dict[str, Dict[str, Number]]
44 Plist = Dict[str, Any]
45 JSON = Dict[str, Any]
46 # Diff in a form: field -> (before, after)
47 JSONDiff = Dict[str, Tuple[str, str]]
48 # Type for generics
49 T = TypeVar('T')
51 STATS_REGEXP = re.compile(r"Statistics: (\{.+\})", re.MULTILINE | re.DOTALL)
54 class Colors:
55 """
56 Color for terminal highlight.
57 """
58 RED = '\x1b[2;30;41m'
59 GREEN = '\x1b[6;30;42m'
60 CLEAR = '\x1b[0m'
63 class HistogramType(str, Enum):
64 RELATIVE = "relative"
65 LOG_RELATIVE = "log-relative"
66 ABSOLUTE = "absolute"
69 class ResultsDirectory(NamedTuple):
70 path: str
71 root: str = ""
74 class SingleRunInfo:
75 """
76 Information about analysis run:
77 path - the analysis output directory
78 root - the name of the root directory, which will be disregarded when
79 determining the source file name
80 """
81 def __init__(self, results: ResultsDirectory,
82 verbose_log: Optional[str] = None):
83 self.path = results.path
84 self.root = results.root.rstrip("/\\")
85 self.verbose_log = verbose_log
88 class AnalysisDiagnostic:
89 def __init__(self, data: Plist, report: "AnalysisReport",
90 html_report: Optional[str]):
91 self._data = data
92 self._loc = self._data['location']
93 self._report = report
94 self._html_report = html_report
95 self._report_size = len(self._data['path'])
97 def get_file_name(self) -> str:
98 root = self._report.run.root
99 file_name = self._report.files[self._loc['file']]
101 if file_name.startswith(root) and len(root) > 0:
102 return file_name[len(root) + 1:]
104 return file_name
106 def get_root_file_name(self) -> str:
107 path = self._data['path']
109 if not path:
110 return self.get_file_name()
112 p = path[0]
113 if 'location' in p:
114 file_index = p['location']['file']
115 else: # control edge
116 file_index = path[0]['edges'][0]['start'][0]['file']
118 out = self._report.files[file_index]
119 root = self._report.run.root
121 if out.startswith(root):
122 return out[len(root):]
124 return out
126 def get_line(self) -> int:
127 return self._loc['line']
129 def get_column(self) -> int:
130 return self._loc['col']
132 def get_path_length(self) -> int:
133 return self._report_size
135 def get_category(self) -> str:
136 return self._data['category']
138 def get_description(self) -> str:
139 return self._data['description']
141 def get_location(self) -> str:
142 return f"{self.get_file_name()}:{self.get_line()}:{self.get_column()}"
144 def get_issue_identifier(self) -> str:
145 id = self.get_file_name() + "+"
147 if "issue_context" in self._data:
148 id += self._data["issue_context"] + "+"
150 if "issue_hash_content_of_line_in_context" in self._data:
151 id += str(self._data["issue_hash_content_of_line_in_context"])
153 return id
155 def get_html_report(self) -> str:
156 if self._html_report is None:
157 return " "
159 return os.path.join(self._report.run.path, self._html_report)
161 def get_readable_name(self) -> str:
162 if "issue_context" in self._data:
163 funcname_postfix = "#" + self._data["issue_context"]
164 else:
165 funcname_postfix = ""
167 root_filename = self.get_root_file_name()
168 file_name = self.get_file_name()
170 if root_filename != file_name:
171 file_prefix = f"[{root_filename}] {file_name}"
172 else:
173 file_prefix = root_filename
175 line = self.get_line()
176 col = self.get_column()
177 return f"{file_prefix}{funcname_postfix}:{line}:{col}" \
178 f", {self.get_category()}: {self.get_description()}"
180 KEY_FIELDS = ["check_name", "category", "description"]
182 def is_similar_to(self, other: "AnalysisDiagnostic") -> bool:
183 # We consider two diagnostics similar only if at least one
184 # of the key fields is the same in both diagnostics.
185 return len(self.get_diffs(other)) != len(self.KEY_FIELDS)
187 def get_diffs(self, other: "AnalysisDiagnostic") -> JSONDiff:
188 return {field: (self._data[field], other._data[field])
189 for field in self.KEY_FIELDS
190 if self._data[field] != other._data[field]}
192 # Note, the data format is not an API and may change from one analyzer
193 # version to another.
194 def get_raw_data(self) -> Plist:
195 return self._data
197 def __eq__(self, other: object) -> bool:
198 return hash(self) == hash(other)
200 def __ne__(self, other: object) -> bool:
201 return hash(self) != hash(other)
203 def __hash__(self) -> int:
204 return hash(self.get_issue_identifier())
207 class AnalysisRun:
208 def __init__(self, info: SingleRunInfo):
209 self.path = info.path
210 self.root = info.root
211 self.info = info
212 self.reports: List[AnalysisReport] = []
213 # Cumulative list of all diagnostics from all the reports.
214 self.diagnostics: List[AnalysisDiagnostic] = []
215 self.clang_version: Optional[str] = None
216 self.raw_stats: List[JSON] = []
218 def get_clang_version(self) -> Optional[str]:
219 return self.clang_version
221 def read_single_file(self, path: str, delete_empty: bool):
222 with open(path, "rb") as plist_file:
223 data = plistlib.load(plist_file)
225 if 'statistics' in data:
226 self.raw_stats.append(json.loads(data['statistics']))
227 data.pop('statistics')
229 # We want to retrieve the clang version even if there are no
230 # reports. Assume that all reports were created using the same
231 # clang version (this is always true and is more efficient).
232 if 'clang_version' in data:
233 if self.clang_version is None:
234 self.clang_version = data.pop('clang_version')
235 else:
236 data.pop('clang_version')
238 # Ignore/delete empty reports.
239 if not data['files']:
240 if delete_empty:
241 os.remove(path)
242 return
244 # Extract the HTML reports, if they exists.
245 if 'HTMLDiagnostics_files' in data['diagnostics'][0]:
246 htmlFiles = []
247 for d in data['diagnostics']:
248 # FIXME: Why is this named files, when does it have multiple
249 # files?
250 assert len(d['HTMLDiagnostics_files']) == 1
251 htmlFiles.append(d.pop('HTMLDiagnostics_files')[0])
252 else:
253 htmlFiles = [None] * len(data['diagnostics'])
255 report = AnalysisReport(self, data.pop('files'))
256 diagnostics = [AnalysisDiagnostic(d, report, h)
257 for d, h in zip(data.pop('diagnostics'), htmlFiles)]
259 assert not data
261 report.diagnostics.extend(diagnostics)
262 self.reports.append(report)
263 self.diagnostics.extend(diagnostics)
266 class AnalysisReport:
267 def __init__(self, run: AnalysisRun, files: List[str]):
268 self.run = run
269 self.files = files
270 self.diagnostics: List[AnalysisDiagnostic] = []
273 def load_results(results: ResultsDirectory, delete_empty: bool = True,
274 verbose_log: Optional[str] = None) -> AnalysisRun:
276 Backwards compatibility API.
278 return load_results_from_single_run(SingleRunInfo(results,
279 verbose_log),
280 delete_empty)
283 def load_results_from_single_run(info: SingleRunInfo,
284 delete_empty: bool = True) -> AnalysisRun:
286 # Load results of the analyzes from a given output folder.
287 # - info is the SingleRunInfo object
288 # - delete_empty specifies if the empty plist files should be deleted
291 path = info.path
292 run = AnalysisRun(info)
294 if os.path.isfile(path):
295 run.read_single_file(path, delete_empty)
296 else:
297 for dirpath, dirnames, filenames in os.walk(path):
298 for f in filenames:
299 if not f.endswith('plist'):
300 continue
302 p = os.path.join(dirpath, f)
303 run.read_single_file(p, delete_empty)
305 return run
308 def cmp_analysis_diagnostic(d):
309 return d.get_issue_identifier()
312 AnalysisDiagnosticPair = Tuple[AnalysisDiagnostic, AnalysisDiagnostic]
315 class ComparisonResult:
316 def __init__(self):
317 self.present_in_both: List[AnalysisDiagnostic] = []
318 self.present_only_in_old: List[AnalysisDiagnostic] = []
319 self.present_only_in_new: List[AnalysisDiagnostic] = []
320 self.changed_between_new_and_old: List[AnalysisDiagnosticPair] = []
322 def add_common(self, issue: AnalysisDiagnostic):
323 self.present_in_both.append(issue)
325 def add_removed(self, issue: AnalysisDiagnostic):
326 self.present_only_in_old.append(issue)
328 def add_added(self, issue: AnalysisDiagnostic):
329 self.present_only_in_new.append(issue)
331 def add_changed(self, old_issue: AnalysisDiagnostic,
332 new_issue: AnalysisDiagnostic):
333 self.changed_between_new_and_old.append((old_issue, new_issue))
336 GroupedDiagnostics = DefaultDict[str, List[AnalysisDiagnostic]]
339 def get_grouped_diagnostics(diagnostics: List[AnalysisDiagnostic]
340 ) -> GroupedDiagnostics:
341 result: GroupedDiagnostics = defaultdict(list)
342 for diagnostic in diagnostics:
343 result[diagnostic.get_location()].append(diagnostic)
344 return result
347 def compare_results(results_old: AnalysisRun, results_new: AnalysisRun,
348 histogram: Optional[HistogramType] = None
349 ) -> ComparisonResult:
351 compare_results - Generate a relation from diagnostics in run A to
352 diagnostics in run B.
354 The result is the relation as a list of triples (a, b) where
355 each element {a,b} is None or a matching element from the respective run
358 res = ComparisonResult()
360 # Map size_before -> size_after
361 path_difference_data: List[float] = []
363 diags_old = get_grouped_diagnostics(results_old.diagnostics)
364 diags_new = get_grouped_diagnostics(results_new.diagnostics)
366 locations_old = set(diags_old.keys())
367 locations_new = set(diags_new.keys())
369 common_locations = locations_old & locations_new
371 for location in common_locations:
372 old = diags_old[location]
373 new = diags_new[location]
375 # Quadratic algorithms in this part are fine because 'old' and 'new'
376 # are most commonly of size 1.
377 common: Set[AnalysisDiagnostic] = set()
378 for a in old:
379 for b in new:
380 if a.get_issue_identifier() == b.get_issue_identifier():
381 a_path_len = a.get_path_length()
382 b_path_len = b.get_path_length()
384 if a_path_len != b_path_len:
386 if histogram == HistogramType.RELATIVE:
387 path_difference_data.append(
388 float(a_path_len) / b_path_len)
390 elif histogram == HistogramType.LOG_RELATIVE:
391 path_difference_data.append(
392 log(float(a_path_len) / b_path_len))
394 elif histogram == HistogramType.ABSOLUTE:
395 path_difference_data.append(
396 a_path_len - b_path_len)
398 res.add_common(b)
399 common.add(a)
401 old = filter_issues(old, common)
402 new = filter_issues(new, common)
403 common = set()
405 for a in old:
406 for b in new:
407 if a.is_similar_to(b):
408 res.add_changed(a, b)
409 common.add(a)
410 common.add(b)
412 old = filter_issues(old, common)
413 new = filter_issues(new, common)
415 # Whatever is left in 'old' doesn't have a corresponding diagnostic
416 # in 'new', so we need to mark it as 'removed'.
417 for a in old:
418 res.add_removed(a)
420 # Whatever is left in 'new' doesn't have a corresponding diagnostic
421 # in 'old', so we need to mark it as 'added'.
422 for b in new:
423 res.add_added(b)
425 only_old_locations = locations_old - common_locations
426 for location in only_old_locations:
427 for a in diags_old[location]:
428 # These locations have been found only in the old build, so we
429 # need to mark all of therm as 'removed'
430 res.add_removed(a)
432 only_new_locations = locations_new - common_locations
433 for location in only_new_locations:
434 for b in diags_new[location]:
435 # These locations have been found only in the new build, so we
436 # need to mark all of therm as 'added'
437 res.add_added(b)
439 # FIXME: Add fuzzy matching. One simple and possible effective idea would
440 # be to bin the diagnostics, print them in a normalized form (based solely
441 # on the structure of the diagnostic), compute the diff, then use that as
442 # the basis for matching. This has the nice property that we don't depend
443 # in any way on the diagnostic format.
445 if histogram:
446 from matplotlib import pyplot
447 pyplot.hist(path_difference_data, bins=100)
448 pyplot.show()
450 return res
453 def filter_issues(origin: List[AnalysisDiagnostic],
454 to_remove: Set[AnalysisDiagnostic]) \
455 -> List[AnalysisDiagnostic]:
456 return [diag for diag in origin if diag not in to_remove]
459 def compute_percentile(values: Sequence[T], percentile: float) -> T:
461 Return computed percentile.
463 return sorted(values)[int(round(percentile * len(values) + 0.5)) - 1]
466 def derive_stats(results: AnalysisRun) -> Stats:
467 # Assume all keys are the same in each statistics bucket.
468 combined_data = defaultdict(list)
470 # Collect data on paths length.
471 for report in results.reports:
472 for diagnostic in report.diagnostics:
473 combined_data['PathsLength'].append(diagnostic.get_path_length())
475 for stat in results.raw_stats:
476 for key, value in stat.items():
477 combined_data[str(key)].append(value)
479 combined_stats: Stats = {}
481 for key, values in combined_data.items():
482 combined_stats[key] = {
483 "max": max(values),
484 "min": min(values),
485 "mean": sum(values) / len(values),
486 "90th %tile": compute_percentile(values, 0.9),
487 "95th %tile": compute_percentile(values, 0.95),
488 "median": sorted(values)[len(values) // 2],
489 "total": sum(values)
492 return combined_stats
495 # TODO: compare_results decouples comparison from the output, we should
496 # do it here as well
497 def compare_stats(results_old: AnalysisRun, results_new: AnalysisRun,
498 out: TextIO = sys.stdout):
499 stats_old = derive_stats(results_old)
500 stats_new = derive_stats(results_new)
502 old_keys = set(stats_old.keys())
503 new_keys = set(stats_new.keys())
504 keys = sorted(old_keys & new_keys)
506 for key in keys:
507 out.write(f"{key}\n")
509 nested_keys = sorted(set(stats_old[key]) & set(stats_new[key]))
511 for nested_key in nested_keys:
512 val_old = float(stats_old[key][nested_key])
513 val_new = float(stats_new[key][nested_key])
515 report = f"{val_old:.3f} -> {val_new:.3f}"
517 # Only apply highlighting when writing to TTY and it's not Windows
518 if out.isatty() and os.name != 'nt':
519 if val_new != 0:
520 ratio = (val_new - val_old) / val_new
521 if ratio < -0.2:
522 report = Colors.GREEN + report + Colors.CLEAR
523 elif ratio > 0.2:
524 report = Colors.RED + report + Colors.CLEAR
526 out.write(f"\t {nested_key} {report}\n")
528 removed_keys = old_keys - new_keys
529 if removed_keys:
530 out.write(f"REMOVED statistics: {removed_keys}\n")
532 added_keys = new_keys - old_keys
533 if added_keys:
534 out.write(f"ADDED statistics: {added_keys}\n")
536 out.write("\n")
539 def dump_scan_build_results_diff(dir_old: ResultsDirectory,
540 dir_new: ResultsDirectory,
541 delete_empty: bool = True,
542 out: TextIO = sys.stdout,
543 show_stats: bool = False,
544 stats_only: bool = False,
545 histogram: Optional[HistogramType] = None,
546 verbose_log: Optional[str] = None):
548 Compare directories with analysis results and dump results.
550 :param delete_empty: delete empty plist files
551 :param out: buffer to dump comparison results to.
552 :param show_stats: compare execution stats as well.
553 :param stats_only: compare ONLY execution stats.
554 :param histogram: optional histogram type to plot path differences.
555 :param verbose_log: optional path to an additional log file.
557 results_old = load_results(dir_old, delete_empty, verbose_log)
558 results_new = load_results(dir_new, delete_empty, verbose_log)
560 if show_stats or stats_only:
561 compare_stats(results_old, results_new)
562 if stats_only:
563 return
565 # Open the verbose log, if given.
566 if verbose_log:
567 aux_log: Optional[TextIO] = open(verbose_log, "w")
568 else:
569 aux_log = None
571 diff = compare_results(results_old, results_new, histogram)
572 found_diffs = 0
573 total_added = 0
574 total_removed = 0
575 total_modified = 0
577 for new in diff.present_only_in_new:
578 out.write(f"ADDED: {new.get_readable_name()}\n\n")
579 found_diffs += 1
580 total_added += 1
581 if aux_log:
582 aux_log.write(f"('ADDED', {new.get_readable_name()}, "
583 f"{new.get_html_report()})\n")
585 for old in diff.present_only_in_old:
586 out.write(f"REMOVED: {old.get_readable_name()}\n\n")
587 found_diffs += 1
588 total_removed += 1
589 if aux_log:
590 aux_log.write(f"('REMOVED', {old.get_readable_name()}, "
591 f"{old.get_html_report()})\n")
593 for old, new in diff.changed_between_new_and_old:
594 out.write(f"MODIFIED: {old.get_readable_name()}\n")
595 found_diffs += 1
596 total_modified += 1
597 diffs = old.get_diffs(new)
598 str_diffs = [f" '{key}' changed: "
599 f"'{old_value}' -> '{new_value}'"
600 for key, (old_value, new_value) in diffs.items()]
601 out.write(",\n".join(str_diffs) + "\n\n")
602 if aux_log:
603 aux_log.write(f"('MODIFIED', {old.get_readable_name()}, "
604 f"{old.get_html_report()})\n")
606 total_reports = len(results_new.diagnostics)
607 out.write(f"TOTAL REPORTS: {total_reports}\n")
608 out.write(f"TOTAL ADDED: {total_added}\n")
609 out.write(f"TOTAL REMOVED: {total_removed}\n")
610 out.write(f"TOTAL MODIFIED: {total_modified}\n")
612 if aux_log:
613 aux_log.write(f"('TOTAL NEW REPORTS', {total_reports})\n")
614 aux_log.write(f"('TOTAL DIFFERENCES', {found_diffs})\n")
615 aux_log.close()
617 # TODO: change to NamedTuple
618 return found_diffs, len(results_old.diagnostics), \
619 len(results_new.diagnostics)
622 if __name__ == "__main__":
623 print("CmpRuns.py should not be used on its own.")
624 print("Please use 'SATest.py compare' instead")
625 sys.exit(1)