Run DCE after a LoopFlatten test to reduce spurious output [nfc]
[llvm-project.git] / third-party / benchmark / tools / compare.py
blobf1504c96fa2ba87bbbefed76b07915490a6835db
1 #!/usr/bin/env python
3 import unittest
5 """
6 compare.py - versatile benchmark output compare tool
7 """
9 import argparse
10 from argparse import ArgumentParser
11 import json
12 import sys
13 import gbench
14 from gbench import util, report
15 from gbench.util import *
18 def check_inputs(in1, in2, flags):
19 """
20 Perform checking on the user provided inputs and diagnose any abnormalities
21 """
22 in1_kind, in1_err = classify_input_file(in1)
23 in2_kind, in2_err = classify_input_file(in2)
24 output_file = find_benchmark_flag("--benchmark_out=", flags)
25 output_type = find_benchmark_flag("--benchmark_out_format=", flags)
26 if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
27 print(
29 "WARNING: '--benchmark_out=%s' will be passed to both "
30 "benchmarks causing it to be overwritten"
32 % output_file
34 if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
35 print(
36 "WARNING: passing optional flags has no effect since both "
37 "inputs are JSON"
39 if output_type is not None and output_type != "json":
40 print(
42 "ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
43 " is not supported."
45 % output_type
47 sys.exit(1)
50 def create_parser():
51 parser = ArgumentParser(description="versatile benchmark output compare tool")
53 parser.add_argument(
54 "-a",
55 "--display_aggregates_only",
56 dest="display_aggregates_only",
57 action="store_true",
58 help="If there are repetitions, by default, we display everything - the"
59 " actual runs, and the aggregates computed. Sometimes, it is "
60 "desirable to only view the aggregates. E.g. when there are a lot "
61 "of repetitions. Do note that only the display is affected. "
62 "Internally, all the actual runs are still used, e.g. for U test.",
65 parser.add_argument(
66 "--no-color",
67 dest="color",
68 default=True,
69 action="store_false",
70 help="Do not use colors in the terminal output",
73 parser.add_argument(
74 "-d",
75 "--dump_to_json",
76 dest="dump_to_json",
77 help="Additionally, dump benchmark comparison output to this file in JSON format.",
80 utest = parser.add_argument_group()
81 utest.add_argument(
82 "--no-utest",
83 dest="utest",
84 default=True,
85 action="store_false",
86 help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(
87 report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS
90 alpha_default = 0.05
91 utest.add_argument(
92 "--alpha",
93 dest="utest_alpha",
94 default=alpha_default,
95 type=float,
96 help=(
97 "significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)"
99 % alpha_default,
102 subparsers = parser.add_subparsers(
103 help="This tool has multiple modes of operation:", dest="mode"
106 parser_a = subparsers.add_parser(
107 "benchmarks",
108 help="The most simple use-case, compare all the output of these two benchmarks",
110 baseline = parser_a.add_argument_group("baseline", "The benchmark baseline")
111 baseline.add_argument(
112 "test_baseline",
113 metavar="test_baseline",
114 type=argparse.FileType("r"),
115 nargs=1,
116 help="A benchmark executable or JSON output file",
118 contender = parser_a.add_argument_group(
119 "contender", "The benchmark that will be compared against the baseline"
121 contender.add_argument(
122 "test_contender",
123 metavar="test_contender",
124 type=argparse.FileType("r"),
125 nargs=1,
126 help="A benchmark executable or JSON output file",
128 parser_a.add_argument(
129 "benchmark_options",
130 metavar="benchmark_options",
131 nargs=argparse.REMAINDER,
132 help="Arguments to pass when running benchmark executables",
135 parser_b = subparsers.add_parser(
136 "filters", help="Compare filter one with the filter two of benchmark"
138 baseline = parser_b.add_argument_group("baseline", "The benchmark baseline")
139 baseline.add_argument(
140 "test",
141 metavar="test",
142 type=argparse.FileType("r"),
143 nargs=1,
144 help="A benchmark executable or JSON output file",
146 baseline.add_argument(
147 "filter_baseline",
148 metavar="filter_baseline",
149 type=str,
150 nargs=1,
151 help="The first filter, that will be used as baseline",
153 contender = parser_b.add_argument_group(
154 "contender", "The benchmark that will be compared against the baseline"
156 contender.add_argument(
157 "filter_contender",
158 metavar="filter_contender",
159 type=str,
160 nargs=1,
161 help="The second filter, that will be compared against the baseline",
163 parser_b.add_argument(
164 "benchmark_options",
165 metavar="benchmark_options",
166 nargs=argparse.REMAINDER,
167 help="Arguments to pass when running benchmark executables",
170 parser_c = subparsers.add_parser(
171 "benchmarksfiltered",
172 help="Compare filter one of first benchmark with filter two of the second benchmark",
174 baseline = parser_c.add_argument_group("baseline", "The benchmark baseline")
175 baseline.add_argument(
176 "test_baseline",
177 metavar="test_baseline",
178 type=argparse.FileType("r"),
179 nargs=1,
180 help="A benchmark executable or JSON output file",
182 baseline.add_argument(
183 "filter_baseline",
184 metavar="filter_baseline",
185 type=str,
186 nargs=1,
187 help="The first filter, that will be used as baseline",
189 contender = parser_c.add_argument_group(
190 "contender", "The benchmark that will be compared against the baseline"
192 contender.add_argument(
193 "test_contender",
194 metavar="test_contender",
195 type=argparse.FileType("r"),
196 nargs=1,
197 help="The second benchmark executable or JSON output file, that will be compared against the baseline",
199 contender.add_argument(
200 "filter_contender",
201 metavar="filter_contender",
202 type=str,
203 nargs=1,
204 help="The second filter, that will be compared against the baseline",
206 parser_c.add_argument(
207 "benchmark_options",
208 metavar="benchmark_options",
209 nargs=argparse.REMAINDER,
210 help="Arguments to pass when running benchmark executables",
213 return parser
216 def main():
217 # Parse the command line flags
218 parser = create_parser()
219 args, unknown_args = parser.parse_known_args()
220 if args.mode is None:
221 parser.print_help()
222 exit(1)
223 assert not unknown_args
224 benchmark_options = args.benchmark_options
226 if args.mode == "benchmarks":
227 test_baseline = args.test_baseline[0].name
228 test_contender = args.test_contender[0].name
229 filter_baseline = ""
230 filter_contender = ""
232 # NOTE: if test_baseline == test_contender, you are analyzing the stdev
234 description = "Comparing %s to %s" % (test_baseline, test_contender)
235 elif args.mode == "filters":
236 test_baseline = args.test[0].name
237 test_contender = args.test[0].name
238 filter_baseline = args.filter_baseline[0]
239 filter_contender = args.filter_contender[0]
241 # NOTE: if filter_baseline == filter_contender, you are analyzing the
242 # stdev
244 description = "Comparing %s to %s (from %s)" % (
245 filter_baseline,
246 filter_contender,
247 args.test[0].name,
249 elif args.mode == "benchmarksfiltered":
250 test_baseline = args.test_baseline[0].name
251 test_contender = args.test_contender[0].name
252 filter_baseline = args.filter_baseline[0]
253 filter_contender = args.filter_contender[0]
255 # NOTE: if test_baseline == test_contender and
256 # filter_baseline == filter_contender, you are analyzing the stdev
258 description = "Comparing %s (from %s) to %s (from %s)" % (
259 filter_baseline,
260 test_baseline,
261 filter_contender,
262 test_contender,
264 else:
265 # should never happen
266 print("Unrecognized mode of operation: '%s'" % args.mode)
267 parser.print_help()
268 exit(1)
270 check_inputs(test_baseline, test_contender, benchmark_options)
272 if args.display_aggregates_only:
273 benchmark_options += ["--benchmark_display_aggregates_only=true"]
275 options_baseline = []
276 options_contender = []
278 if filter_baseline and filter_contender:
279 options_baseline = ["--benchmark_filter=%s" % filter_baseline]
280 options_contender = ["--benchmark_filter=%s" % filter_contender]
282 # Run the benchmarks and report the results
283 json1 = json1_orig = gbench.util.sort_benchmark_results(
284 gbench.util.run_or_load_benchmark(
285 test_baseline, benchmark_options + options_baseline
288 json2 = json2_orig = gbench.util.sort_benchmark_results(
289 gbench.util.run_or_load_benchmark(
290 test_contender, benchmark_options + options_contender
294 # Now, filter the benchmarks so that the difference report can work
295 if filter_baseline and filter_contender:
296 replacement = "[%s vs. %s]" % (filter_baseline, filter_contender)
297 json1 = gbench.report.filter_benchmark(json1_orig, filter_baseline, replacement)
298 json2 = gbench.report.filter_benchmark(
299 json2_orig, filter_contender, replacement
302 diff_report = gbench.report.get_difference_report(json1, json2, args.utest)
303 output_lines = gbench.report.print_difference_report(
304 diff_report,
305 args.display_aggregates_only,
306 args.utest,
307 args.utest_alpha,
308 args.color,
310 print(description)
311 for ln in output_lines:
312 print(ln)
314 # Optionally, diff and output to JSON
315 if args.dump_to_json is not None:
316 with open(args.dump_to_json, "w") as f_json:
317 json.dump(diff_report, f_json)
320 class TestParser(unittest.TestCase):
321 def setUp(self):
322 self.parser = create_parser()
323 testInputs = os.path.join(
324 os.path.dirname(os.path.realpath(__file__)), "gbench", "Inputs"
326 self.testInput0 = os.path.join(testInputs, "test1_run1.json")
327 self.testInput1 = os.path.join(testInputs, "test1_run2.json")
329 def test_benchmarks_basic(self):
330 parsed = self.parser.parse_args(
331 ["benchmarks", self.testInput0, self.testInput1]
333 self.assertFalse(parsed.display_aggregates_only)
334 self.assertTrue(parsed.utest)
335 self.assertEqual(parsed.mode, "benchmarks")
336 self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
337 self.assertEqual(parsed.test_contender[0].name, self.testInput1)
338 self.assertFalse(parsed.benchmark_options)
340 def test_benchmarks_basic_without_utest(self):
341 parsed = self.parser.parse_args(
342 ["--no-utest", "benchmarks", self.testInput0, self.testInput1]
344 self.assertFalse(parsed.display_aggregates_only)
345 self.assertFalse(parsed.utest)
346 self.assertEqual(parsed.utest_alpha, 0.05)
347 self.assertEqual(parsed.mode, "benchmarks")
348 self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
349 self.assertEqual(parsed.test_contender[0].name, self.testInput1)
350 self.assertFalse(parsed.benchmark_options)
352 def test_benchmarks_basic_display_aggregates_only(self):
353 parsed = self.parser.parse_args(
354 ["-a", "benchmarks", self.testInput0, self.testInput1]
356 self.assertTrue(parsed.display_aggregates_only)
357 self.assertTrue(parsed.utest)
358 self.assertEqual(parsed.mode, "benchmarks")
359 self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
360 self.assertEqual(parsed.test_contender[0].name, self.testInput1)
361 self.assertFalse(parsed.benchmark_options)
363 def test_benchmarks_basic_with_utest_alpha(self):
364 parsed = self.parser.parse_args(
365 ["--alpha=0.314", "benchmarks", self.testInput0, self.testInput1]
367 self.assertFalse(parsed.display_aggregates_only)
368 self.assertTrue(parsed.utest)
369 self.assertEqual(parsed.utest_alpha, 0.314)
370 self.assertEqual(parsed.mode, "benchmarks")
371 self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
372 self.assertEqual(parsed.test_contender[0].name, self.testInput1)
373 self.assertFalse(parsed.benchmark_options)
375 def test_benchmarks_basic_without_utest_with_utest_alpha(self):
376 parsed = self.parser.parse_args(
378 "--no-utest",
379 "--alpha=0.314",
380 "benchmarks",
381 self.testInput0,
382 self.testInput1,
385 self.assertFalse(parsed.display_aggregates_only)
386 self.assertFalse(parsed.utest)
387 self.assertEqual(parsed.utest_alpha, 0.314)
388 self.assertEqual(parsed.mode, "benchmarks")
389 self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
390 self.assertEqual(parsed.test_contender[0].name, self.testInput1)
391 self.assertFalse(parsed.benchmark_options)
393 def test_benchmarks_with_remainder(self):
394 parsed = self.parser.parse_args(
395 ["benchmarks", self.testInput0, self.testInput1, "d"]
397 self.assertFalse(parsed.display_aggregates_only)
398 self.assertTrue(parsed.utest)
399 self.assertEqual(parsed.mode, "benchmarks")
400 self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
401 self.assertEqual(parsed.test_contender[0].name, self.testInput1)
402 self.assertEqual(parsed.benchmark_options, ["d"])
404 def test_benchmarks_with_remainder_after_doubleminus(self):
405 parsed = self.parser.parse_args(
406 ["benchmarks", self.testInput0, self.testInput1, "--", "e"]
408 self.assertFalse(parsed.display_aggregates_only)
409 self.assertTrue(parsed.utest)
410 self.assertEqual(parsed.mode, "benchmarks")
411 self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
412 self.assertEqual(parsed.test_contender[0].name, self.testInput1)
413 self.assertEqual(parsed.benchmark_options, ["e"])
415 def test_filters_basic(self):
416 parsed = self.parser.parse_args(["filters", self.testInput0, "c", "d"])
417 self.assertFalse(parsed.display_aggregates_only)
418 self.assertTrue(parsed.utest)
419 self.assertEqual(parsed.mode, "filters")
420 self.assertEqual(parsed.test[0].name, self.testInput0)
421 self.assertEqual(parsed.filter_baseline[0], "c")
422 self.assertEqual(parsed.filter_contender[0], "d")
423 self.assertFalse(parsed.benchmark_options)
425 def test_filters_with_remainder(self):
426 parsed = self.parser.parse_args(["filters", self.testInput0, "c", "d", "e"])
427 self.assertFalse(parsed.display_aggregates_only)
428 self.assertTrue(parsed.utest)
429 self.assertEqual(parsed.mode, "filters")
430 self.assertEqual(parsed.test[0].name, self.testInput0)
431 self.assertEqual(parsed.filter_baseline[0], "c")
432 self.assertEqual(parsed.filter_contender[0], "d")
433 self.assertEqual(parsed.benchmark_options, ["e"])
435 def test_filters_with_remainder_after_doubleminus(self):
436 parsed = self.parser.parse_args(
437 ["filters", self.testInput0, "c", "d", "--", "f"]
439 self.assertFalse(parsed.display_aggregates_only)
440 self.assertTrue(parsed.utest)
441 self.assertEqual(parsed.mode, "filters")
442 self.assertEqual(parsed.test[0].name, self.testInput0)
443 self.assertEqual(parsed.filter_baseline[0], "c")
444 self.assertEqual(parsed.filter_contender[0], "d")
445 self.assertEqual(parsed.benchmark_options, ["f"])
447 def test_benchmarksfiltered_basic(self):
448 parsed = self.parser.parse_args(
449 ["benchmarksfiltered", self.testInput0, "c", self.testInput1, "e"]
451 self.assertFalse(parsed.display_aggregates_only)
452 self.assertTrue(parsed.utest)
453 self.assertEqual(parsed.mode, "benchmarksfiltered")
454 self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
455 self.assertEqual(parsed.filter_baseline[0], "c")
456 self.assertEqual(parsed.test_contender[0].name, self.testInput1)
457 self.assertEqual(parsed.filter_contender[0], "e")
458 self.assertFalse(parsed.benchmark_options)
460 def test_benchmarksfiltered_with_remainder(self):
461 parsed = self.parser.parse_args(
462 ["benchmarksfiltered", self.testInput0, "c", self.testInput1, "e", "f"]
464 self.assertFalse(parsed.display_aggregates_only)
465 self.assertTrue(parsed.utest)
466 self.assertEqual(parsed.mode, "benchmarksfiltered")
467 self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
468 self.assertEqual(parsed.filter_baseline[0], "c")
469 self.assertEqual(parsed.test_contender[0].name, self.testInput1)
470 self.assertEqual(parsed.filter_contender[0], "e")
471 self.assertEqual(parsed.benchmark_options[0], "f")
473 def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
474 parsed = self.parser.parse_args(
476 "benchmarksfiltered",
477 self.testInput0,
478 "c",
479 self.testInput1,
480 "e",
481 "--",
482 "g",
485 self.assertFalse(parsed.display_aggregates_only)
486 self.assertTrue(parsed.utest)
487 self.assertEqual(parsed.mode, "benchmarksfiltered")
488 self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
489 self.assertEqual(parsed.filter_baseline[0], "c")
490 self.assertEqual(parsed.test_contender[0].name, self.testInput1)
491 self.assertEqual(parsed.filter_contender[0], "e")
492 self.assertEqual(parsed.benchmark_options[0], "g")
495 if __name__ == "__main__":
496 # unittest.main()
497 main()
499 # vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
500 # kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
501 # kate: indent-mode python; remove-trailing-spaces modified;