6 compare.py - versatile benchmark output compare tool
10 from argparse
import ArgumentParser
14 from gbench
import util
, report
15 from gbench
.util
import *
18 def check_inputs(in1
, in2
, flags
):
20 Perform checking on the user provided inputs and diagnose any abnormalities
22 in1_kind
, in1_err
= classify_input_file(in1
)
23 in2_kind
, in2_err
= classify_input_file(in2
)
24 output_file
= find_benchmark_flag("--benchmark_out=", flags
)
25 output_type
= find_benchmark_flag("--benchmark_out_format=", flags
)
26 if in1_kind
== IT_Executable
and in2_kind
== IT_Executable
and output_file
:
29 "WARNING: '--benchmark_out=%s' will be passed to both "
30 "benchmarks causing it to be overwritten"
34 if in1_kind
== IT_JSON
and in2_kind
== IT_JSON
and len(flags
) > 0:
36 "WARNING: passing optional flags has no effect since both "
39 if output_type
is not None and output_type
!= "json":
42 "ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
51 parser
= ArgumentParser(description
="versatile benchmark output compare tool")
55 "--display_aggregates_only",
56 dest
="display_aggregates_only",
58 help="If there are repetitions, by default, we display everything - the"
59 " actual runs, and the aggregates computed. Sometimes, it is "
60 "desirable to only view the aggregates. E.g. when there are a lot "
61 "of repetitions. Do note that only the display is affected. "
62 "Internally, all the actual runs are still used, e.g. for U test.",
70 help="Do not use colors in the terminal output",
77 help="Additionally, dump benchmark comparison output to this file in JSON format.",
80 utest
= parser
.add_argument_group()
86 help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(
87 report
.UTEST_OPTIMAL_REPETITIONS
, report
.UTEST_MIN_REPETITIONS
94 default
=alpha_default
,
97 "significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)"
102 subparsers
= parser
.add_subparsers(
103 help="This tool has multiple modes of operation:", dest
="mode"
106 parser_a
= subparsers
.add_parser(
108 help="The most simple use-case, compare all the output of these two benchmarks",
110 baseline
= parser_a
.add_argument_group("baseline", "The benchmark baseline")
111 baseline
.add_argument(
113 metavar
="test_baseline",
114 type=argparse
.FileType("r"),
116 help="A benchmark executable or JSON output file",
118 contender
= parser_a
.add_argument_group(
119 "contender", "The benchmark that will be compared against the baseline"
121 contender
.add_argument(
123 metavar
="test_contender",
124 type=argparse
.FileType("r"),
126 help="A benchmark executable or JSON output file",
128 parser_a
.add_argument(
130 metavar
="benchmark_options",
131 nargs
=argparse
.REMAINDER
,
132 help="Arguments to pass when running benchmark executables",
135 parser_b
= subparsers
.add_parser(
136 "filters", help="Compare filter one with the filter two of benchmark"
138 baseline
= parser_b
.add_argument_group("baseline", "The benchmark baseline")
139 baseline
.add_argument(
142 type=argparse
.FileType("r"),
144 help="A benchmark executable or JSON output file",
146 baseline
.add_argument(
148 metavar
="filter_baseline",
151 help="The first filter, that will be used as baseline",
153 contender
= parser_b
.add_argument_group(
154 "contender", "The benchmark that will be compared against the baseline"
156 contender
.add_argument(
158 metavar
="filter_contender",
161 help="The second filter, that will be compared against the baseline",
163 parser_b
.add_argument(
165 metavar
="benchmark_options",
166 nargs
=argparse
.REMAINDER
,
167 help="Arguments to pass when running benchmark executables",
170 parser_c
= subparsers
.add_parser(
171 "benchmarksfiltered",
172 help="Compare filter one of first benchmark with filter two of the second benchmark",
174 baseline
= parser_c
.add_argument_group("baseline", "The benchmark baseline")
175 baseline
.add_argument(
177 metavar
="test_baseline",
178 type=argparse
.FileType("r"),
180 help="A benchmark executable or JSON output file",
182 baseline
.add_argument(
184 metavar
="filter_baseline",
187 help="The first filter, that will be used as baseline",
189 contender
= parser_c
.add_argument_group(
190 "contender", "The benchmark that will be compared against the baseline"
192 contender
.add_argument(
194 metavar
="test_contender",
195 type=argparse
.FileType("r"),
197 help="The second benchmark executable or JSON output file, that will be compared against the baseline",
199 contender
.add_argument(
201 metavar
="filter_contender",
204 help="The second filter, that will be compared against the baseline",
206 parser_c
.add_argument(
208 metavar
="benchmark_options",
209 nargs
=argparse
.REMAINDER
,
210 help="Arguments to pass when running benchmark executables",
217 # Parse the command line flags
218 parser
= create_parser()
219 args
, unknown_args
= parser
.parse_known_args()
220 if args
.mode
is None:
223 assert not unknown_args
224 benchmark_options
= args
.benchmark_options
226 if args
.mode
== "benchmarks":
227 test_baseline
= args
.test_baseline
[0].name
228 test_contender
= args
.test_contender
[0].name
230 filter_contender
= ""
232 # NOTE: if test_baseline == test_contender, you are analyzing the stdev
234 description
= "Comparing %s to %s" % (test_baseline
, test_contender
)
235 elif args
.mode
== "filters":
236 test_baseline
= args
.test
[0].name
237 test_contender
= args
.test
[0].name
238 filter_baseline
= args
.filter_baseline
[0]
239 filter_contender
= args
.filter_contender
[0]
241 # NOTE: if filter_baseline == filter_contender, you are analyzing the
244 description
= "Comparing %s to %s (from %s)" % (
249 elif args
.mode
== "benchmarksfiltered":
250 test_baseline
= args
.test_baseline
[0].name
251 test_contender
= args
.test_contender
[0].name
252 filter_baseline
= args
.filter_baseline
[0]
253 filter_contender
= args
.filter_contender
[0]
255 # NOTE: if test_baseline == test_contender and
256 # filter_baseline == filter_contender, you are analyzing the stdev
258 description
= "Comparing %s (from %s) to %s (from %s)" % (
265 # should never happen
266 print("Unrecognized mode of operation: '%s'" % args
.mode
)
270 check_inputs(test_baseline
, test_contender
, benchmark_options
)
272 if args
.display_aggregates_only
:
273 benchmark_options
+= ["--benchmark_display_aggregates_only=true"]
275 options_baseline
= []
276 options_contender
= []
278 if filter_baseline
and filter_contender
:
279 options_baseline
= ["--benchmark_filter=%s" % filter_baseline
]
280 options_contender
= ["--benchmark_filter=%s" % filter_contender
]
282 # Run the benchmarks and report the results
283 json1
= json1_orig
= gbench
.util
.sort_benchmark_results(
284 gbench
.util
.run_or_load_benchmark(
285 test_baseline
, benchmark_options
+ options_baseline
288 json2
= json2_orig
= gbench
.util
.sort_benchmark_results(
289 gbench
.util
.run_or_load_benchmark(
290 test_contender
, benchmark_options
+ options_contender
294 # Now, filter the benchmarks so that the difference report can work
295 if filter_baseline
and filter_contender
:
296 replacement
= "[%s vs. %s]" % (filter_baseline
, filter_contender
)
297 json1
= gbench
.report
.filter_benchmark(json1_orig
, filter_baseline
, replacement
)
298 json2
= gbench
.report
.filter_benchmark(
299 json2_orig
, filter_contender
, replacement
302 diff_report
= gbench
.report
.get_difference_report(json1
, json2
, args
.utest
)
303 output_lines
= gbench
.report
.print_difference_report(
305 args
.display_aggregates_only
,
311 for ln
in output_lines
:
314 # Optionally, diff and output to JSON
315 if args
.dump_to_json
is not None:
316 with
open(args
.dump_to_json
, "w") as f_json
:
317 json
.dump(diff_report
, f_json
)
320 class TestParser(unittest
.TestCase
):
322 self
.parser
= create_parser()
323 testInputs
= os
.path
.join(
324 os
.path
.dirname(os
.path
.realpath(__file__
)), "gbench", "Inputs"
326 self
.testInput0
= os
.path
.join(testInputs
, "test1_run1.json")
327 self
.testInput1
= os
.path
.join(testInputs
, "test1_run2.json")
329 def test_benchmarks_basic(self
):
330 parsed
= self
.parser
.parse_args(
331 ["benchmarks", self
.testInput0
, self
.testInput1
]
333 self
.assertFalse(parsed
.display_aggregates_only
)
334 self
.assertTrue(parsed
.utest
)
335 self
.assertEqual(parsed
.mode
, "benchmarks")
336 self
.assertEqual(parsed
.test_baseline
[0].name
, self
.testInput0
)
337 self
.assertEqual(parsed
.test_contender
[0].name
, self
.testInput1
)
338 self
.assertFalse(parsed
.benchmark_options
)
340 def test_benchmarks_basic_without_utest(self
):
341 parsed
= self
.parser
.parse_args(
342 ["--no-utest", "benchmarks", self
.testInput0
, self
.testInput1
]
344 self
.assertFalse(parsed
.display_aggregates_only
)
345 self
.assertFalse(parsed
.utest
)
346 self
.assertEqual(parsed
.utest_alpha
, 0.05)
347 self
.assertEqual(parsed
.mode
, "benchmarks")
348 self
.assertEqual(parsed
.test_baseline
[0].name
, self
.testInput0
)
349 self
.assertEqual(parsed
.test_contender
[0].name
, self
.testInput1
)
350 self
.assertFalse(parsed
.benchmark_options
)
352 def test_benchmarks_basic_display_aggregates_only(self
):
353 parsed
= self
.parser
.parse_args(
354 ["-a", "benchmarks", self
.testInput0
, self
.testInput1
]
356 self
.assertTrue(parsed
.display_aggregates_only
)
357 self
.assertTrue(parsed
.utest
)
358 self
.assertEqual(parsed
.mode
, "benchmarks")
359 self
.assertEqual(parsed
.test_baseline
[0].name
, self
.testInput0
)
360 self
.assertEqual(parsed
.test_contender
[0].name
, self
.testInput1
)
361 self
.assertFalse(parsed
.benchmark_options
)
363 def test_benchmarks_basic_with_utest_alpha(self
):
364 parsed
= self
.parser
.parse_args(
365 ["--alpha=0.314", "benchmarks", self
.testInput0
, self
.testInput1
]
367 self
.assertFalse(parsed
.display_aggregates_only
)
368 self
.assertTrue(parsed
.utest
)
369 self
.assertEqual(parsed
.utest_alpha
, 0.314)
370 self
.assertEqual(parsed
.mode
, "benchmarks")
371 self
.assertEqual(parsed
.test_baseline
[0].name
, self
.testInput0
)
372 self
.assertEqual(parsed
.test_contender
[0].name
, self
.testInput1
)
373 self
.assertFalse(parsed
.benchmark_options
)
375 def test_benchmarks_basic_without_utest_with_utest_alpha(self
):
376 parsed
= self
.parser
.parse_args(
385 self
.assertFalse(parsed
.display_aggregates_only
)
386 self
.assertFalse(parsed
.utest
)
387 self
.assertEqual(parsed
.utest_alpha
, 0.314)
388 self
.assertEqual(parsed
.mode
, "benchmarks")
389 self
.assertEqual(parsed
.test_baseline
[0].name
, self
.testInput0
)
390 self
.assertEqual(parsed
.test_contender
[0].name
, self
.testInput1
)
391 self
.assertFalse(parsed
.benchmark_options
)
393 def test_benchmarks_with_remainder(self
):
394 parsed
= self
.parser
.parse_args(
395 ["benchmarks", self
.testInput0
, self
.testInput1
, "d"]
397 self
.assertFalse(parsed
.display_aggregates_only
)
398 self
.assertTrue(parsed
.utest
)
399 self
.assertEqual(parsed
.mode
, "benchmarks")
400 self
.assertEqual(parsed
.test_baseline
[0].name
, self
.testInput0
)
401 self
.assertEqual(parsed
.test_contender
[0].name
, self
.testInput1
)
402 self
.assertEqual(parsed
.benchmark_options
, ["d"])
404 def test_benchmarks_with_remainder_after_doubleminus(self
):
405 parsed
= self
.parser
.parse_args(
406 ["benchmarks", self
.testInput0
, self
.testInput1
, "--", "e"]
408 self
.assertFalse(parsed
.display_aggregates_only
)
409 self
.assertTrue(parsed
.utest
)
410 self
.assertEqual(parsed
.mode
, "benchmarks")
411 self
.assertEqual(parsed
.test_baseline
[0].name
, self
.testInput0
)
412 self
.assertEqual(parsed
.test_contender
[0].name
, self
.testInput1
)
413 self
.assertEqual(parsed
.benchmark_options
, ["e"])
415 def test_filters_basic(self
):
416 parsed
= self
.parser
.parse_args(["filters", self
.testInput0
, "c", "d"])
417 self
.assertFalse(parsed
.display_aggregates_only
)
418 self
.assertTrue(parsed
.utest
)
419 self
.assertEqual(parsed
.mode
, "filters")
420 self
.assertEqual(parsed
.test
[0].name
, self
.testInput0
)
421 self
.assertEqual(parsed
.filter_baseline
[0], "c")
422 self
.assertEqual(parsed
.filter_contender
[0], "d")
423 self
.assertFalse(parsed
.benchmark_options
)
425 def test_filters_with_remainder(self
):
426 parsed
= self
.parser
.parse_args(["filters", self
.testInput0
, "c", "d", "e"])
427 self
.assertFalse(parsed
.display_aggregates_only
)
428 self
.assertTrue(parsed
.utest
)
429 self
.assertEqual(parsed
.mode
, "filters")
430 self
.assertEqual(parsed
.test
[0].name
, self
.testInput0
)
431 self
.assertEqual(parsed
.filter_baseline
[0], "c")
432 self
.assertEqual(parsed
.filter_contender
[0], "d")
433 self
.assertEqual(parsed
.benchmark_options
, ["e"])
435 def test_filters_with_remainder_after_doubleminus(self
):
436 parsed
= self
.parser
.parse_args(
437 ["filters", self
.testInput0
, "c", "d", "--", "f"]
439 self
.assertFalse(parsed
.display_aggregates_only
)
440 self
.assertTrue(parsed
.utest
)
441 self
.assertEqual(parsed
.mode
, "filters")
442 self
.assertEqual(parsed
.test
[0].name
, self
.testInput0
)
443 self
.assertEqual(parsed
.filter_baseline
[0], "c")
444 self
.assertEqual(parsed
.filter_contender
[0], "d")
445 self
.assertEqual(parsed
.benchmark_options
, ["f"])
447 def test_benchmarksfiltered_basic(self
):
448 parsed
= self
.parser
.parse_args(
449 ["benchmarksfiltered", self
.testInput0
, "c", self
.testInput1
, "e"]
451 self
.assertFalse(parsed
.display_aggregates_only
)
452 self
.assertTrue(parsed
.utest
)
453 self
.assertEqual(parsed
.mode
, "benchmarksfiltered")
454 self
.assertEqual(parsed
.test_baseline
[0].name
, self
.testInput0
)
455 self
.assertEqual(parsed
.filter_baseline
[0], "c")
456 self
.assertEqual(parsed
.test_contender
[0].name
, self
.testInput1
)
457 self
.assertEqual(parsed
.filter_contender
[0], "e")
458 self
.assertFalse(parsed
.benchmark_options
)
460 def test_benchmarksfiltered_with_remainder(self
):
461 parsed
= self
.parser
.parse_args(
462 ["benchmarksfiltered", self
.testInput0
, "c", self
.testInput1
, "e", "f"]
464 self
.assertFalse(parsed
.display_aggregates_only
)
465 self
.assertTrue(parsed
.utest
)
466 self
.assertEqual(parsed
.mode
, "benchmarksfiltered")
467 self
.assertEqual(parsed
.test_baseline
[0].name
, self
.testInput0
)
468 self
.assertEqual(parsed
.filter_baseline
[0], "c")
469 self
.assertEqual(parsed
.test_contender
[0].name
, self
.testInput1
)
470 self
.assertEqual(parsed
.filter_contender
[0], "e")
471 self
.assertEqual(parsed
.benchmark_options
[0], "f")
473 def test_benchmarksfiltered_with_remainder_after_doubleminus(self
):
474 parsed
= self
.parser
.parse_args(
476 "benchmarksfiltered",
485 self
.assertFalse(parsed
.display_aggregates_only
)
486 self
.assertTrue(parsed
.utest
)
487 self
.assertEqual(parsed
.mode
, "benchmarksfiltered")
488 self
.assertEqual(parsed
.test_baseline
[0].name
, self
.testInput0
)
489 self
.assertEqual(parsed
.filter_baseline
[0], "c")
490 self
.assertEqual(parsed
.test_contender
[0].name
, self
.testInput1
)
491 self
.assertEqual(parsed
.filter_contender
[0], "e")
492 self
.assertEqual(parsed
.benchmark_options
[0], "g")
495 if __name__
== "__main__":
499 # vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
500 # kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
501 # kate: indent-mode python; remove-trailing-spaces modified;