Fix breakages in https://codereview.chromium.org/1155713003/
[chromium-blink-merge.git] / tools / auto_bisect / bisect_results.py
blobd4fb541eea5657d24b2474fbe4b3f73eee986fa4
1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 import math
6 import os
8 import bisect_utils
9 import math_utils
10 import source_control
11 import ttest
13 from bisect_state import RevisionState
16 class BisectResults(object):
17 """Contains results of the completed bisect.
19 Properties:
20 error: Error message if the bisect failed.
22 If the error is None, the following properties are present:
23 warnings: List of warnings from the bisect run.
24 state: BisectState object from which these results were generated.
25 first_working_revision: First good revision.
26 last_broken_revision: Last bad revision.
28 If both of above revisions are not None, the follow properties are present:
29 culprit_revisions: A list of revisions, which contain the bad change
30 introducing the failure.
31 other_regressions: A list of tuples representing other regressions, which
32 may have occurred.
33 regression_size: For performance bisects, this is a relative change of
34 the mean metric value. For other bisects this field always contains
35 'zero-to-nonzero'.
36 regression_std_err: For performance bisects, it is a pooled standard error
37 for groups of good and bad runs. Not used for other bisects.
38 confidence: For performance bisects, it is a confidence that the good and
39 bad runs are distinct groups. Not used for non-performance bisects.
40 """
42 def __init__(self, bisect_state=None, depot_registry=None, opts=None,
43 runtime_warnings=None, error=None, abort_reason=None):
44 """Computes final bisect results after a bisect run is complete.
46 This constructor should be called in one of the following ways:
47 BisectResults(state, depot_registry, opts, runtime_warnings)
48 BisectResults(error=error)
50 First option creates an object representing successful bisect results, while
51 second option creates an error result.
53 Args:
54 bisect_state: BisectState object representing latest bisect state.
55 depot_registry: DepotDirectoryRegistry object with information on each
56 repository in the bisect_state.
57 opts: Options passed to the bisect run.
58 runtime_warnings: A list of warnings from the bisect run.
59 error: Error message. When error is not None, other arguments are ignored.
60 """
61 self.error = error
62 self.abort_reason = abort_reason
63 if error is not None or abort_reason is not None:
64 return
66 assert (bisect_state is not None and depot_registry is not None and
67 opts is not None and runtime_warnings is not None), (
68 'Incorrect use of the BisectResults constructor. '
69 'When error is None, all other arguments are required.')
71 self.state = bisect_state
73 rev_states = bisect_state.GetRevisionStates()
74 first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)
75 self.first_working_revision = first_working_rev
76 self.last_broken_revision = last_broken_rev
78 self.warnings = runtime_warnings
80 self.retest_results_tot = None
81 self.retest_results_reverted = None
83 if first_working_rev is not None and last_broken_rev is not None:
84 statistics = self._ComputeRegressionStatistics(
85 rev_states, first_working_rev, last_broken_rev)
87 self.regression_size = statistics['regression_size']
88 self.regression_std_err = statistics['regression_std_err']
89 self.confidence = statistics['confidence']
91 self.culprit_revisions = self._FindCulpritRevisions(
92 rev_states, depot_registry, first_working_rev, last_broken_rev)
94 self.other_regressions = self._FindOtherRegressions(
95 rev_states, statistics['bad_greater_than_good'])
97 self.warnings += self._GetResultBasedWarnings(
98 self.culprit_revisions, opts, self.confidence)
99 elif first_working_rev is not None:
100 # Setting these attributes so that bisect printer does not break when the
101 # regression cannot be reproduced (no broken revision was found)
102 self.regression_size = 0
103 self.regression_std_err = 0
104 self.confidence = 0
105 self.culprit_revisions = []
106 self.other_regressions = []
108 def AddRetestResults(self, results_tot, results_reverted):
109 if not results_tot or not results_reverted:
110 self.warnings.append(
111 'Failed to re-test reverted culprit CL against ToT.')
112 return
114 confidence_params = (results_reverted[0]['values'],
115 results_tot[0]['values'])
116 confidence = BisectResults.ConfidenceScore(*confidence_params)
118 self.retest_results_tot = RevisionState('ToT', 'n/a', 0)
119 self.retest_results_tot.value = results_tot[0]
121 self.retest_results_reverted = RevisionState('Reverted', 'n/a', 0)
122 self.retest_results_reverted.value = results_reverted[0]
124 if confidence <= bisect_utils.HIGH_CONFIDENCE:
125 self.warnings.append(
126 'Confidence of re-test with reverted CL is not high.'
127 ' Check that the regression hasn\'t already recovered. '
128 ' There\'s still a chance this is a regression, as performance of'
129 ' local builds may not match official builds.')
131 @staticmethod
132 def _GetResultBasedWarnings(culprit_revisions, opts, confidence):
133 warnings = []
134 if len(culprit_revisions) > 1:
135 warnings.append('Due to build errors, regression range could '
136 'not be narrowed down to a single commit.')
137 if opts.repeat_test_count == 1:
138 warnings.append('Tests were only set to run once. This may '
139 'be insufficient to get meaningful results.')
140 if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:
141 warnings.append('Confidence is not high. Try bisecting again '
142 'with increased repeat_count, larger range, or '
143 'on another metric.')
144 if not confidence:
145 warnings.append('Confidence score is 0%. Try bisecting again on '
146 'another platform or another metric.')
147 return warnings
149 @staticmethod
150 def ConfidenceScore(sample1, sample2,
151 accept_single_bad_or_good=False):
152 """Calculates a confidence score.
154 This score is a percentage which represents our degree of confidence in the
155 proposition that the good results and bad results are distinct groups, and
156 their differences aren't due to chance alone.
159 Args:
160 sample1: A flat list of "good" result numbers.
161 sample2: A flat list of "bad" result numbers.
162 accept_single_bad_or_good: If True, computes confidence even if there is
163 just one bad or good revision, otherwise single good or bad revision
164 always returns 0.0 confidence. This flag will probably get away when
165 we will implement expanding the bisect range by one more revision for
166 such case.
168 Returns:
169 A number in the range [0, 100].
171 # If there's only one item in either list, this means only one revision was
172 # classified good or bad; this isn't good enough evidence to make a
173 # decision. If an empty list was passed, that also implies zero confidence.
174 if not accept_single_bad_or_good:
175 if len(sample1) <= 1 or len(sample2) <= 1:
176 return 0.0
178 # If there were only empty lists in either of the lists (this is unexpected
179 # and normally shouldn't happen), then we also want to return 0.
180 if not sample1 or not sample2:
181 return 0.0
183 # The p-value is approximately the probability of obtaining the given set
184 # of good and bad values just by chance.
185 _, _, p_value = ttest.WelchsTTest(sample1, sample2)
186 return 100.0 * (1.0 - p_value)
188 @classmethod
189 def _FindOtherRegressions(cls, revision_states, bad_greater_than_good):
190 """Compiles a list of other possible regressions from the revision data.
192 Args:
193 revision_states: Sorted list of RevisionState objects.
194 bad_greater_than_good: Whether the result value at the "bad" revision is
195 numerically greater than the result value at the "good" revision.
197 Returns:
198 A list of [current_rev, previous_rev, confidence] for other places where
199 there may have been a regression.
201 other_regressions = []
202 previous_values = []
203 prev_state = None
204 for revision_state in revision_states:
205 if revision_state.value:
206 current_values = revision_state.value['values']
207 if previous_values:
208 confidence_params = (sum(previous_values, []),
209 sum([current_values], []))
210 confidence = cls.ConfidenceScore(*confidence_params,
211 accept_single_bad_or_good=True)
212 mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
213 mean_of_current_runs = math_utils.Mean(current_values)
215 # Check that the potential regression is in the same direction as
216 # the overall regression. If the mean of the previous runs < the
217 # mean of the current runs, this local regression is in same
218 # direction.
219 prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs
220 if bad_greater_than_good:
221 is_same_direction = prev_greater_than_current
222 else:
223 is_same_direction = not prev_greater_than_current
225 # Only report potential regressions with high confidence.
226 if is_same_direction and confidence > 50:
227 other_regressions.append([revision_state, prev_state, confidence])
228 previous_values.append(current_values)
229 prev_state = revision_state
230 return other_regressions
232 @staticmethod
233 def FindBreakingRevRange(revision_states):
234 """Finds the last known good and first known bad revisions.
236 Note that since revision_states is expected to be in reverse chronological
237 order, the last known good revision is the first revision in the list that
238 has the passed property set to 1, therefore the name
239 `first_working_revision`. The inverse applies to `last_broken_revision`.
241 Args:
242 revision_states: A list of RevisionState instances.
244 Returns:
245 A tuple containing the two revision states at the border. (Last
246 known good and first known bad.)
248 first_working_revision = None
249 last_broken_revision = None
251 for revision_state in revision_states:
252 if revision_state.passed == 1 and not first_working_revision:
253 first_working_revision = revision_state
255 if not revision_state.passed:
256 last_broken_revision = revision_state
258 return first_working_revision, last_broken_revision
260 @staticmethod
261 def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,
262 last_broken_rev):
263 cwd = os.getcwd()
265 culprit_revisions = []
266 for i in xrange(last_broken_rev.index, first_working_rev.index):
267 depot_registry.ChangeToDepotDir(revision_states[i].depot)
268 info = source_control.QueryRevisionInfo(revision_states[i].revision)
269 culprit_revisions.append((revision_states[i].revision, info,
270 revision_states[i].depot))
272 os.chdir(cwd)
273 return culprit_revisions
275 @classmethod
276 def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
277 last_broken_rev):
278 # TODO(sergiyb): We assume that value has "values" key, which may not be
279 # the case for failure-bisects, where there is a single value only.
280 broken_means = [state.value['values']
281 for state in rev_states[:last_broken_rev.index+1]
282 if state.value]
284 working_means = [state.value['values']
285 for state in rev_states[first_working_rev.index:]
286 if state.value]
288 # Flatten the lists to calculate mean of all values.
289 working_mean = sum(working_means, [])
290 broken_mean = sum(broken_means, [])
292 # Calculate the approximate size of the regression
293 mean_of_bad_runs = math_utils.Mean(broken_mean)
294 mean_of_good_runs = math_utils.Mean(working_mean)
296 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
297 mean_of_bad_runs)
298 if math.isnan(regression_size):
299 regression_size = 'zero-to-nonzero'
301 regression_std_err = math.fabs(math_utils.PooledStandardError(
302 [working_mean, broken_mean]) /
303 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
305 # Give a "confidence" in the bisect. Currently, we consider the values of
306 # only the revisions at the breaking range (last known good and first known
307 # bad) see the note in the docstring for FindBreakingRange.
308 confidence_params = (
309 sum([first_working_rev.value['values']], []),
310 sum([last_broken_rev.value['values']], [])
312 confidence = cls.ConfidenceScore(*confidence_params)
314 bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
316 return {'regression_size': regression_size,
317 'regression_std_err': regression_std_err,
318 'confidence': confidence,
319 'bad_greater_than_good': bad_greater_than_good}