Reland the ULONG -> SIZE_T change from 317177
[chromium-blink-merge.git] / tools / auto_bisect / bisect_results.py
blob07b7806809dced723375392e01a91447040d5540
1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 import math
6 import os
8 import bisect_utils
9 import math_utils
10 import source_control
11 import ttest
13 from bisect_state import RevisionState
16 class BisectResults(object):
17 """Contains results of the completed bisect.
19 Properties:
20 error: Error message if the bisect failed.
22 If the error is None, the following properties are present:
23 warnings: List of warnings from the bisect run.
24 state: BisectState object from which these results were generated.
25 first_working_revision: First good revision.
26 last_broken_revision: Last bad revision.
28 If both of above revisions are not None, the follow properties are present:
29 culprit_revisions: A list of revisions, which contain the bad change
30 introducing the failure.
31 other_regressions: A list of tuples representing other regressions, which
32 may have occurred.
33 regression_size: For performance bisects, this is a relative change of
34 the mean metric value. For other bisects this field always contains
35 'zero-to-nonzero'.
36 regression_std_err: For performance bisects, it is a pooled standard error
37 for groups of good and bad runs. Not used for other bisects.
38 confidence: For performance bisects, it is a confidence that the good and
39 bad runs are distinct groups. Not used for non-performance bisects.
40 """
42 def __init__(self, bisect_state=None, depot_registry=None, opts=None,
43 runtime_warnings=None, error=None, abort_reason=None):
44 """Computes final bisect results after a bisect run is complete.
46 This constructor should be called in one of the following ways:
47 BisectResults(state, depot_registry, opts, runtime_warnings)
48 BisectResults(error=error)
50 First option creates an object representing successful bisect results, while
51 second option creates an error result.
53 Args:
54 bisect_state: BisectState object representing latest bisect state.
55 depot_registry: DepotDirectoryRegistry object with information on each
56 repository in the bisect_state.
57 opts: Options passed to the bisect run.
58 runtime_warnings: A list of warnings from the bisect run.
59 error: Error message. When error is not None, other arguments are ignored.
60 """
62 self.error = error
63 self.abort_reason = abort_reason
64 if error is not None or abort_reason is not None:
65 return
67 assert (bisect_state is not None and depot_registry is not None and
68 opts is not None and runtime_warnings is not None), (
69 'Incorrect use of the BisectResults constructor. When error is '
70 'None, all other arguments are required')
72 self.state = bisect_state
74 rev_states = bisect_state.GetRevisionStates()
75 first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)
76 self.first_working_revision = first_working_rev
77 self.last_broken_revision = last_broken_rev
79 self.warnings = runtime_warnings
81 self.retest_results_tot = None
82 self.retest_results_reverted = None
84 if first_working_rev is not None and last_broken_rev is not None:
85 statistics = self._ComputeRegressionStatistics(
86 rev_states, first_working_rev, last_broken_rev)
88 self.regression_size = statistics['regression_size']
89 self.regression_std_err = statistics['regression_std_err']
90 self.confidence = statistics['confidence']
92 self.culprit_revisions = self._FindCulpritRevisions(
93 rev_states, depot_registry, first_working_rev, last_broken_rev)
95 self.other_regressions = self._FindOtherRegressions(
96 rev_states, statistics['bad_greater_than_good'])
98 self.warnings += self._GetResultBasedWarnings(
99 self.culprit_revisions, opts, self.confidence)
100 elif first_working_rev is not None:
101 # Setting these attributes so that bisect printer does not break when the
102 # regression cannot be reproduced (no broken revision was found)
103 self.regression_size = 0
104 self.regression_std_err = 0
105 self.confidence = 0
106 self.culprit_revisions = []
107 self.other_regressions = []
109 def AddRetestResults(self, results_tot, results_reverted):
110 if not results_tot or not results_reverted:
111 self.warnings.append(
112 'Failed to re-test reverted culprit CL against ToT.')
113 return
115 confidence_params = (results_reverted[0]['values'],
116 results_tot[0]['values'])
117 confidence = BisectResults.ConfidenceScore(*confidence_params)
119 self.retest_results_tot = RevisionState('ToT', 'n/a', 0)
120 self.retest_results_tot.value = results_tot[0]
122 self.retest_results_reverted = RevisionState('Reverted', 'n/a', 0)
123 self.retest_results_reverted.value = results_reverted[0]
125 if confidence <= bisect_utils.HIGH_CONFIDENCE:
126 self.warnings.append(
127 'Confidence of re-test with reverted CL is not high.'
128 ' Check that the regression hasn\'t already recovered. '
129 ' There\'s still a chance this is a regression, as performance of'
130 ' local builds may not match official builds.' )
132 @staticmethod
133 def _GetResultBasedWarnings(culprit_revisions, opts, confidence):
134 warnings = []
135 if len(culprit_revisions) > 1:
136 warnings.append('Due to build errors, regression range could '
137 'not be narrowed down to a single commit.')
138 if opts.repeat_test_count == 1:
139 warnings.append('Tests were only set to run once. This may '
140 'be insufficient to get meaningful results.')
141 if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:
142 warnings.append('Confidence is not high. Try bisecting again '
143 'with increased repeat_count, larger range, or '
144 'on another metric.')
145 if not confidence:
146 warnings.append('Confidence score is 0%. Try bisecting again on '
147 'another platform or another metric.')
148 return warnings
150 @staticmethod
151 def ConfidenceScore(sample1, sample2,
152 accept_single_bad_or_good=False):
153 """Calculates a confidence score.
155 This score is a percentage which represents our degree of confidence in the
156 proposition that the good results and bad results are distinct groups, and
157 their differences aren't due to chance alone.
160 Args:
161 sample1: A flat list of "good" result numbers.
162 sample2: A flat list of "bad" result numbers.
163 accept_single_bad_or_good: If True, computes confidence even if there is
164 just one bad or good revision, otherwise single good or bad revision
165 always returns 0.0 confidence. This flag will probably get away when
166 we will implement expanding the bisect range by one more revision for
167 such case.
169 Returns:
170 A number in the range [0, 100].
172 # If there's only one item in either list, this means only one revision was
173 # classified good or bad; this isn't good enough evidence to make a
174 # decision. If an empty list was passed, that also implies zero confidence.
175 if not accept_single_bad_or_good:
176 if len(sample1) <= 1 or len(sample2) <= 1:
177 return 0.0
179 # If there were only empty lists in either of the lists (this is unexpected
180 # and normally shouldn't happen), then we also want to return 0.
181 if not sample1 or not sample2:
182 return 0.0
184 # The p-value is approximately the probability of obtaining the given set
185 # of good and bad values just by chance.
186 _, _, p_value = ttest.WelchsTTest(sample1, sample2)
187 return 100.0 * (1.0 - p_value)
189 @classmethod
190 def _FindOtherRegressions(cls, revision_states, bad_greater_than_good):
191 """Compiles a list of other possible regressions from the revision data.
193 Args:
194 revision_states: Sorted list of RevisionState objects.
195 bad_greater_than_good: Whether the result value at the "bad" revision is
196 numerically greater than the result value at the "good" revision.
198 Returns:
199 A list of [current_rev, previous_rev, confidence] for other places where
200 there may have been a regression.
202 other_regressions = []
203 previous_values = []
204 prev_state = None
205 for revision_state in revision_states:
206 if revision_state.value:
207 current_values = revision_state.value['values']
208 if previous_values:
209 confidence_params = (sum(previous_values, []),
210 sum([current_values], []))
211 confidence = cls.ConfidenceScore(*confidence_params,
212 accept_single_bad_or_good=True)
213 mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
214 mean_of_current_runs = math_utils.Mean(current_values)
216 # Check that the potential regression is in the same direction as
217 # the overall regression. If the mean of the previous runs < the
218 # mean of the current runs, this local regression is in same
219 # direction.
220 prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs
221 is_same_direction = (prev_greater_than_current if
222 bad_greater_than_good else not prev_greater_than_current)
224 # Only report potential regressions with high confidence.
225 if is_same_direction and confidence > 50:
226 other_regressions.append([revision_state, prev_state, confidence])
227 previous_values.append(current_values)
228 prev_state = revision_state
229 return other_regressions
231 @staticmethod
232 def FindBreakingRevRange(revision_states):
233 """Finds the last known good and first known bad revisions.
235 Note that since revision_states is expected to be in reverse chronological
236 order, the last known good revision is the first revision in the list that
237 has the passed property set to 1, therefore the name
238 `first_working_revision`. The inverse applies to `last_broken_revision`.
240 Args:
241 revision_states: A list of RevisionState instances.
243 Returns:
244 A tuple containing the two revision states at the border. (Last
245 known good and first known bad.)
247 first_working_revision = None
248 last_broken_revision = None
250 for revision_state in revision_states:
251 if revision_state.passed == 1 and not first_working_revision:
252 first_working_revision = revision_state
254 if not revision_state.passed:
255 last_broken_revision = revision_state
257 return first_working_revision, last_broken_revision
259 @staticmethod
260 def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,
261 last_broken_rev):
262 cwd = os.getcwd()
264 culprit_revisions = []
265 for i in xrange(last_broken_rev.index, first_working_rev.index):
266 depot_registry.ChangeToDepotDir(revision_states[i].depot)
267 info = source_control.QueryRevisionInfo(revision_states[i].revision)
268 culprit_revisions.append((revision_states[i].revision, info,
269 revision_states[i].depot))
271 os.chdir(cwd)
272 return culprit_revisions
274 @classmethod
275 def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
276 last_broken_rev):
277 # TODO(sergiyb): We assume that value has "values" key, which may not be
278 # the case for failure-bisects, where there is a single value only.
279 broken_means = [state.value['values']
280 for state in rev_states[:last_broken_rev.index+1]
281 if state.value]
283 working_means = [state.value['values']
284 for state in rev_states[first_working_rev.index:]
285 if state.value]
287 # Flatten the lists to calculate mean of all values.
288 working_mean = sum(working_means, [])
289 broken_mean = sum(broken_means, [])
291 # Calculate the approximate size of the regression
292 mean_of_bad_runs = math_utils.Mean(broken_mean)
293 mean_of_good_runs = math_utils.Mean(working_mean)
295 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
296 mean_of_bad_runs)
297 if math.isnan(regression_size):
298 regression_size = 'zero-to-nonzero'
300 regression_std_err = math.fabs(math_utils.PooledStandardError(
301 [working_mean, broken_mean]) /
302 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
304 # Give a "confidence" in the bisect. Currently, we consider the values of
305 # only the revisions at the breaking range (last known good and first known
306 # bad) see the note in the docstring for FindBreakingRange.
307 confidence_params = (
308 sum([first_working_rev.value['values']], []),
309 sum([last_broken_rev.value['values']], [])
311 confidence = cls.ConfidenceScore(*confidence_params)
313 bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
315 return {'regression_size': regression_size,
316 'regression_std_err': regression_std_err,
317 'confidence': confidence,
318 'bad_greater_than_good': bad_greater_than_good}