Explicitly add python-numpy dependency to install-build-deps.
[chromium-blink-merge.git] / tools / auto_bisect / bisect_results.py
blobdf93de6799577b361816e83bc3cf736d2a1ac55c
1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 import math
6 import os
8 import bisect_utils
9 import math_utils
10 import source_control
11 import ttest
14 class BisectResults(object):
15 """Contains results of the completed bisect.
17 Properties:
18 error: Error message if the bisect failed.
20 If the error is None, the following properties are present:
21 warnings: List of warnings from the bisect run.
22 state: BisectState object from which these results were generated.
23 first_working_revision: First good revision.
24 last_broken_revision: Last bad revision.
26 If both of above revisions are not None, the follow properties are present:
27 culprit_revisions: A list of revisions, which contain the bad change
28 introducing the failure.
29 other_regressions: A list of tuples representing other regressions, which
30 may have occured.
31 regression_size: For performance bisects, this is a relative change of
32 the mean metric value. For other bisects this field always contains
33 'zero-to-nonzero'.
34 regression_std_err: For performance bisects, it is a pooled standard error
35 for groups of good and bad runs. Not used for other bisects.
36 confidence: For performance bisects, it is a confidence that the good and
37 bad runs are distinct groups. Not used for non-performance bisects.
38 """
40 def __init__(self, bisect_state=None, depot_registry=None, opts=None,
41 runtime_warnings=None, error=None, abort_reason=None):
42 """Computes final bisect results after a bisect run is complete.
44 This constructor should be called in one of the following ways:
45 BisectResults(state, depot_registry, opts, runtime_warnings)
46 BisectResults(error=error)
48 First option creates an object representing successful bisect results, while
49 second option creates an error result.
51 Args:
52 bisect_state: BisectState object representing latest bisect state.
53 depot_registry: DepotDirectoryRegistry object with information on each
54 repository in the bisect_state.
55 opts: Options passed to the bisect run.
56 runtime_warnings: A list of warnings from the bisect run.
57 error: Error message. When error is not None, other arguments are ignored.
58 """
60 self.error = error
61 self.abort_reason = abort_reason
62 if error is not None or abort_reason is not None:
63 return
65 assert (bisect_state is not None and depot_registry is not None and
66 opts is not None and runtime_warnings is not None), (
67 'Incorrect use of the BisectResults constructor. When error is '
68 'None, all other arguments are required')
70 self.state = bisect_state
72 rev_states = bisect_state.GetRevisionStates()
73 first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)
74 self.first_working_revision = first_working_rev
75 self.last_broken_revision = last_broken_rev
77 self.warnings = runtime_warnings
79 if first_working_rev is not None and last_broken_rev is not None:
80 statistics = self._ComputeRegressionStatistics(
81 rev_states, first_working_rev, last_broken_rev)
83 self.regression_size = statistics['regression_size']
84 self.regression_std_err = statistics['regression_std_err']
85 self.confidence = statistics['confidence']
87 self.culprit_revisions = self._FindCulpritRevisions(
88 rev_states, depot_registry, first_working_rev, last_broken_rev)
90 self.other_regressions = self._FindOtherRegressions(
91 rev_states, statistics['bad_greater_than_good'])
93 self.warnings += self._GetResultBasedWarnings(
94 self.culprit_revisions, opts, self.confidence)
95 elif first_working_rev is not None:
96 # Setting these attributes so that bisect printer does not break when the
97 # regression cannot be reproduced (no broken revision was found)
98 self.regression_size = 0
99 self.regression_std_err = 0
100 self.confidence = 0
101 self.culprit_revisions = []
102 self.other_regressions = []
104 @staticmethod
105 def _GetResultBasedWarnings(culprit_revisions, opts, confidence):
106 warnings = []
107 if len(culprit_revisions) > 1:
108 warnings.append('Due to build errors, regression range could '
109 'not be narrowed down to a single commit.')
110 if opts.repeat_test_count == 1:
111 warnings.append('Tests were only set to run once. This may '
112 'be insufficient to get meaningful results.')
113 if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:
114 warnings.append('Confidence is not high. Try bisecting again '
115 'with increased repeat_count, larger range, or '
116 'on another metric.')
117 if not confidence:
118 warnings.append('Confidence score is 0%. Try bisecting again on '
119 'another platform or another metric.')
120 return warnings
122 @staticmethod
123 def ConfidenceScore(sample1, sample2,
124 accept_single_bad_or_good=False):
125 """Calculates a confidence score.
127 This score is a percentage which represents our degree of confidence in the
128 proposition that the good results and bad results are distinct groups, and
129 their differences aren't due to chance alone.
132 Args:
133 sample1: A flat list of "good" result numbers.
134 sample2: A flat list of "bad" result numbers.
135 accept_single_bad_or_good: If True, computes confidence even if there is
136 just one bad or good revision, otherwise single good or bad revision
137 always returns 0.0 confidence. This flag will probably get away when
138 we will implement expanding the bisect range by one more revision for
139 such case.
141 Returns:
142 A number in the range [0, 100].
144 # If there's only one item in either list, this means only one revision was
145 # classified good or bad; this isn't good enough evidence to make a
146 # decision. If an empty list was passed, that also implies zero confidence.
147 if not accept_single_bad_or_good:
148 if len(sample1) <= 1 or len(sample2) <= 1:
149 return 0.0
151 # If there were only empty lists in either of the lists (this is unexpected
152 # and normally shouldn't happen), then we also want to return 0.
153 if not sample1 or not sample2:
154 return 0.0
156 # The p-value is approximately the probability of obtaining the given set
157 # of good and bad values just by chance.
158 _, _, p_value = ttest.WelchsTTest(sample1, sample2)
159 return 100.0 * (1.0 - p_value)
161 @classmethod
162 def _FindOtherRegressions(cls, revision_states, bad_greater_than_good):
163 """Compiles a list of other possible regressions from the revision data.
165 Args:
166 revision_states: Sorted list of RevisionState objects.
167 bad_greater_than_good: Whether the result value at the "bad" revision is
168 numerically greater than the result value at the "good" revision.
170 Returns:
171 A list of [current_rev, previous_rev, confidence] for other places where
172 there may have been a regression.
174 other_regressions = []
175 previous_values = []
176 prev_state = None
177 for revision_state in revision_states:
178 if revision_state.value:
179 current_values = revision_state.value['values']
180 if previous_values:
181 confidence_params = (sum(previous_values, []),
182 sum([current_values], []))
183 confidence = cls.ConfidenceScore(*confidence_params,
184 accept_single_bad_or_good=True)
185 mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
186 mean_of_current_runs = math_utils.Mean(current_values)
188 # Check that the potential regression is in the same direction as
189 # the overall regression. If the mean of the previous runs < the
190 # mean of the current runs, this local regression is in same
191 # direction.
192 prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs
193 is_same_direction = (prev_greater_than_current if
194 bad_greater_than_good else not prev_greater_than_current)
196 # Only report potential regressions with high confidence.
197 if is_same_direction and confidence > 50:
198 other_regressions.append([revision_state, prev_state, confidence])
199 previous_values.append(current_values)
200 prev_state = revision_state
201 return other_regressions
203 @staticmethod
204 def FindBreakingRevRange(revision_states):
205 first_working_revision = None
206 last_broken_revision = None
208 for revision_state in revision_states:
209 if revision_state.passed == 1 and not first_working_revision:
210 first_working_revision = revision_state
212 if not revision_state.passed:
213 last_broken_revision = revision_state
215 return first_working_revision, last_broken_revision
217 @staticmethod
218 def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,
219 last_broken_rev):
220 cwd = os.getcwd()
222 culprit_revisions = []
223 for i in xrange(last_broken_rev.index, first_working_rev.index):
224 depot_registry.ChangeToDepotDir(revision_states[i].depot)
225 info = source_control.QueryRevisionInfo(revision_states[i].revision)
226 culprit_revisions.append((revision_states[i].revision, info,
227 revision_states[i].depot))
229 os.chdir(cwd)
230 return culprit_revisions
232 @classmethod
233 def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
234 last_broken_rev):
235 # TODO(sergiyb): We assume that value has "values" key, which may not be
236 # the case for failure-bisects, where there is a single value only.
237 broken_means = [state.value['values']
238 for state in rev_states[:last_broken_rev.index+1]
239 if state.value]
241 working_means = [state.value['values']
242 for state in rev_states[first_working_rev.index:]
243 if state.value]
245 # Flatten the lists to calculate mean of all values.
246 working_mean = sum(working_means, [])
247 broken_mean = sum(broken_means, [])
249 # Calculate the approximate size of the regression
250 mean_of_bad_runs = math_utils.Mean(broken_mean)
251 mean_of_good_runs = math_utils.Mean(working_mean)
253 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
254 mean_of_bad_runs)
255 if math.isnan(regression_size):
256 regression_size = 'zero-to-nonzero'
258 regression_std_err = math.fabs(math_utils.PooledStandardError(
259 [working_mean, broken_mean]) /
260 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
262 # Give a "confidence" in the bisect. At the moment we use how distinct the
263 # values are before and after the last broken revision, and how noisy the
264 # overall graph is.
265 confidence_params = (sum(working_means, []), sum(broken_means, []))
266 confidence = cls.ConfidenceScore(*confidence_params)
268 bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
270 return {'regression_size': regression_size,
271 'regression_std_err': regression_std_err,
272 'confidence': confidence,
273 'bad_greater_than_good': bad_greater_than_good}