Roll src/third_party/WebKit 3aea697:d9c6159 (svn 201973:201974)
[chromium-blink-merge.git] / tools / auto_bisect / bisect_results.py
blobf6ba0d856d54860e9ce86f3ff84a661085ce86ee
1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
5 import math
6 import os
8 import bisect_utils
9 import math_utils
10 import source_control
11 import ttest
13 from bisect_state import RevisionState
16 class BisectResults(object):
17 """Contains results of the completed bisect.
19 Properties:
20 error: Error message if the bisect failed.
22 If the error is None, the following properties are present:
23 warnings: List of warnings from the bisect run.
24 state: BisectState object from which these results were generated.
25 first_working_revision: First good revision.
26 last_broken_revision: Last bad revision.
28 If both of above revisions are not None, the follow properties are present:
29 culprit_revisions: A list of revisions, which contain the bad change
30 introducing the failure.
31 regression_size: For performance bisects, this is a relative change of
32 the mean metric value. For other bisects this field always contains
33 'zero-to-nonzero'.
34 regression_std_err: For performance bisects, it is a pooled standard error
35 for groups of good and bad runs. Not used for other bisects.
36 confidence: For performance bisects, it is a confidence that the good and
37 bad runs are distinct groups. Not used for non-performance bisects.
38 """
40 def __init__(self, bisect_state=None, depot_registry=None, opts=None,
41 runtime_warnings=None, error=None, abort_reason=None):
42 """Computes final bisect results after a bisect run is complete.
44 This constructor should be called in one of the following ways:
45 BisectResults(state, depot_registry, opts, runtime_warnings)
46 BisectResults(error=error)
48 First option creates an object representing successful bisect results, while
49 second option creates an error result.
51 Args:
52 bisect_state: BisectState object representing latest bisect state.
53 depot_registry: DepotDirectoryRegistry object with information on each
54 repository in the bisect_state.
55 opts: Options passed to the bisect run.
56 runtime_warnings: A list of warnings from the bisect run.
57 error: Error message. When error is not None, other arguments are ignored.
58 """
59 self.error = error
60 self.abort_reason = abort_reason
61 if error is not None or abort_reason is not None:
62 return
64 assert (bisect_state is not None and depot_registry is not None and
65 opts is not None and runtime_warnings is not None), (
66 'Incorrect use of the BisectResults constructor. '
67 'When error is None, all other arguments are required.')
69 self.state = bisect_state
71 rev_states = bisect_state.GetRevisionStates()
72 first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)
73 self.first_working_revision = first_working_rev
74 self.last_broken_revision = last_broken_rev
76 self.warnings = runtime_warnings
78 self.retest_results_tot = None
79 self.retest_results_reverted = None
81 if first_working_rev is not None and last_broken_rev is not None:
82 statistics = self._ComputeRegressionStatistics(
83 rev_states, first_working_rev, last_broken_rev)
85 self.regression_size = statistics['regression_size']
86 self.regression_std_err = statistics['regression_std_err']
87 self.confidence = statistics['confidence']
89 self.culprit_revisions = self._FindCulpritRevisions(
90 rev_states, depot_registry, first_working_rev, last_broken_rev)
92 self.warnings += self._GetResultBasedWarnings(
93 self.culprit_revisions, opts, self.confidence)
94 elif first_working_rev is not None:
95 # Setting these attributes so that bisect printer does not break when the
96 # regression cannot be reproduced (no broken revision was found)
97 self.regression_size = 0
98 self.regression_std_err = 0
99 self.confidence = 0
100 self.culprit_revisions = []
102 def AddRetestResults(self, results_tot, results_reverted):
103 if not results_tot or not results_reverted:
104 self.warnings.append(
105 'Failed to re-test reverted culprit CL against ToT.')
106 return
108 confidence = BisectResults.ConfidenceScore(
109 results_reverted[0]['values'],
110 results_tot[0]['values'])
112 self.retest_results_tot = RevisionState('ToT', 'n/a', 0)
113 self.retest_results_tot.value = results_tot[0]
115 self.retest_results_reverted = RevisionState('Reverted', 'n/a', 0)
116 self.retest_results_reverted.value = results_reverted[0]
118 if confidence <= bisect_utils.HIGH_CONFIDENCE:
119 self.warnings.append(
120 'Confidence of re-test with reverted CL is not high.'
121 ' Check that the regression hasn\'t already recovered. '
122 ' There\'s still a chance this is a regression, as performance of'
123 ' local builds may not match official builds.')
125 @staticmethod
126 def _GetResultBasedWarnings(culprit_revisions, opts, confidence):
127 warnings = []
128 if len(culprit_revisions) > 1:
129 warnings.append('Due to build errors, regression range could '
130 'not be narrowed down to a single commit.')
131 if opts.repeat_test_count == 1:
132 warnings.append('Tests were only set to run once. This may '
133 'be insufficient to get meaningful results.')
134 if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:
135 warnings.append('Confidence is not high. Try bisecting again '
136 'with increased repeat_count, larger range, or '
137 'on another metric.')
138 if not confidence:
139 warnings.append('Confidence score is 0%. Try bisecting again on '
140 'another platform or another metric.')
141 return warnings
143 @staticmethod
144 def ConfidenceScore(sample1, sample2, accept_single_bad_or_good=False):
145 """Calculates a confidence score.
147 This score is based on a statistical hypothesis test. The null
148 hypothesis is that the two groups of results have no difference,
149 i.e. there is no performance regression. The alternative hypothesis
150 is that there is some difference between the groups that's unlikely
151 to occur by chance.
153 The score returned by this function represents our confidence in the
154 alternative hypothesis.
156 Note that if there's only one item in either sample, this means only
157 one revision was classified good or bad, so there's not much evidence
158 to make a decision.
160 Args:
161 sample1: A flat list of "good" result numbers.
162 sample2: A flat list of "bad" result numbers.
163 accept_single_bad_or_good: If True, compute a value even if
164 there is only one bad or good revision.
166 Returns:
167 A float between 0 and 100; 0 if the samples aren't large enough.
169 if ((len(sample1) <= 1 or len(sample2) <= 1) and
170 not accept_single_bad_or_good):
171 return 0.0
172 if not sample1 or not sample2:
173 return 0.0
174 _, _, p_value = ttest.WelchsTTest(sample1, sample2)
175 return 100.0 * (1.0 - p_value)
177 @staticmethod
178 def FindBreakingRevRange(revision_states):
179 """Finds the last known good and first known bad revisions.
181 Note that since revision_states is expected to be in reverse chronological
182 order, the last known good revision is the first revision in the list that
183 has the passed property set to 1, therefore the name
184 `first_working_revision`. The inverse applies to `last_broken_revision`.
186 Args:
187 revision_states: A list of RevisionState instances.
189 Returns:
190 A tuple containing the two revision states at the border. (Last
191 known good and first known bad.)
193 first_working_revision = None
194 last_broken_revision = None
196 for revision_state in revision_states:
197 if revision_state.passed == 1 and not first_working_revision:
198 first_working_revision = revision_state
200 if not revision_state.passed:
201 last_broken_revision = revision_state
203 return first_working_revision, last_broken_revision
205 @staticmethod
206 def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,
207 last_broken_rev):
208 cwd = os.getcwd()
210 culprit_revisions = []
211 for i in xrange(last_broken_rev.index, first_working_rev.index):
212 depot_registry.ChangeToDepotDir(revision_states[i].depot)
213 info = source_control.QueryRevisionInfo(revision_states[i].revision)
214 culprit_revisions.append((revision_states[i].revision, info,
215 revision_states[i].depot))
217 os.chdir(cwd)
218 return culprit_revisions
220 @classmethod
221 def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
222 last_broken_rev):
223 # TODO(sergiyb): We assume that value has "values" key, which may not be
224 # the case for failure-bisects, where there is a single value only.
225 broken_means = [state.value['values']
226 for state in rev_states[:last_broken_rev.index+1]
227 if state.value]
229 working_means = [state.value['values']
230 for state in rev_states[first_working_rev.index:]
231 if state.value]
233 # Flatten the lists to calculate mean of all values.
234 working_mean = sum(working_means, [])
235 broken_mean = sum(broken_means, [])
237 # Calculate the approximate size of the regression
238 mean_of_bad_runs = math_utils.Mean(broken_mean)
239 mean_of_good_runs = math_utils.Mean(working_mean)
241 regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
242 mean_of_bad_runs)
243 if math.isnan(regression_size):
244 regression_size = 'zero-to-nonzero'
246 regression_std_err = math.fabs(math_utils.PooledStandardError(
247 [working_mean, broken_mean]) /
248 max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
250 # Give a "confidence" in the bisect culprit by seeing whether the results
251 # of the culprit revision and the revision before that appear to be
252 # statistically significantly different.
253 confidence = cls.ConfidenceScore(
254 sum([first_working_rev.value['values']], []),
255 sum([last_broken_rev.value['values']], []))
257 bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
259 return {'regression_size': regression_size,
260 'regression_std_err': regression_std_err,
261 'confidence': confidence,
262 'bad_greater_than_good': bad_greater_than_good}