1 # Copyright 2014 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file.
14 class BisectResults(object):
15 """Contains results of the completed bisect.
18 error: Error message if the bisect failed.
20 If the error is None, the following properties are present:
21 warnings: List of warnings from the bisect run.
22 state: BisectState object from which these results were generated.
23 first_working_revision: First good revision.
24 last_broken_revision: Last bad revision.
26 If both of above revisions are not None, the follow properties are present:
27 culprit_revisions: A list of revisions, which contain the bad change
28 introducing the failure.
29 other_regressions: A list of tuples representing other regressions, which
31 regression_size: For performance bisects, this is a relative change of
32 the mean metric value. For other bisects this field always contains
34 regression_std_err: For performance bisects, it is a pooled standard error
35 for groups of good and bad runs. Not used for other bisects.
36 confidence: For performance bisects, it is a confidence that the good and
37 bad runs are distinct groups. Not used for non-performance bisects.
40 def __init__(self
, bisect_state
=None, depot_registry
=None, opts
=None,
41 runtime_warnings
=None, error
=None, abort_reason
=None):
42 """Computes final bisect results after a bisect run is complete.
44 This constructor should be called in one of the following ways:
45 BisectResults(state, depot_registry, opts, runtime_warnings)
46 BisectResults(error=error)
48 First option creates an object representing successful bisect results, while
49 second option creates an error result.
52 bisect_state: BisectState object representing latest bisect state.
53 depot_registry: DepotDirectoryRegistry object with information on each
54 repository in the bisect_state.
55 opts: Options passed to the bisect run.
56 runtime_warnings: A list of warnings from the bisect run.
57 error: Error message. When error is not None, other arguments are ignored.
61 self
.abort_reason
= abort_reason
62 if error
is not None or abort_reason
is not None:
65 assert (bisect_state
is not None and depot_registry
is not None and
66 opts
is not None and runtime_warnings
is not None), (
67 'Incorrect use of the BisectResults constructor. When error is '
68 'None, all other arguments are required')
70 self
.state
= bisect_state
72 rev_states
= bisect_state
.GetRevisionStates()
73 first_working_rev
, last_broken_rev
= self
.FindBreakingRevRange(rev_states
)
74 self
.first_working_revision
= first_working_rev
75 self
.last_broken_revision
= last_broken_rev
77 self
.warnings
= runtime_warnings
79 if first_working_rev
is not None and last_broken_rev
is not None:
80 statistics
= self
._ComputeRegressionStatistics
(
81 rev_states
, first_working_rev
, last_broken_rev
)
83 self
.regression_size
= statistics
['regression_size']
84 self
.regression_std_err
= statistics
['regression_std_err']
85 self
.confidence
= statistics
['confidence']
87 self
.culprit_revisions
= self
._FindCulpritRevisions
(
88 rev_states
, depot_registry
, first_working_rev
, last_broken_rev
)
90 self
.other_regressions
= self
._FindOtherRegressions
(
91 rev_states
, statistics
['bad_greater_than_good'])
93 self
.warnings
+= self
._GetResultBasedWarnings
(
94 self
.culprit_revisions
, opts
, self
.confidence
)
95 elif first_working_rev
is not None:
96 # Setting these attributes so that bisect printer does not break when the
97 # regression cannot be reproduced (no broken revision was found)
98 self
.regression_size
= 0
99 self
.regression_std_err
= 0
101 self
.culprit_revisions
= []
102 self
.other_regressions
= []
105 def _GetResultBasedWarnings(culprit_revisions
, opts
, confidence
):
107 if len(culprit_revisions
) > 1:
108 warnings
.append('Due to build errors, regression range could '
109 'not be narrowed down to a single commit.')
110 if opts
.repeat_test_count
== 1:
111 warnings
.append('Tests were only set to run once. This may '
112 'be insufficient to get meaningful results.')
113 if 0 < confidence
< bisect_utils
.HIGH_CONFIDENCE
:
114 warnings
.append('Confidence is not high. Try bisecting again '
115 'with increased repeat_count, larger range, or '
116 'on another metric.')
118 warnings
.append('Confidence score is 0%. Try bisecting again on '
119 'another platform or another metric.')
123 def ConfidenceScore(sample1
, sample2
,
124 accept_single_bad_or_good
=False):
125 """Calculates a confidence score.
127 This score is a percentage which represents our degree of confidence in the
128 proposition that the good results and bad results are distinct groups, and
129 their differences aren't due to chance alone.
133 sample1: A flat list of "good" result numbers.
134 sample2: A flat list of "bad" result numbers.
135 accept_single_bad_or_good: If True, computes confidence even if there is
136 just one bad or good revision, otherwise single good or bad revision
137 always returns 0.0 confidence. This flag will probably get away when
138 we will implement expanding the bisect range by one more revision for
142 A number in the range [0, 100].
144 # If there's only one item in either list, this means only one revision was
145 # classified good or bad; this isn't good enough evidence to make a
146 # decision. If an empty list was passed, that also implies zero confidence.
147 if not accept_single_bad_or_good
:
148 if len(sample1
) <= 1 or len(sample2
) <= 1:
151 # If there were only empty lists in either of the lists (this is unexpected
152 # and normally shouldn't happen), then we also want to return 0.
153 if not sample1
or not sample2
:
156 # The p-value is approximately the probability of obtaining the given set
157 # of good and bad values just by chance.
158 _
, _
, p_value
= ttest
.WelchsTTest(sample1
, sample2
)
159 return 100.0 * (1.0 - p_value
)
162 def _FindOtherRegressions(cls
, revision_states
, bad_greater_than_good
):
163 """Compiles a list of other possible regressions from the revision data.
166 revision_states: Sorted list of RevisionState objects.
167 bad_greater_than_good: Whether the result value at the "bad" revision is
168 numerically greater than the result value at the "good" revision.
171 A list of [current_rev, previous_rev, confidence] for other places where
172 there may have been a regression.
174 other_regressions
= []
177 for revision_state
in revision_states
:
178 if revision_state
.value
:
179 current_values
= revision_state
.value
['values']
181 confidence_params
= (sum(previous_values
, []),
182 sum([current_values
], []))
183 confidence
= cls
.ConfidenceScore(*confidence_params
,
184 accept_single_bad_or_good
=True)
185 mean_of_prev_runs
= math_utils
.Mean(sum(previous_values
, []))
186 mean_of_current_runs
= math_utils
.Mean(current_values
)
188 # Check that the potential regression is in the same direction as
189 # the overall regression. If the mean of the previous runs < the
190 # mean of the current runs, this local regression is in same
192 prev_greater_than_current
= mean_of_prev_runs
> mean_of_current_runs
193 is_same_direction
= (prev_greater_than_current
if
194 bad_greater_than_good
else not prev_greater_than_current
)
196 # Only report potential regressions with high confidence.
197 if is_same_direction
and confidence
> 50:
198 other_regressions
.append([revision_state
, prev_state
, confidence
])
199 previous_values
.append(current_values
)
200 prev_state
= revision_state
201 return other_regressions
204 def FindBreakingRevRange(revision_states
):
205 first_working_revision
= None
206 last_broken_revision
= None
208 for revision_state
in revision_states
:
209 if revision_state
.passed
== 1 and not first_working_revision
:
210 first_working_revision
= revision_state
212 if not revision_state
.passed
:
213 last_broken_revision
= revision_state
215 return first_working_revision
, last_broken_revision
218 def _FindCulpritRevisions(revision_states
, depot_registry
, first_working_rev
,
222 culprit_revisions
= []
223 for i
in xrange(last_broken_rev
.index
, first_working_rev
.index
):
224 depot_registry
.ChangeToDepotDir(revision_states
[i
].depot
)
225 info
= source_control
.QueryRevisionInfo(revision_states
[i
].revision
)
226 culprit_revisions
.append((revision_states
[i
].revision
, info
,
227 revision_states
[i
].depot
))
230 return culprit_revisions
233 def _ComputeRegressionStatistics(cls
, rev_states
, first_working_rev
,
235 # TODO(sergiyb): We assume that value has "values" key, which may not be
236 # the case for failure-bisects, where there is a single value only.
237 broken_means
= [state
.value
['values']
238 for state
in rev_states
[:last_broken_rev
.index
+1]
241 working_means
= [state
.value
['values']
242 for state
in rev_states
[first_working_rev
.index
:]
245 # Flatten the lists to calculate mean of all values.
246 working_mean
= sum(working_means
, [])
247 broken_mean
= sum(broken_means
, [])
249 # Calculate the approximate size of the regression
250 mean_of_bad_runs
= math_utils
.Mean(broken_mean
)
251 mean_of_good_runs
= math_utils
.Mean(working_mean
)
253 regression_size
= 100 * math_utils
.RelativeChange(mean_of_good_runs
,
255 if math
.isnan(regression_size
):
256 regression_size
= 'zero-to-nonzero'
258 regression_std_err
= math
.fabs(math_utils
.PooledStandardError(
259 [working_mean
, broken_mean
]) /
260 max(0.0001, min(mean_of_good_runs
, mean_of_bad_runs
))) * 100.0
262 # Give a "confidence" in the bisect. At the moment we use how distinct the
263 # values are before and after the last broken revision, and how noisy the
265 confidence_params
= (sum(working_means
, []), sum(broken_means
, []))
266 confidence
= cls
.ConfidenceScore(*confidence_params
)
268 bad_greater_than_good
= mean_of_bad_runs
> mean_of_good_runs
270 return {'regression_size': regression_size
,
271 'regression_std_err': regression_std_err
,
272 'confidence': confidence
,
273 'bad_greater_than_good': bad_greater_than_good
}