tools/auto_bisect/bisect_results.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import math
   6 import os
   7
   8 import bisect_utils
   9 import math_utils
  10 import source_control
  11 import ttest
  12
  13 from bisect_state import RevisionState
  14
  15
  16 class BisectResults(object):
  17   """Contains results of the completed bisect.
  18
  19   Properties:
  20     error: Error message if the bisect failed.
  21
  22   If the error is None, the following properties are present:
  23     warnings: List of warnings from the bisect run.
  24     state: BisectState object from which these results were generated.
  25     first_working_revision: First good revision.
  26     last_broken_revision: Last bad revision.
  27
  28   If both of above revisions are not None, the follow properties are present:
  29     culprit_revisions: A list of revisions, which contain the bad change
  30         introducing the failure.
  31     other_regressions: A list of tuples representing other regressions, which
  32         may have occurred.
  33     regression_size: For performance bisects, this is a relative change of
  34         the mean metric value. For other bisects this field always contains
  35         'zero-to-nonzero'.
  36     regression_std_err: For performance bisects, it is a pooled standard error
  37         for groups of good and bad runs. Not used for other bisects.
  38     confidence: For performance bisects, it is a confidence that the good and
  39         bad runs are distinct groups. Not used for non-performance bisects.
  40   """
  41
  42   def __init__(self, bisect_state=None, depot_registry=None, opts=None,
  43                runtime_warnings=None, error=None, abort_reason=None):
  44     """Computes final bisect results after a bisect run is complete.
  45
  46     This constructor should be called in one of the following ways:
  47       BisectResults(state, depot_registry, opts, runtime_warnings)
  48       BisectResults(error=error)
  49
  50     First option creates an object representing successful bisect results, while
  51     second option creates an error result.
  52
  53     Args:
  54       bisect_state: BisectState object representing latest bisect state.
  55       depot_registry: DepotDirectoryRegistry object with information on each
  56           repository in the bisect_state.
  57       opts: Options passed to the bisect run.
  58       runtime_warnings: A list of warnings from the bisect run.
  59       error: Error message. When error is not None, other arguments are ignored.
  60     """
  61
  62     self.error = error
  63     self.abort_reason = abort_reason
  64     if error is not None or abort_reason is not None:
  65       return
  66
  67     assert (bisect_state is not None and depot_registry is not None and
  68             opts is not None and runtime_warnings is not None), (
  69             'Incorrect use of the BisectResults constructor. When error is '
  70             'None, all other arguments are required')
  71
  72     self.state = bisect_state
  73
  74     rev_states = bisect_state.GetRevisionStates()
  75     first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)
  76     self.first_working_revision = first_working_rev
  77     self.last_broken_revision = last_broken_rev
  78
  79     self.warnings = runtime_warnings
  80
  81     self.retest_results_tot = None
  82     self.retest_results_reverted = None
  83
  84     if first_working_rev is not None and last_broken_rev is not None:
  85       statistics = self._ComputeRegressionStatistics(
  86           rev_states, first_working_rev, last_broken_rev)
  87
  88       self.regression_size = statistics['regression_size']
  89       self.regression_std_err = statistics['regression_std_err']
  90       self.confidence = statistics['confidence']
  91
  92       self.culprit_revisions = self._FindCulpritRevisions(
  93           rev_states, depot_registry, first_working_rev, last_broken_rev)
  94
  95       self.other_regressions = self._FindOtherRegressions(
  96           rev_states, statistics['bad_greater_than_good'])
  97
  98       self.warnings += self._GetResultBasedWarnings(
  99           self.culprit_revisions, opts, self.confidence)
 100     elif first_working_rev is not None:
 101       # Setting these attributes so that bisect printer does not break when the
 102       # regression cannot be reproduced (no broken revision was found)
 103       self.regression_size = 0
 104       self.regression_std_err = 0
 105       self.confidence = 0
 106       self.culprit_revisions = []
 107       self.other_regressions = []
 108
 109   def AddRetestResults(self, results_tot, results_reverted):
 110     if not results_tot or not results_reverted:
 111       self.warnings.append(
 112           'Failed to re-test reverted culprit CL against ToT.')
 113       return
 114
 115     confidence_params = (results_reverted[0]['values'],
 116         results_tot[0]['values'])
 117     confidence = BisectResults.ConfidenceScore(*confidence_params)
 118
 119     self.retest_results_tot = RevisionState('ToT', 'n/a', 0)
 120     self.retest_results_tot.value = results_tot[0]
 121
 122     self.retest_results_reverted = RevisionState('Reverted', 'n/a', 0)
 123     self.retest_results_reverted.value = results_reverted[0]
 124
 125     if confidence <= bisect_utils.HIGH_CONFIDENCE:
 126       self.warnings.append(
 127           'Confidence of re-test with reverted CL is not high.'
 128           ' Check that the regression hasn\'t already recovered. '
 129           ' There\'s still a chance this is a regression, as performance of'
 130           ' local builds may not match official builds.' )
 131
 132   @staticmethod
 133   def _GetResultBasedWarnings(culprit_revisions, opts, confidence):
 134     warnings = []
 135     if len(culprit_revisions) > 1:
 136       warnings.append('Due to build errors, regression range could '
 137                       'not be narrowed down to a single commit.')
 138     if opts.repeat_test_count == 1:
 139       warnings.append('Tests were only set to run once. This may '
 140                       'be insufficient to get meaningful results.')
 141     if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:
 142       warnings.append('Confidence is not high. Try bisecting again '
 143                       'with increased repeat_count, larger range, or '
 144                       'on another metric.')
 145     if not confidence:
 146       warnings.append('Confidence score is 0%. Try bisecting again on '
 147                       'another platform or another metric.')
 148     return warnings
 149
 150   @staticmethod
 151   def ConfidenceScore(sample1, sample2,
 152                       accept_single_bad_or_good=False):
 153     """Calculates a confidence score.
 154
 155     This score is a percentage which represents our degree of confidence in the
 156     proposition that the good results and bad results are distinct groups, and
 157     their differences aren't due to chance alone.
 158
 159
 160     Args:
 161       sample1: A flat list of "good" result numbers.
 162       sample2: A flat list of "bad" result numbers.
 163       accept_single_bad_or_good: If True, computes confidence even if there is
 164           just one bad or good revision, otherwise single good or bad revision
 165           always returns 0.0 confidence. This flag will probably get away when
 166           we will implement expanding the bisect range by one more revision for
 167           such case.
 168
 169     Returns:
 170       A number in the range [0, 100].
 171     """
 172     # If there's only one item in either list, this means only one revision was
 173     # classified good or bad; this isn't good enough evidence to make a
 174     # decision. If an empty list was passed, that also implies zero confidence.
 175     if not accept_single_bad_or_good:
 176       if len(sample1) <= 1 or len(sample2) <= 1:
 177         return 0.0
 178
 179     # If there were only empty lists in either of the lists (this is unexpected
 180     # and normally shouldn't happen), then we also want to return 0.
 181     if not sample1 or not sample2:
 182       return 0.0
 183
 184     # The p-value is approximately the probability of obtaining the given set
 185     # of good and bad values just by chance.
 186     _, _, p_value = ttest.WelchsTTest(sample1, sample2)
 187     return 100.0 * (1.0 - p_value)
 188
 189   @classmethod
 190   def _FindOtherRegressions(cls, revision_states, bad_greater_than_good):
 191     """Compiles a list of other possible regressions from the revision data.
 192
 193     Args:
 194       revision_states: Sorted list of RevisionState objects.
 195       bad_greater_than_good: Whether the result value at the "bad" revision is
 196           numerically greater than the result value at the "good" revision.
 197
 198     Returns:
 199       A list of [current_rev, previous_rev, confidence] for other places where
 200       there may have been a regression.
 201     """
 202     other_regressions = []
 203     previous_values = []
 204     prev_state = None
 205     for revision_state in revision_states:
 206       if revision_state.value:
 207         current_values = revision_state.value['values']
 208         if previous_values:
 209           confidence_params = (sum(previous_values, []),
 210                                sum([current_values], []))
 211           confidence = cls.ConfidenceScore(*confidence_params,
 212                                            accept_single_bad_or_good=True)
 213           mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
 214           mean_of_current_runs = math_utils.Mean(current_values)
 215
 216           # Check that the potential regression is in the same direction as
 217           # the overall regression. If the mean of the previous runs < the
 218           # mean of the current runs, this local regression is in same
 219           # direction.
 220           prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs
 221           is_same_direction = (prev_greater_than_current if
 222               bad_greater_than_good else not prev_greater_than_current)
 223
 224           # Only report potential regressions with high confidence.
 225           if is_same_direction and confidence > 50:
 226             other_regressions.append([revision_state, prev_state, confidence])
 227         previous_values.append(current_values)
 228         prev_state = revision_state
 229     return other_regressions
 230
 231   @staticmethod
 232   def FindBreakingRevRange(revision_states):
 233     """Finds the last known good and first known bad revisions.
 234
 235     Note that since revision_states is expected to be in reverse chronological
 236     order, the last known good revision is the first revision in the list that
 237     has the passed property set to 1, therefore the name
 238     `first_working_revision`. The inverse applies to `last_broken_revision`.
 239
 240     Args:
 241       revision_states: A list of RevisionState instances.
 242
 243     Returns:
 244       A tuple containing the two revision states at the border. (Last
 245       known good and first known bad.)
 246     """
 247     first_working_revision = None
 248     last_broken_revision = None
 249
 250     for revision_state in revision_states:
 251       if revision_state.passed == 1 and not first_working_revision:
 252         first_working_revision = revision_state
 253
 254       if not revision_state.passed:
 255         last_broken_revision = revision_state
 256
 257     return first_working_revision, last_broken_revision
 258
 259   @staticmethod
 260   def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,
 261                             last_broken_rev):
 262     cwd = os.getcwd()
 263
 264     culprit_revisions = []
 265     for i in xrange(last_broken_rev.index, first_working_rev.index):
 266       depot_registry.ChangeToDepotDir(revision_states[i].depot)
 267       info = source_control.QueryRevisionInfo(revision_states[i].revision)
 268       culprit_revisions.append((revision_states[i].revision, info,
 269                                 revision_states[i].depot))
 270
 271     os.chdir(cwd)
 272     return culprit_revisions
 273
 274   @classmethod
 275   def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
 276                                    last_broken_rev):
 277     # TODO(sergiyb): We assume that value has "values" key, which may not be
 278     # the case for failure-bisects, where there is a single value only.
 279     broken_means = [state.value['values']
 280                     for state in rev_states[:last_broken_rev.index+1]
 281                     if state.value]
 282
 283     working_means = [state.value['values']
 284                      for state in rev_states[first_working_rev.index:]
 285                      if state.value]
 286
 287     # Flatten the lists to calculate mean of all values.
 288     working_mean = sum(working_means, [])
 289     broken_mean = sum(broken_means, [])
 290
 291     # Calculate the approximate size of the regression
 292     mean_of_bad_runs = math_utils.Mean(broken_mean)
 293     mean_of_good_runs = math_utils.Mean(working_mean)
 294
 295     regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
 296                                                       mean_of_bad_runs)
 297     if math.isnan(regression_size):
 298       regression_size = 'zero-to-nonzero'
 299
 300     regression_std_err = math.fabs(math_utils.PooledStandardError(
 301         [working_mean, broken_mean]) /
 302         max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
 303
 304     # Give a "confidence" in the bisect. Currently, we consider the values of
 305     # only the revisions at the breaking range (last known good and first known
 306     # bad) see the note in the docstring for FindBreakingRange.
 307     confidence_params = (
 308         sum([first_working_rev.value['values']], []),
 309         sum([last_broken_rev.value['values']], [])
 310     )
 311     confidence = cls.ConfidenceScore(*confidence_params)
 312
 313     bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
 314
 315     return {'regression_size': regression_size,
 316             'regression_std_err': regression_std_err,
 317             'confidence': confidence,
 318             'bad_greater_than_good': bad_greater_than_good}