tools/auto_bisect/bisect_results.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import math
   6 import os
   7
   8 import bisect_utils
   9 import math_utils
  10 import source_control
  11 import ttest
  12
  13 from bisect_state import RevisionState
  14
  15
  16 class BisectResults(object):
  17   """Contains results of the completed bisect.
  18
  19   Properties:
  20     error: Error message if the bisect failed.
  21
  22   If the error is None, the following properties are present:
  23     warnings: List of warnings from the bisect run.
  24     state: BisectState object from which these results were generated.
  25     first_working_revision: First good revision.
  26     last_broken_revision: Last bad revision.
  27
  28   If both of above revisions are not None, the follow properties are present:
  29     culprit_revisions: A list of revisions, which contain the bad change
  30         introducing the failure.
  31     other_regressions: A list of tuples representing other regressions, which
  32         may have occurred.
  33     regression_size: For performance bisects, this is a relative change of
  34         the mean metric value. For other bisects this field always contains
  35         'zero-to-nonzero'.
  36     regression_std_err: For performance bisects, it is a pooled standard error
  37         for groups of good and bad runs. Not used for other bisects.
  38     confidence: For performance bisects, it is a confidence that the good and
  39         bad runs are distinct groups. Not used for non-performance bisects.
  40   """
  41
  42   def __init__(self, bisect_state=None, depot_registry=None, opts=None,
  43                runtime_warnings=None, error=None, abort_reason=None):
  44     """Computes final bisect results after a bisect run is complete.
  45
  46     This constructor should be called in one of the following ways:
  47       BisectResults(state, depot_registry, opts, runtime_warnings)
  48       BisectResults(error=error)
  49
  50     First option creates an object representing successful bisect results, while
  51     second option creates an error result.
  52
  53     Args:
  54       bisect_state: BisectState object representing latest bisect state.
  55       depot_registry: DepotDirectoryRegistry object with information on each
  56           repository in the bisect_state.
  57       opts: Options passed to the bisect run.
  58       runtime_warnings: A list of warnings from the bisect run.
  59       error: Error message. When error is not None, other arguments are ignored.
  60     """
  61     self.error = error
  62     self.abort_reason = abort_reason
  63     if error is not None or abort_reason is not None:
  64       return
  65
  66     assert (bisect_state is not None and depot_registry is not None and
  67             opts is not None and runtime_warnings is not None), (
  68                 'Incorrect use of the BisectResults constructor. '
  69                 'When error is None, all other arguments are required.')
  70
  71     self.state = bisect_state
  72
  73     rev_states = bisect_state.GetRevisionStates()
  74     first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)
  75     self.first_working_revision = first_working_rev
  76     self.last_broken_revision = last_broken_rev
  77
  78     self.warnings = runtime_warnings
  79
  80     self.retest_results_tot = None
  81     self.retest_results_reverted = None
  82
  83     if first_working_rev is not None and last_broken_rev is not None:
  84       statistics = self._ComputeRegressionStatistics(
  85           rev_states, first_working_rev, last_broken_rev)
  86
  87       self.regression_size = statistics['regression_size']
  88       self.regression_std_err = statistics['regression_std_err']
  89       self.confidence = statistics['confidence']
  90
  91       self.culprit_revisions = self._FindCulpritRevisions(
  92           rev_states, depot_registry, first_working_rev, last_broken_rev)
  93
  94       self.other_regressions = self._FindOtherRegressions(
  95           rev_states, statistics['bad_greater_than_good'])
  96
  97       self.warnings += self._GetResultBasedWarnings(
  98           self.culprit_revisions, opts, self.confidence)
  99     elif first_working_rev is not None:
 100       # Setting these attributes so that bisect printer does not break when the
 101       # regression cannot be reproduced (no broken revision was found)
 102       self.regression_size = 0
 103       self.regression_std_err = 0
 104       self.confidence = 0
 105       self.culprit_revisions = []
 106       self.other_regressions = []
 107
 108   def AddRetestResults(self, results_tot, results_reverted):
 109     if not results_tot or not results_reverted:
 110       self.warnings.append(
 111           'Failed to re-test reverted culprit CL against ToT.')
 112       return
 113
 114     confidence_params = (results_reverted[0]['values'],
 115                          results_tot[0]['values'])
 116     confidence = BisectResults.ConfidenceScore(*confidence_params)
 117
 118     self.retest_results_tot = RevisionState('ToT', 'n/a', 0)
 119     self.retest_results_tot.value = results_tot[0]
 120
 121     self.retest_results_reverted = RevisionState('Reverted', 'n/a', 0)
 122     self.retest_results_reverted.value = results_reverted[0]
 123
 124     if confidence <= bisect_utils.HIGH_CONFIDENCE:
 125       self.warnings.append(
 126           'Confidence of re-test with reverted CL is not high.'
 127           ' Check that the regression hasn\'t already recovered. '
 128           ' There\'s still a chance this is a regression, as performance of'
 129           ' local builds may not match official builds.')
 130
 131   @staticmethod
 132   def _GetResultBasedWarnings(culprit_revisions, opts, confidence):
 133     warnings = []
 134     if len(culprit_revisions) > 1:
 135       warnings.append('Due to build errors, regression range could '
 136                       'not be narrowed down to a single commit.')
 137     if opts.repeat_test_count == 1:
 138       warnings.append('Tests were only set to run once. This may '
 139                       'be insufficient to get meaningful results.')
 140     if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:
 141       warnings.append('Confidence is not high. Try bisecting again '
 142                       'with increased repeat_count, larger range, or '
 143                       'on another metric.')
 144     if not confidence:
 145       warnings.append('Confidence score is 0%. Try bisecting again on '
 146                       'another platform or another metric.')
 147     return warnings
 148
 149   @staticmethod
 150   def ConfidenceScore(sample1, sample2,
 151                       accept_single_bad_or_good=False):
 152     """Calculates a confidence score.
 153
 154     This score is a percentage which represents our degree of confidence in the
 155     proposition that the good results and bad results are distinct groups, and
 156     their differences aren't due to chance alone.
 157
 158
 159     Args:
 160       sample1: A flat list of "good" result numbers.
 161       sample2: A flat list of "bad" result numbers.
 162       accept_single_bad_or_good: If True, computes confidence even if there is
 163           just one bad or good revision, otherwise single good or bad revision
 164           always returns 0.0 confidence. This flag will probably get away when
 165           we will implement expanding the bisect range by one more revision for
 166           such case.
 167
 168     Returns:
 169       A number in the range [0, 100].
 170     """
 171     # If there's only one item in either list, this means only one revision was
 172     # classified good or bad; this isn't good enough evidence to make a
 173     # decision. If an empty list was passed, that also implies zero confidence.
 174     if not accept_single_bad_or_good:
 175       if len(sample1) <= 1 or len(sample2) <= 1:
 176         return 0.0
 177
 178     # If there were only empty lists in either of the lists (this is unexpected
 179     # and normally shouldn't happen), then we also want to return 0.
 180     if not sample1 or not sample2:
 181       return 0.0
 182
 183     # The p-value is approximately the probability of obtaining the given set
 184     # of good and bad values just by chance.
 185     _, _, p_value = ttest.WelchsTTest(sample1, sample2)
 186     return 100.0 * (1.0 - p_value)
 187
 188   @classmethod
 189   def _FindOtherRegressions(cls, revision_states, bad_greater_than_good):
 190     """Compiles a list of other possible regressions from the revision data.
 191
 192     Args:
 193       revision_states: Sorted list of RevisionState objects.
 194       bad_greater_than_good: Whether the result value at the "bad" revision is
 195           numerically greater than the result value at the "good" revision.
 196
 197     Returns:
 198       A list of [current_rev, previous_rev, confidence] for other places where
 199       there may have been a regression.
 200     """
 201     other_regressions = []
 202     previous_values = []
 203     prev_state = None
 204     for revision_state in revision_states:
 205       if revision_state.value:
 206         current_values = revision_state.value['values']
 207         if previous_values:
 208           confidence_params = (sum(previous_values, []),
 209                                sum([current_values], []))
 210           confidence = cls.ConfidenceScore(*confidence_params,
 211                                            accept_single_bad_or_good=True)
 212           mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
 213           mean_of_current_runs = math_utils.Mean(current_values)
 214
 215           # Check that the potential regression is in the same direction as
 216           # the overall regression. If the mean of the previous runs < the
 217           # mean of the current runs, this local regression is in same
 218           # direction.
 219           prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs
 220           if bad_greater_than_good:
 221             is_same_direction = prev_greater_than_current
 222           else:
 223             is_same_direction = not prev_greater_than_current
 224
 225           # Only report potential regressions with high confidence.
 226           if is_same_direction and confidence > 50:
 227             other_regressions.append([revision_state, prev_state, confidence])
 228         previous_values.append(current_values)
 229         prev_state = revision_state
 230     return other_regressions
 231
 232   @staticmethod
 233   def FindBreakingRevRange(revision_states):
 234     """Finds the last known good and first known bad revisions.
 235
 236     Note that since revision_states is expected to be in reverse chronological
 237     order, the last known good revision is the first revision in the list that
 238     has the passed property set to 1, therefore the name
 239     `first_working_revision`. The inverse applies to `last_broken_revision`.
 240
 241     Args:
 242       revision_states: A list of RevisionState instances.
 243
 244     Returns:
 245       A tuple containing the two revision states at the border. (Last
 246       known good and first known bad.)
 247     """
 248     first_working_revision = None
 249     last_broken_revision = None
 250
 251     for revision_state in revision_states:
 252       if revision_state.passed == 1 and not first_working_revision:
 253         first_working_revision = revision_state
 254
 255       if not revision_state.passed:
 256         last_broken_revision = revision_state
 257
 258     return first_working_revision, last_broken_revision
 259
 260   @staticmethod
 261   def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,
 262                             last_broken_rev):
 263     cwd = os.getcwd()
 264
 265     culprit_revisions = []
 266     for i in xrange(last_broken_rev.index, first_working_rev.index):
 267       depot_registry.ChangeToDepotDir(revision_states[i].depot)
 268       info = source_control.QueryRevisionInfo(revision_states[i].revision)
 269       culprit_revisions.append((revision_states[i].revision, info,
 270                                 revision_states[i].depot))
 271
 272     os.chdir(cwd)
 273     return culprit_revisions
 274
 275   @classmethod
 276   def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
 277                                    last_broken_rev):
 278     # TODO(sergiyb): We assume that value has "values" key, which may not be
 279     # the case for failure-bisects, where there is a single value only.
 280     broken_means = [state.value['values']
 281                     for state in rev_states[:last_broken_rev.index+1]
 282                     if state.value]
 283
 284     working_means = [state.value['values']
 285                      for state in rev_states[first_working_rev.index:]
 286                      if state.value]
 287
 288     # Flatten the lists to calculate mean of all values.
 289     working_mean = sum(working_means, [])
 290     broken_mean = sum(broken_means, [])
 291
 292     # Calculate the approximate size of the regression
 293     mean_of_bad_runs = math_utils.Mean(broken_mean)
 294     mean_of_good_runs = math_utils.Mean(working_mean)
 295
 296     regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
 297                                                       mean_of_bad_runs)
 298     if math.isnan(regression_size):
 299       regression_size = 'zero-to-nonzero'
 300
 301     regression_std_err = math.fabs(math_utils.PooledStandardError(
 302         [working_mean, broken_mean]) /
 303         max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
 304
 305     # Give a "confidence" in the bisect. Currently, we consider the values of
 306     # only the revisions at the breaking range (last known good and first known
 307     # bad) see the note in the docstring for FindBreakingRange.
 308     confidence_params = (
 309         sum([first_working_rev.value['values']], []),
 310         sum([last_broken_rev.value['values']], [])
 311     )
 312     confidence = cls.ConfidenceScore(*confidence_params)
 313
 314     bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
 315
 316     return {'regression_size': regression_size,
 317             'regression_std_err': regression_std_err,
 318             'confidence': confidence,
 319             'bad_greater_than_good': bad_greater_than_good}