tools/auto_bisect/bisect_results.py

   1 # Copyright 2014 The Chromium Authors. All rights reserved.
   2 # Use of this source code is governed by a BSD-style license that can be
   3 # found in the LICENSE file.
   4
   5 import math
   6 import os
   7
   8 import bisect_utils
   9 import math_utils
  10 import source_control
  11 import ttest
  12
  13
  14 class BisectResults(object):
  15   """Contains results of the completed bisect.
  16
  17   Properties:
  18     error: Error message if the bisect failed.
  19
  20   If the error is None, the following properties are present:
  21     warnings: List of warnings from the bisect run.
  22     state: BisectState object from which these results were generated.
  23     first_working_revision: First good revision.
  24     last_broken_revision: Last bad revision.
  25
  26   If both of above revisions are not None, the follow properties are present:
  27     culprit_revisions: A list of revisions, which contain the bad change
  28         introducing the failure.
  29     other_regressions: A list of tuples representing other regressions, which
  30         may have occured.
  31     regression_size: For performance bisects, this is a relative change of
  32         the mean metric value. For other bisects this field always contains
  33         'zero-to-nonzero'.
  34     regression_std_err: For performance bisects, it is a pooled standard error
  35         for groups of good and bad runs. Not used for other bisects.
  36     confidence: For performance bisects, it is a confidence that the good and
  37         bad runs are distinct groups. Not used for non-performance bisects.
  38   """
  39
  40   def __init__(self, bisect_state=None, depot_registry=None, opts=None,
  41                runtime_warnings=None, error=None, abort_reason=None):
  42     """Computes final bisect results after a bisect run is complete.
  43
  44     This constructor should be called in one of the following ways:
  45       BisectResults(state, depot_registry, opts, runtime_warnings)
  46       BisectResults(error=error)
  47
  48     First option creates an object representing successful bisect results, while
  49     second option creates an error result.
  50
  51     Args:
  52       bisect_state: BisectState object representing latest bisect state.
  53       depot_registry: DepotDirectoryRegistry object with information on each
  54           repository in the bisect_state.
  55       opts: Options passed to the bisect run.
  56       runtime_warnings: A list of warnings from the bisect run.
  57       error: Error message. When error is not None, other arguments are ignored.
  58     """
  59
  60     self.error = error
  61     self.abort_reason = abort_reason
  62     if error is not None or abort_reason is not None:
  63       return
  64
  65     assert (bisect_state is not None and depot_registry is not None and
  66             opts is not None and runtime_warnings is not None), (
  67             'Incorrect use of the BisectResults constructor. When error is '
  68             'None, all other arguments are required')
  69
  70     self.state = bisect_state
  71
  72     rev_states = bisect_state.GetRevisionStates()
  73     first_working_rev, last_broken_rev = self.FindBreakingRevRange(rev_states)
  74     self.first_working_revision = first_working_rev
  75     self.last_broken_revision = last_broken_rev
  76
  77     self.warnings = runtime_warnings
  78
  79     if first_working_rev is not None and last_broken_rev is not None:
  80       statistics = self._ComputeRegressionStatistics(
  81           rev_states, first_working_rev, last_broken_rev)
  82
  83       self.regression_size = statistics['regression_size']
  84       self.regression_std_err = statistics['regression_std_err']
  85       self.confidence = statistics['confidence']
  86
  87       self.culprit_revisions = self._FindCulpritRevisions(
  88           rev_states, depot_registry, first_working_rev, last_broken_rev)
  89
  90       self.other_regressions = self._FindOtherRegressions(
  91           rev_states, statistics['bad_greater_than_good'])
  92
  93       self.warnings += self._GetResultBasedWarnings(
  94           self.culprit_revisions, opts, self.confidence)
  95     elif first_working_rev is not None:
  96       # Setting these attributes so that bisect printer does not break when the
  97       # regression cannot be reproduced (no broken revision was found)
  98       self.regression_size = 0
  99       self.regression_std_err = 0
 100       self.confidence = 0
 101       self.culprit_revisions = []
 102       self.other_regressions = []
 103
 104   @staticmethod
 105   def _GetResultBasedWarnings(culprit_revisions, opts, confidence):
 106     warnings = []
 107     if len(culprit_revisions) > 1:
 108       warnings.append('Due to build errors, regression range could '
 109                       'not be narrowed down to a single commit.')
 110     if opts.repeat_test_count == 1:
 111       warnings.append('Tests were only set to run once. This may '
 112                       'be insufficient to get meaningful results.')
 113     if 0 < confidence < bisect_utils.HIGH_CONFIDENCE:
 114       warnings.append('Confidence is not high. Try bisecting again '
 115                       'with increased repeat_count, larger range, or '
 116                       'on another metric.')
 117     if not confidence:
 118       warnings.append('Confidence score is 0%. Try bisecting again on '
 119                       'another platform or another metric.')
 120     return warnings
 121
 122   @staticmethod
 123   def ConfidenceScore(sample1, sample2,
 124                       accept_single_bad_or_good=False):
 125     """Calculates a confidence score.
 126
 127     This score is a percentage which represents our degree of confidence in the
 128     proposition that the good results and bad results are distinct groups, and
 129     their differences aren't due to chance alone.
 130
 131
 132     Args:
 133       sample1: A flat list of "good" result numbers.
 134       sample2: A flat list of "bad" result numbers.
 135       accept_single_bad_or_good: If True, computes confidence even if there is
 136           just one bad or good revision, otherwise single good or bad revision
 137           always returns 0.0 confidence. This flag will probably get away when
 138           we will implement expanding the bisect range by one more revision for
 139           such case.
 140
 141     Returns:
 142       A number in the range [0, 100].
 143     """
 144     # If there's only one item in either list, this means only one revision was
 145     # classified good or bad; this isn't good enough evidence to make a
 146     # decision. If an empty list was passed, that also implies zero confidence.
 147     if not accept_single_bad_or_good:
 148       if len(sample1) <= 1 or len(sample2) <= 1:
 149         return 0.0
 150
 151     # If there were only empty lists in either of the lists (this is unexpected
 152     # and normally shouldn't happen), then we also want to return 0.
 153     if not sample1 or not sample2:
 154       return 0.0
 155
 156     # The p-value is approximately the probability of obtaining the given set
 157     # of good and bad values just by chance.
 158     _, _, p_value = ttest.WelchsTTest(sample1, sample2)
 159     return 100.0 * (1.0 - p_value)
 160
 161   @classmethod
 162   def _FindOtherRegressions(cls, revision_states, bad_greater_than_good):
 163     """Compiles a list of other possible regressions from the revision data.
 164
 165     Args:
 166       revision_states: Sorted list of RevisionState objects.
 167       bad_greater_than_good: Whether the result value at the "bad" revision is
 168           numerically greater than the result value at the "good" revision.
 169
 170     Returns:
 171       A list of [current_rev, previous_rev, confidence] for other places where
 172       there may have been a regression.
 173     """
 174     other_regressions = []
 175     previous_values = []
 176     prev_state = None
 177     for revision_state in revision_states:
 178       if revision_state.value:
 179         current_values = revision_state.value['values']
 180         if previous_values:
 181           confidence_params = (sum(previous_values, []),
 182                                sum([current_values], []))
 183           confidence = cls.ConfidenceScore(*confidence_params,
 184                                            accept_single_bad_or_good=True)
 185           mean_of_prev_runs = math_utils.Mean(sum(previous_values, []))
 186           mean_of_current_runs = math_utils.Mean(current_values)
 187
 188           # Check that the potential regression is in the same direction as
 189           # the overall regression. If the mean of the previous runs < the
 190           # mean of the current runs, this local regression is in same
 191           # direction.
 192           prev_greater_than_current = mean_of_prev_runs > mean_of_current_runs
 193           is_same_direction = (prev_greater_than_current if
 194               bad_greater_than_good else not prev_greater_than_current)
 195
 196           # Only report potential regressions with high confidence.
 197           if is_same_direction and confidence > 50:
 198             other_regressions.append([revision_state, prev_state, confidence])
 199         previous_values.append(current_values)
 200         prev_state = revision_state
 201     return other_regressions
 202
 203   @staticmethod
 204   def FindBreakingRevRange(revision_states):
 205     first_working_revision = None
 206     last_broken_revision = None
 207
 208     for revision_state in revision_states:
 209       if revision_state.passed == 1 and not first_working_revision:
 210         first_working_revision = revision_state
 211
 212       if not revision_state.passed:
 213         last_broken_revision = revision_state
 214
 215     return first_working_revision, last_broken_revision
 216
 217   @staticmethod
 218   def _FindCulpritRevisions(revision_states, depot_registry, first_working_rev,
 219                             last_broken_rev):
 220     cwd = os.getcwd()
 221
 222     culprit_revisions = []
 223     for i in xrange(last_broken_rev.index, first_working_rev.index):
 224       depot_registry.ChangeToDepotDir(revision_states[i].depot)
 225       info = source_control.QueryRevisionInfo(revision_states[i].revision)
 226       culprit_revisions.append((revision_states[i].revision, info,
 227                                 revision_states[i].depot))
 228
 229     os.chdir(cwd)
 230     return culprit_revisions
 231
 232   @classmethod
 233   def _ComputeRegressionStatistics(cls, rev_states, first_working_rev,
 234                                    last_broken_rev):
 235     # TODO(sergiyb): We assume that value has "values" key, which may not be
 236     # the case for failure-bisects, where there is a single value only.
 237     broken_means = [state.value['values']
 238                     for state in rev_states[:last_broken_rev.index+1]
 239                     if state.value]
 240
 241     working_means = [state.value['values']
 242                      for state in rev_states[first_working_rev.index:]
 243                      if state.value]
 244
 245     # Flatten the lists to calculate mean of all values.
 246     working_mean = sum(working_means, [])
 247     broken_mean = sum(broken_means, [])
 248
 249     # Calculate the approximate size of the regression
 250     mean_of_bad_runs = math_utils.Mean(broken_mean)
 251     mean_of_good_runs = math_utils.Mean(working_mean)
 252
 253     regression_size = 100 * math_utils.RelativeChange(mean_of_good_runs,
 254                                                       mean_of_bad_runs)
 255     if math.isnan(regression_size):
 256       regression_size = 'zero-to-nonzero'
 257
 258     regression_std_err = math.fabs(math_utils.PooledStandardError(
 259         [working_mean, broken_mean]) /
 260         max(0.0001, min(mean_of_good_runs, mean_of_bad_runs))) * 100.0
 261
 262     # Give a "confidence" in the bisect. At the moment we use how distinct the
 263     # values are before and after the last broken revision, and how noisy the
 264     # overall graph is.
 265     confidence_params = (sum(working_means, []), sum(broken_means, []))
 266     confidence = cls.ConfidenceScore(*confidence_params)
 267
 268     bad_greater_than_good = mean_of_bad_runs > mean_of_good_runs
 269
 270     return {'regression_size': regression_size,
 271             'regression_std_err': regression_std_err,
 272             'confidence': confidence,
 273             'bad_greater_than_good': bad_greater_than_good}