components/metrics/metrics_service.cc

   1 // Copyright 2014 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 //------------------------------------------------------------------------------
   6 // Description of the life cycle of a instance of MetricsService.
   7 //
   8 //  OVERVIEW
   9 //
  10 // A MetricsService instance is typically created at application startup.  It is
  11 // the central controller for the acquisition of log data, and the automatic
  12 // transmission of that log data to an external server.  Its major job is to
  13 // manage logs, grouping them for transmission, and transmitting them.  As part
  14 // of its grouping, MS finalizes logs by including some just-in-time gathered
  15 // memory statistics, snapshotting the current stats of numerous histograms,
  16 // closing the logs, translating to protocol buffer format, and compressing the
  17 // results for transmission.  Transmission includes submitting a compressed log
  18 // as data in a URL-post, and retransmitting (or retaining at process
  19 // termination) if the attempted transmission failed.  Retention across process
  20 // terminations is done using the the PrefServices facilities. The retained logs
  21 // (the ones that never got transmitted) are compressed and base64-encoded
  22 // before being persisted.
  23 //
  24 // Logs fall into one of two categories: "initial logs," and "ongoing logs."
  25 // There is at most one initial log sent for each complete run of Chrome (from
  26 // startup, to browser shutdown).  An initial log is generally transmitted some
  27 // short time (1 minute?) after startup, and includes stats such as recent crash
  28 // info, the number and types of plugins, etc.  The external server's response
  29 // to the initial log conceptually tells this MS if it should continue
  30 // transmitting logs (during this session). The server response can actually be
  31 // much more detailed, and always includes (at a minimum) how often additional
  32 // ongoing logs should be sent.
  33 //
  34 // After the above initial log, a series of ongoing logs will be transmitted.
  35 // The first ongoing log actually begins to accumulate information stating when
  36 // the MS was first constructed.  Note that even though the initial log is
  37 // commonly sent a full minute after startup, the initial log does not include
  38 // much in the way of user stats.   The most common interlog period (delay)
  39 // is 30 minutes. That time period starts when the first user action causes a
  40 // logging event.  This means that if there is no user action, there may be long
  41 // periods without any (ongoing) log transmissions.  Ongoing logs typically
  42 // contain very detailed records of user activities (ex: opened tab, closed
  43 // tab, fetched URL, maximized window, etc.)  In addition, just before an
  44 // ongoing log is closed out, a call is made to gather memory statistics.  Those
  45 // memory statistics are deposited into a histogram, and the log finalization
  46 // code is then called.  In the finalization, a call to a Histogram server
  47 // acquires a list of all local histograms that have been flagged for upload
  48 // to the UMA server.  The finalization also acquires the most recent number
  49 // of page loads, along with any counts of renderer or plugin crashes.
  50 //
  51 // When the browser shuts down, there will typically be a fragment of an ongoing
  52 // log that has not yet been transmitted.  At shutdown time, that fragment is
  53 // closed (including snapshotting histograms), and persisted, for potential
  54 // transmission during a future run of the product.
  55 //
  56 // There are two slightly abnormal shutdown conditions.  There is a
  57 // "disconnected scenario," and a "really fast startup and shutdown" scenario.
  58 // In the "never connected" situation, the user has (during the running of the
  59 // process) never established an internet connection.  As a result, attempts to
  60 // transmit the initial log have failed, and a lot(?) of data has accumulated in
  61 // the ongoing log (which didn't yet get closed, because there was never even a
  62 // contemplation of sending it).  There is also a kindred "lost connection"
  63 // situation, where a loss of connection prevented an ongoing log from being
  64 // transmitted, and a (still open) log was stuck accumulating a lot(?) of data,
  65 // while the earlier log retried its transmission.  In both of these
  66 // disconnected situations, two logs need to be, and are, persistently stored
  67 // for future transmission.
  68 //
  69 // The other unusual shutdown condition, termed "really fast startup and
  70 // shutdown," involves the deliberate user termination of the process before
  71 // the initial log is even formed or transmitted. In that situation, no logging
  72 // is done, but the historical crash statistics remain (unlogged) for inclusion
  73 // in a future run's initial log.  (i.e., we don't lose crash stats).
  74 //
  75 // With the above overview, we can now describe the state machine's various
  76 // states, based on the State enum specified in the state_ member.  Those states
  77 // are:
  78 //
  79 //  INITIALIZED,          // Constructor was called.
  80 //  INIT_TASK_SCHEDULED,  // Waiting for deferred init tasks to finish.
  81 //  INIT_TASK_DONE,       // Waiting for timer to send initial log.
  82 //  SENDING_LOGS,         // Sending logs and creating new ones when we run out.
  83 //
  84 // In more detail, we have:
  85 //
  86 //    INITIALIZED,            // Constructor was called.
  87 // The MS has been constructed, but has taken no actions to compose the
  88 // initial log.
  89 //
  90 //    INIT_TASK_SCHEDULED,    // Waiting for deferred init tasks to finish.
  91 // Typically about 30 seconds after startup, a task is sent to a second thread
  92 // (the file thread) to perform deferred (lower priority and slower)
  93 // initialization steps such as getting the list of plugins.  That task will
  94 // (when complete) make an async callback (via a Task) to indicate the
  95 // completion.
  96 //
  97 //    INIT_TASK_DONE,         // Waiting for timer to send initial log.
  98 // The callback has arrived, and it is now possible for an initial log to be
  99 // created.  This callback typically arrives back less than one second after
 100 // the deferred init task is dispatched.
 101 //
 102 //    SENDING_LOGS,  // Sending logs an creating new ones when we run out.
 103 // Logs from previous sessions have been loaded, and initial logs have been
 104 // created (an optional stability log and the first metrics log).  We will
 105 // send all of these logs, and when run out, we will start cutting new logs
 106 // to send.  We will also cut a new log if we expect a shutdown.
 107 //
 108 // The progression through the above states is simple, and sequential.
 109 // States proceed from INITIAL to SENDING_LOGS, and remain in the latter until
 110 // shutdown.
 111 //
 112 // Also note that whenever we successfully send a log, we mirror the list
 113 // of logs into the PrefService. This ensures that IF we crash, we won't start
 114 // up and retransmit our old logs again.
 115 //
 116 // Due to race conditions, it is always possible that a log file could be sent
 117 // twice.  For example, if a log file is sent, but not yet acknowledged by
 118 // the external server, and the user shuts down, then a copy of the log may be
 119 // saved for re-transmission.  These duplicates could be filtered out server
 120 // side, but are not expected to be a significant problem.
 121 //
 122 //
 123 //------------------------------------------------------------------------------
 124
 125 #include "components/metrics/metrics_service.h"
 126
 127 #include <algorithm>
 128
 129 #include "base/bind.h"
 130 #include "base/callback.h"
 131 #include "base/metrics/histogram.h"
 132 #include "base/metrics/histogram_base.h"
 133 #include "base/metrics/histogram_samples.h"
 134 #include "base/metrics/sparse_histogram.h"
 135 #include "base/metrics/statistics_recorder.h"
 136 #include "base/prefs/pref_registry_simple.h"
 137 #include "base/prefs/pref_service.h"
 138 #include "base/strings/string_number_conversions.h"
 139 #include "base/strings/utf_string_conversions.h"
 140 #include "base/threading/platform_thread.h"
 141 #include "base/threading/thread.h"
 142 #include "base/threading/thread_restrictions.h"
 143 #include "base/time/time.h"
 144 #include "base/tracked_objects.h"
 145 #include "base/values.h"
 146 #include "components/metrics/metrics_log.h"
 147 #include "components/metrics/metrics_log_manager.h"
 148 #include "components/metrics/metrics_log_uploader.h"
 149 #include "components/metrics/metrics_pref_names.h"
 150 #include "components/metrics/metrics_reporting_scheduler.h"
 151 #include "components/metrics/metrics_service_client.h"
 152 #include "components/metrics/metrics_state_manager.h"
 153 #include "components/variations/entropy_provider.h"
 154
 155 namespace metrics {
 156
 157 namespace {
 158
 159 // Check to see that we're being called on only one thread.
 160 bool IsSingleThreaded() {
 161   static base::PlatformThreadId thread_id = 0;
 162   if (!thread_id)
 163     thread_id = base::PlatformThread::CurrentId();
 164   return base::PlatformThread::CurrentId() == thread_id;
 165 }
 166
 167 // The delay, in seconds, after starting recording before doing expensive
 168 // initialization work.
 169 #if defined(OS_ANDROID) || defined(OS_IOS)
 170 // On mobile devices, a significant portion of sessions last less than a minute.
 171 // Use a shorter timer on these platforms to avoid losing data.
 172 // TODO(dfalcantara): To avoid delaying startup, tighten up initialization so
 173 //                    that it occurs after the user gets their initial page.
 174 const int kInitializationDelaySeconds = 5;
 175 #else
 176 const int kInitializationDelaySeconds = 30;
 177 #endif
 178
 179 // The maximum number of events in a log uploaded to the UMA server.
 180 const int kEventLimit = 2400;
 181
 182 // If an upload fails, and the transmission was over this byte count, then we
 183 // will discard the log, and not try to retransmit it.  We also don't persist
 184 // the log to the prefs for transmission during the next chrome session if this
 185 // limit is exceeded.
 186 const size_t kUploadLogAvoidRetransmitSize = 100 * 1024;
 187
 188 // Interval, in minutes, between state saves.
 189 const int kSaveStateIntervalMinutes = 5;
 190
 191 enum ResponseStatus {
 192   UNKNOWN_FAILURE,
 193   SUCCESS,
 194   BAD_REQUEST,  // Invalid syntax or log too large.
 195   NO_RESPONSE,
 196   NUM_RESPONSE_STATUSES
 197 };
 198
 199 ResponseStatus ResponseCodeToStatus(int response_code) {
 200   switch (response_code) {
 201     case -1:
 202       return NO_RESPONSE;
 203     case 200:
 204       return SUCCESS;
 205     case 400:
 206       return BAD_REQUEST;
 207     default:
 208       return UNKNOWN_FAILURE;
 209   }
 210 }
 211
 212 #if defined(OS_ANDROID) || defined(OS_IOS)
 213 void MarkAppCleanShutdownAndCommit(CleanExitBeacon* clean_exit_beacon,
 214                                    PrefService* local_state) {
 215   clean_exit_beacon->WriteBeaconValue(true);
 216   local_state->SetInteger(prefs::kStabilityExecutionPhase,
 217                           MetricsService::SHUTDOWN_COMPLETE);
 218   // Start writing right away (write happens on a different thread).
 219   local_state->CommitPendingWrite();
 220 }
 221 #endif  // defined(OS_ANDROID) || defined(OS_IOS)
 222
 223 }  // namespace
 224
 225
 226 SyntheticTrialGroup::SyntheticTrialGroup(uint32 trial, uint32 group) {
 227   id.name = trial;
 228   id.group = group;
 229 }
 230
 231 SyntheticTrialGroup::~SyntheticTrialGroup() {
 232 }
 233
 234 // static
 235 MetricsService::ShutdownCleanliness MetricsService::clean_shutdown_status_ =
 236     MetricsService::CLEANLY_SHUTDOWN;
 237
 238 MetricsService::ExecutionPhase MetricsService::execution_phase_ =
 239     MetricsService::UNINITIALIZED_PHASE;
 240
 241 // static
 242 void MetricsService::RegisterPrefs(PrefRegistrySimple* registry) {
 243   DCHECK(IsSingleThreaded());
 244   MetricsStateManager::RegisterPrefs(registry);
 245   MetricsLog::RegisterPrefs(registry);
 246
 247   registry->RegisterInt64Pref(prefs::kInstallDate, 0);
 248
 249   registry->RegisterInt64Pref(prefs::kStabilityLaunchTimeSec, 0);
 250   registry->RegisterInt64Pref(prefs::kStabilityLastTimestampSec, 0);
 251   registry->RegisterStringPref(prefs::kStabilityStatsVersion, std::string());
 252   registry->RegisterInt64Pref(prefs::kStabilityStatsBuildTime, 0);
 253   registry->RegisterBooleanPref(prefs::kStabilityExitedCleanly, true);
 254   registry->RegisterIntegerPref(prefs::kStabilityExecutionPhase,
 255                                 UNINITIALIZED_PHASE);
 256   registry->RegisterBooleanPref(prefs::kStabilitySessionEndCompleted, true);
 257   registry->RegisterIntegerPref(prefs::kMetricsSessionID, -1);
 258
 259   registry->RegisterListPref(prefs::kMetricsInitialLogs);
 260   registry->RegisterListPref(prefs::kMetricsOngoingLogs);
 261
 262   registry->RegisterInt64Pref(prefs::kUninstallLaunchCount, 0);
 263   registry->RegisterInt64Pref(prefs::kUninstallMetricsUptimeSec, 0);
 264 }
 265
 266 MetricsService::MetricsService(MetricsStateManager* state_manager,
 267                                MetricsServiceClient* client,
 268                                PrefService* local_state)
 269     : log_manager_(local_state, kUploadLogAvoidRetransmitSize),
 270       histogram_snapshot_manager_(this),
 271       state_manager_(state_manager),
 272       client_(client),
 273       local_state_(local_state),
 274       clean_exit_beacon_(client->GetRegistryBackupKey(), local_state),
 275       recording_active_(false),
 276       reporting_active_(false),
 277       test_mode_active_(false),
 278       state_(INITIALIZED),
 279       log_upload_in_progress_(false),
 280       idle_since_last_transmission_(false),
 281       session_id_(-1),
 282       self_ptr_factory_(this),
 283       state_saver_factory_(this) {
 284   DCHECK(IsSingleThreaded());
 285   DCHECK(state_manager_);
 286   DCHECK(client_);
 287   DCHECK(local_state_);
 288
 289   // Set the install date if this is our first run.
 290   int64 install_date = local_state_->GetInt64(prefs::kInstallDate);
 291   if (install_date == 0)
 292     local_state_->SetInt64(prefs::kInstallDate, base::Time::Now().ToTimeT());
 293 }
 294
 295 MetricsService::~MetricsService() {
 296   DisableRecording();
 297 }
 298
 299 void MetricsService::InitializeMetricsRecordingState() {
 300   InitializeMetricsState();
 301
 302   base::Closure upload_callback =
 303       base::Bind(&MetricsService::StartScheduledUpload,
 304                  self_ptr_factory_.GetWeakPtr());
 305   scheduler_.reset(
 306       new MetricsReportingScheduler(
 307           upload_callback,
 308           // MetricsServiceClient outlives MetricsService, and
 309           // MetricsReportingScheduler is tied to the lifetime of |this|.
 310           base::Bind(&MetricsServiceClient::GetStandardUploadInterval,
 311                      base::Unretained(client_))));
 312 }
 313
 314 void MetricsService::Start() {
 315   HandleIdleSinceLastTransmission(false);
 316   EnableRecording();
 317   EnableReporting();
 318 }
 319
 320 bool MetricsService::StartIfMetricsReportingEnabled() {
 321   const bool enabled = state_manager_->IsMetricsReportingEnabled();
 322   if (enabled)
 323     Start();
 324   return enabled;
 325 }
 326
 327 void MetricsService::StartRecordingForTests() {
 328   test_mode_active_ = true;
 329   EnableRecording();
 330   DisableReporting();
 331 }
 332
 333 void MetricsService::Stop() {
 334   HandleIdleSinceLastTransmission(false);
 335   DisableReporting();
 336   DisableRecording();
 337 }
 338
 339 void MetricsService::EnableReporting() {
 340   if (reporting_active_)
 341     return;
 342   reporting_active_ = true;
 343   StartSchedulerIfNecessary();
 344 }
 345
 346 void MetricsService::DisableReporting() {
 347   reporting_active_ = false;
 348 }
 349
 350 std::string MetricsService::GetClientId() {
 351   return state_manager_->client_id();
 352 }
 353
 354 int64 MetricsService::GetInstallDate() {
 355   return local_state_->GetInt64(prefs::kInstallDate);
 356 }
 357
 358 int64 MetricsService::GetMetricsReportingEnabledDate() {
 359   return local_state_->GetInt64(prefs::kMetricsReportingEnabledTimestamp);
 360 }
 361
 362 scoped_ptr<const base::FieldTrial::EntropyProvider>
 363 MetricsService::CreateEntropyProvider() {
 364   // TODO(asvitkine): Refactor the code so that MetricsService does not expose
 365   // this method.
 366   return state_manager_->CreateEntropyProvider();
 367 }
 368
 369 void MetricsService::EnableRecording() {
 370   DCHECK(IsSingleThreaded());
 371
 372   if (recording_active_)
 373     return;
 374   recording_active_ = true;
 375
 376   state_manager_->ForceClientIdCreation();
 377   client_->SetMetricsClientId(state_manager_->client_id());
 378   if (!log_manager_.current_log())
 379     OpenNewLog();
 380
 381   for (size_t i = 0; i < metrics_providers_.size(); ++i)
 382     metrics_providers_[i]->OnRecordingEnabled();
 383
 384   base::RemoveActionCallback(action_callback_);
 385   action_callback_ = base::Bind(&MetricsService::OnUserAction,
 386                                 base::Unretained(this));
 387   base::AddActionCallback(action_callback_);
 388 }
 389
 390 void MetricsService::DisableRecording() {
 391   DCHECK(IsSingleThreaded());
 392
 393   if (!recording_active_)
 394     return;
 395   recording_active_ = false;
 396
 397   client_->OnRecordingDisabled();
 398
 399   base::RemoveActionCallback(action_callback_);
 400
 401   for (size_t i = 0; i < metrics_providers_.size(); ++i)
 402     metrics_providers_[i]->OnRecordingDisabled();
 403
 404   PushPendingLogsToPersistentStorage();
 405 }
 406
 407 bool MetricsService::recording_active() const {
 408   DCHECK(IsSingleThreaded());
 409   return recording_active_;
 410 }
 411
 412 bool MetricsService::reporting_active() const {
 413   DCHECK(IsSingleThreaded());
 414   return reporting_active_;
 415 }
 416
 417 void MetricsService::RecordDelta(const base::HistogramBase& histogram,
 418                                  const base::HistogramSamples& snapshot) {
 419   log_manager_.current_log()->RecordHistogramDelta(histogram.histogram_name(),
 420                                                    snapshot);
 421 }
 422
 423 void MetricsService::InconsistencyDetected(
 424     base::HistogramBase::Inconsistency problem) {
 425   UMA_HISTOGRAM_ENUMERATION("Histogram.InconsistenciesBrowser",
 426                             problem, base::HistogramBase::NEVER_EXCEEDED_VALUE);
 427 }
 428
 429 void MetricsService::UniqueInconsistencyDetected(
 430     base::HistogramBase::Inconsistency problem) {
 431   UMA_HISTOGRAM_ENUMERATION("Histogram.InconsistenciesBrowserUnique",
 432                             problem, base::HistogramBase::NEVER_EXCEEDED_VALUE);
 433 }
 434
 435 void MetricsService::InconsistencyDetectedInLoggedCount(int amount) {
 436   UMA_HISTOGRAM_COUNTS("Histogram.InconsistentSnapshotBrowser",
 437                        std::abs(amount));
 438 }
 439
 440 void MetricsService::HandleIdleSinceLastTransmission(bool in_idle) {
 441   // If there wasn't a lot of action, maybe the computer was asleep, in which
 442   // case, the log transmissions should have stopped.  Here we start them up
 443   // again.
 444   if (!in_idle && idle_since_last_transmission_)
 445     StartSchedulerIfNecessary();
 446   idle_since_last_transmission_ = in_idle;
 447 }
 448
 449 void MetricsService::OnApplicationNotIdle() {
 450   if (recording_active_)
 451     HandleIdleSinceLastTransmission(false);
 452 }
 453
 454 void MetricsService::RecordStartOfSessionEnd() {
 455   LogCleanShutdown();
 456   RecordBooleanPrefValue(prefs::kStabilitySessionEndCompleted, false);
 457 }
 458
 459 void MetricsService::RecordCompletedSessionEnd() {
 460   LogCleanShutdown();
 461   RecordBooleanPrefValue(prefs::kStabilitySessionEndCompleted, true);
 462 }
 463
 464 #if defined(OS_ANDROID) || defined(OS_IOS)
 465 void MetricsService::OnAppEnterBackground() {
 466   scheduler_->Stop();
 467
 468   MarkAppCleanShutdownAndCommit(&clean_exit_beacon_, local_state_);
 469
 470   // At this point, there's no way of knowing when the process will be
 471   // killed, so this has to be treated similar to a shutdown, closing and
 472   // persisting all logs. Unlinke a shutdown, the state is primed to be ready
 473   // to continue logging and uploading if the process does return.
 474   if (recording_active() && state_ >= SENDING_LOGS) {
 475     PushPendingLogsToPersistentStorage();
 476     // Persisting logs closes the current log, so start recording a new log
 477     // immediately to capture any background work that might be done before the
 478     // process is killed.
 479     OpenNewLog();
 480   }
 481 }
 482
 483 void MetricsService::OnAppEnterForeground() {
 484   clean_exit_beacon_.WriteBeaconValue(false);
 485   StartSchedulerIfNecessary();
 486 }
 487 #else
 488 void MetricsService::LogNeedForCleanShutdown() {
 489   clean_exit_beacon_.WriteBeaconValue(false);
 490   // Redundant setting to be sure we call for a clean shutdown.
 491   clean_shutdown_status_ = NEED_TO_SHUTDOWN;
 492 }
 493 #endif  // defined(OS_ANDROID) || defined(OS_IOS)
 494
 495 // static
 496 void MetricsService::SetExecutionPhase(ExecutionPhase execution_phase,
 497                                        PrefService* local_state) {
 498   execution_phase_ = execution_phase;
 499   local_state->SetInteger(prefs::kStabilityExecutionPhase, execution_phase_);
 500 }
 501
 502 void MetricsService::RecordBreakpadRegistration(bool success) {
 503   if (!success)
 504     IncrementPrefValue(prefs::kStabilityBreakpadRegistrationFail);
 505   else
 506     IncrementPrefValue(prefs::kStabilityBreakpadRegistrationSuccess);
 507 }
 508
 509 void MetricsService::RecordBreakpadHasDebugger(bool has_debugger) {
 510   if (!has_debugger)
 511     IncrementPrefValue(prefs::kStabilityDebuggerNotPresent);
 512   else
 513     IncrementPrefValue(prefs::kStabilityDebuggerPresent);
 514 }
 515
 516 void MetricsService::ClearSavedStabilityMetrics() {
 517   for (size_t i = 0; i < metrics_providers_.size(); ++i)
 518     metrics_providers_[i]->ClearSavedStabilityMetrics();
 519
 520   // Reset the prefs that are managed by MetricsService/MetricsLog directly.
 521   local_state_->SetInteger(prefs::kStabilityCrashCount, 0);
 522   local_state_->SetInteger(prefs::kStabilityExecutionPhase,
 523                            UNINITIALIZED_PHASE);
 524   local_state_->SetInteger(prefs::kStabilityIncompleteSessionEndCount, 0);
 525   local_state_->SetInteger(prefs::kStabilityLaunchCount, 0);
 526   local_state_->SetBoolean(prefs::kStabilitySessionEndCompleted, true);
 527 }
 528
 529 void MetricsService::PushExternalLog(const std::string& log) {
 530   log_manager_.StoreLog(log, MetricsLog::ONGOING_LOG);
 531 }
 532
 533 //------------------------------------------------------------------------------
 534 // private methods
 535 //------------------------------------------------------------------------------
 536
 537
 538 //------------------------------------------------------------------------------
 539 // Initialization methods
 540
 541 void MetricsService::InitializeMetricsState() {
 542   const int64 buildtime = MetricsLog::GetBuildTime();
 543   const std::string version = client_->GetVersionString();
 544   bool version_changed = false;
 545   if (local_state_->GetInt64(prefs::kStabilityStatsBuildTime) != buildtime ||
 546       local_state_->GetString(prefs::kStabilityStatsVersion) != version) {
 547     local_state_->SetString(prefs::kStabilityStatsVersion, version);
 548     local_state_->SetInt64(prefs::kStabilityStatsBuildTime, buildtime);
 549     version_changed = true;
 550   }
 551
 552   log_manager_.LoadPersistedUnsentLogs();
 553
 554   session_id_ = local_state_->GetInteger(prefs::kMetricsSessionID);
 555
 556   if (!clean_exit_beacon_.exited_cleanly()) {
 557     IncrementPrefValue(prefs::kStabilityCrashCount);
 558     // Reset flag, and wait until we call LogNeedForCleanShutdown() before
 559     // monitoring.
 560     clean_exit_beacon_.WriteBeaconValue(true);
 561   }
 562
 563   bool has_initial_stability_log = false;
 564   if (!clean_exit_beacon_.exited_cleanly() || ProvidersHaveStabilityMetrics()) {
 565     // TODO(rtenneti): On windows, consider saving/getting execution_phase from
 566     // the registry.
 567     int execution_phase =
 568         local_state_->GetInteger(prefs::kStabilityExecutionPhase);
 569     UMA_HISTOGRAM_SPARSE_SLOWLY("Chrome.Browser.CrashedExecutionPhase",
 570                                 execution_phase);
 571
 572     // If the previous session didn't exit cleanly, or if any provider
 573     // explicitly requests it, prepare an initial stability log -
 574     // provided UMA is enabled.
 575     if (state_manager_->IsMetricsReportingEnabled())
 576       has_initial_stability_log = PrepareInitialStabilityLog();
 577   }
 578
 579   // If no initial stability log was generated and there was a version upgrade,
 580   // clear the stability stats from the previous version (so that they don't get
 581   // attributed to the current version). This could otherwise happen due to a
 582   // number of different edge cases, such as if the last version crashed before
 583   // it could save off a system profile or if UMA reporting is disabled (which
 584   // normally results in stats being accumulated).
 585   if (!has_initial_stability_log && version_changed)
 586     ClearSavedStabilityMetrics();
 587
 588   // Update session ID.
 589   ++session_id_;
 590   local_state_->SetInteger(prefs::kMetricsSessionID, session_id_);
 591
 592   // Stability bookkeeping
 593   IncrementPrefValue(prefs::kStabilityLaunchCount);
 594
 595   DCHECK_EQ(UNINITIALIZED_PHASE, execution_phase_);
 596   SetExecutionPhase(START_METRICS_RECORDING, local_state_);
 597
 598   if (!local_state_->GetBoolean(prefs::kStabilitySessionEndCompleted)) {
 599     IncrementPrefValue(prefs::kStabilityIncompleteSessionEndCount);
 600     // This is marked false when we get a WM_ENDSESSION.
 601     local_state_->SetBoolean(prefs::kStabilitySessionEndCompleted, true);
 602   }
 603
 604   // Call GetUptimes() for the first time, thus allowing all later calls
 605   // to record incremental uptimes accurately.
 606   base::TimeDelta ignored_uptime_parameter;
 607   base::TimeDelta startup_uptime;
 608   GetUptimes(local_state_, &startup_uptime, &ignored_uptime_parameter);
 609   DCHECK_EQ(0, startup_uptime.InMicroseconds());
 610   // For backwards compatibility, leave this intact in case Omaha is checking
 611   // them.  prefs::kStabilityLastTimestampSec may also be useless now.
 612   // TODO(jar): Delete these if they have no uses.
 613   local_state_->SetInt64(prefs::kStabilityLaunchTimeSec,
 614                          base::Time::Now().ToTimeT());
 615
 616   // Bookkeeping for the uninstall metrics.
 617   IncrementLongPrefsValue(prefs::kUninstallLaunchCount);
 618
 619   // Kick off the process of saving the state (so the uptime numbers keep
 620   // getting updated) every n minutes.
 621   ScheduleNextStateSave();
 622 }
 623
 624 void MetricsService::OnUserAction(const std::string& action) {
 625   if (!ShouldLogEvents())
 626     return;
 627
 628   log_manager_.current_log()->RecordUserAction(action);
 629   HandleIdleSinceLastTransmission(false);
 630 }
 631
 632 void MetricsService::FinishedGatheringInitialMetrics() {
 633   DCHECK_EQ(INIT_TASK_SCHEDULED, state_);
 634   state_ = INIT_TASK_DONE;
 635
 636   // Create the initial log.
 637   if (!initial_metrics_log_.get()) {
 638     initial_metrics_log_ = CreateLog(MetricsLog::ONGOING_LOG);
 639     NotifyOnDidCreateMetricsLog();
 640   }
 641
 642   scheduler_->InitTaskComplete();
 643 }
 644
 645 void MetricsService::GetUptimes(PrefService* pref,
 646                                 base::TimeDelta* incremental_uptime,
 647                                 base::TimeDelta* uptime) {
 648   base::TimeTicks now = base::TimeTicks::Now();
 649   // If this is the first call, init |first_updated_time_| and
 650   // |last_updated_time_|.
 651   if (last_updated_time_.is_null()) {
 652     first_updated_time_ = now;
 653     last_updated_time_ = now;
 654   }
 655   *incremental_uptime = now - last_updated_time_;
 656   *uptime = now - first_updated_time_;
 657   last_updated_time_ = now;
 658
 659   const int64 incremental_time_secs = incremental_uptime->InSeconds();
 660   if (incremental_time_secs > 0) {
 661     int64 metrics_uptime = pref->GetInt64(prefs::kUninstallMetricsUptimeSec);
 662     metrics_uptime += incremental_time_secs;
 663     pref->SetInt64(prefs::kUninstallMetricsUptimeSec, metrics_uptime);
 664   }
 665 }
 666
 667 void MetricsService::NotifyOnDidCreateMetricsLog() {
 668   DCHECK(IsSingleThreaded());
 669   for (size_t i = 0; i < metrics_providers_.size(); ++i)
 670     metrics_providers_[i]->OnDidCreateMetricsLog();
 671 }
 672
 673 //------------------------------------------------------------------------------
 674 // State save methods
 675
 676 void MetricsService::ScheduleNextStateSave() {
 677   state_saver_factory_.InvalidateWeakPtrs();
 678
 679   base::MessageLoop::current()->PostDelayedTask(FROM_HERE,
 680       base::Bind(&MetricsService::SaveLocalState,
 681                  state_saver_factory_.GetWeakPtr()),
 682       base::TimeDelta::FromMinutes(kSaveStateIntervalMinutes));
 683 }
 684
 685 void MetricsService::SaveLocalState() {
 686   RecordCurrentState(local_state_);
 687
 688   // TODO(jar):110021 Does this run down the batteries????
 689   ScheduleNextStateSave();
 690 }
 691
 692
 693 //------------------------------------------------------------------------------
 694 // Recording control methods
 695
 696 void MetricsService::OpenNewLog() {
 697   DCHECK(!log_manager_.current_log());
 698
 699   log_manager_.BeginLoggingWithLog(CreateLog(MetricsLog::ONGOING_LOG));
 700   NotifyOnDidCreateMetricsLog();
 701   if (state_ == INITIALIZED) {
 702     // We only need to schedule that run once.
 703     state_ = INIT_TASK_SCHEDULED;
 704
 705     base::MessageLoop::current()->PostDelayedTask(
 706         FROM_HERE,
 707         base::Bind(&MetricsService::StartGatheringMetrics,
 708                    self_ptr_factory_.GetWeakPtr()),
 709         base::TimeDelta::FromSeconds(kInitializationDelaySeconds));
 710   }
 711 }
 712
 713 void MetricsService::StartGatheringMetrics() {
 714   client_->StartGatheringMetrics(
 715       base::Bind(&MetricsService::FinishedGatheringInitialMetrics,
 716                  self_ptr_factory_.GetWeakPtr()));
 717 }
 718
 719 void MetricsService::CloseCurrentLog() {
 720   if (!log_manager_.current_log())
 721     return;
 722
 723   // TODO(jar): Integrate bounds on log recording more consistently, so that we
 724   // can stop recording logs that are too big much sooner.
 725   if (log_manager_.current_log()->num_events() > kEventLimit) {
 726     UMA_HISTOGRAM_COUNTS("UMA.Discarded Log Events",
 727                          log_manager_.current_log()->num_events());
 728     log_manager_.DiscardCurrentLog();
 729     OpenNewLog();  // Start trivial log to hold our histograms.
 730   }
 731
 732   // Put incremental data (histogram deltas, and realtime stats deltas) at the
 733   // end of all log transmissions (initial log handles this separately).
 734   // RecordIncrementalStabilityElements only exists on the derived
 735   // MetricsLog class.
 736   MetricsLog* current_log = log_manager_.current_log();
 737   DCHECK(current_log);
 738   RecordCurrentEnvironment(current_log);
 739   base::TimeDelta incremental_uptime;
 740   base::TimeDelta uptime;
 741   GetUptimes(local_state_, &incremental_uptime, &uptime);
 742   current_log->RecordStabilityMetrics(metrics_providers_.get(),
 743                                       incremental_uptime, uptime);
 744
 745   current_log->RecordGeneralMetrics(metrics_providers_.get());
 746   RecordCurrentHistograms();
 747
 748   log_manager_.FinishCurrentLog();
 749 }
 750
 751 void MetricsService::PushPendingLogsToPersistentStorage() {
 752   if (state_ < SENDING_LOGS)
 753     return;  // We didn't and still don't have time to get plugin list etc.
 754
 755   CloseCurrentLog();
 756   log_manager_.PersistUnsentLogs();
 757 }
 758
 759 //------------------------------------------------------------------------------
 760 // Transmission of logs methods
 761
 762 void MetricsService::StartSchedulerIfNecessary() {
 763   // Never schedule cutting or uploading of logs in test mode.
 764   if (test_mode_active_)
 765     return;
 766
 767   // Even if reporting is disabled, the scheduler is needed to trigger the
 768   // creation of the initial log, which must be done in order for any logs to be
 769   // persisted on shutdown or backgrounding.
 770   if (recording_active() &&
 771       (reporting_active() || state_ < SENDING_LOGS)) {
 772     scheduler_->Start();
 773   }
 774 }
 775
 776 void MetricsService::StartScheduledUpload() {
 777   DCHECK(state_ >= INIT_TASK_DONE);
 778   // If we're getting no notifications, then the log won't have much in it, and
 779   // it's possible the computer is about to go to sleep, so don't upload and
 780   // stop the scheduler.
 781   // If recording has been turned off, the scheduler doesn't need to run.
 782   // If reporting is off, proceed if the initial log hasn't been created, since
 783   // that has to happen in order for logs to be cut and stored when persisting.
 784   // TODO(stuartmorgan): Call Stop() on the scheduler when reporting and/or
 785   // recording are turned off instead of letting it fire and then aborting.
 786   if (idle_since_last_transmission_ ||
 787       !recording_active() ||
 788       (!reporting_active() && state_ >= SENDING_LOGS)) {
 789     scheduler_->Stop();
 790     scheduler_->UploadCancelled();
 791     return;
 792   }
 793
 794   // If there are unsent logs, send the next one. If not, start the asynchronous
 795   // process of finalizing the current log for upload.
 796   if (state_ == SENDING_LOGS && log_manager_.has_unsent_logs()) {
 797     SendNextLog();
 798   } else {
 799     // There are no logs left to send, so start creating a new one.
 800     client_->CollectFinalMetrics(
 801         base::Bind(&MetricsService::OnFinalLogInfoCollectionDone,
 802                    self_ptr_factory_.GetWeakPtr()));
 803   }
 804 }
 805
 806 void MetricsService::OnFinalLogInfoCollectionDone() {
 807   // If somehow there is a log upload in progress, we return and hope things
 808   // work out. The scheduler isn't informed since if this happens, the scheduler
 809   // will get a response from the upload.
 810   DCHECK(!log_upload_in_progress_);
 811   if (log_upload_in_progress_)
 812     return;
 813
 814   // Abort if metrics were turned off during the final info gathering.
 815   if (!recording_active()) {
 816     scheduler_->Stop();
 817     scheduler_->UploadCancelled();
 818     return;
 819   }
 820
 821   if (state_ == INIT_TASK_DONE) {
 822     PrepareInitialMetricsLog();
 823   } else {
 824     DCHECK_EQ(SENDING_LOGS, state_);
 825     CloseCurrentLog();
 826     OpenNewLog();
 827   }
 828   SendNextLog();
 829 }
 830
 831 void MetricsService::SendNextLog() {
 832   DCHECK_EQ(SENDING_LOGS, state_);
 833   if (!reporting_active()) {
 834     scheduler_->Stop();
 835     scheduler_->UploadCancelled();
 836     return;
 837   }
 838   if (!log_manager_.has_unsent_logs()) {
 839     // Should only get here if serializing the log failed somehow.
 840     // Just tell the scheduler it was uploaded and wait for the next log
 841     // interval.
 842     scheduler_->UploadFinished(true, log_manager_.has_unsent_logs());
 843     return;
 844   }
 845   if (!log_manager_.has_staged_log())
 846     log_manager_.StageNextLogForUpload();
 847   SendStagedLog();
 848 }
 849
 850 bool MetricsService::ProvidersHaveStabilityMetrics() {
 851   // Check whether any metrics provider has stability metrics.
 852   for (size_t i = 0; i < metrics_providers_.size(); ++i) {
 853     if (metrics_providers_[i]->HasStabilityMetrics())
 854       return true;
 855   }
 856
 857   return false;
 858 }
 859
 860 bool MetricsService::PrepareInitialStabilityLog() {
 861   DCHECK_EQ(INITIALIZED, state_);
 862
 863   scoped_ptr<MetricsLog> initial_stability_log(
 864       CreateLog(MetricsLog::INITIAL_STABILITY_LOG));
 865
 866   // Do not call NotifyOnDidCreateMetricsLog here because the stability
 867   // log describes stats from the _previous_ session.
 868
 869   if (!initial_stability_log->LoadSavedEnvironmentFromPrefs())
 870     return false;
 871
 872   log_manager_.PauseCurrentLog();
 873   log_manager_.BeginLoggingWithLog(initial_stability_log.Pass());
 874
 875   // Note: Some stability providers may record stability stats via histograms,
 876   //       so this call has to be after BeginLoggingWithLog().
 877   log_manager_.current_log()->RecordStabilityMetrics(
 878       metrics_providers_.get(), base::TimeDelta(), base::TimeDelta());
 879   RecordCurrentStabilityHistograms();
 880
 881   // Note: RecordGeneralMetrics() intentionally not called since this log is for
 882   //       stability stats from a previous session only.
 883
 884   log_manager_.FinishCurrentLog();
 885   log_manager_.ResumePausedLog();
 886
 887   // Store unsent logs, including the stability log that was just saved, so
 888   // that they're not lost in case of a crash before upload time.
 889   log_manager_.PersistUnsentLogs();
 890
 891   return true;
 892 }
 893
 894 void MetricsService::PrepareInitialMetricsLog() {
 895   DCHECK_EQ(INIT_TASK_DONE, state_);
 896
 897   RecordCurrentEnvironment(initial_metrics_log_.get());
 898   base::TimeDelta incremental_uptime;
 899   base::TimeDelta uptime;
 900   GetUptimes(local_state_, &incremental_uptime, &uptime);
 901
 902   // Histograms only get written to the current log, so make the new log current
 903   // before writing them.
 904   log_manager_.PauseCurrentLog();
 905   log_manager_.BeginLoggingWithLog(initial_metrics_log_.Pass());
 906
 907   // Note: Some stability providers may record stability stats via histograms,
 908   //       so this call has to be after BeginLoggingWithLog().
 909   MetricsLog* current_log = log_manager_.current_log();
 910   current_log->RecordStabilityMetrics(metrics_providers_.get(),
 911                                       base::TimeDelta(), base::TimeDelta());
 912   current_log->RecordGeneralMetrics(metrics_providers_.get());
 913   RecordCurrentHistograms();
 914
 915   log_manager_.FinishCurrentLog();
 916   log_manager_.ResumePausedLog();
 917
 918   // Store unsent logs, including the initial log that was just saved, so
 919   // that they're not lost in case of a crash before upload time.
 920   log_manager_.PersistUnsentLogs();
 921
 922   state_ = SENDING_LOGS;
 923 }
 924
 925 void MetricsService::SendStagedLog() {
 926   DCHECK(log_manager_.has_staged_log());
 927   if (!log_manager_.has_staged_log())
 928     return;
 929
 930   DCHECK(!log_upload_in_progress_);
 931   log_upload_in_progress_ = true;
 932
 933   if (!log_uploader_) {
 934     log_uploader_ = client_->CreateUploader(
 935         base::Bind(&MetricsService::OnLogUploadComplete,
 936                    self_ptr_factory_.GetWeakPtr()));
 937   }
 938
 939   const std::string hash =
 940       base::HexEncode(log_manager_.staged_log_hash().data(),
 941                       log_manager_.staged_log_hash().size());
 942   bool success = log_uploader_->UploadLog(log_manager_.staged_log(), hash);
 943   UMA_HISTOGRAM_BOOLEAN("UMA.UploadCreation", success);
 944   if (!success) {
 945     // Skip this upload and hope things work out next time.
 946     log_manager_.DiscardStagedLog();
 947     scheduler_->UploadCancelled();
 948     log_upload_in_progress_ = false;
 949     return;
 950   }
 951
 952   HandleIdleSinceLastTransmission(true);
 953 }
 954
 955
 956 void MetricsService::OnLogUploadComplete(int response_code) {
 957   DCHECK_EQ(SENDING_LOGS, state_);
 958   DCHECK(log_upload_in_progress_);
 959   log_upload_in_progress_ = false;
 960
 961   // Log a histogram to track response success vs. failure rates.
 962   UMA_HISTOGRAM_ENUMERATION("UMA.UploadResponseStatus.Protobuf",
 963                             ResponseCodeToStatus(response_code),
 964                             NUM_RESPONSE_STATUSES);
 965
 966   bool upload_succeeded = response_code == 200;
 967
 968   // Provide boolean for error recovery (allow us to ignore response_code).
 969   bool discard_log = false;
 970   const size_t log_size = log_manager_.staged_log().length();
 971   if (upload_succeeded) {
 972     UMA_HISTOGRAM_COUNTS_10000("UMA.LogSize.OnSuccess", log_size / 1024);
 973   } else if (log_size > kUploadLogAvoidRetransmitSize) {
 974     UMA_HISTOGRAM_COUNTS("UMA.Large Rejected Log was Discarded",
 975                          static_cast<int>(log_size));
 976     discard_log = true;
 977   } else if (response_code == 400) {
 978     // Bad syntax.  Retransmission won't work.
 979     discard_log = true;
 980   }
 981
 982   if (upload_succeeded || discard_log) {
 983     log_manager_.DiscardStagedLog();
 984     // Store the updated list to disk now that the removed log is uploaded.
 985     log_manager_.PersistUnsentLogs();
 986   }
 987
 988   // Error 400 indicates a problem with the log, not with the server, so
 989   // don't consider that a sign that the server is in trouble.
 990   bool server_is_healthy = upload_succeeded || response_code == 400;
 991   scheduler_->UploadFinished(server_is_healthy, log_manager_.has_unsent_logs());
 992
 993   if (server_is_healthy)
 994     client_->OnLogUploadComplete();
 995 }
 996
 997 void MetricsService::IncrementPrefValue(const char* path) {
 998   int value = local_state_->GetInteger(path);
 999   local_state_->SetInteger(path, value + 1);
1000 }
1001
1002 void MetricsService::IncrementLongPrefsValue(const char* path) {
1003   int64 value = local_state_->GetInt64(path);
1004   local_state_->SetInt64(path, value + 1);
1005 }
1006
1007 bool MetricsService::UmaMetricsProperlyShutdown() {
1008   CHECK(clean_shutdown_status_ == CLEANLY_SHUTDOWN ||
1009         clean_shutdown_status_ == NEED_TO_SHUTDOWN);
1010   return clean_shutdown_status_ == CLEANLY_SHUTDOWN;
1011 }
1012
1013 void MetricsService::AddSyntheticTrialObserver(
1014     SyntheticTrialObserver* observer) {
1015   synthetic_trial_observer_list_.AddObserver(observer);
1016   if (!synthetic_trial_groups_.empty())
1017     observer->OnSyntheticTrialsChanged(synthetic_trial_groups_);
1018 }
1019
1020 void MetricsService::RemoveSyntheticTrialObserver(
1021     SyntheticTrialObserver* observer) {
1022   synthetic_trial_observer_list_.RemoveObserver(observer);
1023 }
1024
1025 void MetricsService::RegisterSyntheticFieldTrial(
1026     const SyntheticTrialGroup& trial) {
1027   for (size_t i = 0; i < synthetic_trial_groups_.size(); ++i) {
1028     if (synthetic_trial_groups_[i].id.name == trial.id.name) {
1029       if (synthetic_trial_groups_[i].id.group != trial.id.group) {
1030         synthetic_trial_groups_[i].id.group = trial.id.group;
1031         synthetic_trial_groups_[i].start_time = base::TimeTicks::Now();
1032         NotifySyntheticTrialObservers();
1033       }
1034       return;
1035     }
1036   }
1037
1038   SyntheticTrialGroup trial_group = trial;
1039   trial_group.start_time = base::TimeTicks::Now();
1040   synthetic_trial_groups_.push_back(trial_group);
1041   NotifySyntheticTrialObservers();
1042 }
1043
1044 void MetricsService::RegisterMetricsProvider(
1045     scoped_ptr<MetricsProvider> provider) {
1046   DCHECK_EQ(INITIALIZED, state_);
1047   metrics_providers_.push_back(provider.release());
1048 }
1049
1050 void MetricsService::CheckForClonedInstall(
1051     scoped_refptr<base::SingleThreadTaskRunner> task_runner) {
1052   state_manager_->CheckForClonedInstall(task_runner);
1053 }
1054
1055 void MetricsService::NotifySyntheticTrialObservers() {
1056   FOR_EACH_OBSERVER(SyntheticTrialObserver, synthetic_trial_observer_list_,
1057                     OnSyntheticTrialsChanged(synthetic_trial_groups_));
1058 }
1059
1060 void MetricsService::GetCurrentSyntheticFieldTrials(
1061     std::vector<variations::ActiveGroupId>* synthetic_trials) {
1062   DCHECK(synthetic_trials);
1063   synthetic_trials->clear();
1064   const MetricsLog* current_log = log_manager_.current_log();
1065   for (size_t i = 0; i < synthetic_trial_groups_.size(); ++i) {
1066     if (synthetic_trial_groups_[i].start_time <= current_log->creation_time())
1067       synthetic_trials->push_back(synthetic_trial_groups_[i].id);
1068   }
1069 }
1070
1071 scoped_ptr<MetricsLog> MetricsService::CreateLog(MetricsLog::LogType log_type) {
1072   return make_scoped_ptr(new MetricsLog(state_manager_->client_id(),
1073                                         session_id_,
1074                                         log_type,
1075                                         client_,
1076                                         local_state_));
1077 }
1078
1079 void MetricsService::RecordCurrentEnvironment(MetricsLog* log) {
1080   std::vector<variations::ActiveGroupId> synthetic_trials;
1081   GetCurrentSyntheticFieldTrials(&synthetic_trials);
1082   log->RecordEnvironment(metrics_providers_.get(), synthetic_trials,
1083                          GetInstallDate(), GetMetricsReportingEnabledDate());
1084   UMA_HISTOGRAM_COUNTS_100("UMA.SyntheticTrials.Count",
1085                            synthetic_trials.size());
1086 }
1087
1088 void MetricsService::RecordCurrentHistograms() {
1089   DCHECK(log_manager_.current_log());
1090   histogram_snapshot_manager_.PrepareDeltas(
1091       base::Histogram::kNoFlags, base::Histogram::kUmaTargetedHistogramFlag);
1092 }
1093
1094 void MetricsService::RecordCurrentStabilityHistograms() {
1095   DCHECK(log_manager_.current_log());
1096   histogram_snapshot_manager_.PrepareDeltas(
1097       base::Histogram::kNoFlags, base::Histogram::kUmaStabilityHistogramFlag);
1098 }
1099
1100 void MetricsService::LogCleanShutdown() {
1101   // Redundant setting to assure that we always reset this value at shutdown
1102   // (and that we don't use some alternate path, and not call LogCleanShutdown).
1103   clean_shutdown_status_ = CLEANLY_SHUTDOWN;
1104
1105   clean_exit_beacon_.WriteBeaconValue(true);
1106   RecordCurrentState(local_state_);
1107   local_state_->SetInteger(prefs::kStabilityExecutionPhase,
1108                            MetricsService::SHUTDOWN_COMPLETE);
1109 }
1110
1111 bool MetricsService::ShouldLogEvents() {
1112   // We simply don't log events to UMA if there is a single incognito
1113   // session visible. The problem is that we always notify using the orginal
1114   // profile in order to simplify notification processing.
1115   return !client_->IsOffTheRecordSessionActive();
1116 }
1117
1118 void MetricsService::RecordBooleanPrefValue(const char* path, bool value) {
1119   DCHECK(IsSingleThreaded());
1120   local_state_->SetBoolean(path, value);
1121   RecordCurrentState(local_state_);
1122 }
1123
1124 void MetricsService::RecordCurrentState(PrefService* pref) {
1125   pref->SetInt64(prefs::kStabilityLastTimestampSec,
1126                  base::Time::Now().ToTimeT());
1127 }
1128
1129 }  // namespace metrics