content/browser/speech/endpointer/endpointer.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "content/browser/speech/endpointer/endpointer.h"
   6
   7 #include "base/time.h"
   8 #include "content/browser/speech/audio_buffer.h"
   9
  10 using base::Time;
  11
  12 namespace {
  13 const int kFrameRate = 50;  // 1 frame = 20ms of audio.
  14 }
  15
  16 namespace content {
  17
  18 Endpointer::Endpointer(int sample_rate)
  19     : speech_input_possibly_complete_silence_length_us_(-1),
  20       speech_input_complete_silence_length_us_(-1),
  21       audio_frame_time_us_(0),
  22       sample_rate_(sample_rate),
  23       frame_size_(0) {
  24   Reset();
  25
  26   frame_size_ = static_cast<int>(sample_rate / static_cast<float>(kFrameRate));
  27
  28   speech_input_minimum_length_us_ =
  29       static_cast<int64>(1.7 * Time::kMicrosecondsPerSecond);
  30   speech_input_complete_silence_length_us_ =
  31       static_cast<int64>(0.5 * Time::kMicrosecondsPerSecond);
  32   long_speech_input_complete_silence_length_us_ = -1;
  33   long_speech_length_us_ = -1;
  34   speech_input_possibly_complete_silence_length_us_ =
  35       1 * Time::kMicrosecondsPerSecond;
  36
  37   // Set the default configuration for Push To Talk mode.
  38   EnergyEndpointerParams ep_config;
  39   ep_config.set_frame_period(1.0f / static_cast<float>(kFrameRate));
  40   ep_config.set_frame_duration(1.0f / static_cast<float>(kFrameRate));
  41   ep_config.set_endpoint_margin(0.2f);
  42   ep_config.set_onset_window(0.15f);
  43   ep_config.set_speech_on_window(0.4f);
  44   ep_config.set_offset_window(0.15f);
  45   ep_config.set_onset_detect_dur(0.09f);
  46   ep_config.set_onset_confirm_dur(0.075f);
  47   ep_config.set_on_maintain_dur(0.10f);
  48   ep_config.set_offset_confirm_dur(0.12f);
  49   ep_config.set_decision_threshold(1000.0f);
  50   ep_config.set_min_decision_threshold(50.0f);
  51   ep_config.set_fast_update_dur(0.2f);
  52   ep_config.set_sample_rate(static_cast<float>(sample_rate));
  53   ep_config.set_min_fundamental_frequency(57.143f);
  54   ep_config.set_max_fundamental_frequency(400.0f);
  55   ep_config.set_contamination_rejection_period(0.25f);
  56   energy_endpointer_.Init(ep_config);
  57 }
  58
  59 void Endpointer::Reset() {
  60   old_ep_status_ = EP_PRE_SPEECH;
  61   waiting_for_speech_possibly_complete_timeout_ = false;
  62   waiting_for_speech_complete_timeout_ = false;
  63   speech_previously_detected_ = false;
  64   speech_input_complete_ = false;
  65   audio_frame_time_us_ = 0; // Reset time for packets sent to endpointer.
  66   speech_end_time_us_ = -1;
  67   speech_start_time_us_ = -1;
  68 }
  69
  70 void Endpointer::StartSession() {
  71   Reset();
  72   energy_endpointer_.StartSession();
  73 }
  74
  75 void Endpointer::EndSession() {
  76   energy_endpointer_.EndSession();
  77 }
  78
  79 void Endpointer::SetEnvironmentEstimationMode() {
  80   Reset();
  81   energy_endpointer_.SetEnvironmentEstimationMode();
  82 }
  83
  84 void Endpointer::SetUserInputMode() {
  85   energy_endpointer_.SetUserInputMode();
  86 }
  87
  88 EpStatus Endpointer::Status(int64 *time) {
  89   return energy_endpointer_.Status(time);
  90 }
  91
  92 EpStatus Endpointer::ProcessAudio(const AudioChunk& raw_audio, float* rms_out) {
  93   const int16* audio_data = raw_audio.SamplesData16();
  94   const int num_samples = raw_audio.NumSamples();
  95   EpStatus ep_status = EP_PRE_SPEECH;
  96
  97   // Process the input data in blocks of frame_size_, dropping any incomplete
  98   // frames at the end (which is ok since typically the caller will be recording
  99   // audio in multiples of our frame size).
 100   int sample_index = 0;
 101   while (sample_index + frame_size_ <= num_samples) {
 102     // Have the endpointer process the frame.
 103     energy_endpointer_.ProcessAudioFrame(audio_frame_time_us_,
 104                                          audio_data + sample_index,
 105                                          frame_size_,
 106                                          rms_out);
 107     sample_index += frame_size_;
 108     audio_frame_time_us_ += (frame_size_ * Time::kMicrosecondsPerSecond) /
 109                          sample_rate_;
 110
 111     // Get the status of the endpointer.
 112     int64 ep_time;
 113     ep_status = energy_endpointer_.Status(&ep_time);
 114
 115     // Handle state changes.
 116     if ((EP_SPEECH_PRESENT == ep_status) &&
 117         (EP_POSSIBLE_ONSET == old_ep_status_)) {
 118       speech_end_time_us_ = -1;
 119       waiting_for_speech_possibly_complete_timeout_ = false;
 120       waiting_for_speech_complete_timeout_ = false;
 121       // Trigger SpeechInputDidStart event on first detection.
 122       if (false == speech_previously_detected_) {
 123         speech_previously_detected_ = true;
 124         speech_start_time_us_ = ep_time;
 125       }
 126     }
 127     if ((EP_PRE_SPEECH == ep_status) &&
 128         (EP_POSSIBLE_OFFSET == old_ep_status_)) {
 129       speech_end_time_us_ = ep_time;
 130       waiting_for_speech_possibly_complete_timeout_ = true;
 131       waiting_for_speech_complete_timeout_ = true;
 132     }
 133     if (ep_time > speech_input_minimum_length_us_) {
 134       // Speech possibly complete timeout.
 135       if ((waiting_for_speech_possibly_complete_timeout_) &&
 136           (ep_time - speech_end_time_us_ >
 137               speech_input_possibly_complete_silence_length_us_)) {
 138         waiting_for_speech_possibly_complete_timeout_ = false;
 139       }
 140       if (waiting_for_speech_complete_timeout_) {
 141         // The length of the silence timeout period can be held constant, or it
 142         // can be changed after a fixed amount of time from the beginning of
 143         // speech.
 144         bool has_stepped_silence =
 145             (long_speech_length_us_ > 0) &&
 146             (long_speech_input_complete_silence_length_us_ > 0);
 147         int64 requested_silence_length;
 148         if (has_stepped_silence &&
 149             (ep_time - speech_start_time_us_) > long_speech_length_us_) {
 150           requested_silence_length =
 151               long_speech_input_complete_silence_length_us_;
 152         } else {
 153           requested_silence_length =
 154               speech_input_complete_silence_length_us_;
 155         }
 156
 157         // Speech complete timeout.
 158         if ((ep_time - speech_end_time_us_) > requested_silence_length) {
 159           waiting_for_speech_complete_timeout_ = false;
 160           speech_input_complete_ = true;
 161         }
 162       }
 163     }
 164     old_ep_status_ = ep_status;
 165   }
 166   return ep_status;
 167 }
 168
 169 }  // namespace content