content/browser/speech/endpointer/endpointer.h

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
   6 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
   7
   8 #include "base/basictypes.h"
   9 #include "content/browser/speech/endpointer/energy_endpointer.h"
  10 #include "content/common/content_export.h"
  11
  12 class EpStatus;
  13
  14 namespace content {
  15
  16 class AudioChunk;
  17
  18 // A simple interface to the underlying energy-endpointer implementation, this
  19 // class lets callers provide audio as being recorded and let them poll to find
  20 // when the user has stopped speaking.
  21 //
  22 // There are two events that may trigger the end of speech:
  23 //
  24 // speechInputPossiblyComplete event:
  25 //
  26 // Signals that silence/noise has  been detected for a *short* amount of
  27 // time after some speech has been detected. It can be used for low latency
  28 // UI feedback. To disable it, set it to a large amount.
  29 //
  30 // speechInputComplete event:
  31 //
  32 // This event is intended to signal end of input and to stop recording.
  33 // The amount of time to wait after speech is set by
  34 // speech_input_complete_silence_length_ and optionally two other
  35 // parameters (see below).
  36 // This time can be held constant, or can change as more speech is detected.
  37 // In the latter case, the time changes after a set amount of time from the
  38 // *beginning* of speech.  This is motivated by the expectation that there
  39 // will be two distinct types of inputs: short search queries and longer
  40 // dictation style input.
  41 //
  42 // Three parameters are used to define the piecewise constant timeout function.
  43 // The timeout length is speech_input_complete_silence_length until
  44 // long_speech_length, when it changes to
  45 // long_speech_input_complete_silence_length.
  46 class CONTENT_EXPORT Endpointer {
  47  public:
  48   explicit Endpointer(int sample_rate);
  49
  50   // Start the endpointer. This should be called at the beginning of a session.
  51   void StartSession();
  52
  53   // Stop the endpointer.
  54   void EndSession();
  55
  56   // Start environment estimation. Audio will be used for environment estimation
  57   // i.e. noise level estimation.
  58   void SetEnvironmentEstimationMode();
  59
  60   // Start user input. This should be called when the user indicates start of
  61   // input, e.g. by pressing a button.
  62   void SetUserInputMode();
  63
  64   // Process a segment of audio, which may be more than one frame.
  65   // The status of the last frame will be returned.
  66   EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
  67
  68   // Get the status of the endpointer.
  69   EpStatus Status(int64 *time_us);
  70
  71   // Returns true if the endpointer detected reasonable audio levels above
  72   // background noise which could be user speech, false if not.
  73   bool DidStartReceivingSpeech() const {
  74     return speech_previously_detected_;
  75   }
  76
  77   bool IsEstimatingEnvironment() const {
  78     return energy_endpointer_.estimating_environment();
  79   }
  80
  81   void set_speech_input_complete_silence_length(int64 time_us) {
  82     speech_input_complete_silence_length_us_ = time_us;
  83   }
  84
  85   void set_long_speech_input_complete_silence_length(int64 time_us) {
  86     long_speech_input_complete_silence_length_us_ = time_us;
  87   }
  88
  89   void set_speech_input_possibly_complete_silence_length(int64 time_us) {
  90     speech_input_possibly_complete_silence_length_us_ = time_us;
  91   }
  92
  93   void set_long_speech_length(int64 time_us) {
  94     long_speech_length_us_ = time_us;
  95   }
  96
  97   bool speech_input_complete() const {
  98     return speech_input_complete_;
  99   }
 100
 101   // RMS background noise level in dB.
 102   float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
 103
 104  private:
 105   // Reset internal states. Helper method common to initial input utterance
 106   // and following input utternaces.
 107   void Reset();
 108
 109   // Minimum allowable length of speech input.
 110   int64 speech_input_minimum_length_us_;
 111
 112   // The speechInputPossiblyComplete event signals that silence/noise has been
 113   // detected for a *short* amount of time after some speech has been detected.
 114   // This proporty specifies the time period.
 115   int64 speech_input_possibly_complete_silence_length_us_;
 116
 117   // The speechInputComplete event signals that silence/noise has been
 118   // detected for a *long* amount of time after some speech has been detected.
 119   // This property specifies the time period.
 120   int64 speech_input_complete_silence_length_us_;
 121
 122   // Same as above, this specifies the required silence period after speech
 123   // detection. This period is used instead of
 124   // speech_input_complete_silence_length_ when the utterance is longer than
 125   // long_speech_length_. This parameter is optional.
 126   int64 long_speech_input_complete_silence_length_us_;
 127
 128   // The period of time after which the endpointer should consider
 129   // long_speech_input_complete_silence_length_ as a valid silence period
 130   // instead of speech_input_complete_silence_length_. This parameter is
 131   // optional.
 132   int64 long_speech_length_us_;
 133
 134   // First speech onset time, used in determination of speech complete timeout.
 135   int64 speech_start_time_us_;
 136
 137   // Most recent end time, used in determination of speech complete timeout.
 138   int64 speech_end_time_us_;
 139
 140   int64 audio_frame_time_us_;
 141   EpStatus old_ep_status_;
 142   bool waiting_for_speech_possibly_complete_timeout_;
 143   bool waiting_for_speech_complete_timeout_;
 144   bool speech_previously_detected_;
 145   bool speech_input_complete_;
 146   EnergyEndpointer energy_endpointer_;
 147   int sample_rate_;
 148   int32 frame_size_;
 149 };
 150
 151 }  // namespace content
 152
 153 #endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_