dom/media/webspeech/recognition/endpointer.h

   1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
   2 //
   3 // Redistribution and use in source and binary forms, with or without
   4 // modification, are permitted provided that the following conditions are
   5 // met:
   6 //
   7 //    * Redistributions of source code must retain the above copyright
   8 // notice, this list of conditions and the following disclaimer.
   9 //    * Redistributions in binary form must reproduce the above
  10 // copyright notice, this list of conditions and the following disclaimer
  11 // in the documentation and/or other materials provided with the
  12 // distribution.
  13 //    * Neither the name of Google Inc. nor the names of its
  14 // contributors may be used to endorse or promote products derived from
  15 // this software without specific prior written permission.
  16 //
  17 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  18 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  19 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  20 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  21 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  22 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  23 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  24 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  25 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  26 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  27 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  28
  29 #ifndef CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
  30 #define CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_
  31
  32 #include "energy_endpointer.h"
  33
  34 namespace mozilla {
  35
  36 struct AudioChunk;
  37
  38 // A simple interface to the underlying energy-endpointer implementation, this
  39 // class lets callers provide audio as being recorded and let them poll to find
  40 // when the user has stopped speaking.
  41 //
  42 // There are two events that may trigger the end of speech:
  43 //
  44 // speechInputPossiblyComplete event:
  45 //
  46 // Signals that silence/noise has  been detected for a *short* amount of
  47 // time after some speech has been detected. It can be used for low latency
  48 // UI feedback. To disable it, set it to a large amount.
  49 //
  50 // speechInputComplete event:
  51 //
  52 // This event is intended to signal end of input and to stop recording.
  53 // The amount of time to wait after speech is set by
  54 // speech_input_complete_silence_length_ and optionally two other
  55 // parameters (see below).
  56 // This time can be held constant, or can change as more speech is detected.
  57 // In the latter case, the time changes after a set amount of time from the
  58 // *beginning* of speech.  This is motivated by the expectation that there
  59 // will be two distinct types of inputs: short search queries and longer
  60 // dictation style input.
  61 //
  62 // Three parameters are used to define the piecewise constant timeout function.
  63 // The timeout length is speech_input_complete_silence_length until
  64 // long_speech_length, when it changes to
  65 // long_speech_input_complete_silence_length.
  66 class Endpointer {
  67  public:
  68   explicit Endpointer(int sample_rate);
  69
  70   // Start the endpointer. This should be called at the beginning of a session.
  71   void StartSession();
  72
  73   // Stop the endpointer.
  74   void EndSession();
  75
  76   // Start environment estimation. Audio will be used for environment estimation
  77   // i.e. noise level estimation.
  78   void SetEnvironmentEstimationMode();
  79
  80   // Start user input. This should be called when the user indicates start of
  81   // input, e.g. by pressing a button.
  82   void SetUserInputMode();
  83
  84   // Process a segment of audio, which may be more than one frame.
  85   // The status of the last frame will be returned.
  86   EpStatus ProcessAudio(const AudioChunk& raw_audio, float* rms_out);
  87
  88   // Get the status of the endpointer.
  89   EpStatus Status(int64_t *time_us);
  90
  91   // Get the expected frame size for audio chunks. Audio chunks are expected
  92   // to contain a number of samples that is a multiple of this number, and extra
  93   // samples will be dropped.
  94   int32_t FrameSize() const {
  95     return frame_size_;
  96   }
  97
  98   // Returns true if the endpointer detected reasonable audio levels above
  99   // background noise which could be user speech, false if not.
 100   bool DidStartReceivingSpeech() const {
 101     return speech_previously_detected_;
 102   }
 103
 104   bool IsEstimatingEnvironment() const {
 105     return energy_endpointer_.estimating_environment();
 106   }
 107
 108   void set_speech_input_complete_silence_length(int64_t time_us) {
 109     speech_input_complete_silence_length_us_ = time_us;
 110   }
 111
 112   void set_long_speech_input_complete_silence_length(int64_t time_us) {
 113     long_speech_input_complete_silence_length_us_ = time_us;
 114   }
 115
 116   void set_speech_input_possibly_complete_silence_length(int64_t time_us) {
 117     speech_input_possibly_complete_silence_length_us_ = time_us;
 118   }
 119
 120   void set_long_speech_length(int64_t time_us) {
 121     long_speech_length_us_ = time_us;
 122   }
 123
 124   bool speech_input_complete() const {
 125     return speech_input_complete_;
 126   }
 127
 128   // RMS background noise level in dB.
 129   float NoiseLevelDb() const { return energy_endpointer_.GetNoiseLevelDb(); }
 130
 131  private:
 132   // Reset internal states. Helper method common to initial input utterance
 133   // and following input utternaces.
 134   void Reset();
 135
 136   // Minimum allowable length of speech input.
 137   int64_t speech_input_minimum_length_us_;
 138
 139   // The speechInputPossiblyComplete event signals that silence/noise has been
 140   // detected for a *short* amount of time after some speech has been detected.
 141   // This proporty specifies the time period.
 142   int64_t speech_input_possibly_complete_silence_length_us_;
 143
 144   // The speechInputComplete event signals that silence/noise has been
 145   // detected for a *long* amount of time after some speech has been detected.
 146   // This property specifies the time period.
 147   int64_t speech_input_complete_silence_length_us_;
 148
 149   // Same as above, this specifies the required silence period after speech
 150   // detection. This period is used instead of
 151   // speech_input_complete_silence_length_ when the utterance is longer than
 152   // long_speech_length_. This parameter is optional.
 153   int64_t long_speech_input_complete_silence_length_us_;
 154
 155   // The period of time after which the endpointer should consider
 156   // long_speech_input_complete_silence_length_ as a valid silence period
 157   // instead of speech_input_complete_silence_length_. This parameter is
 158   // optional.
 159   int64_t long_speech_length_us_;
 160
 161   // First speech onset time, used in determination of speech complete timeout.
 162   int64_t speech_start_time_us_;
 163
 164   // Most recent end time, used in determination of speech complete timeout.
 165   int64_t speech_end_time_us_;
 166
 167   int64_t audio_frame_time_us_;
 168   EpStatus old_ep_status_;
 169   bool waiting_for_speech_possibly_complete_timeout_;
 170   bool waiting_for_speech_complete_timeout_;
 171   bool speech_previously_detected_;
 172   bool speech_input_complete_;
 173   EnergyEndpointer energy_endpointer_;
 174   int sample_rate_;
 175   int32_t frame_size_;
 176 };
 177
 178 }  // namespace mozilla
 179
 180 #endif  // CONTENT_BROWSER_SPEECH_ENDPOINTER_ENDPOINTER_H_