content/browser/speech/speech_recognizer_impl.h

   1 // Copyright (c) 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifndef CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
   6 #define CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_
   7
   8 #include "base/basictypes.h"
   9 #include "base/memory/scoped_ptr.h"
  10 #include "content/browser/speech/endpointer/endpointer.h"
  11 #include "content/browser/speech/speech_recognition_engine.h"
  12 #include "content/browser/speech/speech_recognizer.h"
  13 #include "content/public/common/speech_recognition_error.h"
  14 #include "content/public/common/speech_recognition_result.h"
  15 #include "media/audio/audio_input_controller.h"
  16 #include "net/url_request/url_request_context_getter.h"
  17
  18 namespace media {
  19 class AudioManager;
  20 }
  21
  22 namespace content {
  23
  24 class SpeechRecognitionEventListener;
  25
  26 // Handles speech recognition for a session (identified by |session_id|), taking
  27 // care of audio capture, silence detection/endpointer and interaction with the
  28 // SpeechRecognitionEngine.
  29 class CONTENT_EXPORT SpeechRecognizerImpl
  30     : public SpeechRecognizer,
  31       public media::AudioInputController::EventHandler,
  32       public NON_EXPORTED_BASE(SpeechRecognitionEngineDelegate) {
  33  public:
  34   static const int kAudioSampleRate;
  35   static const media::ChannelLayout kChannelLayout;
  36   static const int kNumBitsPerAudioSample;
  37   static const int kNoSpeechTimeoutMs;
  38   static const int kEndpointerEstimationTimeMs;
  39
  40   static void SetAudioManagerForTesting(media::AudioManager* audio_manager);
  41
  42   SpeechRecognizerImpl(SpeechRecognitionEventListener* listener,
  43                        int session_id,
  44                        bool continuous,
  45                        bool provisional_results,
  46                        SpeechRecognitionEngine* engine);
  47
  48   virtual void StartRecognition(const std::string& device_id) OVERRIDE;
  49   virtual void AbortRecognition() OVERRIDE;
  50   virtual void StopAudioCapture() OVERRIDE;
  51   virtual bool IsActive() const OVERRIDE;
  52   virtual bool IsCapturingAudio() const OVERRIDE;
  53   const SpeechRecognitionEngine& recognition_engine() const;
  54
  55  private:
  56   friend class SpeechRecognizerTest;
  57
  58   enum FSMState {
  59     STATE_IDLE = 0,
  60     STATE_STARTING,
  61     STATE_ESTIMATING_ENVIRONMENT,
  62     STATE_WAITING_FOR_SPEECH,
  63     STATE_RECOGNIZING,
  64     STATE_WAITING_FINAL_RESULT,
  65     STATE_ENDED,
  66     STATE_MAX_VALUE = STATE_ENDED
  67   };
  68
  69   enum FSMEvent {
  70     EVENT_ABORT = 0,
  71     EVENT_START,
  72     EVENT_STOP_CAPTURE,
  73     EVENT_AUDIO_DATA,
  74     EVENT_ENGINE_RESULT,
  75     EVENT_ENGINE_ERROR,
  76     EVENT_AUDIO_ERROR,
  77     EVENT_MAX_VALUE = EVENT_AUDIO_ERROR
  78   };
  79
  80   struct FSMEventArgs {
  81     explicit FSMEventArgs(FSMEvent event_value);
  82     ~FSMEventArgs();
  83
  84     FSMEvent event;
  85     scoped_refptr<AudioChunk> audio_data;
  86     SpeechRecognitionResults engine_results;
  87     SpeechRecognitionError engine_error;
  88   };
  89
  90   virtual ~SpeechRecognizerImpl();
  91
  92   // Entry point for pushing any new external event into the recognizer FSM.
  93   void DispatchEvent(const FSMEventArgs& event_args);
  94
  95   // Defines the behavior of the recognizer FSM, selecting the appropriate
  96   // transition according to the current state and event.
  97   FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args);
  98
  99   // Process a new audio chunk in the audio pipeline (endpointer, vumeter, etc).
 100   void ProcessAudioPipeline(const AudioChunk& raw_audio);
 101
 102   // The methods below handle transitions of the recognizer FSM.
 103   FSMState StartRecording(const FSMEventArgs& event_args);
 104   FSMState StartRecognitionEngine(const FSMEventArgs& event_args);
 105   FSMState WaitEnvironmentEstimationCompletion(const FSMEventArgs& event_args);
 106   FSMState DetectUserSpeechOrTimeout(const FSMEventArgs& event_args);
 107   FSMState StopCaptureAndWaitForResult(const FSMEventArgs& event_args);
 108   FSMState ProcessIntermediateResult(const FSMEventArgs& event_args);
 109   FSMState ProcessFinalResult(const FSMEventArgs& event_args);
 110   FSMState AbortSilently(const FSMEventArgs& event_args);
 111   FSMState AbortWithError(const FSMEventArgs& event_args);
 112   FSMState Abort(const SpeechRecognitionError& error);
 113   FSMState DetectEndOfSpeech(const FSMEventArgs& event_args);
 114   FSMState DoNothing(const FSMEventArgs& event_args) const;
 115   FSMState NotFeasible(const FSMEventArgs& event_args);
 116
 117   // Returns the time span of captured audio samples since the start of capture.
 118   int GetElapsedTimeMs() const;
 119
 120   // Calculates the input volume to be displayed in the UI, triggering the
 121   // OnAudioLevelsChange event accordingly.
 122   void UpdateSignalAndNoiseLevels(const float& rms, bool clip_detected);
 123
 124   void CloseAudioControllerAsynchronously();
 125
 126   // Callback called on IO thread by audio_controller->Close().
 127   void OnAudioClosed(media::AudioInputController*);
 128
 129   // AudioInputController::EventHandler methods.
 130   virtual void OnCreated(media::AudioInputController* controller) OVERRIDE {}
 131   virtual void OnRecording(media::AudioInputController* controller) OVERRIDE {}
 132   virtual void OnError(media::AudioInputController* controller) OVERRIDE;
 133   virtual void OnData(media::AudioInputController* controller,
 134                       const uint8* data, uint32 size) OVERRIDE;
 135
 136   // SpeechRecognitionEngineDelegate methods.
 137   virtual void OnSpeechRecognitionEngineResults(
 138       const SpeechRecognitionResults& results) OVERRIDE;
 139   virtual void OnSpeechRecognitionEngineError(
 140       const SpeechRecognitionError& error) OVERRIDE;
 141
 142   static media::AudioManager* audio_manager_for_tests_;
 143
 144   scoped_ptr<SpeechRecognitionEngine> recognition_engine_;
 145   Endpointer endpointer_;
 146   scoped_refptr<media::AudioInputController> audio_controller_;
 147   int num_samples_recorded_;
 148   float audio_level_;
 149   bool is_dispatching_event_;
 150   bool provisional_results_;
 151   FSMState state_;
 152   std::string device_id_;
 153
 154   class OnDataConverter;
 155
 156   // Converts data between native input format and a WebSpeech specific
 157   // output format.
 158   scoped_ptr<SpeechRecognizerImpl::OnDataConverter> audio_converter_;
 159
 160   DISALLOW_COPY_AND_ASSIGN(SpeechRecognizerImpl);
 161 };
 162
 163 }  // namespace content
 164
 165 #endif  // CONTENT_BROWSER_SPEECH_SPEECH_RECOGNIZER_IMPL_H_