content/browser/speech/google_streaming_remote_engine.h

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifndef CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
   6 #define CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
   7
   8 #include <string>
   9 #include <vector>
  10
  11 #include "base/basictypes.h"
  12 #include "base/memory/ref_counted.h"
  13 #include "base/memory/scoped_ptr.h"
  14 #include "base/threading/non_thread_safe.h"
  15 #include "content/browser/speech/audio_encoder.h"
  16 #include "content/browser/speech/chunked_byte_buffer.h"
  17 #include "content/browser/speech/speech_recognition_engine.h"
  18 #include "content/common/content_export.h"
  19 #include "content/public/common/speech_recognition_error.h"
  20 #include "net/url_request/url_fetcher_delegate.h"
  21
  22 namespace net {
  23 class URLRequestContextGetter;
  24 }
  25
  26 namespace content {
  27
  28 class AudioChunk;
  29 struct SpeechRecognitionError;
  30 struct SpeechRecognitionResult;
  31
  32 // Implements a SpeechRecognitionEngine supporting continuous recognition by
  33 // means of interaction with Google streaming speech recognition webservice.
  34 // More in details, this class establishes two HTTP(S) connections with the
  35 // webservice, for each session, herein called "upstream" and "downstream".
  36 // Audio chunks are sent on the upstream by means of a chunked HTTP POST upload.
  37 // Recognition results are retrieved in a full-duplex fashion (i.e. while
  38 // pushing audio on the upstream) on the downstream by means of a chunked
  39 // HTTP GET request. Pairing between the two stream is handled through a
  40 // randomly generated key, unique for each request, which is passed in the
  41 // &pair= arg to both stream request URLs.
  42 // In the case of a regular session, the upstream is closed when the audio
  43 // capture ends (notified through a |AudioChunksEnded| call) and the downstream
  44 // waits for a corresponding server closure (eventually some late results can
  45 // come after closing the upstream).
  46 // Both stream are guaranteed to be closed when |EndRecognition| call is issued.
  47 class CONTENT_EXPORT GoogleStreamingRemoteEngine
  48     : public NON_EXPORTED_BASE(SpeechRecognitionEngine),
  49       public net::URLFetcherDelegate,
  50       public NON_EXPORTED_BASE(base::NonThreadSafe) {
  51  public:
  52   // Duration of each audio packet.
  53   static const int kAudioPacketIntervalMs;
  54
  55   // IDs passed to URLFetcher::Create(). Used for testing.
  56   static const int kUpstreamUrlFetcherIdForTesting;
  57   static const int kDownstreamUrlFetcherIdForTesting;
  58
  59   explicit GoogleStreamingRemoteEngine(net::URLRequestContextGetter* context);
  60   ~GoogleStreamingRemoteEngine() override;
  61
  62   // SpeechRecognitionEngine methods.
  63   void SetConfig(const SpeechRecognitionEngineConfig& config) override;
  64   void StartRecognition() override;
  65   void EndRecognition() override;
  66   void TakeAudioChunk(const AudioChunk& data) override;
  67   void AudioChunksEnded() override;
  68   bool IsRecognitionPending() const override;
  69   int GetDesiredAudioChunkDurationMs() const override;
  70
  71   // net::URLFetcherDelegate methods.
  72   void OnURLFetchComplete(const net::URLFetcher* source) override;
  73   void OnURLFetchDownloadProgress(const net::URLFetcher* source,
  74                                   int64 current,
  75                                   int64 total) override;
  76
  77  private:
  78   // Response status codes from the speech recognition webservice.
  79   static const int kWebserviceStatusNoError;
  80   static const int kWebserviceStatusErrorNoMatch;
  81
  82   // Frame type for framed POST data. Do NOT change these. They must match
  83   // values the server expects.
  84   enum FrameType {
  85     FRAME_PREAMBLE_AUDIO = 0,
  86     FRAME_RECOGNITION_AUDIO = 1
  87   };
  88
  89   // Data types for the internal Finite State Machine (FSM).
  90   enum FSMState {
  91     STATE_IDLE = 0,
  92     STATE_BOTH_STREAMS_CONNECTED,
  93     STATE_WAITING_DOWNSTREAM_RESULTS,
  94     STATE_MAX_VALUE = STATE_WAITING_DOWNSTREAM_RESULTS
  95   };
  96
  97   enum FSMEvent {
  98     EVENT_END_RECOGNITION = 0,
  99     EVENT_START_RECOGNITION,
 100     EVENT_AUDIO_CHUNK,
 101     EVENT_AUDIO_CHUNKS_ENDED,
 102     EVENT_UPSTREAM_ERROR,
 103     EVENT_DOWNSTREAM_ERROR,
 104     EVENT_DOWNSTREAM_RESPONSE,
 105     EVENT_DOWNSTREAM_CLOSED,
 106     EVENT_MAX_VALUE = EVENT_DOWNSTREAM_CLOSED
 107   };
 108
 109   struct FSMEventArgs {
 110     explicit FSMEventArgs(FSMEvent event_value);
 111     ~FSMEventArgs();
 112
 113     FSMEvent event;
 114
 115     // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
 116     scoped_refptr<const AudioChunk> audio_data;
 117
 118     // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
 119     scoped_ptr<std::vector<uint8> > response;
 120
 121    private:
 122     DISALLOW_COPY_AND_ASSIGN(FSMEventArgs);
 123   };
 124
 125   // Invoked by both upstream and downstream URLFetcher callbacks to handle
 126   // new chunk data, connection closed or errors notifications.
 127   void DispatchHTTPResponse(const net::URLFetcher* source,
 128                             bool end_of_response);
 129
 130   // Entry point for pushing any new external event into the recognizer FSM.
 131   void DispatchEvent(const FSMEventArgs& event_args);
 132
 133   // Defines the behavior of the recognizer FSM, selecting the appropriate
 134   // transition according to the current state and event.
 135   FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& event_args);
 136
 137   // The methods below handle transitions of the recognizer FSM.
 138   FSMState ConnectBothStreams(const FSMEventArgs& event_args);
 139   FSMState TransmitAudioUpstream(const FSMEventArgs& event_args);
 140   FSMState ProcessDownstreamResponse(const FSMEventArgs& event_args);
 141   FSMState RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs& event_args);
 142   FSMState CloseUpstreamAndWaitForResults(const FSMEventArgs& event_args);
 143   FSMState CloseDownstream(const FSMEventArgs& event_args);
 144   FSMState AbortSilently(const FSMEventArgs& event_args);
 145   FSMState AbortWithError(const FSMEventArgs& event_args);
 146   FSMState Abort(SpeechRecognitionErrorCode error);
 147   FSMState DoNothing(const FSMEventArgs& event_args);
 148   FSMState NotFeasible(const FSMEventArgs& event_args);
 149
 150   std::string GetAcceptedLanguages() const;
 151   std::string GenerateRequestKey() const;
 152
 153   // Upload a single chunk of audio data. Handles both unframed and framed
 154   // upload formats, and uses the appropriate one.
 155   void UploadAudioChunk(const std::string& data, FrameType type, bool is_final);
 156
 157   SpeechRecognitionEngineConfig config_;
 158   scoped_ptr<net::URLFetcher> upstream_fetcher_;
 159   scoped_ptr<net::URLFetcher> downstream_fetcher_;
 160   scoped_refptr<net::URLRequestContextGetter> url_context_;
 161   scoped_ptr<AudioEncoder> encoder_;
 162   scoped_ptr<AudioEncoder> preamble_encoder_;
 163   ChunkedByteBuffer chunked_byte_buffer_;
 164   size_t previous_response_length_;
 165   bool got_last_definitive_result_;
 166   bool is_dispatching_event_;
 167   bool use_framed_post_data_;
 168   FSMState state_;
 169
 170   DISALLOW_COPY_AND_ASSIGN(GoogleStreamingRemoteEngine);
 171 };
 172
 173 }  // namespace content
 174
 175 #endif  // CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_