1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #ifndef CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
6 #define CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_
11 #include "base/basictypes.h"
12 #include "base/memory/ref_counted.h"
13 #include "base/memory/scoped_ptr.h"
14 #include "base/threading/non_thread_safe.h"
15 #include "content/browser/speech/audio_encoder.h"
16 #include "content/browser/speech/chunked_byte_buffer.h"
17 #include "content/browser/speech/speech_recognition_engine.h"
18 #include "content/common/content_export.h"
19 #include "content/public/common/speech_recognition_error.h"
20 #include "net/url_request/url_fetcher_delegate.h"
23 class URLRequestContextGetter
;
29 struct SpeechRecognitionError
;
30 struct SpeechRecognitionResult
;
32 // Implements a SpeechRecognitionEngine supporting continuous recognition by
33 // means of interaction with Google streaming speech recognition webservice.
34 // More in details, this class establishes two HTTP(S) connections with the
35 // webservice, for each session, herein called "upstream" and "downstream".
36 // Audio chunks are sent on the upstream by means of a chunked HTTP POST upload.
37 // Recognition results are retrieved in a full-duplex fashion (i.e. while
38 // pushing audio on the upstream) on the downstream by means of a chunked
39 // HTTP GET request. Pairing between the two stream is handled through a
40 // randomly generated key, unique for each request, which is passed in the
41 // &pair= arg to both stream request URLs.
42 // In the case of a regular session, the upstream is closed when the audio
43 // capture ends (notified through a |AudioChunksEnded| call) and the downstream
44 // waits for a corresponding server closure (eventually some late results can
45 // come after closing the upstream).
46 // Both stream are guaranteed to be closed when |EndRecognition| call is issued.
47 class CONTENT_EXPORT GoogleStreamingRemoteEngine
48 : public NON_EXPORTED_BASE(SpeechRecognitionEngine
),
49 public net::URLFetcherDelegate
,
50 public NON_EXPORTED_BASE(base::NonThreadSafe
) {
52 // Duration of each audio packet.
53 static const int kAudioPacketIntervalMs
;
55 // IDs passed to URLFetcher::Create(). Used for testing.
56 static const int kUpstreamUrlFetcherIdForTesting
;
57 static const int kDownstreamUrlFetcherIdForTesting
;
59 explicit GoogleStreamingRemoteEngine(net::URLRequestContextGetter
* context
);
60 ~GoogleStreamingRemoteEngine() override
;
62 // SpeechRecognitionEngine methods.
63 void SetConfig(const SpeechRecognitionEngineConfig
& config
) override
;
64 void StartRecognition() override
;
65 void EndRecognition() override
;
66 void TakeAudioChunk(const AudioChunk
& data
) override
;
67 void AudioChunksEnded() override
;
68 bool IsRecognitionPending() const override
;
69 int GetDesiredAudioChunkDurationMs() const override
;
71 // net::URLFetcherDelegate methods.
72 void OnURLFetchComplete(const net::URLFetcher
* source
) override
;
73 void OnURLFetchDownloadProgress(const net::URLFetcher
* source
,
75 int64 total
) override
;
78 // Response status codes from the speech recognition webservice.
79 static const int kWebserviceStatusNoError
;
80 static const int kWebserviceStatusErrorNoMatch
;
82 // Frame type for framed POST data. Do NOT change these. They must match
83 // values the server expects.
85 FRAME_PREAMBLE_AUDIO
= 0,
86 FRAME_RECOGNITION_AUDIO
= 1
89 // Data types for the internal Finite State Machine (FSM).
92 STATE_BOTH_STREAMS_CONNECTED
,
93 STATE_WAITING_DOWNSTREAM_RESULTS
,
94 STATE_MAX_VALUE
= STATE_WAITING_DOWNSTREAM_RESULTS
98 EVENT_END_RECOGNITION
= 0,
99 EVENT_START_RECOGNITION
,
101 EVENT_AUDIO_CHUNKS_ENDED
,
102 EVENT_UPSTREAM_ERROR
,
103 EVENT_DOWNSTREAM_ERROR
,
104 EVENT_DOWNSTREAM_RESPONSE
,
105 EVENT_DOWNSTREAM_CLOSED
,
106 EVENT_MAX_VALUE
= EVENT_DOWNSTREAM_CLOSED
109 struct FSMEventArgs
{
110 explicit FSMEventArgs(FSMEvent event_value
);
115 // In case of EVENT_AUDIO_CHUNK, holds the chunk pushed by |TakeAudioChunk|.
116 scoped_refptr
<const AudioChunk
> audio_data
;
118 // In case of EVENT_DOWNSTREAM_RESPONSE, hold the current chunk bytes.
119 scoped_ptr
<std::vector
<uint8
> > response
;
122 DISALLOW_COPY_AND_ASSIGN(FSMEventArgs
);
125 // Invoked by both upstream and downstream URLFetcher callbacks to handle
126 // new chunk data, connection closed or errors notifications.
127 void DispatchHTTPResponse(const net::URLFetcher
* source
,
128 bool end_of_response
);
130 // Entry point for pushing any new external event into the recognizer FSM.
131 void DispatchEvent(const FSMEventArgs
& event_args
);
133 // Defines the behavior of the recognizer FSM, selecting the appropriate
134 // transition according to the current state and event.
135 FSMState
ExecuteTransitionAndGetNextState(const FSMEventArgs
& event_args
);
137 // The methods below handle transitions of the recognizer FSM.
138 FSMState
ConnectBothStreams(const FSMEventArgs
& event_args
);
139 FSMState
TransmitAudioUpstream(const FSMEventArgs
& event_args
);
140 FSMState
ProcessDownstreamResponse(const FSMEventArgs
& event_args
);
141 FSMState
RaiseNoMatchErrorIfGotNoResults(const FSMEventArgs
& event_args
);
142 FSMState
CloseUpstreamAndWaitForResults(const FSMEventArgs
& event_args
);
143 FSMState
CloseDownstream(const FSMEventArgs
& event_args
);
144 FSMState
AbortSilently(const FSMEventArgs
& event_args
);
145 FSMState
AbortWithError(const FSMEventArgs
& event_args
);
146 FSMState
Abort(SpeechRecognitionErrorCode error
);
147 FSMState
DoNothing(const FSMEventArgs
& event_args
);
148 FSMState
NotFeasible(const FSMEventArgs
& event_args
);
150 std::string
GetAcceptedLanguages() const;
151 std::string
GenerateRequestKey() const;
153 // Upload a single chunk of audio data. Handles both unframed and framed
154 // upload formats, and uses the appropriate one.
155 void UploadAudioChunk(const std::string
& data
, FrameType type
, bool is_final
);
157 SpeechRecognitionEngineConfig config_
;
158 scoped_ptr
<net::URLFetcher
> upstream_fetcher_
;
159 scoped_ptr
<net::URLFetcher
> downstream_fetcher_
;
160 scoped_refptr
<net::URLRequestContextGetter
> url_context_
;
161 scoped_ptr
<AudioEncoder
> encoder_
;
162 scoped_ptr
<AudioEncoder
> preamble_encoder_
;
163 ChunkedByteBuffer chunked_byte_buffer_
;
164 size_t previous_response_length_
;
165 bool got_last_definitive_result_
;
166 bool is_dispatching_event_
;
167 bool use_framed_post_data_
;
170 DISALLOW_COPY_AND_ASSIGN(GoogleStreamingRemoteEngine
);
173 } // namespace content
175 #endif // CONTENT_BROWSER_SPEECH_GOOGLE_STREAMING_REMOTE_ENGINE_H_