1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
7 #include "base/memory/scoped_ptr.h"
8 #include "base/message_loop.h"
9 #include "base/strings/utf_string_conversions.h"
10 #include "content/browser/speech/audio_buffer.h"
11 #include "content/browser/speech/google_streaming_remote_engine.h"
12 #include "content/browser/speech/proto/google_streaming_api.pb.h"
13 #include "content/public/common/speech_recognition_error.h"
14 #include "content/public/common/speech_recognition_result.h"
15 #include "net/url_request/test_url_fetcher_factory.h"
16 #include "net/url_request/url_request_context_getter.h"
17 #include "net/url_request/url_request_status.h"
18 #include "testing/gtest/include/gtest/gtest.h"
20 using net::URLRequestStatus
;
21 using net::TestURLFetcher
;
22 using net::TestURLFetcherFactory
;
26 // Note: the terms upstream and downstream are from the point-of-view of the
27 // client (engine_under_test_).
29 class GoogleStreamingRemoteEngineTest
: public SpeechRecognitionEngineDelegate
,
30 public testing::Test
{
32 GoogleStreamingRemoteEngineTest()
33 : last_number_of_upstream_chunks_seen_(0U),
34 error_(SPEECH_RECOGNITION_ERROR_NONE
) { }
36 // Creates a speech recognition request and invokes its URL fetcher delegate
37 // with the given test data.
38 void CreateAndTestRequest(bool success
, const std::string
& http_response
);
40 // SpeechRecognitionRequestDelegate methods.
41 virtual void OnSpeechRecognitionEngineResults(
42 const SpeechRecognitionResults
& results
) OVERRIDE
{
43 results_
.push(results
);
45 virtual void OnSpeechRecognitionEngineError(
46 const SpeechRecognitionError
& error
) OVERRIDE
{
50 // testing::Test methods.
51 virtual void SetUp() OVERRIDE
;
52 virtual void TearDown() OVERRIDE
;
55 enum DownstreamError
{
56 DOWNSTREAM_ERROR_NONE
,
57 DOWNSTREAM_ERROR_HTTP500
,
58 DOWNSTREAM_ERROR_NETWORK
,
59 DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH
61 static bool ResultsAreEqual(const SpeechRecognitionResults
& a
,
62 const SpeechRecognitionResults
& b
);
63 static std::string
SerializeProtobufResponse(
64 const proto::SpeechRecognitionEvent
& msg
);
65 static std::string
ToBigEndian32(uint32 value
);
67 TestURLFetcher
* GetUpstreamFetcher();
68 TestURLFetcher
* GetDownstreamFetcher();
69 void StartMockRecognition();
70 void EndMockRecognition();
71 void InjectDummyAudioChunk();
72 size_t UpstreamChunksUploadedFromLastCall();
73 void ProvideMockProtoResultDownstream(
74 const proto::SpeechRecognitionEvent
& result
);
75 void ProvideMockResultDownstream(const SpeechRecognitionResult
& result
);
76 void ExpectResultsReceived(const SpeechRecognitionResults
& result
);
77 void CloseMockDownstream(DownstreamError error
);
79 scoped_ptr
<GoogleStreamingRemoteEngine
> engine_under_test_
;
80 TestURLFetcherFactory url_fetcher_factory_
;
81 size_t last_number_of_upstream_chunks_seen_
;
82 base::MessageLoop message_loop_
;
83 std::string response_buffer_
;
84 SpeechRecognitionErrorCode error_
;
85 std::queue
<SpeechRecognitionResults
> results_
;
88 TEST_F(GoogleStreamingRemoteEngineTest
, SingleDefinitiveResult
) {
89 StartMockRecognition();
90 ASSERT_TRUE(GetUpstreamFetcher());
91 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
93 // Inject some dummy audio chunks and check a corresponding chunked upload
94 // is performed every time on the server.
95 for (int i
= 0; i
< 3; ++i
) {
96 InjectDummyAudioChunk();
97 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
100 // Ensure that a final (empty) audio chunk is uploaded on chunks end.
101 engine_under_test_
->AudioChunksEnded();
102 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
103 ASSERT_TRUE(engine_under_test_
->IsRecognitionPending());
105 // Simulate a protobuf message streamed from the server containing a single
106 // result with two hypotheses.
107 SpeechRecognitionResults results
;
108 results
.push_back(SpeechRecognitionResult());
109 SpeechRecognitionResult
& result
= results
.back();
110 result
.is_provisional
= false;
111 result
.hypotheses
.push_back(
112 SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 1"), 0.1F
));
113 result
.hypotheses
.push_back(
114 SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis 2"), 0.2F
));
116 ProvideMockResultDownstream(result
);
117 ExpectResultsReceived(results
);
118 ASSERT_TRUE(engine_under_test_
->IsRecognitionPending());
120 // Ensure everything is closed cleanly after the downstream is closed.
121 CloseMockDownstream(DOWNSTREAM_ERROR_NONE
);
122 ASSERT_FALSE(engine_under_test_
->IsRecognitionPending());
123 EndMockRecognition();
124 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE
, error_
);
125 ASSERT_EQ(0U, results_
.size());
128 TEST_F(GoogleStreamingRemoteEngineTest
, SeveralStreamingResults
) {
129 StartMockRecognition();
130 ASSERT_TRUE(GetUpstreamFetcher());
131 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
133 for (int i
= 0; i
< 4; ++i
) {
134 InjectDummyAudioChunk();
135 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
137 SpeechRecognitionResults results
;
138 results
.push_back(SpeechRecognitionResult());
139 SpeechRecognitionResult
& result
= results
.back();
140 result
.is_provisional
= (i
% 2 == 0); // Alternate result types.
141 float confidence
= result
.is_provisional
? 0.0F
: (i
* 0.1F
);
142 result
.hypotheses
.push_back(
143 SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), confidence
));
145 ProvideMockResultDownstream(result
);
146 ExpectResultsReceived(results
);
147 ASSERT_TRUE(engine_under_test_
->IsRecognitionPending());
150 // Ensure that a final (empty) audio chunk is uploaded on chunks end.
151 engine_under_test_
->AudioChunksEnded();
152 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
153 ASSERT_TRUE(engine_under_test_
->IsRecognitionPending());
155 // Simulate a final definitive result.
156 SpeechRecognitionResults results
;
157 results
.push_back(SpeechRecognitionResult());
158 SpeechRecognitionResult
& result
= results
.back();
159 result
.is_provisional
= false;
160 result
.hypotheses
.push_back(
161 SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 1.0F
));
162 ProvideMockResultDownstream(result
);
163 ExpectResultsReceived(results
);
164 ASSERT_TRUE(engine_under_test_
->IsRecognitionPending());
166 // Ensure everything is closed cleanly after the downstream is closed.
167 CloseMockDownstream(DOWNSTREAM_ERROR_NONE
);
168 ASSERT_FALSE(engine_under_test_
->IsRecognitionPending());
169 EndMockRecognition();
170 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE
, error_
);
171 ASSERT_EQ(0U, results_
.size());
174 TEST_F(GoogleStreamingRemoteEngineTest
, NoFinalResultAfterAudioChunksEnded
) {
175 StartMockRecognition();
176 ASSERT_TRUE(GetUpstreamFetcher());
177 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
179 // Simulate one pushed audio chunk.
180 InjectDummyAudioChunk();
181 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
183 // Simulate the corresponding definitive result.
184 SpeechRecognitionResults results
;
185 results
.push_back(SpeechRecognitionResult());
186 SpeechRecognitionResult
& result
= results
.back();
187 result
.hypotheses
.push_back(
188 SpeechRecognitionHypothesis(UTF8ToUTF16("hypothesis"), 1.0F
));
189 ProvideMockResultDownstream(result
);
190 ExpectResultsReceived(results
);
191 ASSERT_TRUE(engine_under_test_
->IsRecognitionPending());
193 // Simulate a silent downstream closure after |AudioChunksEnded|.
194 engine_under_test_
->AudioChunksEnded();
195 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
196 ASSERT_TRUE(engine_under_test_
->IsRecognitionPending());
197 CloseMockDownstream(DOWNSTREAM_ERROR_NONE
);
199 // Expect an empty result, aimed at notifying recognition ended with no
200 // actual results nor errors.
201 SpeechRecognitionResults empty_results
;
202 ExpectResultsReceived(empty_results
);
204 // Ensure everything is closed cleanly after the downstream is closed.
205 ASSERT_FALSE(engine_under_test_
->IsRecognitionPending());
206 EndMockRecognition();
207 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE
, error_
);
208 ASSERT_EQ(0U, results_
.size());
211 TEST_F(GoogleStreamingRemoteEngineTest
, NoMatchError
) {
212 StartMockRecognition();
213 ASSERT_TRUE(GetUpstreamFetcher());
214 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
216 for (int i
= 0; i
< 3; ++i
)
217 InjectDummyAudioChunk();
218 engine_under_test_
->AudioChunksEnded();
219 ASSERT_EQ(4U, UpstreamChunksUploadedFromLastCall());
220 ASSERT_TRUE(engine_under_test_
->IsRecognitionPending());
222 // Simulate only a provisional result.
223 SpeechRecognitionResults results
;
224 results
.push_back(SpeechRecognitionResult());
225 SpeechRecognitionResult
& result
= results
.back();
226 result
.is_provisional
= true;
227 result
.hypotheses
.push_back(
228 SpeechRecognitionHypothesis(UTF8ToUTF16("The final result"), 0.0F
));
229 ProvideMockResultDownstream(result
);
230 ExpectResultsReceived(results
);
231 ASSERT_TRUE(engine_under_test_
->IsRecognitionPending());
233 CloseMockDownstream(DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH
);
235 // Expect an empty result.
236 ASSERT_FALSE(engine_under_test_
->IsRecognitionPending());
237 EndMockRecognition();
238 SpeechRecognitionResults empty_result
;
239 ExpectResultsReceived(empty_result
);
242 TEST_F(GoogleStreamingRemoteEngineTest
, HTTPError
) {
243 StartMockRecognition();
244 ASSERT_TRUE(GetUpstreamFetcher());
245 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
247 InjectDummyAudioChunk();
248 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
250 // Close the downstream with a HTTP 500 error.
251 CloseMockDownstream(DOWNSTREAM_ERROR_HTTP500
);
253 // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
254 ASSERT_FALSE(engine_under_test_
->IsRecognitionPending());
255 EndMockRecognition();
256 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK
, error_
);
257 ASSERT_EQ(0U, results_
.size());
260 TEST_F(GoogleStreamingRemoteEngineTest
, NetworkError
) {
261 StartMockRecognition();
262 ASSERT_TRUE(GetUpstreamFetcher());
263 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
265 InjectDummyAudioChunk();
266 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
268 // Close the downstream fetcher simulating a network failure.
269 CloseMockDownstream(DOWNSTREAM_ERROR_NETWORK
);
271 // Expect a SPEECH_RECOGNITION_ERROR_NETWORK error to be raised.
272 ASSERT_FALSE(engine_under_test_
->IsRecognitionPending());
273 EndMockRecognition();
274 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NETWORK
, error_
);
275 ASSERT_EQ(0U, results_
.size());
278 TEST_F(GoogleStreamingRemoteEngineTest
, Stability
) {
279 StartMockRecognition();
280 ASSERT_TRUE(GetUpstreamFetcher());
281 ASSERT_EQ(0U, UpstreamChunksUploadedFromLastCall());
283 // Upload a dummy audio chunk.
284 InjectDummyAudioChunk();
285 ASSERT_EQ(1U, UpstreamChunksUploadedFromLastCall());
286 engine_under_test_
->AudioChunksEnded();
288 // Simulate a protobuf message with an intermediate result without confidence,
289 // but with stability.
290 proto::SpeechRecognitionEvent proto_event
;
291 proto_event
.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS
);
292 proto::SpeechRecognitionResult
* proto_result
= proto_event
.add_result();
293 proto_result
->set_stability(0.5);
294 proto::SpeechRecognitionAlternative
*proto_alternative
=
295 proto_result
->add_alternative();
296 proto_alternative
->set_transcript("foo");
297 ProvideMockProtoResultDownstream(proto_event
);
299 // Set up expectations.
300 SpeechRecognitionResults results
;
301 results
.push_back(SpeechRecognitionResult());
302 SpeechRecognitionResult
& result
= results
.back();
303 result
.is_provisional
= true;
304 result
.hypotheses
.push_back(
305 SpeechRecognitionHypothesis(UTF8ToUTF16("foo"), 0.5));
307 // Check that the protobuf generated the expected result.
308 ExpectResultsReceived(results
);
310 // Since it was a provisional result, recognition is still pending.
311 ASSERT_TRUE(engine_under_test_
->IsRecognitionPending());
314 CloseMockDownstream(DOWNSTREAM_ERROR_NONE
);
315 ASSERT_FALSE(engine_under_test_
->IsRecognitionPending());
316 EndMockRecognition();
318 // Since there was no final result, we get an empty "no match" result.
319 SpeechRecognitionResults empty_result
;
320 ExpectResultsReceived(empty_result
);
321 ASSERT_EQ(SPEECH_RECOGNITION_ERROR_NONE
, error_
);
322 ASSERT_EQ(0U, results_
.size());
325 void GoogleStreamingRemoteEngineTest::SetUp() {
326 engine_under_test_
.reset(
327 new GoogleStreamingRemoteEngine(NULL
/*URLRequestContextGetter*/));
328 engine_under_test_
->set_delegate(this);
331 void GoogleStreamingRemoteEngineTest::TearDown() {
332 engine_under_test_
.reset();
335 TestURLFetcher
* GoogleStreamingRemoteEngineTest::GetUpstreamFetcher() {
336 return url_fetcher_factory_
.GetFetcherByID(
337 GoogleStreamingRemoteEngine::kUpstreamUrlFetcherIdForTests
);
340 TestURLFetcher
* GoogleStreamingRemoteEngineTest::GetDownstreamFetcher() {
341 return url_fetcher_factory_
.GetFetcherByID(
342 GoogleStreamingRemoteEngine::kDownstreamUrlFetcherIdForTests
);
345 // Starts recognition on the engine, ensuring that both stream fetchers are
347 void GoogleStreamingRemoteEngineTest::StartMockRecognition() {
348 DCHECK(engine_under_test_
.get());
350 ASSERT_FALSE(engine_under_test_
->IsRecognitionPending());
352 engine_under_test_
->StartRecognition();
353 ASSERT_TRUE(engine_under_test_
->IsRecognitionPending());
355 TestURLFetcher
* upstream_fetcher
= GetUpstreamFetcher();
356 ASSERT_TRUE(upstream_fetcher
);
357 upstream_fetcher
->set_url(upstream_fetcher
->GetOriginalURL());
359 TestURLFetcher
* downstream_fetcher
= GetDownstreamFetcher();
360 ASSERT_TRUE(downstream_fetcher
);
361 downstream_fetcher
->set_url(downstream_fetcher
->GetOriginalURL());
364 void GoogleStreamingRemoteEngineTest::EndMockRecognition() {
365 DCHECK(engine_under_test_
.get());
366 engine_under_test_
->EndRecognition();
367 ASSERT_FALSE(engine_under_test_
->IsRecognitionPending());
369 // TODO(primiano): In order to be very pedantic we should check that both the
370 // upstream and downstream URL fetchers have been disposed at this time.
371 // Unfortunately it seems that there is no direct way to detect (in tests)
372 // if a url_fetcher has been freed or not, since they are not automatically
373 // de-registered from the TestURLFetcherFactory on destruction.
376 void GoogleStreamingRemoteEngineTest::InjectDummyAudioChunk() {
377 unsigned char dummy_audio_buffer_data
[2] = {'\0', '\0'};
378 scoped_refptr
<AudioChunk
> dummy_audio_chunk(
379 new AudioChunk(&dummy_audio_buffer_data
[0],
380 sizeof(dummy_audio_buffer_data
),
381 2 /* bytes per sample */));
382 DCHECK(engine_under_test_
.get());
383 engine_under_test_
->TakeAudioChunk(*dummy_audio_chunk
.get());
386 size_t GoogleStreamingRemoteEngineTest::UpstreamChunksUploadedFromLastCall() {
387 TestURLFetcher
* upstream_fetcher
= GetUpstreamFetcher();
388 DCHECK(upstream_fetcher
);
389 const size_t number_of_chunks
= upstream_fetcher
->upload_chunks().size();
390 DCHECK_GE(number_of_chunks
, last_number_of_upstream_chunks_seen_
);
391 const size_t new_chunks
= number_of_chunks
-
392 last_number_of_upstream_chunks_seen_
;
393 last_number_of_upstream_chunks_seen_
= number_of_chunks
;
397 void GoogleStreamingRemoteEngineTest::ProvideMockProtoResultDownstream(
398 const proto::SpeechRecognitionEvent
& result
) {
399 TestURLFetcher
* downstream_fetcher
= GetDownstreamFetcher();
401 ASSERT_TRUE(downstream_fetcher
);
402 downstream_fetcher
->set_status(URLRequestStatus(/* default=SUCCESS */));
403 downstream_fetcher
->set_response_code(200);
405 std::string response_string
= SerializeProtobufResponse(result
);
406 response_buffer_
.append(response_string
);
407 downstream_fetcher
->SetResponseString(response_buffer_
);
408 downstream_fetcher
->delegate()->OnURLFetchDownloadProgress(
410 response_buffer_
.size(),
411 -1 /* total response length not used */);
414 void GoogleStreamingRemoteEngineTest::ProvideMockResultDownstream(
415 const SpeechRecognitionResult
& result
) {
416 proto::SpeechRecognitionEvent proto_event
;
417 proto_event
.set_status(proto::SpeechRecognitionEvent::STATUS_SUCCESS
);
418 proto::SpeechRecognitionResult
* proto_result
= proto_event
.add_result();
419 proto_result
->set_final(!result
.is_provisional
);
420 for (size_t i
= 0; i
< result
.hypotheses
.size(); ++i
) {
421 proto::SpeechRecognitionAlternative
* proto_alternative
=
422 proto_result
->add_alternative();
423 const SpeechRecognitionHypothesis
& hypothesis
= result
.hypotheses
[i
];
424 proto_alternative
->set_confidence(hypothesis
.confidence
);
425 proto_alternative
->set_transcript(UTF16ToUTF8(hypothesis
.utterance
));
427 ProvideMockProtoResultDownstream(proto_event
);
430 void GoogleStreamingRemoteEngineTest::CloseMockDownstream(
431 DownstreamError error
) {
432 TestURLFetcher
* downstream_fetcher
= GetDownstreamFetcher();
433 ASSERT_TRUE(downstream_fetcher
);
435 const URLRequestStatus::Status fetcher_status
=
436 (error
== DOWNSTREAM_ERROR_NETWORK
) ? URLRequestStatus::FAILED
:
437 URLRequestStatus::SUCCESS
;
438 downstream_fetcher
->set_status(URLRequestStatus(fetcher_status
, 0));
439 downstream_fetcher
->set_response_code(
440 (error
== DOWNSTREAM_ERROR_HTTP500
) ? 500 : 200);
442 if (error
== DOWNSTREAM_ERROR_WEBSERVICE_NO_MATCH
) {
443 // Send empty response.
444 proto::SpeechRecognitionEvent response
;
445 response_buffer_
.append(SerializeProtobufResponse(response
));
447 downstream_fetcher
->SetResponseString(response_buffer_
);
448 downstream_fetcher
->delegate()->OnURLFetchComplete(downstream_fetcher
);
451 void GoogleStreamingRemoteEngineTest::ExpectResultsReceived(
452 const SpeechRecognitionResults
& results
) {
453 ASSERT_GE(1U, results_
.size());
454 ASSERT_TRUE(ResultsAreEqual(results
, results_
.front()));
458 bool GoogleStreamingRemoteEngineTest::ResultsAreEqual(
459 const SpeechRecognitionResults
& a
, const SpeechRecognitionResults
& b
) {
460 if (a
.size() != b
.size())
463 SpeechRecognitionResults::const_iterator it_a
= a
.begin();
464 SpeechRecognitionResults::const_iterator it_b
= b
.begin();
465 for (; it_a
!= a
.end() && it_b
!= b
.end(); ++it_a
, ++it_b
) {
466 if (it_a
->is_provisional
!= it_b
->is_provisional
||
467 it_a
->hypotheses
.size() != it_b
->hypotheses
.size()) {
470 for (size_t i
= 0; i
< it_a
->hypotheses
.size(); ++i
) {
471 const SpeechRecognitionHypothesis
& hyp_a
= it_a
->hypotheses
[i
];
472 const SpeechRecognitionHypothesis
& hyp_b
= it_b
->hypotheses
[i
];
473 if (hyp_a
.utterance
!= hyp_b
.utterance
||
474 hyp_a
.confidence
!= hyp_b
.confidence
) {
483 std::string
GoogleStreamingRemoteEngineTest::SerializeProtobufResponse(
484 const proto::SpeechRecognitionEvent
& msg
) {
485 std::string msg_string
;
486 msg
.SerializeToString(&msg_string
);
488 // Prepend 4 byte prefix length indication to the protobuf message as
489 // envisaged by the google streaming recognition webservice protocol.
490 msg_string
.insert(0, ToBigEndian32(msg_string
.size()));
494 std::string
GoogleStreamingRemoteEngineTest::ToBigEndian32(uint32 value
) {
496 raw_data
[0] = static_cast<uint8
>((value
>> 24) & 0xFF);
497 raw_data
[1] = static_cast<uint8
>((value
>> 16) & 0xFF);
498 raw_data
[2] = static_cast<uint8
>((value
>> 8) & 0xFF);
499 raw_data
[3] = static_cast<uint8
>(value
& 0xFF);
500 return std::string(raw_data
, sizeof(raw_data
));
503 } // namespace content