1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "content/browser/speech/google_one_shot_remote_engine.h"
9 #include "base/json/json_reader.h"
10 #include "base/strings/string_number_conversions.h"
11 #include "base/strings/string_util.h"
12 #include "base/values.h"
13 #include "content/browser/speech/audio_buffer.h"
14 #include "content/public/common/speech_recognition_error.h"
15 #include "content/public/common/speech_recognition_result.h"
16 #include "google_apis/google_api_keys.h"
17 #include "net/base/escape.h"
18 #include "net/base/load_flags.h"
19 #include "net/url_request/http_user_agent_settings.h"
20 #include "net/url_request/url_fetcher.h"
21 #include "net/url_request/url_request_context.h"
22 #include "net/url_request/url_request_context_getter.h"
23 #include "net/url_request/url_request_status.h"
28 const char* const kDefaultSpeechRecognitionUrl
=
29 "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";
30 const char* const kStatusString
= "status";
31 const char* const kHypothesesString
= "hypotheses";
32 const char* const kUtteranceString
= "utterance";
33 const char* const kConfidenceString
= "confidence";
34 const int kWebServiceStatusNoError
= 0;
35 const int kWebServiceStatusNoSpeech
= 4;
36 const int kWebServiceStatusNoMatch
= 5;
37 const AudioEncoder::Codec kDefaultAudioCodec
= AudioEncoder::CODEC_FLAC
;
39 bool ParseServerResponse(const std::string
& response_body
,
40 SpeechRecognitionResult
* result
,
41 SpeechRecognitionError
* error
) {
42 if (response_body
.empty()) {
43 LOG(WARNING
) << "ParseServerResponse: Response was empty.";
46 DVLOG(1) << "ParseServerResponse: Parsing response " << response_body
;
48 // Parse the response, ignoring comments.
49 std::string error_msg
;
50 scoped_ptr
<base::Value
> response_value(
51 base::JSONReader::DeprecatedReadAndReturnError(
52 response_body
, base::JSON_PARSE_RFC
, NULL
, &error_msg
));
53 if (response_value
== NULL
) {
54 LOG(WARNING
) << "ParseServerResponse: JSONReader failed : " << error_msg
;
58 if (!response_value
->IsType(base::Value::TYPE_DICTIONARY
)) {
59 DVLOG(1) << "ParseServerResponse: Unexpected response type "
60 << response_value
->GetType();
63 const base::DictionaryValue
* response_object
=
64 static_cast<const base::DictionaryValue
*>(response_value
.get());
68 if (!response_object
->GetInteger(kStatusString
, &status
)) {
69 DVLOG(1) << "ParseServerResponse: " << kStatusString
70 << " is not a valid integer value.";
74 // Process the status.
76 case kWebServiceStatusNoError
:
78 case kWebServiceStatusNoSpeech
:
79 error
->code
= SPEECH_RECOGNITION_ERROR_NO_SPEECH
;
81 case kWebServiceStatusNoMatch
:
82 error
->code
= SPEECH_RECOGNITION_ERROR_NO_MATCH
;
85 error
->code
= SPEECH_RECOGNITION_ERROR_NETWORK
;
86 // Other status codes should not be returned by the server.
87 DVLOG(1) << "ParseServerResponse: unexpected status code " << status
;
91 // Get the hypotheses.
92 const base::Value
* hypotheses_value
= NULL
;
93 if (!response_object
->Get(kHypothesesString
, &hypotheses_value
)) {
94 DVLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";
98 DCHECK(hypotheses_value
);
99 if (!hypotheses_value
->IsType(base::Value::TYPE_LIST
)) {
100 DVLOG(1) << "ParseServerResponse: Unexpected hypotheses type "
101 << hypotheses_value
->GetType();
105 const base::ListValue
* hypotheses_list
=
106 static_cast<const base::ListValue
*>(hypotheses_value
);
108 // For now we support only single shot recognition, so we are giving only a
109 // final result, consisting of one fragment (with one or more hypotheses).
111 for (; index
< hypotheses_list
->GetSize(); ++index
) {
112 const base::Value
* hypothesis
= NULL
;
113 if (!hypotheses_list
->Get(index
, &hypothesis
)) {
114 LOG(WARNING
) << "ParseServerResponse: Unable to read hypothesis value.";
118 if (!hypothesis
->IsType(base::Value::TYPE_DICTIONARY
)) {
119 LOG(WARNING
) << "ParseServerResponse: Unexpected value type "
120 << hypothesis
->GetType();
124 const base::DictionaryValue
* hypothesis_value
=
125 static_cast<const base::DictionaryValue
*>(hypothesis
);
126 base::string16 utterance
;
128 if (!hypothesis_value
->GetString(kUtteranceString
, &utterance
)) {
129 LOG(WARNING
) << "ParseServerResponse: Missing utterance value.";
133 // It is not an error if the 'confidence' field is missing.
134 double confidence
= 0.0;
135 hypothesis_value
->GetDouble(kConfidenceString
, &confidence
);
136 result
->hypotheses
.push_back(SpeechRecognitionHypothesis(utterance
,
140 if (index
< hypotheses_list
->GetSize()) {
141 result
->hypotheses
.clear();
149 const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs
= 100;
150 int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests
= 0;
152 GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(
153 net::URLRequestContextGetter
* context
)
154 : url_context_(context
) {
157 GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}
159 void GoogleOneShotRemoteEngine::SetConfig(
160 const SpeechRecognitionEngineConfig
& config
) {
164 void GoogleOneShotRemoteEngine::StartRecognition() {
166 DCHECK(!url_fetcher_
.get());
167 std::string lang_param
= config_
.language
;
169 if (lang_param
.empty() && url_context_
.get()) {
170 // If no language is provided then we use the first from the accepted
171 // language list. If this list is empty then it defaults to "en-US".
172 // Example of the contents of this list: "es,en-GB;q=0.8", ""
173 net::URLRequestContext
* request_context
=
174 url_context_
->GetURLRequestContext();
175 DCHECK(request_context
);
176 // TODO(pauljensen): GoogleOneShotRemoteEngine should be constructed with
177 // a reference to the HttpUserAgentSettings rather than accessing the
178 // accept language through the URLRequestContext.
179 if (request_context
->http_user_agent_settings()) {
180 std::string accepted_language_list
=
181 request_context
->http_user_agent_settings()->GetAcceptLanguage();
182 size_t separator
= accepted_language_list
.find_first_of(",;");
183 lang_param
= accepted_language_list
.substr(0, separator
);
187 if (lang_param
.empty())
188 lang_param
= "en-US";
190 std::vector
<std::string
> parts
;
191 parts
.push_back("lang=" + net::EscapeQueryParamValue(lang_param
, true));
193 if (!config_
.grammars
.empty()) {
194 DCHECK_EQ(config_
.grammars
.size(), 1U);
195 parts
.push_back("lm=" + net::EscapeQueryParamValue(config_
.grammars
[0].url
,
199 if (!config_
.hardware_info
.empty())
200 parts
.push_back("xhw=" + net::EscapeQueryParamValue(config_
.hardware_info
,
202 parts
.push_back("maxresults=" + base::UintToString(config_
.max_hypotheses
));
203 parts
.push_back(config_
.filter_profanities
? "pfilter=2" : "pfilter=0");
205 std::string api_key
= google_apis::GetAPIKey();
206 parts
.push_back("key=" + net::EscapeQueryParamValue(api_key
, true));
208 GURL
url(std::string(kDefaultSpeechRecognitionUrl
) +
209 base::JoinString(parts
, "&"));
211 encoder_
.reset(AudioEncoder::Create(kDefaultAudioCodec
,
212 config_
.audio_sample_rate
,
213 config_
.audio_num_bits_per_sample
));
214 DCHECK(encoder_
.get());
215 url_fetcher_
= net::URLFetcher::Create(url_fetcher_id_for_tests
, url
,
216 net::URLFetcher::POST
, this);
217 url_fetcher_
->SetChunkedUpload(encoder_
->mime_type());
218 url_fetcher_
->SetRequestContext(url_context_
.get());
219 url_fetcher_
->SetReferrer(config_
.origin_url
);
221 // The speech recognition API does not require user identification as part
222 // of requests, so we don't send cookies or auth data for these requests to
223 // prevent any accidental connection between users who are logged into the
224 // domain for other services (e.g. bookmark sync) with the speech requests.
225 url_fetcher_
->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES
|
226 net::LOAD_DO_NOT_SEND_COOKIES
|
227 net::LOAD_DO_NOT_SEND_AUTH_DATA
);
228 url_fetcher_
->Start();
231 void GoogleOneShotRemoteEngine::EndRecognition() {
232 url_fetcher_
.reset();
235 void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk
& data
) {
236 DCHECK(url_fetcher_
.get());
237 DCHECK(encoder_
.get());
238 DCHECK_EQ(data
.bytes_per_sample(), config_
.audio_num_bits_per_sample
/ 8);
239 encoder_
->Encode(data
);
240 scoped_refptr
<AudioChunk
> encoded_data(encoder_
->GetEncodedDataAndClear());
241 url_fetcher_
->AppendChunkToUpload(encoded_data
->AsString(), false);
244 void GoogleOneShotRemoteEngine::AudioChunksEnded() {
245 DCHECK(url_fetcher_
.get());
246 DCHECK(encoder_
.get());
248 // UploadAudioChunk requires a non-empty final buffer. So we encode a packet
249 // of silence in case encoder had no data already.
250 size_t sample_count
=
251 config_
.audio_sample_rate
* kAudioPacketIntervalMs
/ 1000;
252 scoped_refptr
<AudioChunk
> dummy_chunk(new AudioChunk(
253 sample_count
* sizeof(int16
), encoder_
->bits_per_sample() / 8));
254 encoder_
->Encode(*dummy_chunk
.get());
256 scoped_refptr
<AudioChunk
> encoded_dummy_data(
257 encoder_
->GetEncodedDataAndClear());
258 DCHECK(!encoded_dummy_data
->IsEmpty());
261 url_fetcher_
->AppendChunkToUpload(encoded_dummy_data
->AsString(), true);
264 void GoogleOneShotRemoteEngine::OnURLFetchComplete(
265 const net::URLFetcher
* source
) {
266 DCHECK_EQ(url_fetcher_
.get(), source
);
267 SpeechRecognitionResults results
;
268 results
.push_back(SpeechRecognitionResult());
269 SpeechRecognitionResult
& result
= results
.back();
270 SpeechRecognitionError
error(SPEECH_RECOGNITION_ERROR_NETWORK
);
273 // The default error code in case of parse errors is NETWORK_FAILURE, however
274 // ParseServerResponse can change the error to a more appropriate one.
275 bool error_occurred
= (!source
->GetStatus().is_success() ||
276 source
->GetResponseCode() != 200 ||
277 !source
->GetResponseAsString(&data
) ||
278 !ParseServerResponse(data
, &result
, &error
));
279 url_fetcher_
.reset();
280 if (error_occurred
) {
281 DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error
.code
;
282 delegate()->OnSpeechRecognitionEngineError(error
);
284 DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";
285 delegate()->OnSpeechRecognitionEngineResults(results
);
289 bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {
290 return url_fetcher_
!= NULL
;
293 int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {
294 return kAudioPacketIntervalMs
;
297 } // namespace content