content/browser/speech/google_one_shot_remote_engine.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "content/browser/speech/google_one_shot_remote_engine.h"
   6
   7 #include <vector>
   8
   9 #include "base/json/json_reader.h"
  10 #include "base/strings/string_number_conversions.h"
  11 #include "base/strings/string_util.h"
  12 #include "base/values.h"
  13 #include "content/browser/speech/audio_buffer.h"
  14 #include "content/public/common/speech_recognition_error.h"
  15 #include "content/public/common/speech_recognition_result.h"
  16 #include "google_apis/google_api_keys.h"
  17 #include "net/base/escape.h"
  18 #include "net/base/load_flags.h"
  19 #include "net/url_request/http_user_agent_settings.h"
  20 #include "net/url_request/url_fetcher.h"
  21 #include "net/url_request/url_request_context.h"
  22 #include "net/url_request/url_request_context_getter.h"
  23 #include "net/url_request/url_request_status.h"
  24
  25 namespace content {
  26 namespace {
  27
  28 const char* const kDefaultSpeechRecognitionUrl =
  29     "https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&";
  30 const char* const kStatusString = "status";
  31 const char* const kHypothesesString = "hypotheses";
  32 const char* const kUtteranceString = "utterance";
  33 const char* const kConfidenceString = "confidence";
  34 const int kWebServiceStatusNoError = 0;
  35 const int kWebServiceStatusNoSpeech = 4;
  36 const int kWebServiceStatusNoMatch = 5;
  37 const AudioEncoder::Codec kDefaultAudioCodec = AudioEncoder::CODEC_FLAC;
  38
  39 bool ParseServerResponse(const std::string& response_body,
  40                          SpeechRecognitionResult* result,
  41                          SpeechRecognitionError* error) {
  42   if (response_body.empty()) {
  43     LOG(WARNING) << "ParseServerResponse: Response was empty.";
  44     return false;
  45   }
  46   DVLOG(1) << "ParseServerResponse: Parsing response " << response_body;
  47
  48   // Parse the response, ignoring comments.
  49   std::string error_msg;
  50   scoped_ptr<base::Value> response_value(
  51       base::JSONReader::DeprecatedReadAndReturnError(
  52           response_body, base::JSON_PARSE_RFC, NULL, &error_msg));
  53   if (response_value == NULL) {
  54     LOG(WARNING) << "ParseServerResponse: JSONReader failed : " << error_msg;
  55     return false;
  56   }
  57
  58   if (!response_value->IsType(base::Value::TYPE_DICTIONARY)) {
  59     DVLOG(1) << "ParseServerResponse: Unexpected response type "
  60             << response_value->GetType();
  61     return false;
  62   }
  63   const base::DictionaryValue* response_object =
  64       static_cast<const base::DictionaryValue*>(response_value.get());
  65
  66   // Get the status.
  67   int status;
  68   if (!response_object->GetInteger(kStatusString, &status)) {
  69     DVLOG(1) << "ParseServerResponse: " << kStatusString
  70             << " is not a valid integer value.";
  71     return false;
  72   }
  73
  74   // Process the status.
  75   switch (status) {
  76     case kWebServiceStatusNoError:
  77       break;
  78     case kWebServiceStatusNoSpeech:
  79       error->code = SPEECH_RECOGNITION_ERROR_NO_SPEECH;
  80       return false;
  81     case kWebServiceStatusNoMatch:
  82       error->code = SPEECH_RECOGNITION_ERROR_NO_MATCH;
  83       return false;
  84     default:
  85       error->code = SPEECH_RECOGNITION_ERROR_NETWORK;
  86       // Other status codes should not be returned by the server.
  87       DVLOG(1) << "ParseServerResponse: unexpected status code " << status;
  88       return false;
  89   }
  90
  91   // Get the hypotheses.
  92   const base::Value* hypotheses_value = NULL;
  93   if (!response_object->Get(kHypothesesString, &hypotheses_value)) {
  94     DVLOG(1) << "ParseServerResponse: Missing hypotheses attribute.";
  95     return false;
  96   }
  97
  98   DCHECK(hypotheses_value);
  99   if (!hypotheses_value->IsType(base::Value::TYPE_LIST)) {
 100     DVLOG(1) << "ParseServerResponse: Unexpected hypotheses type "
 101              << hypotheses_value->GetType();
 102     return false;
 103   }
 104
 105   const base::ListValue* hypotheses_list =
 106       static_cast<const base::ListValue*>(hypotheses_value);
 107
 108   // For now we support only single shot recognition, so we are giving only a
 109   // final result, consisting of one fragment (with one or more hypotheses).
 110   size_t index = 0;
 111   for (; index < hypotheses_list->GetSize(); ++index) {
 112     const base::Value* hypothesis = NULL;
 113     if (!hypotheses_list->Get(index, &hypothesis)) {
 114       LOG(WARNING) << "ParseServerResponse: Unable to read hypothesis value.";
 115       break;
 116     }
 117     DCHECK(hypothesis);
 118     if (!hypothesis->IsType(base::Value::TYPE_DICTIONARY)) {
 119       LOG(WARNING) << "ParseServerResponse: Unexpected value type "
 120                    << hypothesis->GetType();
 121       break;
 122     }
 123
 124     const base::DictionaryValue* hypothesis_value =
 125         static_cast<const base::DictionaryValue*>(hypothesis);
 126     base::string16 utterance;
 127
 128     if (!hypothesis_value->GetString(kUtteranceString, &utterance)) {
 129       LOG(WARNING) << "ParseServerResponse: Missing utterance value.";
 130       break;
 131     }
 132
 133     // It is not an error if the 'confidence' field is missing.
 134     double confidence = 0.0;
 135     hypothesis_value->GetDouble(kConfidenceString, &confidence);
 136     result->hypotheses.push_back(SpeechRecognitionHypothesis(utterance,
 137                                                              confidence));
 138   }
 139
 140   if (index < hypotheses_list->GetSize()) {
 141     result->hypotheses.clear();
 142     return false;
 143   }
 144   return true;
 145 }
 146
 147 }  // namespace
 148
 149 const int GoogleOneShotRemoteEngine::kAudioPacketIntervalMs = 100;
 150 int GoogleOneShotRemoteEngine::url_fetcher_id_for_tests = 0;
 151
 152 GoogleOneShotRemoteEngine::GoogleOneShotRemoteEngine(
 153     net::URLRequestContextGetter* context)
 154     : url_context_(context) {
 155 }
 156
 157 GoogleOneShotRemoteEngine::~GoogleOneShotRemoteEngine() {}
 158
 159 void GoogleOneShotRemoteEngine::SetConfig(
 160     const SpeechRecognitionEngineConfig& config) {
 161   config_ = config;
 162 }
 163
 164 void GoogleOneShotRemoteEngine::StartRecognition() {
 165   DCHECK(delegate());
 166   DCHECK(!url_fetcher_.get());
 167   std::string lang_param = config_.language;
 168
 169   if (lang_param.empty() && url_context_.get()) {
 170     // If no language is provided then we use the first from the accepted
 171     // language list. If this list is empty then it defaults to "en-US".
 172     // Example of the contents of this list: "es,en-GB;q=0.8", ""
 173     net::URLRequestContext* request_context =
 174         url_context_->GetURLRequestContext();
 175     DCHECK(request_context);
 176     // TODO(pauljensen): GoogleOneShotRemoteEngine should be constructed with
 177     // a reference to the HttpUserAgentSettings rather than accessing the
 178     // accept language through the URLRequestContext.
 179     if (request_context->http_user_agent_settings()) {
 180       std::string accepted_language_list =
 181           request_context->http_user_agent_settings()->GetAcceptLanguage();
 182       size_t separator = accepted_language_list.find_first_of(",;");
 183       lang_param = accepted_language_list.substr(0, separator);
 184     }
 185   }
 186
 187   if (lang_param.empty())
 188     lang_param = "en-US";
 189
 190   std::vector<std::string> parts;
 191   parts.push_back("lang=" + net::EscapeQueryParamValue(lang_param, true));
 192
 193   if (!config_.grammars.empty()) {
 194     DCHECK_EQ(config_.grammars.size(), 1U);
 195     parts.push_back("lm=" + net::EscapeQueryParamValue(config_.grammars[0].url,
 196                                                        true));
 197   }
 198
 199   if (!config_.hardware_info.empty())
 200     parts.push_back("xhw=" + net::EscapeQueryParamValue(config_.hardware_info,
 201                                                         true));
 202   parts.push_back("maxresults=" + base::UintToString(config_.max_hypotheses));
 203   parts.push_back(config_.filter_profanities ? "pfilter=2" : "pfilter=0");
 204
 205   std::string api_key = google_apis::GetAPIKey();
 206   parts.push_back("key=" + net::EscapeQueryParamValue(api_key, true));
 207
 208   GURL url(std::string(kDefaultSpeechRecognitionUrl) +
 209            base::JoinString(parts, "&"));
 210
 211   encoder_.reset(AudioEncoder::Create(kDefaultAudioCodec,
 212                                       config_.audio_sample_rate,
 213                                       config_.audio_num_bits_per_sample));
 214   DCHECK(encoder_.get());
 215   url_fetcher_ = net::URLFetcher::Create(url_fetcher_id_for_tests, url,
 216                                          net::URLFetcher::POST, this);
 217   url_fetcher_->SetChunkedUpload(encoder_->mime_type());
 218   url_fetcher_->SetRequestContext(url_context_.get());
 219   url_fetcher_->SetReferrer(config_.origin_url);
 220
 221   // The speech recognition API does not require user identification as part
 222   // of requests, so we don't send cookies or auth data for these requests to
 223   // prevent any accidental connection between users who are logged into the
 224   // domain for other services (e.g. bookmark sync) with the speech requests.
 225   url_fetcher_->SetLoadFlags(net::LOAD_DO_NOT_SAVE_COOKIES |
 226                              net::LOAD_DO_NOT_SEND_COOKIES |
 227                              net::LOAD_DO_NOT_SEND_AUTH_DATA);
 228   url_fetcher_->Start();
 229 }
 230
 231 void GoogleOneShotRemoteEngine::EndRecognition() {
 232   url_fetcher_.reset();
 233 }
 234
 235 void GoogleOneShotRemoteEngine::TakeAudioChunk(const AudioChunk& data) {
 236   DCHECK(url_fetcher_.get());
 237   DCHECK(encoder_.get());
 238   DCHECK_EQ(data.bytes_per_sample(), config_.audio_num_bits_per_sample / 8);
 239   encoder_->Encode(data);
 240   scoped_refptr<AudioChunk> encoded_data(encoder_->GetEncodedDataAndClear());
 241   url_fetcher_->AppendChunkToUpload(encoded_data->AsString(), false);
 242 }
 243
 244 void GoogleOneShotRemoteEngine::AudioChunksEnded() {
 245   DCHECK(url_fetcher_.get());
 246   DCHECK(encoder_.get());
 247
 248   // UploadAudioChunk requires a non-empty final buffer. So we encode a packet
 249   // of silence in case encoder had no data already.
 250   size_t sample_count =
 251       config_.audio_sample_rate * kAudioPacketIntervalMs / 1000;
 252   scoped_refptr<AudioChunk> dummy_chunk(new AudioChunk(
 253       sample_count * sizeof(int16), encoder_->bits_per_sample() / 8));
 254   encoder_->Encode(*dummy_chunk.get());
 255   encoder_->Flush();
 256   scoped_refptr<AudioChunk> encoded_dummy_data(
 257       encoder_->GetEncodedDataAndClear());
 258   DCHECK(!encoded_dummy_data->IsEmpty());
 259   encoder_.reset();
 260
 261   url_fetcher_->AppendChunkToUpload(encoded_dummy_data->AsString(), true);
 262 }
 263
 264 void GoogleOneShotRemoteEngine::OnURLFetchComplete(
 265     const net::URLFetcher* source) {
 266   DCHECK_EQ(url_fetcher_.get(), source);
 267   SpeechRecognitionResults results;
 268   results.push_back(SpeechRecognitionResult());
 269   SpeechRecognitionResult& result = results.back();
 270   SpeechRecognitionError error(SPEECH_RECOGNITION_ERROR_NETWORK);
 271   std::string data;
 272
 273   // The default error code in case of parse errors is NETWORK_FAILURE, however
 274   // ParseServerResponse can change the error to a more appropriate one.
 275   bool error_occurred = (!source->GetStatus().is_success() ||
 276                         source->GetResponseCode() != 200 ||
 277                         !source->GetResponseAsString(&data) ||
 278                         !ParseServerResponse(data, &result, &error));
 279   url_fetcher_.reset();
 280   if (error_occurred) {
 281     DVLOG(1) << "GoogleOneShotRemoteEngine: Network Error " << error.code;
 282     delegate()->OnSpeechRecognitionEngineError(error);
 283   } else {
 284     DVLOG(1) << "GoogleOneShotRemoteEngine: Invoking delegate with result.";
 285     delegate()->OnSpeechRecognitionEngineResults(results);
 286   }
 287 }
 288
 289 bool GoogleOneShotRemoteEngine::IsRecognitionPending() const {
 290   return url_fetcher_ != NULL;
 291 }
 292
 293 int GoogleOneShotRemoteEngine::GetDesiredAudioChunkDurationMs() const {
 294   return kAudioPacketIntervalMs;
 295 }
 296
 297 }  // namespace content