Roll src/third_party/WebKit 3aea697:d9c6159 (svn 201973:201974)
[chromium-blink-merge.git] / components / search_engines / template_url_parser.cc
bloba09403da256255a7c438a7b54849c922ecf48097
1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/search_engines/template_url_parser.h"
7 #include <algorithm>
8 #include <map>
9 #include <vector>
11 #include "base/logging.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/string_util.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "components/search_engines/search_terms_data.h"
17 #include "components/search_engines/template_url.h"
18 #include "libxml/parser.h"
19 #include "libxml/xmlwriter.h"
20 #include "ui/gfx/favicon_size.h"
21 #include "url/gurl.h"
22 #include "url/url_constants.h"
24 namespace {
26 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
27 // to that of char, the following names are all in terms of char. This avoids
28 // having to convert to wide, then do comparisons.
30 // Defines for element names of the OSD document:
31 const char kURLElement[] = "Url";
32 const char kParamElement[] = "Param";
33 const char kShortNameElement[] = "ShortName";
34 const char kImageElement[] = "Image";
35 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
36 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
37 const char kInputEncodingElement[] = "InputEncoding";
38 const char kAliasElement[] = "Alias";
40 // Various XML attributes used.
41 const char kURLTypeAttribute[] = "type";
42 const char kURLTemplateAttribute[] = "template";
43 const char kImageTypeAttribute[] = "type";
44 const char kImageWidthAttribute[] = "width";
45 const char kImageHeightAttribute[] = "height";
46 const char kParamNameAttribute[] = "name";
47 const char kParamValueAttribute[] = "value";
48 const char kParamMethodAttribute[] = "method";
50 // Mime type for search results.
51 const char kHTMLType[] = "text/html";
53 // Mime type for as you type suggestions.
54 const char kSuggestionType[] = "application/x-suggestions+json";
56 std::string XMLCharToString(const xmlChar* value) {
57 return std::string(reinterpret_cast<const char*>(value));
60 // Returns true if input_encoding contains a valid input encoding string. This
61 // doesn't verify that we have a valid encoding for the string, just that the
62 // string contains characters that constitute a valid input encoding.
63 bool IsValidEncodingString(const std::string& input_encoding) {
64 if (input_encoding.empty())
65 return false;
67 if (!base::IsAsciiAlpha(input_encoding[0]))
68 return false;
70 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
71 char c = input_encoding[i];
72 if (!base::IsAsciiAlpha(c) && !base::IsAsciiDigit(c) &&
73 c != '.' && c != '_' && c != '-') {
74 return false;
77 return true;
80 void AppendParamToQuery(const std::string& key,
81 const std::string& value,
82 std::string* query) {
83 if (!query->empty())
84 query->append("&");
85 if (!key.empty()) {
86 query->append(key);
87 query->append("=");
89 query->append(value);
92 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S].
93 bool IsHTTPRef(const std::string& url) {
94 if (url.empty())
95 return true;
96 GURL gurl(url);
97 return gurl.is_valid() && (gurl.SchemeIs(url::kHttpScheme) ||
98 gurl.SchemeIs(url::kHttpsScheme));
101 } // namespace
104 // TemplateURLParsingContext --------------------------------------------------
106 // To minimize memory overhead while parsing, a SAX style parser is used.
107 // TemplateURLParsingContext is used to maintain the state we're in the document
108 // while parsing.
109 class TemplateURLParsingContext {
110 public:
111 // Enum of the known element types.
112 enum ElementType {
113 UNKNOWN,
114 OPEN_SEARCH_DESCRIPTION,
115 URL,
116 PARAM,
117 SHORT_NAME,
118 IMAGE,
119 INPUT_ENCODING,
120 ALIAS,
123 enum Method {
124 GET,
125 POST
128 // Key/value of a Param node.
129 typedef std::pair<std::string, std::string> Param;
131 explicit TemplateURLParsingContext(
132 TemplateURLParser::ParameterFilter* parameter_filter);
134 static void StartElementImpl(void* ctx,
135 const xmlChar* name,
136 const xmlChar** atts);
137 static void EndElementImpl(void* ctx, const xmlChar* name);
138 static void CharactersImpl(void* ctx, const xmlChar* ch, int len);
140 // Returns a heap-allocated TemplateURL representing the result of parsing.
141 // This will be NULL if parsing failed or if the results were invalid for some
142 // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied,
143 // a resulting TemplateURLRef was invalid, etc.).
144 TemplateURL* GetTemplateURL(const SearchTermsData& search_terms_data,
145 bool show_in_default_list);
147 private:
148 // Key is UTF8 encoded.
149 typedef std::map<std::string, ElementType> ElementNameToElementTypeMap;
151 static void InitMapping();
153 void ParseURL(const xmlChar** atts);
154 void ParseImage(const xmlChar** atts);
155 void ParseParam(const xmlChar** atts);
156 void ProcessURLParams();
158 // Returns the current ElementType.
159 ElementType GetKnownType();
161 static ElementNameToElementTypeMap* kElementNameToElementTypeMap;
163 // Data that gets updated as we parse, and is converted to a TemplateURL by
164 // GetTemplateURL().
165 TemplateURLData data_;
167 std::vector<ElementType> elements_;
168 bool image_is_valid_for_favicon_;
170 // Character content for the current element.
171 base::string16 string_;
173 TemplateURLParser::ParameterFilter* parameter_filter_;
175 // The list of parameters parsed in the Param nodes of a Url node.
176 std::vector<Param> extra_params_;
178 // The HTTP methods used.
179 Method method_;
180 Method suggestion_method_;
182 // If true, we are currently parsing a suggest URL, otherwise it is an HTML
183 // search. Note that we don't need a stack as URL nodes cannot be nested.
184 bool is_suggest_url_;
186 // If true, the user has set a keyword and we should use it. Otherwise,
187 // we generate a keyword based on the URL.
188 bool has_custom_keyword_;
190 // Whether we should derive the image from the URL (when images are data
191 // URLs).
192 bool derive_image_from_url_;
194 DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext);
197 // static
198 TemplateURLParsingContext::ElementNameToElementTypeMap*
199 TemplateURLParsingContext::kElementNameToElementTypeMap = NULL;
201 TemplateURLParsingContext::TemplateURLParsingContext(
202 TemplateURLParser::ParameterFilter* parameter_filter)
203 : image_is_valid_for_favicon_(false),
204 parameter_filter_(parameter_filter),
205 method_(GET),
206 suggestion_method_(GET),
207 is_suggest_url_(false),
208 has_custom_keyword_(false),
209 derive_image_from_url_(false) {
210 if (kElementNameToElementTypeMap == NULL)
211 InitMapping();
214 // static
215 void TemplateURLParsingContext::StartElementImpl(void* ctx,
216 const xmlChar* name,
217 const xmlChar** atts) {
218 // Remove the namespace from |name|, ex: os:Url -> Url.
219 std::string node_name(XMLCharToString(name));
220 size_t index = node_name.find_first_of(":");
221 if (index != std::string::npos)
222 node_name.erase(0, index + 1);
224 TemplateURLParsingContext* context =
225 reinterpret_cast<TemplateURLParsingContext*>(ctx);
226 context->elements_.push_back(
227 context->kElementNameToElementTypeMap->count(node_name) ?
228 (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN);
229 switch (context->GetKnownType()) {
230 case TemplateURLParsingContext::URL:
231 context->extra_params_.clear();
232 context->ParseURL(atts);
233 break;
234 case TemplateURLParsingContext::IMAGE:
235 context->ParseImage(atts);
236 break;
237 case TemplateURLParsingContext::PARAM:
238 context->ParseParam(atts);
239 break;
240 default:
241 break;
243 context->string_.clear();
246 // static
247 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) {
248 TemplateURLParsingContext* context =
249 reinterpret_cast<TemplateURLParsingContext*>(ctx);
250 switch (context->GetKnownType()) {
251 case TemplateURLParsingContext::URL:
252 context->ProcessURLParams();
253 break;
254 case TemplateURLParsingContext::SHORT_NAME:
255 context->data_.SetShortName(context->string_);
256 break;
257 case TemplateURLParsingContext::IMAGE: {
258 GURL image_url(base::UTF16ToUTF8(context->string_));
259 if (image_url.SchemeIs(url::kDataScheme)) {
260 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
261 // decode the data URL in the renderer. For now, we'll just point to the
262 // favicon from the URL.
263 context->derive_image_from_url_ = true;
264 } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() &&
265 (image_url.SchemeIs(url::kHttpScheme) ||
266 image_url.SchemeIs(url::kHttpsScheme))) {
267 context->data_.favicon_url = image_url;
269 context->image_is_valid_for_favicon_ = false;
270 break;
272 case TemplateURLParsingContext::INPUT_ENCODING: {
273 std::string input_encoding = base::UTF16ToASCII(context->string_);
274 if (IsValidEncodingString(input_encoding))
275 context->data_.input_encodings.push_back(input_encoding);
276 break;
278 case TemplateURLParsingContext::ALIAS: {
279 context->data_.SetKeyword(context->string_);
280 context->has_custom_keyword_ = true;
281 break;
283 default:
284 break;
286 context->string_.clear();
287 context->elements_.pop_back();
290 // static
291 void TemplateURLParsingContext::CharactersImpl(void* ctx,
292 const xmlChar* ch,
293 int len) {
294 reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ +=
295 base::UTF8ToUTF16(
296 base::StringPiece(reinterpret_cast<const char*>(ch), len));
299 TemplateURL* TemplateURLParsingContext::GetTemplateURL(
300 const SearchTermsData& search_terms_data,
301 bool show_in_default_list) {
302 // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107
303 if (method_ == TemplateURLParsingContext::POST ||
304 data_.short_name().empty() || !IsHTTPRef(data_.url()) ||
305 !IsHTTPRef(data_.suggestions_url))
306 return NULL;
307 if (suggestion_method_ == TemplateURLParsingContext::POST)
308 data_.suggestions_url.clear();
310 // If the image was a data URL, use the favicon from the search URL instead.
311 // (see the TODO in EndElementImpl()).
312 GURL search_url(data_.url());
313 if (derive_image_from_url_ && data_.favicon_url.is_empty())
314 data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url);
316 // Generate a keyword for this search engine if a custom one was not present
317 // in the imported data.
318 if (!has_custom_keyword_)
319 data_.SetKeyword(TemplateURL::GenerateKeyword(
320 search_url, search_terms_data.GetAcceptLanguages()));
322 data_.show_in_default_list = show_in_default_list;
324 // Bail if the search URL is empty or if either TemplateURLRef is invalid.
325 scoped_ptr<TemplateURL> template_url(new TemplateURL(data_));
326 if (template_url->url().empty() ||
327 !template_url->url_ref().IsValid(search_terms_data) ||
328 (!template_url->suggestions_url().empty() &&
329 !template_url->suggestions_url_ref().IsValid(search_terms_data))) {
330 return NULL;
333 return template_url.release();
336 // static
337 void TemplateURLParsingContext::InitMapping() {
338 kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
339 (*kElementNameToElementTypeMap)[kURLElement] = URL;
340 (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
341 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
342 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
343 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
344 OPEN_SEARCH_DESCRIPTION;
345 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
346 OPEN_SEARCH_DESCRIPTION;
347 (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING;
348 (*kElementNameToElementTypeMap)[kAliasElement] = ALIAS;
351 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) {
352 if (!atts)
353 return;
355 std::string template_url;
356 bool is_post = false;
357 bool is_html_url = false;
358 bool is_suggest_url = false;
359 for (; *atts; atts += 2) {
360 std::string name(XMLCharToString(*atts));
361 const xmlChar* value = atts[1];
362 if (name == kURLTypeAttribute) {
363 std::string type = XMLCharToString(value);
364 is_html_url = (type == kHTMLType);
365 is_suggest_url = (type == kSuggestionType);
366 } else if (name == kURLTemplateAttribute) {
367 template_url = XMLCharToString(value);
368 } else if (name == kParamMethodAttribute) {
369 is_post = base::LowerCaseEqualsASCII(XMLCharToString(value), "post");
373 if (is_html_url && !template_url.empty()) {
374 data_.SetURL(template_url);
375 is_suggest_url_ = false;
376 if (is_post)
377 method_ = POST;
378 } else if (is_suggest_url) {
379 data_.suggestions_url = template_url;
380 is_suggest_url_ = true;
381 if (is_post)
382 suggestion_method_ = POST;
386 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) {
387 if (!atts)
388 return;
390 int width = 0;
391 int height = 0;
392 std::string type;
393 for (; *atts; atts += 2) {
394 std::string name(XMLCharToString(*atts));
395 const xmlChar* value = atts[1];
396 if (name == kImageTypeAttribute) {
397 type = XMLCharToString(value);
398 } else if (name == kImageWidthAttribute) {
399 base::StringToInt(XMLCharToString(value), &width);
400 } else if (name == kImageHeightAttribute) {
401 base::StringToInt(XMLCharToString(value), &height);
405 image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) &&
406 (height == gfx::kFaviconSize) &&
407 ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
410 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) {
411 if (!atts)
412 return;
414 std::string key, value;
415 for (; *atts; atts += 2) {
416 std::string name(XMLCharToString(*atts));
417 const xmlChar* val = atts[1];
418 if (name == kParamNameAttribute) {
419 key = XMLCharToString(val);
420 } else if (name == kParamValueAttribute) {
421 value = XMLCharToString(val);
425 if (!key.empty() &&
426 (!parameter_filter_ || parameter_filter_->KeepParameter(key, value)))
427 extra_params_.push_back(Param(key, value));
430 void TemplateURLParsingContext::ProcessURLParams() {
431 if (!parameter_filter_ && extra_params_.empty())
432 return;
434 GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url());
435 if (url.is_empty())
436 return;
438 // If there is a parameter filter, parse the existing URL and remove any
439 // unwanted parameter.
440 std::string new_query;
441 bool modified = false;
442 if (parameter_filter_) {
443 url::Component query = url.parsed_for_possibly_invalid_spec().query;
444 url::Component key, value;
445 const char* url_spec = url.spec().c_str();
446 while (url::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
447 std::string key_str(url_spec, key.begin, key.len);
448 std::string value_str(url_spec, value.begin, value.len);
449 if (parameter_filter_->KeepParameter(key_str, value_str)) {
450 AppendParamToQuery(key_str, value_str, &new_query);
451 } else {
452 modified = true;
456 if (!modified)
457 new_query = url.query();
459 // Add the extra parameters if any.
460 if (!extra_params_.empty()) {
461 modified = true;
462 for (std::vector<Param>::const_iterator iter(extra_params_.begin());
463 iter != extra_params_.end(); ++iter)
464 AppendParamToQuery(iter->first, iter->second, &new_query);
467 if (modified) {
468 GURL::Replacements repl;
469 repl.SetQueryStr(new_query);
470 url = url.ReplaceComponents(repl);
471 if (is_suggest_url_)
472 data_.suggestions_url = url.spec();
473 else if (url.is_valid())
474 data_.SetURL(url.spec());
478 TemplateURLParsingContext::ElementType
479 TemplateURLParsingContext::GetKnownType() {
480 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
481 return elements_[1];
482 // We only expect PARAM nodes under the URL node.
483 return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
484 elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN;
488 // TemplateURLParser ----------------------------------------------------------
490 // static
491 TemplateURL* TemplateURLParser::Parse(
492 const SearchTermsData& search_terms_data,
493 bool show_in_default_list,
494 const char* data,
495 size_t length,
496 TemplateURLParser::ParameterFilter* param_filter) {
497 // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
498 // &#38; . Unfortunately xmlSubstituteEntitiesDefault affects global state.
499 // If this becomes problematic we'll need to provide our own entity
500 // type for &amp;, or strip out &#38; by hand after parsing.
501 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
502 TemplateURLParsingContext context(param_filter);
503 xmlSAXHandler sax_handler;
504 memset(&sax_handler, 0, sizeof(sax_handler));
505 sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl;
506 sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl;
507 sax_handler.characters = &TemplateURLParsingContext::CharactersImpl;
508 int error = xmlSAXUserParseMemory(&sax_handler, &context, data,
509 static_cast<int>(length));
510 xmlSubstituteEntitiesDefault(last_sub_entities_value);
512 return error ?
513 NULL : context.GetTemplateURL(search_terms_data, show_in_default_list);