Disable view source for Developer Tools.
[chromium-blink-merge.git] / chrome / browser / search_engines / template_url_parser.cc
blob93d11dec4795d9320142d5305a611895eab98f1c
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/search_engines/template_url_parser.h"
7 #include <algorithm>
8 #include <map>
9 #include <vector>
11 #include "base/logging.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/string_util.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "chrome/browser/search_engines/search_terms_data.h"
17 #include "chrome/browser/search_engines/template_url.h"
18 #include "chrome/browser/search_engines/template_url_service.h"
19 #include "chrome/common/url_constants.h"
20 #include "libxml/parser.h"
21 #include "libxml/xmlwriter.h"
22 #include "ui/gfx/favicon_size.h"
23 #include "url/gurl.h"
25 namespace {
27 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
28 // to that of char, the following names are all in terms of char. This avoids
29 // having to convert to wide, then do comparisons.
31 // Defines for element names of the OSD document:
32 const char kURLElement[] = "Url";
33 const char kParamElement[] = "Param";
34 const char kShortNameElement[] = "ShortName";
35 const char kImageElement[] = "Image";
36 const char kOpenSearchDescriptionElement[] = "OpenSearchDescription";
37 const char kFirefoxSearchDescriptionElement[] = "SearchPlugin";
38 const char kInputEncodingElement[] = "InputEncoding";
40 // Various XML attributes used.
41 const char kURLTypeAttribute[] = "type";
42 const char kURLTemplateAttribute[] = "template";
43 const char kImageTypeAttribute[] = "type";
44 const char kImageWidthAttribute[] = "width";
45 const char kImageHeightAttribute[] = "height";
46 const char kParamNameAttribute[] = "name";
47 const char kParamValueAttribute[] = "value";
48 const char kParamMethodAttribute[] = "method";
50 // Mime type for search results.
51 const char kHTMLType[] = "text/html";
53 // Mime type for as you type suggestions.
54 const char kSuggestionType[] = "application/x-suggestions+json";
56 std::string XMLCharToString(const xmlChar* value) {
57 return std::string(reinterpret_cast<const char*>(value));
60 // Returns true if input_encoding contains a valid input encoding string. This
61 // doesn't verify that we have a valid encoding for the string, just that the
62 // string contains characters that constitute a valid input encoding.
63 bool IsValidEncodingString(const std::string& input_encoding) {
64 if (input_encoding.empty())
65 return false;
67 if (!IsAsciiAlpha(input_encoding[0]))
68 return false;
70 for (size_t i = 1, max = input_encoding.size(); i < max; ++i) {
71 char c = input_encoding[i];
72 if (!IsAsciiAlpha(c) && !IsAsciiDigit(c) && c != '.' && c != '_' &&
73 c != '-') {
74 return false;
77 return true;
80 void AppendParamToQuery(const std::string& key,
81 const std::string& value,
82 std::string* query) {
83 if (!query->empty())
84 query->append("&");
85 if (!key.empty()) {
86 query->append(key);
87 query->append("=");
89 query->append(value);
92 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S].
93 bool IsHTTPRef(const std::string& url) {
94 if (url.empty())
95 return true;
96 GURL gurl(url);
97 return gurl.is_valid() && (gurl.SchemeIs(content::kHttpScheme) ||
98 gurl.SchemeIs(content::kHttpsScheme));
101 } // namespace
104 // TemplateURLParsingContext --------------------------------------------------
106 // To minimize memory overhead while parsing, a SAX style parser is used.
107 // TemplateURLParsingContext is used to maintain the state we're in the document
108 // while parsing.
109 class TemplateURLParsingContext {
110 public:
111 // Enum of the known element types.
112 enum ElementType {
113 UNKNOWN,
114 OPEN_SEARCH_DESCRIPTION,
115 URL,
116 PARAM,
117 SHORT_NAME,
118 IMAGE,
119 INPUT_ENCODING,
122 enum Method {
123 GET,
124 POST
127 // Key/value of a Param node.
128 typedef std::pair<std::string, std::string> Param;
130 explicit TemplateURLParsingContext(
131 TemplateURLParser::ParameterFilter* parameter_filter);
133 static void StartElementImpl(void* ctx,
134 const xmlChar* name,
135 const xmlChar** atts);
136 static void EndElementImpl(void* ctx, const xmlChar* name);
137 static void CharactersImpl(void* ctx, const xmlChar* ch, int len);
139 // Returns a heap-allocated TemplateURL representing the result of parsing.
140 // This will be NULL if parsing failed or if the results were invalid for some
141 // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied,
142 // a resulting TemplateURLRef was invalid, etc.).
143 TemplateURL* GetTemplateURL(Profile* profile, bool show_in_default_list);
145 private:
146 // Key is UTF8 encoded.
147 typedef std::map<std::string, ElementType> ElementNameToElementTypeMap;
149 static void InitMapping();
151 void ParseURL(const xmlChar** atts);
152 void ParseImage(const xmlChar** atts);
153 void ParseParam(const xmlChar** atts);
154 void ProcessURLParams();
156 // Returns the current ElementType.
157 ElementType GetKnownType();
159 static ElementNameToElementTypeMap* kElementNameToElementTypeMap;
161 // Data that gets updated as we parse, and is converted to a TemplateURL by
162 // GetTemplateURL().
163 TemplateURLData data_;
165 std::vector<ElementType> elements_;
166 bool image_is_valid_for_favicon_;
168 // Character content for the current element.
169 base::string16 string_;
171 TemplateURLParser::ParameterFilter* parameter_filter_;
173 // The list of parameters parsed in the Param nodes of a Url node.
174 std::vector<Param> extra_params_;
176 // The HTTP methods used.
177 Method method_;
178 Method suggestion_method_;
180 // If true, we are currently parsing a suggest URL, otherwise it is an HTML
181 // search. Note that we don't need a stack as URL nodes cannot be nested.
182 bool is_suggest_url_;
184 // Whether we should derive the image from the URL (when images are data
185 // URLs).
186 bool derive_image_from_url_;
188 DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext);
191 // static
192 TemplateURLParsingContext::ElementNameToElementTypeMap*
193 TemplateURLParsingContext::kElementNameToElementTypeMap = NULL;
195 TemplateURLParsingContext::TemplateURLParsingContext(
196 TemplateURLParser::ParameterFilter* parameter_filter)
197 : image_is_valid_for_favicon_(false),
198 parameter_filter_(parameter_filter),
199 method_(GET),
200 suggestion_method_(GET),
201 is_suggest_url_(false),
202 derive_image_from_url_(false) {
203 if (kElementNameToElementTypeMap == NULL)
204 InitMapping();
207 // static
208 void TemplateURLParsingContext::StartElementImpl(void* ctx,
209 const xmlChar* name,
210 const xmlChar** atts) {
211 // Remove the namespace from |name|, ex: os:Url -> Url.
212 std::string node_name(XMLCharToString(name));
213 size_t index = node_name.find_first_of(":");
214 if (index != std::string::npos)
215 node_name.erase(0, index + 1);
217 TemplateURLParsingContext* context =
218 reinterpret_cast<TemplateURLParsingContext*>(ctx);
219 context->elements_.push_back(
220 context->kElementNameToElementTypeMap->count(node_name) ?
221 (*context->kElementNameToElementTypeMap)[node_name] : UNKNOWN);
222 switch (context->GetKnownType()) {
223 case TemplateURLParsingContext::URL:
224 context->extra_params_.clear();
225 context->ParseURL(atts);
226 break;
227 case TemplateURLParsingContext::IMAGE:
228 context->ParseImage(atts);
229 break;
230 case TemplateURLParsingContext::PARAM:
231 context->ParseParam(atts);
232 break;
233 default:
234 break;
236 context->string_.clear();
239 // static
240 void TemplateURLParsingContext::EndElementImpl(void* ctx, const xmlChar* name) {
241 TemplateURLParsingContext* context =
242 reinterpret_cast<TemplateURLParsingContext*>(ctx);
243 switch (context->GetKnownType()) {
244 case TemplateURLParsingContext::SHORT_NAME:
245 context->data_.short_name = context->string_;
246 break;
247 case TemplateURLParsingContext::IMAGE: {
248 GURL image_url(base::UTF16ToUTF8(context->string_));
249 if (image_url.SchemeIs(chrome::kDataScheme)) {
250 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
251 // decode the data URL in the renderer. For now, we'll just point to the
252 // favicon from the URL.
253 context->derive_image_from_url_ = true;
254 } else if (context->image_is_valid_for_favicon_ && image_url.is_valid() &&
255 (image_url.SchemeIs(content::kHttpScheme) ||
256 image_url.SchemeIs(content::kHttpsScheme))) {
257 context->data_.favicon_url = image_url;
259 context->image_is_valid_for_favicon_ = false;
260 break;
262 case TemplateURLParsingContext::INPUT_ENCODING: {
263 std::string input_encoding = UTF16ToASCII(context->string_);
264 if (IsValidEncodingString(input_encoding))
265 context->data_.input_encodings.push_back(input_encoding);
266 break;
268 case TemplateURLParsingContext::URL:
269 context->ProcessURLParams();
270 break;
271 default:
272 break;
274 context->string_.clear();
275 context->elements_.pop_back();
278 // static
279 void TemplateURLParsingContext::CharactersImpl(void* ctx,
280 const xmlChar* ch,
281 int len) {
282 reinterpret_cast<TemplateURLParsingContext*>(ctx)->string_ +=
283 base::UTF8ToUTF16(std::string(reinterpret_cast<const char*>(ch), len));
286 TemplateURL* TemplateURLParsingContext::GetTemplateURL(
287 Profile* profile,
288 bool show_in_default_list) {
289 // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107
290 if (method_ == TemplateURLParsingContext::POST || data_.short_name.empty() ||
291 !IsHTTPRef(data_.url()) || !IsHTTPRef(data_.suggestions_url))
292 return NULL;
293 if (suggestion_method_ == TemplateURLParsingContext::POST)
294 data_.suggestions_url.clear();
296 // If the image was a data URL, use the favicon from the search URL instead.
297 // (see the TODO in EndElementImpl()).
298 GURL search_url(data_.url());
299 if (derive_image_from_url_ && data_.favicon_url.is_empty())
300 data_.favicon_url = TemplateURL::GenerateFaviconURL(search_url);
302 data_.SetKeyword(TemplateURLService::GenerateKeyword(search_url));
303 data_.show_in_default_list = show_in_default_list;
305 // Bail if the search URL is empty or if either TemplateURLRef is invalid.
306 scoped_ptr<TemplateURL> template_url(new TemplateURL(profile, data_));
307 scoped_ptr<SearchTermsData> search_terms_data(profile ?
308 new UIThreadSearchTermsData(profile) : new SearchTermsData());
309 if (template_url->url().empty() ||
310 !template_url->url_ref().IsValidUsingTermsData(*search_terms_data) ||
311 (!template_url->suggestions_url().empty() &&
312 !template_url->suggestions_url_ref().
313 IsValidUsingTermsData(*search_terms_data))) {
314 return NULL;
317 return template_url.release();
320 // static
321 void TemplateURLParsingContext::InitMapping() {
322 kElementNameToElementTypeMap = new std::map<std::string, ElementType>;
323 (*kElementNameToElementTypeMap)[kURLElement] = URL;
324 (*kElementNameToElementTypeMap)[kParamElement] = PARAM;
325 (*kElementNameToElementTypeMap)[kShortNameElement] = SHORT_NAME;
326 (*kElementNameToElementTypeMap)[kImageElement] = IMAGE;
327 (*kElementNameToElementTypeMap)[kOpenSearchDescriptionElement] =
328 OPEN_SEARCH_DESCRIPTION;
329 (*kElementNameToElementTypeMap)[kFirefoxSearchDescriptionElement] =
330 OPEN_SEARCH_DESCRIPTION;
331 (*kElementNameToElementTypeMap)[kInputEncodingElement] = INPUT_ENCODING;
334 void TemplateURLParsingContext::ParseURL(const xmlChar** atts) {
335 if (!atts)
336 return;
338 std::string template_url;
339 bool is_post = false;
340 bool is_html_url = false;
341 bool is_suggest_url = false;
342 for (; *atts; atts += 2) {
343 std::string name(XMLCharToString(*atts));
344 const xmlChar* value = atts[1];
345 if (name == kURLTypeAttribute) {
346 std::string type = XMLCharToString(value);
347 is_html_url = (type == kHTMLType);
348 is_suggest_url = (type == kSuggestionType);
349 } else if (name == kURLTemplateAttribute) {
350 template_url = XMLCharToString(value);
351 } else if (name == kParamMethodAttribute) {
352 is_post = LowerCaseEqualsASCII(XMLCharToString(value), "post");
356 if (is_html_url && !template_url.empty()) {
357 data_.SetURL(template_url);
358 is_suggest_url_ = false;
359 if (is_post)
360 method_ = POST;
361 } else if (is_suggest_url) {
362 data_.suggestions_url = template_url;
363 is_suggest_url_ = true;
364 if (is_post)
365 suggestion_method_ = POST;
369 void TemplateURLParsingContext::ParseImage(const xmlChar** atts) {
370 if (!atts)
371 return;
373 int width = 0;
374 int height = 0;
375 std::string type;
376 for (; *atts; atts += 2) {
377 std::string name(XMLCharToString(*atts));
378 const xmlChar* value = atts[1];
379 if (name == kImageTypeAttribute) {
380 type = XMLCharToString(value);
381 } else if (name == kImageWidthAttribute) {
382 base::StringToInt(XMLCharToString(value), &width);
383 } else if (name == kImageHeightAttribute) {
384 base::StringToInt(XMLCharToString(value), &height);
388 image_is_valid_for_favicon_ = (width == gfx::kFaviconSize) &&
389 (height == gfx::kFaviconSize) &&
390 ((type == "image/x-icon") || (type == "image/vnd.microsoft.icon"));
393 void TemplateURLParsingContext::ParseParam(const xmlChar** atts) {
394 if (!atts)
395 return;
397 std::string key, value;
398 for (; *atts; atts += 2) {
399 std::string name(XMLCharToString(*atts));
400 const xmlChar* val = atts[1];
401 if (name == kParamNameAttribute) {
402 key = XMLCharToString(val);
403 } else if (name == kParamValueAttribute) {
404 value = XMLCharToString(val);
408 if (!key.empty() &&
409 (!parameter_filter_ || parameter_filter_->KeepParameter(key, value)))
410 extra_params_.push_back(Param(key, value));
413 void TemplateURLParsingContext::ProcessURLParams() {
414 if (!parameter_filter_ && extra_params_.empty())
415 return;
417 GURL url(is_suggest_url_ ? data_.suggestions_url : data_.url());
418 if (url.is_empty())
419 return;
421 // If there is a parameter filter, parse the existing URL and remove any
422 // unwanted parameter.
423 std::string new_query;
424 bool modified = false;
425 if (parameter_filter_) {
426 url_parse::Component query = url.parsed_for_possibly_invalid_spec().query;
427 url_parse::Component key, value;
428 const char* url_spec = url.spec().c_str();
429 while (url_parse::ExtractQueryKeyValue(url_spec, &query, &key, &value)) {
430 std::string key_str(url_spec, key.begin, key.len);
431 std::string value_str(url_spec, value.begin, value.len);
432 if (parameter_filter_->KeepParameter(key_str, value_str)) {
433 AppendParamToQuery(key_str, value_str, &new_query);
434 } else {
435 modified = true;
439 if (!modified)
440 new_query = url.query();
442 // Add the extra parameters if any.
443 if (!extra_params_.empty()) {
444 modified = true;
445 for (std::vector<Param>::const_iterator iter(extra_params_.begin());
446 iter != extra_params_.end(); ++iter)
447 AppendParamToQuery(iter->first, iter->second, &new_query);
450 if (modified) {
451 GURL::Replacements repl;
452 repl.SetQueryStr(new_query);
453 url = url.ReplaceComponents(repl);
454 if (is_suggest_url_)
455 data_.suggestions_url = url.spec();
456 else if (url.is_valid())
457 data_.SetURL(url.spec());
461 TemplateURLParsingContext::ElementType
462 TemplateURLParsingContext::GetKnownType() {
463 if (elements_.size() == 2 && elements_[0] == OPEN_SEARCH_DESCRIPTION)
464 return elements_[1];
465 // We only expect PARAM nodes under the URL node.
466 return (elements_.size() == 3 && elements_[0] == OPEN_SEARCH_DESCRIPTION &&
467 elements_[1] == URL && elements_[2] == PARAM) ? PARAM : UNKNOWN;
471 // TemplateURLParser ----------------------------------------------------------
473 // static
474 TemplateURL* TemplateURLParser::Parse(
475 Profile* profile,
476 bool show_in_default_list,
477 const char* data,
478 size_t length,
479 TemplateURLParser::ParameterFilter* param_filter) {
480 // xmlSubstituteEntitiesDefault(1) makes it so that &amp; isn't mapped to
481 // &#38; . Unfortunately xmlSubstituteEntitiesDefault affects global state.
482 // If this becomes problematic we'll need to provide our own entity
483 // type for &amp;, or strip out &#38; by hand after parsing.
484 int last_sub_entities_value = xmlSubstituteEntitiesDefault(1);
485 TemplateURLParsingContext context(param_filter);
486 xmlSAXHandler sax_handler;
487 memset(&sax_handler, 0, sizeof(sax_handler));
488 sax_handler.startElement = &TemplateURLParsingContext::StartElementImpl;
489 sax_handler.endElement = &TemplateURLParsingContext::EndElementImpl;
490 sax_handler.characters = &TemplateURLParsingContext::CharactersImpl;
491 int error = xmlSAXUserParseMemory(&sax_handler, &context, data,
492 static_cast<int>(length));
493 xmlSubstituteEntitiesDefault(last_sub_entities_value);
495 return error ? NULL : context.GetTemplateURL(profile, show_in_default_list);