1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "components/search_engines/template_url_parser.h"
11 #include "base/logging.h"
12 #include "base/memory/scoped_ptr.h"
13 #include "base/strings/string_number_conversions.h"
14 #include "base/strings/string_util.h"
15 #include "base/strings/utf_string_conversions.h"
16 #include "components/search_engines/search_terms_data.h"
17 #include "components/search_engines/template_url.h"
18 #include "libxml/parser.h"
19 #include "libxml/xmlwriter.h"
20 #include "ui/gfx/favicon_size.h"
22 #include "url/url_constants.h"
26 // NOTE: libxml uses the UTF-8 encoding. As 0-127 of UTF-8 corresponds
27 // to that of char, the following names are all in terms of char. This avoids
28 // having to convert to wide, then do comparisons.
30 // Defines for element names of the OSD document:
31 const char kURLElement
[] = "Url";
32 const char kParamElement
[] = "Param";
33 const char kShortNameElement
[] = "ShortName";
34 const char kImageElement
[] = "Image";
35 const char kOpenSearchDescriptionElement
[] = "OpenSearchDescription";
36 const char kFirefoxSearchDescriptionElement
[] = "SearchPlugin";
37 const char kInputEncodingElement
[] = "InputEncoding";
38 const char kAliasElement
[] = "Alias";
40 // Various XML attributes used.
41 const char kURLTypeAttribute
[] = "type";
42 const char kURLTemplateAttribute
[] = "template";
43 const char kImageTypeAttribute
[] = "type";
44 const char kImageWidthAttribute
[] = "width";
45 const char kImageHeightAttribute
[] = "height";
46 const char kParamNameAttribute
[] = "name";
47 const char kParamValueAttribute
[] = "value";
48 const char kParamMethodAttribute
[] = "method";
50 // Mime type for search results.
51 const char kHTMLType
[] = "text/html";
53 // Mime type for as you type suggestions.
54 const char kSuggestionType
[] = "application/x-suggestions+json";
56 std::string
XMLCharToString(const xmlChar
* value
) {
57 return std::string(reinterpret_cast<const char*>(value
));
60 // Returns true if input_encoding contains a valid input encoding string. This
61 // doesn't verify that we have a valid encoding for the string, just that the
62 // string contains characters that constitute a valid input encoding.
63 bool IsValidEncodingString(const std::string
& input_encoding
) {
64 if (input_encoding
.empty())
67 if (!base::IsAsciiAlpha(input_encoding
[0]))
70 for (size_t i
= 1, max
= input_encoding
.size(); i
< max
; ++i
) {
71 char c
= input_encoding
[i
];
72 if (!base::IsAsciiAlpha(c
) && !base::IsAsciiDigit(c
) &&
73 c
!= '.' && c
!= '_' && c
!= '-') {
80 void AppendParamToQuery(const std::string
& key
,
81 const std::string
& value
,
92 // Returns true if |url| is empty or is a valid URL with a scheme of HTTP[S].
93 bool IsHTTPRef(const std::string
& url
) {
97 return gurl
.is_valid() && (gurl
.SchemeIs(url::kHttpScheme
) ||
98 gurl
.SchemeIs(url::kHttpsScheme
));
104 // TemplateURLParsingContext --------------------------------------------------
106 // To minimize memory overhead while parsing, a SAX style parser is used.
107 // TemplateURLParsingContext is used to maintain the state we're in the document
109 class TemplateURLParsingContext
{
111 // Enum of the known element types.
114 OPEN_SEARCH_DESCRIPTION
,
128 // Key/value of a Param node.
129 typedef std::pair
<std::string
, std::string
> Param
;
131 explicit TemplateURLParsingContext(
132 TemplateURLParser::ParameterFilter
* parameter_filter
);
134 static void StartElementImpl(void* ctx
,
136 const xmlChar
** atts
);
137 static void EndElementImpl(void* ctx
, const xmlChar
* name
);
138 static void CharactersImpl(void* ctx
, const xmlChar
* ch
, int len
);
140 // Returns a heap-allocated TemplateURL representing the result of parsing.
141 // This will be NULL if parsing failed or if the results were invalid for some
142 // reason (e.g. the resulting URL was not HTTP[S], a name wasn't supplied,
143 // a resulting TemplateURLRef was invalid, etc.).
144 TemplateURL
* GetTemplateURL(const SearchTermsData
& search_terms_data
,
145 bool show_in_default_list
);
148 // Key is UTF8 encoded.
149 typedef std::map
<std::string
, ElementType
> ElementNameToElementTypeMap
;
151 static void InitMapping();
153 void ParseURL(const xmlChar
** atts
);
154 void ParseImage(const xmlChar
** atts
);
155 void ParseParam(const xmlChar
** atts
);
156 void ProcessURLParams();
158 // Returns the current ElementType.
159 ElementType
GetKnownType();
161 static ElementNameToElementTypeMap
* kElementNameToElementTypeMap
;
163 // Data that gets updated as we parse, and is converted to a TemplateURL by
165 TemplateURLData data_
;
167 std::vector
<ElementType
> elements_
;
168 bool image_is_valid_for_favicon_
;
170 // Character content for the current element.
171 base::string16 string_
;
173 TemplateURLParser::ParameterFilter
* parameter_filter_
;
175 // The list of parameters parsed in the Param nodes of a Url node.
176 std::vector
<Param
> extra_params_
;
178 // The HTTP methods used.
180 Method suggestion_method_
;
182 // If true, we are currently parsing a suggest URL, otherwise it is an HTML
183 // search. Note that we don't need a stack as URL nodes cannot be nested.
184 bool is_suggest_url_
;
186 // If true, the user has set a keyword and we should use it. Otherwise,
187 // we generate a keyword based on the URL.
188 bool has_custom_keyword_
;
190 // Whether we should derive the image from the URL (when images are data
192 bool derive_image_from_url_
;
194 DISALLOW_COPY_AND_ASSIGN(TemplateURLParsingContext
);
198 TemplateURLParsingContext::ElementNameToElementTypeMap
*
199 TemplateURLParsingContext::kElementNameToElementTypeMap
= NULL
;
201 TemplateURLParsingContext::TemplateURLParsingContext(
202 TemplateURLParser::ParameterFilter
* parameter_filter
)
203 : image_is_valid_for_favicon_(false),
204 parameter_filter_(parameter_filter
),
206 suggestion_method_(GET
),
207 is_suggest_url_(false),
208 has_custom_keyword_(false),
209 derive_image_from_url_(false) {
210 if (kElementNameToElementTypeMap
== NULL
)
215 void TemplateURLParsingContext::StartElementImpl(void* ctx
,
217 const xmlChar
** atts
) {
218 // Remove the namespace from |name|, ex: os:Url -> Url.
219 std::string
node_name(XMLCharToString(name
));
220 size_t index
= node_name
.find_first_of(":");
221 if (index
!= std::string::npos
)
222 node_name
.erase(0, index
+ 1);
224 TemplateURLParsingContext
* context
=
225 reinterpret_cast<TemplateURLParsingContext
*>(ctx
);
226 context
->elements_
.push_back(
227 context
->kElementNameToElementTypeMap
->count(node_name
) ?
228 (*context
->kElementNameToElementTypeMap
)[node_name
] : UNKNOWN
);
229 switch (context
->GetKnownType()) {
230 case TemplateURLParsingContext::URL
:
231 context
->extra_params_
.clear();
232 context
->ParseURL(atts
);
234 case TemplateURLParsingContext::IMAGE
:
235 context
->ParseImage(atts
);
237 case TemplateURLParsingContext::PARAM
:
238 context
->ParseParam(atts
);
243 context
->string_
.clear();
247 void TemplateURLParsingContext::EndElementImpl(void* ctx
, const xmlChar
* name
) {
248 TemplateURLParsingContext
* context
=
249 reinterpret_cast<TemplateURLParsingContext
*>(ctx
);
250 switch (context
->GetKnownType()) {
251 case TemplateURLParsingContext::URL
:
252 context
->ProcessURLParams();
254 case TemplateURLParsingContext::SHORT_NAME
:
255 context
->data_
.SetShortName(context
->string_
);
257 case TemplateURLParsingContext::IMAGE
: {
258 GURL
image_url(base::UTF16ToUTF8(context
->string_
));
259 if (image_url
.SchemeIs(url::kDataScheme
)) {
260 // TODO (jcampan): bug 1169256: when dealing with data URL, we need to
261 // decode the data URL in the renderer. For now, we'll just point to the
262 // favicon from the URL.
263 context
->derive_image_from_url_
= true;
264 } else if (context
->image_is_valid_for_favicon_
&& image_url
.is_valid() &&
265 (image_url
.SchemeIs(url::kHttpScheme
) ||
266 image_url
.SchemeIs(url::kHttpsScheme
))) {
267 context
->data_
.favicon_url
= image_url
;
269 context
->image_is_valid_for_favicon_
= false;
272 case TemplateURLParsingContext::INPUT_ENCODING
: {
273 std::string input_encoding
= base::UTF16ToASCII(context
->string_
);
274 if (IsValidEncodingString(input_encoding
))
275 context
->data_
.input_encodings
.push_back(input_encoding
);
278 case TemplateURLParsingContext::ALIAS
: {
279 context
->data_
.SetKeyword(context
->string_
);
280 context
->has_custom_keyword_
= true;
286 context
->string_
.clear();
287 context
->elements_
.pop_back();
291 void TemplateURLParsingContext::CharactersImpl(void* ctx
,
294 reinterpret_cast<TemplateURLParsingContext
*>(ctx
)->string_
+=
296 base::StringPiece(reinterpret_cast<const char*>(ch
), len
));
299 TemplateURL
* TemplateURLParsingContext::GetTemplateURL(
300 const SearchTermsData
& search_terms_data
,
301 bool show_in_default_list
) {
302 // TODO(jcampan): Support engines that use POST; see http://crbug.com/18107
303 if (method_
== TemplateURLParsingContext::POST
||
304 data_
.short_name().empty() || !IsHTTPRef(data_
.url()) ||
305 !IsHTTPRef(data_
.suggestions_url
))
307 if (suggestion_method_
== TemplateURLParsingContext::POST
)
308 data_
.suggestions_url
.clear();
310 // If the image was a data URL, use the favicon from the search URL instead.
311 // (see the TODO in EndElementImpl()).
312 GURL
search_url(data_
.url());
313 if (derive_image_from_url_
&& data_
.favicon_url
.is_empty())
314 data_
.favicon_url
= TemplateURL::GenerateFaviconURL(search_url
);
316 // Generate a keyword for this search engine if a custom one was not present
317 // in the imported data.
318 if (!has_custom_keyword_
)
319 data_
.SetKeyword(TemplateURL::GenerateKeyword(
320 search_url
, search_terms_data
.GetAcceptLanguages()));
322 data_
.show_in_default_list
= show_in_default_list
;
324 // Bail if the search URL is empty or if either TemplateURLRef is invalid.
325 scoped_ptr
<TemplateURL
> template_url(new TemplateURL(data_
));
326 if (template_url
->url().empty() ||
327 !template_url
->url_ref().IsValid(search_terms_data
) ||
328 (!template_url
->suggestions_url().empty() &&
329 !template_url
->suggestions_url_ref().IsValid(search_terms_data
))) {
333 return template_url
.release();
337 void TemplateURLParsingContext::InitMapping() {
338 kElementNameToElementTypeMap
= new std::map
<std::string
, ElementType
>;
339 (*kElementNameToElementTypeMap
)[kURLElement
] = URL
;
340 (*kElementNameToElementTypeMap
)[kParamElement
] = PARAM
;
341 (*kElementNameToElementTypeMap
)[kShortNameElement
] = SHORT_NAME
;
342 (*kElementNameToElementTypeMap
)[kImageElement
] = IMAGE
;
343 (*kElementNameToElementTypeMap
)[kOpenSearchDescriptionElement
] =
344 OPEN_SEARCH_DESCRIPTION
;
345 (*kElementNameToElementTypeMap
)[kFirefoxSearchDescriptionElement
] =
346 OPEN_SEARCH_DESCRIPTION
;
347 (*kElementNameToElementTypeMap
)[kInputEncodingElement
] = INPUT_ENCODING
;
348 (*kElementNameToElementTypeMap
)[kAliasElement
] = ALIAS
;
351 void TemplateURLParsingContext::ParseURL(const xmlChar
** atts
) {
355 std::string template_url
;
356 bool is_post
= false;
357 bool is_html_url
= false;
358 bool is_suggest_url
= false;
359 for (; *atts
; atts
+= 2) {
360 std::string
name(XMLCharToString(*atts
));
361 const xmlChar
* value
= atts
[1];
362 if (name
== kURLTypeAttribute
) {
363 std::string type
= XMLCharToString(value
);
364 is_html_url
= (type
== kHTMLType
);
365 is_suggest_url
= (type
== kSuggestionType
);
366 } else if (name
== kURLTemplateAttribute
) {
367 template_url
= XMLCharToString(value
);
368 } else if (name
== kParamMethodAttribute
) {
369 is_post
= base::LowerCaseEqualsASCII(XMLCharToString(value
), "post");
373 if (is_html_url
&& !template_url
.empty()) {
374 data_
.SetURL(template_url
);
375 is_suggest_url_
= false;
378 } else if (is_suggest_url
) {
379 data_
.suggestions_url
= template_url
;
380 is_suggest_url_
= true;
382 suggestion_method_
= POST
;
386 void TemplateURLParsingContext::ParseImage(const xmlChar
** atts
) {
393 for (; *atts
; atts
+= 2) {
394 std::string
name(XMLCharToString(*atts
));
395 const xmlChar
* value
= atts
[1];
396 if (name
== kImageTypeAttribute
) {
397 type
= XMLCharToString(value
);
398 } else if (name
== kImageWidthAttribute
) {
399 base::StringToInt(XMLCharToString(value
), &width
);
400 } else if (name
== kImageHeightAttribute
) {
401 base::StringToInt(XMLCharToString(value
), &height
);
405 image_is_valid_for_favicon_
= (width
== gfx::kFaviconSize
) &&
406 (height
== gfx::kFaviconSize
) &&
407 ((type
== "image/x-icon") || (type
== "image/vnd.microsoft.icon"));
410 void TemplateURLParsingContext::ParseParam(const xmlChar
** atts
) {
414 std::string key
, value
;
415 for (; *atts
; atts
+= 2) {
416 std::string
name(XMLCharToString(*atts
));
417 const xmlChar
* val
= atts
[1];
418 if (name
== kParamNameAttribute
) {
419 key
= XMLCharToString(val
);
420 } else if (name
== kParamValueAttribute
) {
421 value
= XMLCharToString(val
);
426 (!parameter_filter_
|| parameter_filter_
->KeepParameter(key
, value
)))
427 extra_params_
.push_back(Param(key
, value
));
430 void TemplateURLParsingContext::ProcessURLParams() {
431 if (!parameter_filter_
&& extra_params_
.empty())
434 GURL
url(is_suggest_url_
? data_
.suggestions_url
: data_
.url());
438 // If there is a parameter filter, parse the existing URL and remove any
439 // unwanted parameter.
440 std::string new_query
;
441 bool modified
= false;
442 if (parameter_filter_
) {
443 url::Component query
= url
.parsed_for_possibly_invalid_spec().query
;
444 url::Component key
, value
;
445 const char* url_spec
= url
.spec().c_str();
446 while (url::ExtractQueryKeyValue(url_spec
, &query
, &key
, &value
)) {
447 std::string
key_str(url_spec
, key
.begin
, key
.len
);
448 std::string
value_str(url_spec
, value
.begin
, value
.len
);
449 if (parameter_filter_
->KeepParameter(key_str
, value_str
)) {
450 AppendParamToQuery(key_str
, value_str
, &new_query
);
457 new_query
= url
.query();
459 // Add the extra parameters if any.
460 if (!extra_params_
.empty()) {
462 for (std::vector
<Param
>::const_iterator
iter(extra_params_
.begin());
463 iter
!= extra_params_
.end(); ++iter
)
464 AppendParamToQuery(iter
->first
, iter
->second
, &new_query
);
468 GURL::Replacements repl
;
469 repl
.SetQueryStr(new_query
);
470 url
= url
.ReplaceComponents(repl
);
472 data_
.suggestions_url
= url
.spec();
473 else if (url
.is_valid())
474 data_
.SetURL(url
.spec());
478 TemplateURLParsingContext::ElementType
479 TemplateURLParsingContext::GetKnownType() {
480 if (elements_
.size() == 2 && elements_
[0] == OPEN_SEARCH_DESCRIPTION
)
482 // We only expect PARAM nodes under the URL node.
483 return (elements_
.size() == 3 && elements_
[0] == OPEN_SEARCH_DESCRIPTION
&&
484 elements_
[1] == URL
&& elements_
[2] == PARAM
) ? PARAM
: UNKNOWN
;
488 // TemplateURLParser ----------------------------------------------------------
491 TemplateURL
* TemplateURLParser::Parse(
492 const SearchTermsData
& search_terms_data
,
493 bool show_in_default_list
,
496 TemplateURLParser::ParameterFilter
* param_filter
) {
497 // xmlSubstituteEntitiesDefault(1) makes it so that & isn't mapped to
498 // & . Unfortunately xmlSubstituteEntitiesDefault affects global state.
499 // If this becomes problematic we'll need to provide our own entity
500 // type for &, or strip out & by hand after parsing.
501 int last_sub_entities_value
= xmlSubstituteEntitiesDefault(1);
502 TemplateURLParsingContext
context(param_filter
);
503 xmlSAXHandler sax_handler
;
504 memset(&sax_handler
, 0, sizeof(sax_handler
));
505 sax_handler
.startElement
= &TemplateURLParsingContext::StartElementImpl
;
506 sax_handler
.endElement
= &TemplateURLParsingContext::EndElementImpl
;
507 sax_handler
.characters
= &TemplateURLParsingContext::CharactersImpl
;
508 int error
= xmlSAXUserParseMemory(&sax_handler
, &context
, data
,
509 static_cast<int>(length
));
510 xmlSubstituteEntitiesDefault(last_sub_entities_value
);
513 NULL
: context
.GetTemplateURL(search_terms_data
, show_in_default_list
);