1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/autocomplete/autocomplete_input.h"
7 #include "base/strings/string_util.h"
8 #include "base/strings/utf_string_conversions.h"
9 #include "chrome/browser/external_protocol/external_protocol_handler.h"
10 #include "chrome/browser/profiles/profile_io_data.h"
11 #include "chrome/common/net/url_fixer_upper.h"
12 #include "content/public/common/url_constants.h"
13 #include "net/base/net_util.h"
14 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
15 #include "url/url_canon_ip.h"
16 #include "url/url_util.h"
20 void AdjustCursorPositionIfNecessary(size_t num_leading_chars_removed
,
21 size_t* cursor_position
) {
22 if (*cursor_position
== base::string16::npos
)
24 if (num_leading_chars_removed
< *cursor_position
)
25 *cursor_position
-= num_leading_chars_removed
;
32 AutocompleteInput::AutocompleteInput()
33 : cursor_position_(base::string16::npos
),
34 current_page_classification_(AutocompleteInput::INVALID_SPEC
),
36 prevent_inline_autocomplete_(false),
37 prefer_keyword_(false),
38 allow_exact_keyword_match_(true),
39 want_asynchronous_matches_(true) {
42 AutocompleteInput::AutocompleteInput(
43 const base::string16
& text
,
44 size_t cursor_position
,
45 const base::string16
& desired_tld
,
46 const GURL
& current_url
,
47 AutocompleteInput::PageClassification current_page_classification
,
48 bool prevent_inline_autocomplete
,
50 bool allow_exact_keyword_match
,
51 bool want_asynchronous_matches
)
52 : cursor_position_(cursor_position
),
53 current_url_(current_url
),
54 current_page_classification_(current_page_classification
),
55 prevent_inline_autocomplete_(prevent_inline_autocomplete
),
56 prefer_keyword_(prefer_keyword
),
57 allow_exact_keyword_match_(allow_exact_keyword_match
),
58 want_asynchronous_matches_(want_asynchronous_matches
) {
59 DCHECK(cursor_position
<= text
.length() ||
60 cursor_position
== base::string16::npos
)
61 << "Text: '" << text
<< "', cp: " << cursor_position
;
62 // None of the providers care about leading white space so we always trim it.
63 // Providers that care about trailing white space handle trimming themselves.
64 if ((base::TrimWhitespace(text
, base::TRIM_LEADING
, &text_
) &
65 base::TRIM_LEADING
) != 0)
66 AdjustCursorPositionIfNecessary(text
.length() - text_
.length(),
69 GURL canonicalized_url
;
70 type_
= Parse(text_
, desired_tld
, &parts_
, &scheme_
, &canonicalized_url
);
75 if (((type_
== UNKNOWN
) || (type_
== URL
)) &&
76 canonicalized_url
.is_valid() &&
77 (!canonicalized_url
.IsStandard() || canonicalized_url
.SchemeIsFile() ||
78 canonicalized_url
.SchemeIsFileSystem() ||
79 !canonicalized_url
.host().empty()))
80 canonicalized_url_
= canonicalized_url
;
82 size_t chars_removed
= RemoveForcedQueryStringIfNecessary(type_
, &text_
);
83 AdjustCursorPositionIfNecessary(chars_removed
, &cursor_position_
);
85 // Remove spaces between opening question mark and first actual character.
86 base::string16 trimmed_text
;
87 if ((base::TrimWhitespace(text_
, base::TRIM_LEADING
, &trimmed_text
) &
88 base::TRIM_LEADING
) != 0) {
89 AdjustCursorPositionIfNecessary(text_
.length() - trimmed_text
.length(),
96 AutocompleteInput::~AutocompleteInput() {
100 size_t AutocompleteInput::RemoveForcedQueryStringIfNecessary(
102 base::string16
* text
) {
103 if (type
!= FORCED_QUERY
|| text
->empty() || (*text
)[0] != L
'?')
105 // Drop the leading '?'.
111 std::string
AutocompleteInput::TypeToString(Type type
) {
113 case INVALID
: return "invalid";
114 case UNKNOWN
: return "unknown";
115 case URL
: return "url";
116 case QUERY
: return "query";
117 case FORCED_QUERY
: return "forced-query";
121 return std::string();
126 AutocompleteInput::Type
AutocompleteInput::Parse(
127 const base::string16
& text
,
128 const base::string16
& desired_tld
,
130 base::string16
* scheme
,
131 GURL
* canonicalized_url
) {
132 size_t first_non_white
= text
.find_first_not_of(base::kWhitespaceUTF16
, 0);
133 if (first_non_white
== base::string16::npos
)
134 return INVALID
; // All whitespace.
136 if (text
.at(first_non_white
) == L
'?') {
137 // If the first non-whitespace character is a '?', we magically treat this
142 // Ask our parsing back-end to help us understand what the user typed. We
143 // use the URLFixerUpper here because we want to be smart about what we
144 // consider a scheme. For example, we shouldn't consider www.google.com:80
146 url::Parsed local_parts
;
148 parts
= &local_parts
;
149 const base::string16
parsed_scheme(URLFixerUpper::SegmentURL(text
, parts
));
151 *scheme
= parsed_scheme
;
153 // If we can't canonicalize the user's input, the rest of the autocomplete
154 // system isn't going to be able to produce a navigable URL match for it.
155 // So we just return QUERY immediately in these cases.
156 GURL placeholder_canonicalized_url
;
157 if (!canonicalized_url
)
158 canonicalized_url
= &placeholder_canonicalized_url
;
159 *canonicalized_url
= URLFixerUpper::FixupURL(base::UTF16ToUTF8(text
),
160 base::UTF16ToUTF8(desired_tld
));
161 if (!canonicalized_url
->is_valid())
164 if (LowerCaseEqualsASCII(parsed_scheme
, content::kFileScheme
)) {
165 // A user might or might not type a scheme when entering a file URL. In
166 // either case, |parsed_scheme| will tell us that this is a file URL, but
167 // |parts->scheme| might be empty, e.g. if the user typed "C:\foo".
171 // If the user typed a scheme, and it's HTTP or HTTPS, we know how to parse it
172 // well enough that we can fall through to the heuristics below. If it's
173 // something else, we can just determine our action based on what we do with
174 // any input of this scheme. In theory we could do better with some schemes
175 // (e.g. "ftp" or "view-source") but I'll wait to spend the effort on that
176 // until I run into some cases that really need it.
177 if (parts
->scheme
.is_nonempty() &&
178 !LowerCaseEqualsASCII(parsed_scheme
, content::kHttpScheme
) &&
179 !LowerCaseEqualsASCII(parsed_scheme
, content::kHttpsScheme
)) {
180 // See if we know how to handle the URL internally. There are some schemes
181 // that we convert to other things before they reach the renderer or else
182 // the renderer handles internally without reaching the net::URLRequest
183 // logic. They thus won't be listed as "handled protocols", but we should
184 // still claim to handle them.
185 if (ProfileIOData::IsHandledProtocol(base::UTF16ToASCII(parsed_scheme
)) ||
186 LowerCaseEqualsASCII(parsed_scheme
, content::kViewSourceScheme
) ||
187 LowerCaseEqualsASCII(parsed_scheme
, content::kJavaScriptScheme
) ||
188 LowerCaseEqualsASCII(parsed_scheme
, content::kDataScheme
))
191 // Not an internal protocol. Check and see if the user has explicitly
192 // opened this scheme as a URL before, or if the "scheme" is actually a
193 // username. We need to do this after the check above because some
194 // handlable schemes (e.g. "javascript") may be treated as "blocked" by the
195 // external protocol handler because we don't want pages to open them, but
197 // Note that the protocol handler needs to be informed that omnibox input
198 // should always be considered "user gesture-triggered", lest it always
200 ExternalProtocolHandler::BlockState block_state
=
201 ExternalProtocolHandler::GetBlockState(
202 base::UTF16ToUTF8(parsed_scheme
), true);
203 switch (block_state
) {
204 case ExternalProtocolHandler::DONT_BLOCK
:
207 case ExternalProtocolHandler::BLOCK
:
208 // If we don't want the user to open the URL, don't let it be navigated
213 // We don't know about this scheme. It might be that the user typed a
214 // URL of the form "username:password@foo.com".
215 const base::string16 http_scheme_prefix
=
216 base::ASCIIToUTF16(std::string(content::kHttpScheme
) +
217 content::kStandardSchemeSeparator
);
218 url::Parsed http_parts
;
219 base::string16 http_scheme
;
220 GURL http_canonicalized_url
;
221 Type http_type
= Parse(http_scheme_prefix
+ text
, desired_tld
,
222 &http_parts
, &http_scheme
,
223 &http_canonicalized_url
);
224 DCHECK_EQ(std::string(content::kHttpScheme
),
225 base::UTF16ToUTF8(http_scheme
));
227 if ((http_type
== URL
) && http_parts
.username
.is_nonempty() &&
228 http_parts
.password
.is_nonempty()) {
229 // Manually re-jigger the parsed parts to match |text| (without the
230 // http scheme added).
231 http_parts
.scheme
.reset();
232 url::Component
* components
[] = {
233 &http_parts
.username
,
234 &http_parts
.password
,
241 for (size_t i
= 0; i
< arraysize(components
); ++i
) {
242 URLFixerUpper::OffsetComponent(
243 -static_cast<int>(http_scheme_prefix
.length()), components
[i
]);
249 *canonicalized_url
= http_canonicalized_url
;
254 // We don't know about this scheme and it doesn't look like the user
255 // typed a username and password. It's likely to be a search operator
256 // like "site:" or "link:". We classify it as UNKNOWN so the user has
257 // the option of treating it as a URL if we're wrong.
258 // Note that SegmentURL() is smart so we aren't tricked by "c:\foo" or
259 // "www.example.com:81" in this case.
265 // Either the user didn't type a scheme, in which case we need to distinguish
266 // between an HTTP URL and a query, or the scheme is HTTP or HTTPS, in which
267 // case we should reject invalid formulations.
269 // If we have an empty host it can't be a valid HTTP[S] URL. (This should
270 // only trigger for input that begins with a colon, which GURL will parse as a
271 // valid, non-standard URL; for standard URLs, an empty host would have
272 // resulted in an invalid |canonicalized_url| above.)
273 if (!parts
->host
.is_nonempty())
276 // Sanity-check: GURL should have failed to canonicalize this URL if it had an
278 DCHECK_NE(url::PORT_INVALID
, url::ParsePort(text
.c_str(), parts
->port
));
280 // Likewise, the RCDS can reject certain obviously-invalid hosts. (We also
281 // use the registry length later below.)
282 const base::string16
host(text
.substr(parts
->host
.begin
, parts
->host
.len
));
283 const size_t registry_length
=
284 net::registry_controlled_domains::GetRegistryLength(
285 base::UTF16ToUTF8(host
),
286 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES
,
287 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES
);
288 if (registry_length
== std::string::npos
) {
289 // Try to append the desired_tld.
290 if (!desired_tld
.empty()) {
291 base::string16
host_with_tld(host
);
292 if (host
[host
.length() - 1] != '.')
293 host_with_tld
+= '.';
294 host_with_tld
+= desired_tld
;
295 const size_t tld_length
=
296 net::registry_controlled_domains::GetRegistryLength(
297 base::UTF16ToUTF8(host_with_tld
),
298 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES
,
299 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES
);
300 if (tld_length
!= std::string::npos
)
301 return URL
; // Something like "99999999999" that looks like a bad IP
302 // address, but becomes valid on attaching a TLD.
304 return QUERY
; // Could be a broken IP address, etc.
308 // See if the hostname is valid. While IE and GURL allow hostnames to contain
309 // many other characters (perhaps for weird intranet machines), it's extremely
310 // unlikely that a user would be trying to type those in for anything other
311 // than a search query.
312 url::CanonHostInfo host_info
;
313 const std::string
canonicalized_host(net::CanonicalizeHost(
314 base::UTF16ToUTF8(host
), &host_info
));
315 if ((host_info
.family
== url::CanonHostInfo::NEUTRAL
) &&
316 !net::IsCanonicalizedHostCompliant(canonicalized_host
,
317 base::UTF16ToUTF8(desired_tld
))) {
318 // Invalid hostname. There are several possible cases:
319 // * Our checker is too strict and the user pasted in a real-world URL
320 // that's "invalid" but resolves. To catch these, we return UNKNOWN when
321 // the user explicitly typed a scheme, so we'll still search by default
322 // but we'll show the accidental search infobar if necessary.
323 // * The user is typing a multi-word query. If we see a space anywhere in
324 // the hostname we assume this is a search and return QUERY.
325 // * Our checker is too strict and the user is typing a real-world hostname
326 // that's "invalid" but resolves. We return UNKNOWN if the TLD is known.
327 // Note that we explicitly excluded hosts with spaces above so that
328 // "toys at amazon.com" will be treated as a search.
329 // * The user is typing some garbage string. Return QUERY.
331 // Thus we fall down in the following cases:
332 // * Trying to navigate to a hostname with spaces
333 // * Trying to navigate to a hostname with invalid characters and an unknown
335 // These are rare, though probably possible in intranets.
336 return (parts
->scheme
.is_nonempty() ||
337 ((registry_length
!= 0) &&
338 (host
.find(' ') == base::string16::npos
))) ? UNKNOWN
: QUERY
;
341 // Now that we've ruled out all schemes other than http or https and done a
342 // little more sanity checking, the presence of a scheme means this is likely
344 if (parts
->scheme
.is_nonempty())
347 // See if the host is an IP address.
348 if (host_info
.family
== url::CanonHostInfo::IPV6
)
350 // If the user originally typed a host that looks like an IP address (a
351 // dotted quad), they probably want to open it. If the original input was
352 // something else (like a single number), they probably wanted to search for
353 // it, unless they explicitly typed a scheme. This is true even if the URL
354 // appears to have a path: "1.2/45" is more likely a search (for the answer
355 // to a math problem) than a URL. However, if there are more non-host
356 // components, then maybe this really was intended to be a navigation. For
357 // this reason we only check the dotted-quad case here, and save the "other
358 // IP addresses" case for after we check the number of non-host components
360 if ((host_info
.family
== url::CanonHostInfo::IPV4
) &&
361 (host_info
.num_ipv4_components
== 4))
364 // Presence of a password means this is likely a URL. Note that unless the
365 // user has typed an explicit "http://" or similar, we'll probably think that
366 // the username is some unknown scheme, and bail out in the scheme-handling
368 if (parts
->password
.is_nonempty())
371 // Trailing slashes force the input to be treated as a URL.
372 if (parts
->path
.is_nonempty()) {
373 char c
= text
[parts
->path
.end() - 1];
374 if ((c
== '\\') || (c
== '/'))
378 // If there is more than one recognized non-host component, this is likely to
379 // be a URL, even if the TLD is unknown (in which case this is likely an
381 if (NumNonHostComponents(*parts
) > 1)
384 // If the host has a known TLD or a port, it's probably a URL, with the
385 // following exceptions:
386 // * Any "IP addresses" that make it here are more likely searches
388 // * If we reach here with a username, our input looks like "user@host[.tld]".
389 // Because there is no scheme explicitly specified, we think this is more
390 // likely an email address than an HTTP auth attempt. Hence, we search by
391 // default and let users correct us on a case-by-case basis.
392 // Note that we special-case "localhost" as a known hostname.
393 if ((host_info
.family
!= url::CanonHostInfo::IPV4
) &&
394 ((registry_length
!= 0) || (host
== base::ASCIIToUTF16("localhost") ||
395 parts
->port
.is_nonempty())))
396 return parts
->username
.is_nonempty() ? UNKNOWN
: URL
;
398 // If we reach this point, we know there's no known TLD on the input, so if
399 // the user wishes to add a desired_tld, the fixup code will oblige; thus this
401 if (!desired_tld
.empty())
404 // No scheme, password, port, path, and no known TLD on the host.
406 // * An "incomplete IP address"; likely a search (see above).
407 // * An email-like input like "user@host", where "host" has no known TLD.
408 // It's not clear what the user means here and searching seems reasonable.
409 // * A single word "foo"; possibly an intranet site, but more likely a search.
410 // This is ideally an UNKNOWN, and we can let the Alternate Nav URL code
411 // catch our mistakes.
412 // * A URL with a valid TLD we don't know about yet. If e.g. a registrar adds
413 // "xxx" as a TLD, then until we add it to our data file, Chrome won't know
414 // "foo.xxx" is a real URL. So ideally this is a URL, but we can't really
415 // distinguish this case from:
416 // * A "URL-like" string that's not really a URL (like
417 // "browser.tabs.closeButtons" or "java.awt.event.*"). This is ideally a
418 // QUERY. Since this is indistinguishable from the case above, and this
419 // case is much more likely, claim these are UNKNOWN, which should default
420 // to the right thing and let users correct us on a case-by-case basis.
425 void AutocompleteInput::ParseForEmphasizeComponents(const base::string16
& text
,
426 url::Component
* scheme
,
427 url::Component
* host
) {
429 base::string16 scheme_str
;
430 Parse(text
, base::string16(), &parts
, &scheme_str
, NULL
);
432 *scheme
= parts
.scheme
;
435 int after_scheme_and_colon
= parts
.scheme
.end() + 1;
436 // For the view-source scheme, we should emphasize the scheme and host of the
437 // URL qualified by the view-source prefix.
438 if (LowerCaseEqualsASCII(scheme_str
, content::kViewSourceScheme
) &&
439 (static_cast<int>(text
.length()) > after_scheme_and_colon
)) {
440 // Obtain the URL prefixed by view-source and parse it.
441 base::string16
real_url(text
.substr(after_scheme_and_colon
));
442 url::Parsed real_parts
;
443 AutocompleteInput::Parse(real_url
, base::string16(), &real_parts
, NULL
, NULL
);
444 if (real_parts
.scheme
.is_nonempty() || real_parts
.host
.is_nonempty()) {
445 if (real_parts
.scheme
.is_nonempty()) {
446 *scheme
= url::Component(
447 after_scheme_and_colon
+ real_parts
.scheme
.begin
,
448 real_parts
.scheme
.len
);
452 if (real_parts
.host
.is_nonempty()) {
453 *host
= url::Component(after_scheme_and_colon
+ real_parts
.host
.begin
,
454 real_parts
.host
.len
);
459 } else if (LowerCaseEqualsASCII(scheme_str
, content::kFileSystemScheme
) &&
460 parts
.inner_parsed() && parts
.inner_parsed()->scheme
.is_valid()) {
461 *host
= parts
.inner_parsed()->host
;
466 base::string16
AutocompleteInput::FormattedStringWithEquivalentMeaning(
468 const base::string16
& formatted_url
) {
469 if (!net::CanStripTrailingSlash(url
))
470 return formatted_url
;
471 const base::string16
url_with_path(formatted_url
+ base::char16('/'));
472 return (AutocompleteInput::Parse(formatted_url
, base::string16(), NULL
, NULL
,
474 AutocompleteInput::Parse(url_with_path
, base::string16(), NULL
, NULL
,
476 formatted_url
: url_with_path
;
480 int AutocompleteInput::NumNonHostComponents(const url::Parsed
& parts
) {
481 int num_nonhost_components
= 0;
482 if (parts
.scheme
.is_nonempty())
483 ++num_nonhost_components
;
484 if (parts
.username
.is_nonempty())
485 ++num_nonhost_components
;
486 if (parts
.password
.is_nonempty())
487 ++num_nonhost_components
;
488 if (parts
.port
.is_nonempty())
489 ++num_nonhost_components
;
490 if (parts
.path
.is_nonempty())
491 ++num_nonhost_components
;
492 if (parts
.query
.is_nonempty())
493 ++num_nonhost_components
;
494 if (parts
.ref
.is_nonempty())
495 ++num_nonhost_components
;
496 return num_nonhost_components
;
500 bool AutocompleteInput::HasHTTPScheme(const base::string16
& input
) {
501 std::string
utf8_input(base::UTF16ToUTF8(input
));
502 url::Component scheme
;
503 if (url::FindAndCompareScheme(utf8_input
, content::kViewSourceScheme
,
505 utf8_input
.erase(0, scheme
.end() + 1);
507 return url::FindAndCompareScheme(utf8_input
, content::kHttpScheme
, NULL
);
510 void AutocompleteInput::UpdateText(const base::string16
& text
,
511 size_t cursor_position
,
512 const url::Parsed
& parts
) {
513 DCHECK(cursor_position
<= text
.length() ||
514 cursor_position
== base::string16::npos
)
515 << "Text: '" << text
<< "', cp: " << cursor_position
;
517 cursor_position_
= cursor_position
;
521 void AutocompleteInput::Clear() {
523 cursor_position_
= base::string16::npos
;
524 current_url_
= GURL();
525 current_page_classification_
= AutocompleteInput::INVALID_SPEC
;
527 parts_
= url::Parsed();
529 canonicalized_url_
= GURL();
530 prevent_inline_autocomplete_
= false;
531 prefer_keyword_
= false;
532 allow_exact_keyword_match_
= false;
533 want_asynchronous_matches_
= true;