1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "chrome/browser/autocomplete/autocomplete_input.h"
7 #include "base/strings/string_util.h"
8 #include "base/strings/utf_string_conversions.h"
9 #include "chrome/browser/external_protocol/external_protocol_handler.h"
10 #include "chrome/browser/profiles/profile_io_data.h"
11 #include "chrome/common/net/url_fixer_upper.h"
12 #include "content/public/common/url_constants.h"
13 #include "net/base/net_util.h"
14 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
15 #include "url/url_canon_ip.h"
16 #include "url/url_util.h"
20 void AdjustCursorPositionIfNecessary(size_t num_leading_chars_removed
,
21 size_t* cursor_position
) {
22 if (*cursor_position
== base::string16::npos
)
24 if (num_leading_chars_removed
< *cursor_position
)
25 *cursor_position
-= num_leading_chars_removed
;
32 AutocompleteInput::AutocompleteInput()
33 : cursor_position_(base::string16::npos
),
34 current_page_classification_(AutocompleteInput::INVALID_SPEC
),
36 prevent_inline_autocomplete_(false),
37 prefer_keyword_(false),
38 allow_exact_keyword_match_(true),
39 matches_requested_(ALL_MATCHES
) {
42 AutocompleteInput::AutocompleteInput(
43 const base::string16
& text
,
44 size_t cursor_position
,
45 const base::string16
& desired_tld
,
46 const GURL
& current_url
,
47 AutocompleteInput::PageClassification current_page_classification
,
48 bool prevent_inline_autocomplete
,
50 bool allow_exact_keyword_match
,
51 MatchesRequested matches_requested
)
52 : cursor_position_(cursor_position
),
53 current_url_(current_url
),
54 current_page_classification_(current_page_classification
),
55 prevent_inline_autocomplete_(prevent_inline_autocomplete
),
56 prefer_keyword_(prefer_keyword
),
57 allow_exact_keyword_match_(allow_exact_keyword_match
),
58 matches_requested_(matches_requested
) {
59 DCHECK(cursor_position
<= text
.length() ||
60 cursor_position
== base::string16::npos
)
61 << "Text: '" << text
<< "', cp: " << cursor_position
;
62 // None of the providers care about leading white space so we always trim it.
63 // Providers that care about trailing white space handle trimming themselves.
64 if ((TrimWhitespace(text
, TRIM_LEADING
, &text_
) & TRIM_LEADING
) != 0)
65 AdjustCursorPositionIfNecessary(text
.length() - text_
.length(),
68 GURL canonicalized_url
;
69 type_
= Parse(text_
, desired_tld
, &parts_
, &scheme_
, &canonicalized_url
);
74 if (((type_
== UNKNOWN
) || (type_
== URL
)) &&
75 canonicalized_url
.is_valid() &&
76 (!canonicalized_url
.IsStandard() || canonicalized_url
.SchemeIsFile() ||
77 canonicalized_url
.SchemeIsFileSystem() ||
78 !canonicalized_url
.host().empty()))
79 canonicalized_url_
= canonicalized_url
;
81 size_t chars_removed
= RemoveForcedQueryStringIfNecessary(type_
, &text_
);
82 AdjustCursorPositionIfNecessary(chars_removed
, &cursor_position_
);
84 // Remove spaces between opening question mark and first actual character.
85 base::string16 trimmed_text
;
86 if ((TrimWhitespace(text_
, TRIM_LEADING
, &trimmed_text
) & TRIM_LEADING
) !=
88 AdjustCursorPositionIfNecessary(text_
.length() - trimmed_text
.length(),
95 AutocompleteInput::~AutocompleteInput() {
99 size_t AutocompleteInput::RemoveForcedQueryStringIfNecessary(
101 base::string16
* text
) {
102 if (type
!= FORCED_QUERY
|| text
->empty() || (*text
)[0] != L
'?')
104 // Drop the leading '?'.
110 std::string
AutocompleteInput::TypeToString(Type type
) {
112 case INVALID
: return "invalid";
113 case UNKNOWN
: return "unknown";
114 case URL
: return "url";
115 case QUERY
: return "query";
116 case FORCED_QUERY
: return "forced-query";
120 return std::string();
125 AutocompleteInput::Type
AutocompleteInput::Parse(
126 const base::string16
& text
,
127 const base::string16
& desired_tld
,
128 url_parse::Parsed
* parts
,
129 base::string16
* scheme
,
130 GURL
* canonicalized_url
) {
131 size_t first_non_white
= text
.find_first_not_of(base::kWhitespaceUTF16
, 0);
132 if (first_non_white
== base::string16::npos
)
133 return INVALID
; // All whitespace.
135 if (text
.at(first_non_white
) == L
'?') {
136 // If the first non-whitespace character is a '?', we magically treat this
141 // Ask our parsing back-end to help us understand what the user typed. We
142 // use the URLFixerUpper here because we want to be smart about what we
143 // consider a scheme. For example, we shouldn't consider www.google.com:80
145 url_parse::Parsed local_parts
;
147 parts
= &local_parts
;
148 const base::string16
parsed_scheme(URLFixerUpper::SegmentURL(text
, parts
));
150 *scheme
= parsed_scheme
;
151 if (canonicalized_url
) {
152 *canonicalized_url
= URLFixerUpper::FixupURL(
153 base::UTF16ToUTF8(text
), base::UTF16ToUTF8(desired_tld
));
156 if (LowerCaseEqualsASCII(parsed_scheme
, content::kFileScheme
)) {
157 // A user might or might not type a scheme when entering a file URL. In
158 // either case, |parsed_scheme| will tell us that this is a file URL, but
159 // |parts->scheme| might be empty, e.g. if the user typed "C:\foo".
163 if (LowerCaseEqualsASCII(parsed_scheme
, content::kFileSystemScheme
)) {
164 // This could theoretically be a strange search, but let's check.
165 // If it's got an inner_url with a scheme, it's a URL, whether it's valid or
167 if (parts
->inner_parsed() && parts
->inner_parsed()->scheme
.is_valid())
171 // If the user typed a scheme, and it's HTTP or HTTPS, we know how to parse it
172 // well enough that we can fall through to the heuristics below. If it's
173 // something else, we can just determine our action based on what we do with
174 // any input of this scheme. In theory we could do better with some schemes
175 // (e.g. "ftp" or "view-source") but I'll wait to spend the effort on that
176 // until I run into some cases that really need it.
177 if (parts
->scheme
.is_nonempty() &&
178 !LowerCaseEqualsASCII(parsed_scheme
, content::kHttpScheme
) &&
179 !LowerCaseEqualsASCII(parsed_scheme
, content::kHttpsScheme
)) {
180 // See if we know how to handle the URL internally.
181 if (ProfileIOData::IsHandledProtocol(UTF16ToASCII(parsed_scheme
)))
184 // There are also some schemes that we convert to other things before they
185 // reach the renderer or else the renderer handles internally without
186 // reaching the net::URLRequest logic. We thus won't catch these above, but
187 // we should still claim to handle them.
188 if (LowerCaseEqualsASCII(parsed_scheme
, content::kViewSourceScheme
) ||
189 LowerCaseEqualsASCII(parsed_scheme
, content::kJavaScriptScheme
) ||
190 LowerCaseEqualsASCII(parsed_scheme
, chrome::kDataScheme
))
193 // Finally, check and see if the user has explicitly opened this scheme as
194 // a URL before, or if the "scheme" is actually a username. We need to do
195 // this last because some schemes (e.g. "javascript") may be treated as
196 // "blocked" by the external protocol handler because we don't want pages to
197 // open them, but users still can.
198 // TODO(viettrungluu): get rid of conversion.
199 ExternalProtocolHandler::BlockState block_state
=
200 ExternalProtocolHandler::GetBlockState(
201 base::UTF16ToUTF8(parsed_scheme
));
202 switch (block_state
) {
203 case ExternalProtocolHandler::DONT_BLOCK
:
206 case ExternalProtocolHandler::BLOCK
:
207 // If we don't want the user to open the URL, don't let it be navigated
212 // We don't know about this scheme. It might be that the user typed a
213 // URL of the form "username:password@foo.com".
214 const base::string16 http_scheme_prefix
=
215 base::ASCIIToUTF16(std::string(content::kHttpScheme
) +
216 content::kStandardSchemeSeparator
);
217 url_parse::Parsed http_parts
;
218 base::string16 http_scheme
;
219 GURL http_canonicalized_url
;
220 Type http_type
= Parse(http_scheme_prefix
+ text
, desired_tld
,
221 &http_parts
, &http_scheme
,
222 &http_canonicalized_url
);
223 DCHECK_EQ(std::string(content::kHttpScheme
),
224 base::UTF16ToUTF8(http_scheme
));
226 if (http_type
== URL
&&
227 http_parts
.username
.is_nonempty() &&
228 http_parts
.password
.is_nonempty()) {
229 // Manually re-jigger the parsed parts to match |text| (without the
230 // http scheme added).
231 http_parts
.scheme
.reset();
232 url_parse::Component
* components
[] = {
233 &http_parts
.username
,
234 &http_parts
.password
,
241 for (size_t i
= 0; i
< arraysize(components
); ++i
) {
242 URLFixerUpper::OffsetComponent(
243 -static_cast<int>(http_scheme_prefix
.length()), components
[i
]);
249 if (canonicalized_url
)
250 *canonicalized_url
= http_canonicalized_url
;
255 // We don't know about this scheme and it doesn't look like the user
256 // typed a username and password. It's likely to be a search operator
257 // like "site:" or "link:". We classify it as UNKNOWN so the user has
258 // the option of treating it as a URL if we're wrong.
259 // Note that SegmentURL() is smart so we aren't tricked by "c:\foo" or
260 // "www.example.com:81" in this case.
266 // Either the user didn't type a scheme, in which case we need to distinguish
267 // between an HTTP URL and a query, or the scheme is HTTP or HTTPS, in which
268 // case we should reject invalid formulations.
270 // If we have an empty host it can't be a URL.
271 if (!parts
->host
.is_nonempty())
274 // Likewise, the RCDS can reject certain obviously-invalid hosts. (We also
275 // use the registry length later below.)
276 const base::string16
host(text
.substr(parts
->host
.begin
, parts
->host
.len
));
277 const size_t registry_length
=
278 net::registry_controlled_domains::GetRegistryLength(
279 base::UTF16ToUTF8(host
),
280 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES
,
281 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES
);
282 if (registry_length
== std::string::npos
) {
283 // Try to append the desired_tld.
284 if (!desired_tld
.empty()) {
285 base::string16
host_with_tld(host
);
286 if (host
[host
.length() - 1] != '.')
287 host_with_tld
+= '.';
288 host_with_tld
+= desired_tld
;
289 const size_t tld_length
=
290 net::registry_controlled_domains::GetRegistryLength(
291 base::UTF16ToUTF8(host_with_tld
),
292 net::registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES
,
293 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES
);
294 if (tld_length
!= std::string::npos
)
295 return URL
; // Something like "99999999999" that looks like a bad IP
296 // address, but becomes valid on attaching a TLD.
298 return QUERY
; // Could be a broken IP address, etc.
302 // See if the hostname is valid. While IE and GURL allow hostnames to contain
303 // many other characters (perhaps for weird intranet machines), it's extremely
304 // unlikely that a user would be trying to type those in for anything other
305 // than a search query.
306 url_canon::CanonHostInfo host_info
;
307 const std::string
canonicalized_host(net::CanonicalizeHost(
308 base::UTF16ToUTF8(host
), &host_info
));
309 if ((host_info
.family
== url_canon::CanonHostInfo::NEUTRAL
) &&
310 !net::IsCanonicalizedHostCompliant(canonicalized_host
,
311 base::UTF16ToUTF8(desired_tld
))) {
312 // Invalid hostname. There are several possible cases:
313 // * Our checker is too strict and the user pasted in a real-world URL
314 // that's "invalid" but resolves. To catch these, we return UNKNOWN when
315 // the user explicitly typed a scheme, so we'll still search by default
316 // but we'll show the accidental search infobar if necessary.
317 // * The user is typing a multi-word query. If we see a space anywhere in
318 // the hostname we assume this is a search and return QUERY.
319 // * Our checker is too strict and the user is typing a real-world hostname
320 // that's "invalid" but resolves. We return UNKNOWN if the TLD is known.
321 // Note that we explicitly excluded hosts with spaces above so that
322 // "toys at amazon.com" will be treated as a search.
323 // * The user is typing some garbage string. Return QUERY.
325 // Thus we fall down in the following cases:
326 // * Trying to navigate to a hostname with spaces
327 // * Trying to navigate to a hostname with invalid characters and an unknown
329 // These are rare, though probably possible in intranets.
330 return (parts
->scheme
.is_nonempty() ||
331 ((registry_length
!= 0) &&
332 (host
.find(' ') == base::string16::npos
))) ? UNKNOWN
: QUERY
;
335 // A port number is a good indicator that this is a URL. However, it might
336 // also be a query like "1.66:1" that looks kind of like an IP address and
337 // port number. So here we only check for "port numbers" that are illegal and
338 // thus mean this can't be navigated to (e.g. "1.2.3.4:garbage"), and we save
339 // handling legal port numbers until after the "IP address" determination
341 if (url_parse::ParsePort(text
.c_str(), parts
->port
) ==
342 url_parse::PORT_INVALID
)
345 // Now that we've ruled out all schemes other than http or https and done a
346 // little more sanity checking, the presence of a scheme means this is likely
348 if (parts
->scheme
.is_nonempty())
351 // See if the host is an IP address.
352 if (host_info
.family
== url_canon::CanonHostInfo::IPV6
)
354 // If the user originally typed a host that looks like an IP address (a
355 // dotted quad), they probably want to open it. If the original input was
356 // something else (like a single number), they probably wanted to search for
357 // it, unless they explicitly typed a scheme. This is true even if the URL
358 // appears to have a path: "1.2/45" is more likely a search (for the answer
359 // to a math problem) than a URL. However, if there are more non-host
360 // components, then maybe this really was intended to be a navigation. For
361 // this reason we only check the dotted-quad case here, and save the "other
362 // IP addresses" case for after we check the number of non-host components
364 if ((host_info
.family
== url_canon::CanonHostInfo::IPV4
) &&
365 (host_info
.num_ipv4_components
== 4))
368 // Presence of a password means this is likely a URL. Note that unless the
369 // user has typed an explicit "http://" or similar, we'll probably think that
370 // the username is some unknown scheme, and bail out in the scheme-handling
372 if (parts
->password
.is_nonempty())
375 // Trailing slashes force the input to be treated as a URL.
376 if (parts
->path
.is_nonempty()) {
377 char c
= text
[parts
->path
.end() - 1];
378 if ((c
== '\\') || (c
== '/'))
382 // If there is more than one recognized non-host component, this is likely to
383 // be a URL, even if the TLD is unknown (in which case this is likely an
385 if (NumNonHostComponents(*parts
) > 1)
388 // If the host has a known TLD or a port, it's probably a URL, with the
389 // following exceptions:
390 // * Any "IP addresses" that make it here are more likely searches
392 // * If we reach here with a username, our input looks like "user@host[.tld]".
393 // Because there is no scheme explicitly specified, we think this is more
394 // likely an email address than an HTTP auth attempt. Hence, we search by
395 // default and let users correct us on a case-by-case basis.
396 // Note that we special-case "localhost" as a known hostname.
397 if ((host_info
.family
!= url_canon::CanonHostInfo::IPV4
) &&
398 ((registry_length
!= 0) || (host
== base::ASCIIToUTF16("localhost") ||
399 parts
->port
.is_nonempty())))
400 return parts
->username
.is_nonempty() ? UNKNOWN
: URL
;
402 // If we reach this point, we know there's no known TLD on the input, so if
403 // the user wishes to add a desired_tld, the fixup code will oblige; thus this
405 if (!desired_tld
.empty())
408 // No scheme, password, port, path, and no known TLD on the host.
410 // * An "incomplete IP address"; likely a search (see above).
411 // * An email-like input like "user@host", where "host" has no known TLD.
412 // It's not clear what the user means here and searching seems reasonable.
413 // * A single word "foo"; possibly an intranet site, but more likely a search.
414 // This is ideally an UNKNOWN, and we can let the Alternate Nav URL code
415 // catch our mistakes.
416 // * A URL with a valid TLD we don't know about yet. If e.g. a registrar adds
417 // "xxx" as a TLD, then until we add it to our data file, Chrome won't know
418 // "foo.xxx" is a real URL. So ideally this is a URL, but we can't really
419 // distinguish this case from:
420 // * A "URL-like" string that's not really a URL (like
421 // "browser.tabs.closeButtons" or "java.awt.event.*"). This is ideally a
422 // QUERY. Since this is indistinguishable from the case above, and this
423 // case is much more likely, claim these are UNKNOWN, which should default
424 // to the right thing and let users correct us on a case-by-case basis.
429 void AutocompleteInput::ParseForEmphasizeComponents(
430 const base::string16
& text
,
431 url_parse::Component
* scheme
,
432 url_parse::Component
* host
) {
433 url_parse::Parsed parts
;
434 base::string16 scheme_str
;
435 Parse(text
, base::string16(), &parts
, &scheme_str
, NULL
);
437 *scheme
= parts
.scheme
;
440 int after_scheme_and_colon
= parts
.scheme
.end() + 1;
441 // For the view-source scheme, we should emphasize the scheme and host of the
442 // URL qualified by the view-source prefix.
443 if (LowerCaseEqualsASCII(scheme_str
, content::kViewSourceScheme
) &&
444 (static_cast<int>(text
.length()) > after_scheme_and_colon
)) {
445 // Obtain the URL prefixed by view-source and parse it.
446 base::string16
real_url(text
.substr(after_scheme_and_colon
));
447 url_parse::Parsed real_parts
;
448 AutocompleteInput::Parse(real_url
, base::string16(), &real_parts
, NULL
, NULL
);
449 if (real_parts
.scheme
.is_nonempty() || real_parts
.host
.is_nonempty()) {
450 if (real_parts
.scheme
.is_nonempty()) {
451 *scheme
= url_parse::Component(
452 after_scheme_and_colon
+ real_parts
.scheme
.begin
,
453 real_parts
.scheme
.len
);
457 if (real_parts
.host
.is_nonempty()) {
458 *host
= url_parse::Component(
459 after_scheme_and_colon
+ real_parts
.host
.begin
,
460 real_parts
.host
.len
);
465 } else if (LowerCaseEqualsASCII(scheme_str
, content::kFileSystemScheme
) &&
466 parts
.inner_parsed() && parts
.inner_parsed()->scheme
.is_valid()) {
467 *host
= parts
.inner_parsed()->host
;
472 base::string16
AutocompleteInput::FormattedStringWithEquivalentMeaning(
474 const base::string16
& formatted_url
) {
475 if (!net::CanStripTrailingSlash(url
))
476 return formatted_url
;
477 const base::string16
url_with_path(formatted_url
+ base::char16('/'));
478 return (AutocompleteInput::Parse(formatted_url
, base::string16(), NULL
, NULL
,
480 AutocompleteInput::Parse(url_with_path
, base::string16(), NULL
, NULL
,
482 formatted_url
: url_with_path
;
486 int AutocompleteInput::NumNonHostComponents(const url_parse::Parsed
& parts
) {
487 int num_nonhost_components
= 0;
488 if (parts
.scheme
.is_nonempty())
489 ++num_nonhost_components
;
490 if (parts
.username
.is_nonempty())
491 ++num_nonhost_components
;
492 if (parts
.password
.is_nonempty())
493 ++num_nonhost_components
;
494 if (parts
.port
.is_nonempty())
495 ++num_nonhost_components
;
496 if (parts
.path
.is_nonempty())
497 ++num_nonhost_components
;
498 if (parts
.query
.is_nonempty())
499 ++num_nonhost_components
;
500 if (parts
.ref
.is_nonempty())
501 ++num_nonhost_components
;
502 return num_nonhost_components
;
506 bool AutocompleteInput::HasHTTPScheme(const base::string16
& input
) {
507 std::string
utf8_input(base::UTF16ToUTF8(input
));
508 url_parse::Component scheme
;
509 if (url_util::FindAndCompareScheme(utf8_input
, content::kViewSourceScheme
,
511 utf8_input
.erase(0, scheme
.end() + 1);
512 return url_util::FindAndCompareScheme(utf8_input
, content::kHttpScheme
, NULL
);
515 void AutocompleteInput::UpdateText(const base::string16
& text
,
516 size_t cursor_position
,
517 const url_parse::Parsed
& parts
) {
518 DCHECK(cursor_position
<= text
.length() ||
519 cursor_position
== base::string16::npos
)
520 << "Text: '" << text
<< "', cp: " << cursor_position
;
522 cursor_position_
= cursor_position
;
526 void AutocompleteInput::Clear() {
528 cursor_position_
= base::string16::npos
;
529 current_url_
= GURL();
530 current_page_classification_
= AutocompleteInput::INVALID_SPEC
;
532 parts_
= url_parse::Parsed();
534 canonicalized_url_
= GURL();
535 prevent_inline_autocomplete_
= false;
536 prefer_keyword_
= false;
537 allow_exact_keyword_match_
= false;
538 matches_requested_
= ALL_MATCHES
;