1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // url_formatter contains routines for formatting URLs in a way that can be
6 // safely and securely displayed to users. For example, it is responsible
7 // for determining when to convert an IDN A-Label (e.g. "xn--[something]")
8 // into the IDN U-Label.
10 // Note that this formatting is only intended for display purposes; it would
11 // be insecure and insufficient to make comparisons solely on formatted URLs
12 // (that is, it should not be used for normalizing URLs for comparison for
13 // security decisions).
15 #ifndef COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_
16 #define COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_
23 #include "base/strings/string16.h"
24 #include "base/strings/utf_offset_string_conversions.h"
25 #include "net/base/escape.h"
33 namespace url_formatter
{
35 // Used by FormatUrl to specify handling of certain parts of the url.
36 typedef uint32_t FormatUrlType
;
37 typedef uint32_t FormatUrlTypes
;
39 // Nothing is ommitted.
40 extern const FormatUrlType kFormatUrlOmitNothing
;
42 // If set, any username and password are removed.
43 extern const FormatUrlType kFormatUrlOmitUsernamePassword
;
45 // If the scheme is 'http://', it's removed.
46 extern const FormatUrlType kFormatUrlOmitHTTP
;
48 // Omits the path if it is just a slash and there is no query or ref. This is
49 // meaningful for non-file "standard" URLs.
50 extern const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname
;
52 // Convenience for omitting all unecessary types.
53 extern const FormatUrlType kFormatUrlOmitAll
;
55 // Creates a string representation of |url|. The IDN host name may be in Unicode
56 // if |languages| accepts the Unicode representation. |format_type| is a bitmask
57 // of FormatUrlTypes, see it for details. |unescape_rules| defines how to clean
58 // the URL for human readability. You will generally want |UnescapeRule::SPACES|
59 // for display to the user if you can handle spaces, or |UnescapeRule::NORMAL|
60 // if not. If the path part and the query part seem to be encoded in %-encoded
61 // UTF-8, decodes %-encoding and UTF-8.
63 // The last three parameters may be NULL.
65 // |new_parsed| will be set to the parsing parameters of the resultant URL.
67 // |prefix_end| will be the length before the hostname of the resultant URL.
69 // |offset[s]_for_adjustment| specifies one or more offsets into the original
70 // URL, representing insertion or selection points between characters: if the
71 // input is "http://foo.com/", offset 0 is before the entire URL, offset 7 is
72 // between the scheme and the host, and offset 15 is after the end of the URL.
73 // Valid input offsets range from 0 to the length of the input URL string. On
74 // exit, each offset will have been modified to reflect any changes made to the
75 // output string. For example, if |url| is "http://a:b@c.com/",
76 // |omit_username_password| is true, and an offset is 12 (pointing between 'c'
77 // and '.'), then on return the output string will be "http://c.com/" and the
78 // offset will be 8. If an offset cannot be successfully adjusted (e.g. because
79 // it points into the middle of a component that was entirely removed or into
80 // the middle of an encoding sequence), it will be set to base::string16::npos.
81 // For consistency, if an input offset points between the scheme and the
82 // username/password, and both are removed, on output this offset will be 0
83 // rather than npos; this means that offsets at the starts and ends of removed
84 // components are always transformed the same way regardless of what other
85 // components are adjacent.
86 base::string16
FormatUrl(const GURL
& url
,
87 const std::string
& languages
,
88 FormatUrlTypes format_types
,
89 net::UnescapeRule::Type unescape_rules
,
90 url::Parsed
* new_parsed
,
92 size_t* offset_for_adjustment
);
94 base::string16
FormatUrlWithOffsets(
96 const std::string
& languages
,
97 FormatUrlTypes format_types
,
98 net::UnescapeRule::Type unescape_rules
,
99 url::Parsed
* new_parsed
,
101 std::vector
<size_t>* offsets_for_adjustment
);
103 // This function is like those above except it takes |adjustments| rather
104 // than |offset[s]_for_adjustment|. |adjustments| will be set to reflect all
105 // the transformations that happened to |url| to convert it into the returned
107 base::string16
FormatUrlWithAdjustments(
109 const std::string
& languages
,
110 FormatUrlTypes format_types
,
111 net::UnescapeRule::Type unescape_rules
,
112 url::Parsed
* new_parsed
,
114 base::OffsetAdjuster::Adjustments
* adjustments
);
116 // This is a convenience function for FormatUrl() with
117 // format_types = kFormatUrlOmitAll and unescape = SPACES. This is the typical
118 // set of flags for "URLs to display to the user". You should be cautious about
119 // using this for URLs which will be parsed or sent to other applications.
120 inline base::string16
FormatUrl(const GURL
& url
, const std::string
& languages
) {
121 return FormatUrl(url
, languages
, kFormatUrlOmitAll
, net::UnescapeRule::SPACES
,
122 nullptr, nullptr, nullptr);
125 // Returns whether FormatUrl() would strip a trailing slash from |url|, given a
126 // format flag including kFormatUrlOmitTrailingSlashOnBareHostname.
127 bool CanStripTrailingSlash(const GURL
& url
);
129 // Formats the host in |url| and appends it to |output|. The host formatter
130 // takes the same accept languages component as ElideURL().
131 void AppendFormattedHost(const GURL
& url
,
132 const std::string
& languages
,
133 base::string16
* output
);
135 // Converts the given host name to unicode characters. This can be called for
136 // any host name, if the input is not IDN or is invalid in some way, we'll just
137 // return the ASCII source so it is still usable.
139 // The input should be the canonicalized ASCII host name from GURL. This
140 // function does NOT accept UTF-8!
142 // |languages| is a comma separated list of ISO 639 language codes. It
143 // is used to determine whether a hostname is 'comprehensible' to a user
144 // who understands languages listed. |host| will be converted to a
145 // human-readable form (Unicode) ONLY when each component of |host| is
146 // regarded as 'comprehensible'. Scipt-mixing is not allowed except that
147 // Latin letters in the ASCII range can be mixed with a limited set of
148 // script-language pairs (currently Han, Kana and Hangul for zh,ja and ko).
149 // When |languages| is empty, even that mixing is not allowed.
150 base::string16
IDNToUnicode(const std::string
& host
,
151 const std::string
& languages
);
155 #endif // COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_