Roll harfbuzz-ng to 1.0.2
[chromium-blink-merge.git] / components / url_formatter / url_formatter.h
blob01c8795ce0662e0edc465e1505e515923b26173b
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 // url_formatter contains routines for formatting URLs in a way that can be
6 // safely and securely displayed to users. For example, it is responsible
7 // for determining when to convert an IDN A-Label (e.g. "xn--[something]")
8 // into the IDN U-Label.
9 //
10 // Note that this formatting is only intended for display purposes; it would
11 // be insecure and insufficient to make comparisons solely on formatted URLs
12 // (that is, it should not be used for normalizing URLs for comparison for
13 // security decisions).
15 #ifndef COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_
16 #define COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_
18 #include <stdint.h>
20 #include <string>
21 #include <vector>
23 #include "base/strings/string16.h"
24 #include "base/strings/utf_offset_string_conversions.h"
25 #include "net/base/escape.h"
27 class GURL;
29 namespace url {
30 struct Parsed;
31 } // url
33 namespace url_formatter {
35 // Used by FormatUrl to specify handling of certain parts of the url.
36 typedef uint32_t FormatUrlType;
37 typedef uint32_t FormatUrlTypes;
39 // Nothing is ommitted.
40 extern const FormatUrlType kFormatUrlOmitNothing;
42 // If set, any username and password are removed.
43 extern const FormatUrlType kFormatUrlOmitUsernamePassword;
45 // If the scheme is 'http://', it's removed.
46 extern const FormatUrlType kFormatUrlOmitHTTP;
48 // Omits the path if it is just a slash and there is no query or ref. This is
49 // meaningful for non-file "standard" URLs.
50 extern const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname;
52 // Convenience for omitting all unecessary types.
53 extern const FormatUrlType kFormatUrlOmitAll;
55 // Creates a string representation of |url|. The IDN host name may be in Unicode
56 // if |languages| accepts the Unicode representation. |format_type| is a bitmask
57 // of FormatUrlTypes, see it for details. |unescape_rules| defines how to clean
58 // the URL for human readability. You will generally want |UnescapeRule::SPACES|
59 // for display to the user if you can handle spaces, or |UnescapeRule::NORMAL|
60 // if not. If the path part and the query part seem to be encoded in %-encoded
61 // UTF-8, decodes %-encoding and UTF-8.
63 // The last three parameters may be NULL.
65 // |new_parsed| will be set to the parsing parameters of the resultant URL.
67 // |prefix_end| will be the length before the hostname of the resultant URL.
69 // |offset[s]_for_adjustment| specifies one or more offsets into the original
70 // URL, representing insertion or selection points between characters: if the
71 // input is "http://foo.com/", offset 0 is before the entire URL, offset 7 is
72 // between the scheme and the host, and offset 15 is after the end of the URL.
73 // Valid input offsets range from 0 to the length of the input URL string. On
74 // exit, each offset will have been modified to reflect any changes made to the
75 // output string. For example, if |url| is "http://a:b@c.com/",
76 // |omit_username_password| is true, and an offset is 12 (pointing between 'c'
77 // and '.'), then on return the output string will be "http://c.com/" and the
78 // offset will be 8. If an offset cannot be successfully adjusted (e.g. because
79 // it points into the middle of a component that was entirely removed or into
80 // the middle of an encoding sequence), it will be set to base::string16::npos.
81 // For consistency, if an input offset points between the scheme and the
82 // username/password, and both are removed, on output this offset will be 0
83 // rather than npos; this means that offsets at the starts and ends of removed
84 // components are always transformed the same way regardless of what other
85 // components are adjacent.
86 base::string16 FormatUrl(const GURL& url,
87 const std::string& languages,
88 FormatUrlTypes format_types,
89 net::UnescapeRule::Type unescape_rules,
90 url::Parsed* new_parsed,
91 size_t* prefix_end,
92 size_t* offset_for_adjustment);
94 base::string16 FormatUrlWithOffsets(
95 const GURL& url,
96 const std::string& languages,
97 FormatUrlTypes format_types,
98 net::UnescapeRule::Type unescape_rules,
99 url::Parsed* new_parsed,
100 size_t* prefix_end,
101 std::vector<size_t>* offsets_for_adjustment);
103 // This function is like those above except it takes |adjustments| rather
104 // than |offset[s]_for_adjustment|. |adjustments| will be set to reflect all
105 // the transformations that happened to |url| to convert it into the returned
106 // value.
107 base::string16 FormatUrlWithAdjustments(
108 const GURL& url,
109 const std::string& languages,
110 FormatUrlTypes format_types,
111 net::UnescapeRule::Type unescape_rules,
112 url::Parsed* new_parsed,
113 size_t* prefix_end,
114 base::OffsetAdjuster::Adjustments* adjustments);
116 // This is a convenience function for FormatUrl() with
117 // format_types = kFormatUrlOmitAll and unescape = SPACES. This is the typical
118 // set of flags for "URLs to display to the user". You should be cautious about
119 // using this for URLs which will be parsed or sent to other applications.
120 inline base::string16 FormatUrl(const GURL& url, const std::string& languages) {
121 return FormatUrl(url, languages, kFormatUrlOmitAll, net::UnescapeRule::SPACES,
122 nullptr, nullptr, nullptr);
125 // Returns whether FormatUrl() would strip a trailing slash from |url|, given a
126 // format flag including kFormatUrlOmitTrailingSlashOnBareHostname.
127 bool CanStripTrailingSlash(const GURL& url);
129 // Formats the host in |url| and appends it to |output|. The host formatter
130 // takes the same accept languages component as ElideURL().
131 void AppendFormattedHost(const GURL& url,
132 const std::string& languages,
133 base::string16* output);
135 // Converts the given host name to unicode characters. This can be called for
136 // any host name, if the input is not IDN or is invalid in some way, we'll just
137 // return the ASCII source so it is still usable.
139 // The input should be the canonicalized ASCII host name from GURL. This
140 // function does NOT accept UTF-8!
142 // |languages| is a comma separated list of ISO 639 language codes. It
143 // is used to determine whether a hostname is 'comprehensible' to a user
144 // who understands languages listed. |host| will be converted to a
145 // human-readable form (Unicode) ONLY when each component of |host| is
146 // regarded as 'comprehensible'. Scipt-mixing is not allowed except that
147 // Latin letters in the ASCII range can be mixed with a limited set of
148 // script-language pairs (currently Han, Kana and Hangul for zh,ja and ko).
149 // When |languages| is empty, even that mixing is not allowed.
150 base::string16 IDNToUnicode(const std::string& host,
151 const std::string& languages);
153 } // url_formatter
155 #endif // COMPONENTS_URL_FORMATTER_URL_FORMATTER_H_