1 // Copyright 2007, Google Inc.
2 // All rights reserved.
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 // Functions to canonicalize "standard" URLs, which are ones that have an
31 // authority section including a host name.
33 #include "googleurl/src/url_canon.h"
34 #include "googleurl/src/url_canon_internal.h"
40 template<typename CHAR
, typename UCHAR
>
41 bool DoCanonicalizeStandardURL(const URLComponentSource
<CHAR
>& source
,
42 const url_parse::Parsed
& parsed
,
43 CharsetConverter
* query_converter
,
45 url_parse::Parsed
* new_parsed
) {
46 // Scheme: this will append the colon.
47 bool success
= CanonicalizeScheme(source
.scheme
, parsed
.scheme
,
48 output
, &new_parsed
->scheme
);
50 // Authority (username, password, host, port)
52 if (parsed
.username
.is_valid() || parsed
.password
.is_valid() ||
53 parsed
.host
.is_nonempty() || parsed
.port
.is_valid()) {
54 have_authority
= true;
56 // Only write the authority separators when we have a scheme.
57 if (parsed
.scheme
.is_valid()) {
58 output
->push_back('/');
59 output
->push_back('/');
62 // User info: the canonicalizer will handle the : and @.
63 success
&= CanonicalizeUserInfo(source
.username
, parsed
.username
,
64 source
.password
, parsed
.password
,
66 &new_parsed
->username
,
67 &new_parsed
->password
);
69 success
&= CanonicalizeHost(source
.host
, parsed
.host
,
70 output
, &new_parsed
->host
);
72 // Host must not be empty for standard URLs.
73 if (!parsed
.host
.is_nonempty())
76 // Port: the port canonicalizer will handle the colon.
77 int default_port
= DefaultPortForScheme(
78 &output
->data()[new_parsed
->scheme
.begin
], new_parsed
->scheme
.len
);
79 success
&= CanonicalizePort(source
.port
, parsed
.port
, default_port
,
80 output
, &new_parsed
->port
);
82 // No authority, clear the components.
83 have_authority
= false;
84 new_parsed
->host
.reset();
85 new_parsed
->username
.reset();
86 new_parsed
->password
.reset();
87 new_parsed
->port
.reset();
88 success
= false; // Standard URLs must have an authority.
92 if (parsed
.path
.is_valid()) {
93 success
&= CanonicalizePath(source
.path
, parsed
.path
,
94 output
, &new_parsed
->path
);
95 } else if (have_authority
||
96 parsed
.query
.is_valid() || parsed
.ref
.is_valid()) {
97 // When we have an empty path, make up a path when we have an authority
98 // or something following the path. The only time we allow an empty
99 // output path is when there is nothing else.
100 new_parsed
->path
= url_parse::Component(output
->length(), 1);
101 output
->push_back('/');
104 new_parsed
->path
.reset();
108 CanonicalizeQuery(source
.query
, parsed
.query
, query_converter
,
109 output
, &new_parsed
->query
);
111 // Ref: ignore failure for this, since the page can probably still be loaded.
112 CanonicalizeRef(source
.ref
, parsed
.ref
, output
, &new_parsed
->ref
);
120 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
121 // if the scheme is unknown.
122 int DefaultPortForScheme(const char* scheme
, int scheme_len
) {
123 int default_port
= url_parse::PORT_UNSPECIFIED
;
124 switch (scheme_len
) {
126 if (!strncmp(scheme
, "http", scheme_len
))
130 if (!strncmp(scheme
, "https", scheme_len
))
134 if (!strncmp(scheme
, "ftp", scheme_len
))
136 else if (!strncmp(scheme
, "wss", scheme_len
))
140 if (!strncmp(scheme
, "gopher", scheme_len
))
144 if (!strncmp(scheme
, "ws", scheme_len
))
151 bool CanonicalizeStandardURL(const char* spec
,
153 const url_parse::Parsed
& parsed
,
154 CharsetConverter
* query_converter
,
156 url_parse::Parsed
* new_parsed
) {
157 return DoCanonicalizeStandardURL
<char, unsigned char>(
158 URLComponentSource
<char>(spec
), parsed
, query_converter
,
162 bool CanonicalizeStandardURL(const char16
* spec
,
164 const url_parse::Parsed
& parsed
,
165 CharsetConverter
* query_converter
,
167 url_parse::Parsed
* new_parsed
) {
168 return DoCanonicalizeStandardURL
<char16
, char16
>(
169 URLComponentSource
<char16
>(spec
), parsed
, query_converter
,
173 bool ReplaceStandardURL(const char* base
,
174 const url_parse::Parsed
& base_parsed
,
175 const Replacements
<char>& replacements
,
176 CharsetConverter
* query_converter
,
178 url_parse::Parsed
* new_parsed
) {
179 URLComponentSource
<char> source(base
);
180 url_parse::Parsed
parsed(base_parsed
);
181 SetupOverrideComponents(base
, replacements
, &source
, &parsed
);
182 return DoCanonicalizeStandardURL
<char, unsigned char>(
183 source
, parsed
, query_converter
, output
, new_parsed
);
186 // For 16-bit replacements, we turn all the replacements into UTF-8 so the
187 // regular codepath can be used.
188 bool ReplaceStandardURL(const char* base
,
189 const url_parse::Parsed
& base_parsed
,
190 const Replacements
<char16
>& replacements
,
191 CharsetConverter
* query_converter
,
193 url_parse::Parsed
* new_parsed
) {
194 RawCanonOutput
<1024> utf8
;
195 URLComponentSource
<char> source(base
);
196 url_parse::Parsed
parsed(base_parsed
);
197 SetupUTF16OverrideComponents(base
, replacements
, &utf8
, &source
, &parsed
);
198 return DoCanonicalizeStandardURL
<char, unsigned char>(
199 source
, parsed
, query_converter
, output
, new_parsed
);
202 } // namespace url_canon