1 // Copyright 2007, Google Inc.
2 // All rights reserved.
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 // Canonicalizer functions for working with and resolving relative URLs.
32 #include "base/logging.h"
33 #include "googleurl/src/url_canon.h"
34 #include "googleurl/src/url_canon_internal.h"
35 #include "googleurl/src/url_file.h"
36 #include "googleurl/src/url_parse_internal.h"
42 // Firefox does a case-sensitive compare (which is probably wrong--Mozilla bug
43 // 379034), whereas IE is case-insensetive.
45 // We choose to be more permissive like IE. We don't need to worry about
46 // unescaping or anything here: neither IE or Firefox allow this. We also
47 // don't have to worry about invalid scheme characters since we are comparing
48 // against the canonical scheme of the base.
50 // The base URL should always be canonical, therefore is ASCII.
51 template<typename CHAR
>
52 bool AreSchemesEqual(const char* base
,
53 const url_parse::Component
& base_scheme
,
55 const url_parse::Component
& cmp_scheme
) {
56 if (base_scheme
.len
!= cmp_scheme
.len
)
58 for (int i
= 0; i
< base_scheme
.len
; i
++) {
59 // We assume the base is already canonical, so we don't have to
61 if (CanonicalSchemeChar(cmp
[cmp_scheme
.begin
+ i
]) !=
62 base
[base_scheme
.begin
+ i
])
70 // Here, we also allow Windows paths to be represented as "/C:/" so we can be
71 // consistent about URL paths beginning with slashes. This function is like
72 // DoesBeginWindowsDrivePath except that it also requires a slash at the
74 template<typename CHAR
>
75 bool DoesBeginSlashWindowsDriveSpec(const CHAR
* spec
, int start_offset
,
77 if (start_offset
>= spec_len
)
79 return url_parse::IsURLSlash(spec
[start_offset
]) &&
80 url_parse::DoesBeginWindowsDriveSpec(spec
, start_offset
+ 1, spec_len
);
85 // See IsRelativeURL in the header file for usage.
86 template<typename CHAR
>
87 bool DoIsRelativeURL(const char* base
,
88 const url_parse::Parsed
& base_parsed
,
91 bool is_base_hierarchical
,
93 url_parse::Component
* relative_component
) {
94 *is_relative
= false; // So we can default later to not relative.
96 // Trim whitespace and construct a new range for the substring.
98 url_parse::TrimURL(url
, &begin
, &url_len
);
99 if (begin
>= url_len
) {
100 // Empty URLs are relative, but do nothing.
101 *relative_component
= url_parse::Component(begin
, 0);
107 // We special case paths like "C:\foo" so they can link directly to the
108 // file on Windows (IE compatability). The security domain stuff should
109 // prevent a link like this from actually being followed if its on a
112 // We treat "C:/foo" as an absolute URL. We can go ahead and treat "/c:/"
113 // as relative, as this will just replace the path when the base scheme
114 // is a file and the answer will still be correct.
116 // We require strict backslashes when detecting UNC since two forward
117 // shashes should be treated a a relative URL with a hostname.
118 if (url_parse::DoesBeginWindowsDriveSpec(url
, begin
, url_len
) ||
119 url_parse::DoesBeginUNCPath(url
, begin
, url_len
, true))
123 // See if we've got a scheme, if not, we know this is a relative URL.
124 // BUT: Just because we have a scheme, doesn't make it absolute.
125 // "http:foo.html" is a relative URL with path "foo.html". If the scheme is
126 // empty, we treat it as relative (":foo") like IE does.
127 url_parse::Component scheme
;
128 if (!url_parse::ExtractScheme(url
, url_len
, &scheme
) || scheme
.len
== 0) {
129 // Don't allow relative URLs if the base scheme doesn't support it.
130 if (!is_base_hierarchical
)
133 *relative_component
= url_parse::MakeRange(begin
, url_len
);
138 // If the scheme isn't valid, then it's relative.
139 int scheme_end
= scheme
.end();
140 for (int i
= scheme
.begin
; i
< scheme_end
; i
++) {
141 if (!CanonicalSchemeChar(url
[i
])) {
142 *relative_component
= url_parse::MakeRange(begin
, url_len
);
148 // If the scheme is not the same, then we can't count it as relative.
149 if (!AreSchemesEqual(base
, base_parsed
.scheme
, url
, scheme
))
152 // When the scheme that they both share is not hierarchical, treat the
153 // incoming scheme as absolute (this way with the base of "data:foo",
154 // "data:bar" will be reported as absolute.
155 if (!is_base_hierarchical
)
158 // ExtractScheme guarantees that the colon immediately follows what it
159 // considers to be the scheme. CountConsecutiveSlashes will handle the
160 // case where the begin offset is the end of the input.
161 int colon_offset
= scheme
.end();
162 int num_slashes
= url_parse::CountConsecutiveSlashes(url
, colon_offset
+ 1,
165 if (num_slashes
== 0 || num_slashes
== 1) {
166 // No slashes means it's a relative path like "http:foo.html". One slash
167 // is an absolute path. "http:/home/foo.html"
169 *relative_component
= url_parse::MakeRange(colon_offset
+ 1, url_len
);
173 // Two or more slashes after the scheme we treat as absolute.
177 // Copies all characters in the range [begin, end) of |spec| to the output,
178 // up until and including the last slash. There should be a slash in the
179 // range, if not, nothing will be copied.
181 // The input is assumed to be canonical, so we search only for exact slashes
182 // and not backslashes as well. We also know that it's ASCII.
183 void CopyToLastSlash(const char* spec
,
186 CanonOutput
* output
) {
187 // Find the last slash.
189 for (int i
= end
- 1; i
>= begin
; i
--) {
190 if (spec
[i
] == '/') {
199 for (int i
= begin
; i
<= last_slash
; i
++)
200 output
->push_back(spec
[i
]);
203 // Copies a single component from the source to the output. This is used
204 // when resolving relative URLs and a given component is unchanged. Since the
205 // source should already be canonical, we don't have to do anything special,
206 // and the input is ASCII.
207 void CopyOneComponent(const char* source
,
208 const url_parse::Component
& source_component
,
210 url_parse::Component
* output_component
) {
211 if (source_component
.len
< 0) {
212 // This component is not present.
213 *output_component
= url_parse::Component();
217 output_component
->begin
= output
->length();
218 int source_end
= source_component
.end();
219 for (int i
= source_component
.begin
; i
< source_end
; i
++)
220 output
->push_back(source
[i
]);
221 output_component
->len
= output
->length() - output_component
->begin
;
226 // Called on Windows when the base URL is a file URL, this will copy the "C:"
227 // to the output, if there is a drive letter and if that drive letter is not
228 // being overridden by the relative URL. Otherwise, do nothing.
230 // It will return the index of the beginning of the next character in the
231 // base to be processed: if there is a "C:", the slash after it, or if
232 // there is no drive letter, the slash at the beginning of the path, or
233 // the end of the base. This can be used as the starting offset for further
235 template<typename CHAR
>
236 int CopyBaseDriveSpecIfNecessary(const char* base_url
,
239 const CHAR
* relative_url
,
241 int relative_url_len
,
242 CanonOutput
* output
) {
243 if (base_path_begin
>= base_path_end
)
244 return base_path_begin
; // No path.
246 // If the relative begins with a drive spec, don't do anything. The existing
247 // drive spec in the base will be replaced.
248 if (url_parse::DoesBeginWindowsDriveSpec(relative_url
,
249 path_start
, relative_url_len
)) {
250 return base_path_begin
; // Relative URL path is "C:/foo"
253 // The path should begin with a slash (as all canonical paths do). We check
254 // if it is followed by a drive letter and copy it.
255 if (DoesBeginSlashWindowsDriveSpec(base_url
,
258 // Copy the two-character drive spec to the output. It will now look like
259 // "file:///C:" so the rest of it can be treated like a standard path.
260 output
->push_back('/');
261 output
->push_back(base_url
[base_path_begin
+ 1]);
262 output
->push_back(base_url
[base_path_begin
+ 2]);
263 return base_path_begin
+ 3;
266 return base_path_begin
;
271 // A subroutine of DoResolveRelativeURL, this resolves the URL knowning that
272 // the input is a relative path or less (qyuery or ref).
273 template<typename CHAR
>
274 bool DoResolveRelativePath(const char* base_url
,
275 const url_parse::Parsed
& base_parsed
,
277 const CHAR
* relative_url
,
278 const url_parse::Component
& relative_component
,
279 CharsetConverter
* query_converter
,
281 url_parse::Parsed
* out_parsed
) {
284 // We know the authority section didn't change, copy it to the output. We
285 // also know we have a path so can copy up to there.
286 url_parse::Component path
, query
, ref
;
287 url_parse::ParsePathInternal(relative_url
,
292 // Canonical URLs always have a path, so we can use that offset.
293 output
->Append(base_url
, base_parsed
.path
.begin
);
296 // The path is replaced or modified.
297 int true_path_begin
= output
->length();
299 // For file: URLs on Windows, we don't want to treat the drive letter and
300 // colon as part of the path for relative file resolution when the
301 // incoming URL does not provide a drive spec. We save the true path
302 // beginning so we can fix it up after we are done.
303 int base_path_begin
= base_parsed
.path
.begin
;
306 base_path_begin
= CopyBaseDriveSpecIfNecessary(
307 base_url
, base_parsed
.path
.begin
, base_parsed
.path
.end(),
308 relative_url
, relative_component
.begin
, relative_component
.end(),
310 // Now the output looks like either "file://" or "file:///C:"
311 // and we can start appending the rest of the path. |base_path_begin|
312 // points to the character in the base that comes next.
316 if (url_parse::IsURLSlash(relative_url
[path
.begin
])) {
317 // Easy case: the path is an absolute path on the server, so we can
318 // just replace everything from the path on with the new versions.
319 // Since the input should be canonical hierarchical URL, we should
320 // always have a path.
321 success
&= CanonicalizePath(relative_url
, path
,
322 output
, &out_parsed
->path
);
324 // Relative path, replace the query, and reference. We take the
325 // original path with the file part stripped, and append the new path.
326 // The canonicalizer will take care of resolving ".." and "."
327 int path_begin
= output
->length();
328 CopyToLastSlash(base_url
, base_path_begin
, base_parsed
.path
.end(),
330 success
&= CanonicalizePartialPath(relative_url
, path
, path_begin
,
332 out_parsed
->path
= url_parse::MakeRange(path_begin
, output
->length());
334 // Copy the rest of the stuff after the path from the relative path.
337 // Finish with the query and reference part (these can't fail).
338 CanonicalizeQuery(relative_url
, query
, query_converter
,
339 output
, &out_parsed
->query
);
340 CanonicalizeRef(relative_url
, ref
, output
, &out_parsed
->ref
);
342 // Fix the path beginning to add back the "C:" we may have written above.
343 out_parsed
->path
= url_parse::MakeRange(true_path_begin
,
344 out_parsed
->path
.end());
348 // If we get here, the path is unchanged: copy to output.
349 CopyOneComponent(base_url
, base_parsed
.path
, output
, &out_parsed
->path
);
351 if (query
.is_valid()) {
352 // Just the query specified, replace the query and reference (ignore
353 // failures for refs)
354 CanonicalizeQuery(relative_url
, query
, query_converter
,
355 output
, &out_parsed
->query
);
356 CanonicalizeRef(relative_url
, ref
, output
, &out_parsed
->ref
);
360 // If we get here, the query is unchanged: copy to output. Note that the
361 // range of the query parameter doesn't include the question mark, so we
362 // have to add it manually if there is a component.
363 if (base_parsed
.query
.is_valid())
364 output
->push_back('?');
365 CopyOneComponent(base_url
, base_parsed
.query
, output
, &out_parsed
->query
);
367 if (ref
.is_valid()) {
368 // Just the reference specified: replace it (ignoring failures).
369 CanonicalizeRef(relative_url
, ref
, output
, &out_parsed
->ref
);
373 // We should always have something to do in this function, the caller checks
374 // that some component is being replaced.
375 DCHECK(false) << "Not reached";
379 // Resolves a relative URL that contains a host. Typically, these will
380 // be of the form "//www.google.com/foo/bar?baz#ref" and the only thing which
381 // should be kept from the original URL is the scheme.
382 template<typename CHAR
>
383 bool DoResolveRelativeHost(const char* base_url
,
384 const url_parse::Parsed
& base_parsed
,
385 const CHAR
* relative_url
,
386 const url_parse::Component
& relative_component
,
387 CharsetConverter
* query_converter
,
389 url_parse::Parsed
* out_parsed
) {
390 // Parse the relative URL, just like we would for anything following a
392 url_parse::Parsed relative_parsed
; // Everything but the scheme is valid.
393 url_parse::ParseAfterScheme(&relative_url
[relative_component
.begin
],
394 relative_component
.len
, relative_component
.begin
,
397 // Now we can just use the replacement function to replace all the necessary
398 // parts of the old URL with the new one.
399 Replacements
<CHAR
> replacements
;
400 replacements
.SetUsername(relative_url
, relative_parsed
.username
);
401 replacements
.SetPassword(relative_url
, relative_parsed
.password
);
402 replacements
.SetHost(relative_url
, relative_parsed
.host
);
403 replacements
.SetPort(relative_url
, relative_parsed
.port
);
404 replacements
.SetPath(relative_url
, relative_parsed
.path
);
405 replacements
.SetQuery(relative_url
, relative_parsed
.query
);
406 replacements
.SetRef(relative_url
, relative_parsed
.ref
);
408 return ReplaceStandardURL(base_url
, base_parsed
, replacements
,
409 query_converter
, output
, out_parsed
);
412 // Resolves a relative URL that happens to be an absolute file path. Examples
413 // include: "//hostname/path", "/c:/foo", and "//hostname/c:/foo".
414 template<typename CHAR
>
415 bool DoResolveAbsoluteFile(const CHAR
* relative_url
,
416 const url_parse::Component
& relative_component
,
417 CharsetConverter
* query_converter
,
419 url_parse::Parsed
* out_parsed
) {
420 // Parse the file URL. The file URl parsing function uses the same logic
421 // as we do for determining if the file is absolute, in which case it will
422 // not bother to look for a scheme.
423 url_parse::Parsed relative_parsed
;
424 url_parse::ParseFileURL(&relative_url
[relative_component
.begin
],
425 relative_component
.len
, &relative_parsed
);
427 return CanonicalizeFileURL(&relative_url
[relative_component
.begin
],
428 relative_component
.len
, relative_parsed
,
429 query_converter
, output
, out_parsed
);
432 // TODO(brettw) treat two slashes as root like Mozilla for FTP?
433 template<typename CHAR
>
434 bool DoResolveRelativeURL(const char* base_url
,
435 const url_parse::Parsed
& base_parsed
,
437 const CHAR
* relative_url
,
438 const url_parse::Component
& relative_component
,
439 CharsetConverter
* query_converter
,
441 url_parse::Parsed
* out_parsed
) {
442 // Starting point for our output parsed. We'll fix what we change.
443 *out_parsed
= base_parsed
;
445 // Sanity check: the input should have a host or we'll break badly below.
446 // We can only resolve relative URLs with base URLs that have hosts and
447 // paths (even the default path of "/" is OK).
449 // We allow hosts with no length so we can handle file URLs, for example.
450 if (base_parsed
.path
.len
<= 0) {
451 // On error, return the input (resolving a relative URL on a non-relative
453 int base_len
= base_parsed
.Length();
454 for (int i
= 0; i
< base_len
; i
++)
455 output
->push_back(base_url
[i
]);
459 if (relative_component
.len
<= 0) {
460 // Empty relative URL, leave unchanged, only removing the ref component.
461 int base_len
= base_parsed
.Length();
462 base_len
-= base_parsed
.ref
.len
+ 1;
463 out_parsed
->ref
.reset();
464 output
->Append(base_url
, base_len
);
468 int num_slashes
= url_parse::CountConsecutiveSlashes(
469 relative_url
, relative_component
.begin
, relative_component
.end());
472 // On Windows, two slashes for a file path (regardless of which direction
473 // they are) means that it's UNC. Two backslashes on any base scheme mean
474 // that it's an absolute UNC path (we use the base_is_file flag to control
475 // how strict the UNC finder is).
477 // We also allow Windows absolute drive specs on any scheme (for example
478 // "c:\foo") like IE does. There must be no preceeding slashes in this
479 // case (we reject anything like "/c:/foo") because that should be treated
480 // as a path. For file URLs, we allow any number of slashes since that would
481 // be setting the path.
483 // This assumes the absolute path resolver handles absolute URLs like this
484 // properly. url_util::DoCanonicalize does this.
485 int after_slashes
= relative_component
.begin
+ num_slashes
;
486 if (url_parse::DoesBeginUNCPath(relative_url
, relative_component
.begin
,
487 relative_component
.end(), !base_is_file
) ||
488 ((num_slashes
== 0 || base_is_file
) &&
489 url_parse::DoesBeginWindowsDriveSpec(relative_url
, after_slashes
,
490 relative_component
.end()))) {
491 return DoResolveAbsoluteFile(relative_url
, relative_component
,
492 query_converter
, output
, out_parsed
);
495 // Other platforms need explicit handling for file: URLs with multiple
496 // slashes because the generic scheme parsing always extracts a host, but a
497 // file: URL only has a host if it has exactly 2 slashes. This also
498 // handles the special case where the URL is only slashes, since that
499 // doesn't have a host part either.
501 (num_slashes
> 2 || num_slashes
== relative_component
.len
)) {
502 return DoResolveAbsoluteFile(relative_url
, relative_component
,
503 query_converter
, output
, out_parsed
);
507 // Any other double-slashes mean that this is relative to the scheme.
508 if (num_slashes
>= 2) {
509 return DoResolveRelativeHost(base_url
, base_parsed
,
510 relative_url
, relative_component
,
511 query_converter
, output
, out_parsed
);
514 // When we get here, we know that the relative URL is on the same host.
515 return DoResolveRelativePath(base_url
, base_parsed
, base_is_file
,
516 relative_url
, relative_component
,
517 query_converter
, output
, out_parsed
);
522 bool IsRelativeURL(const char* base
,
523 const url_parse::Parsed
& base_parsed
,
524 const char* fragment
,
526 bool is_base_hierarchical
,
528 url_parse::Component
* relative_component
) {
529 return DoIsRelativeURL
<char>(
530 base
, base_parsed
, fragment
, fragment_len
, is_base_hierarchical
,
531 is_relative
, relative_component
);
534 bool IsRelativeURL(const char* base
,
535 const url_parse::Parsed
& base_parsed
,
536 const char16
* fragment
,
538 bool is_base_hierarchical
,
540 url_parse::Component
* relative_component
) {
541 return DoIsRelativeURL
<char16
>(
542 base
, base_parsed
, fragment
, fragment_len
, is_base_hierarchical
,
543 is_relative
, relative_component
);
546 bool ResolveRelativeURL(const char* base_url
,
547 const url_parse::Parsed
& base_parsed
,
549 const char* relative_url
,
550 const url_parse::Component
& relative_component
,
551 CharsetConverter
* query_converter
,
553 url_parse::Parsed
* out_parsed
) {
554 return DoResolveRelativeURL
<char>(
555 base_url
, base_parsed
, base_is_file
, relative_url
,
556 relative_component
, query_converter
, output
, out_parsed
);
559 bool ResolveRelativeURL(const char* base_url
,
560 const url_parse::Parsed
& base_parsed
,
562 const char16
* relative_url
,
563 const url_parse::Component
& relative_component
,
564 CharsetConverter
* query_converter
,
566 url_parse::Parsed
* out_parsed
) {
567 return DoResolveRelativeURL
<char16
>(
568 base_url
, base_parsed
, base_is_file
, relative_url
,
569 relative_component
, query_converter
, output
, out_parsed
);
572 } // namespace url_canon