url/gurl.cc

   1 // Copyright 2013 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #ifdef WIN32
   6 #include <windows.h>
   7 #else
   8 #include <pthread.h>
   9 #endif
  10
  11 #include <algorithm>
  12 #include <ostream>
  13
  14 #include "url/gurl.h"
  15
  16 #include "base/logging.h"
  17 #include "base/strings/string_util.h"
  18 #include "url/url_canon_stdstring.h"
  19 #include "url/url_util.h"
  20
  21 namespace {
  22
  23 static std::string* empty_string = NULL;
  24 static GURL* empty_gurl = NULL;
  25
  26 #ifdef WIN32
  27
  28 // Returns a static reference to an empty string for returning a reference
  29 // when there is no underlying string.
  30 const std::string& EmptyStringForGURL() {
  31   // Avoid static object construction/destruction on startup/shutdown.
  32   if (!empty_string) {
  33     // Create the string. Be careful that we don't break in the case that this
  34     // is being called from multiple threads. Statics are not threadsafe.
  35     std::string* new_empty_string = new std::string;
  36     if (InterlockedCompareExchangePointer(
  37         reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
  38       // The old value was non-NULL, so no replacement was done. Another
  39       // thread did the initialization out from under us.
  40       delete new_empty_string;
  41     }
  42   }
  43   return *empty_string;
  44 }
  45
  46 #else
  47
  48 static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
  49 static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
  50
  51 void EmptyStringForGURLOnce(void) {
  52   empty_string = new std::string;
  53 }
  54
  55 const std::string& EmptyStringForGURL() {
  56   // Avoid static object construction/destruction on startup/shutdown.
  57   pthread_once(&empty_string_once, EmptyStringForGURLOnce);
  58   return *empty_string;
  59 }
  60
  61 #endif  // WIN32
  62
  63 } // namespace
  64
  65 GURL::GURL() : is_valid_(false) {
  66 }
  67
  68 GURL::GURL(const GURL& other)
  69     : spec_(other.spec_),
  70       is_valid_(other.is_valid_),
  71       parsed_(other.parsed_) {
  72   if (other.inner_url_)
  73     inner_url_.reset(new GURL(*other.inner_url_));
  74   // Valid filesystem urls should always have an inner_url_.
  75   DCHECK(!is_valid_ || !SchemeIsFileSystem() || inner_url_);
  76 }
  77
  78 GURL::GURL(const std::string& url_string) {
  79   InitCanonical(url_string, true);
  80 }
  81
  82 GURL::GURL(const base::string16& url_string) {
  83   InitCanonical(url_string, true);
  84 }
  85
  86 GURL::GURL(const std::string& url_string, RetainWhiteSpaceSelector) {
  87   InitCanonical(url_string, false);
  88 }
  89
  90 GURL::GURL(const char* canonical_spec,
  91            size_t canonical_spec_len,
  92            const url::Parsed& parsed,
  93            bool is_valid)
  94     : spec_(canonical_spec, canonical_spec_len),
  95       is_valid_(is_valid),
  96       parsed_(parsed) {
  97   InitializeFromCanonicalSpec();
  98 }
  99
 100 GURL::GURL(std::string canonical_spec, const url::Parsed& parsed, bool is_valid)
 101     : is_valid_(is_valid),
 102       parsed_(parsed) {
 103   spec_.swap(canonical_spec);
 104   InitializeFromCanonicalSpec();
 105 }
 106
 107 template<typename STR>
 108 void GURL::InitCanonical(const STR& input_spec, bool trim_path_end) {
 109   // Reserve enough room in the output for the input, plus some extra so that
 110   // we have room if we have to escape a few things without reallocating.
 111   spec_.reserve(input_spec.size() + 32);
 112   url::StdStringCanonOutput output(&spec_);
 113   is_valid_ = url::Canonicalize(
 114       input_spec.data(), static_cast<int>(input_spec.length()), trim_path_end,
 115       NULL, &output, &parsed_);
 116
 117   output.Complete();  // Must be done before using string.
 118   if (is_valid_ && SchemeIsFileSystem()) {
 119     inner_url_.reset(new GURL(spec_.data(), parsed_.Length(),
 120                               *parsed_.inner_parsed(), true));
 121   }
 122 }
 123
 124 void GURL::InitializeFromCanonicalSpec() {
 125   if (is_valid_ && SchemeIsFileSystem()) {
 126     inner_url_.reset(
 127         new GURL(spec_.data(), parsed_.Length(),
 128                  *parsed_.inner_parsed(), true));
 129   }
 130
 131 #ifndef NDEBUG
 132   // For testing purposes, check that the parsed canonical URL is identical to
 133   // what we would have produced. Skip checking for invalid URLs have no meaning
 134   // and we can't always canonicalize then reproducabely.
 135   if (is_valid_) {
 136     url::Component scheme;
 137     // We can't do this check on the inner_url of a filesystem URL, as
 138     // canonical_spec actually points to the start of the outer URL, so we'd
 139     // end up with infinite recursion in this constructor.
 140     if (!url::FindAndCompareScheme(spec_.data(), spec_.length(),
 141                                    url::kFileSystemScheme, &scheme) ||
 142         scheme.begin == parsed_.scheme.begin) {
 143       // We need to retain trailing whitespace on path URLs, as the |parsed_|
 144       // spec we originally received may legitimately contain trailing white-
 145       // space on the path or  components e.g. if the #ref has been
 146       // removed from a "foo:hello #ref" URL (see http://crbug.com/291747).
 147       GURL test_url(spec_, RETAIN_TRAILING_PATH_WHITEPACE);
 148
 149       DCHECK(test_url.is_valid_ == is_valid_);
 150       DCHECK(test_url.spec_ == spec_);
 151
 152       DCHECK(test_url.parsed_.scheme == parsed_.scheme);
 153       DCHECK(test_url.parsed_.username == parsed_.username);
 154       DCHECK(test_url.parsed_.password == parsed_.password);
 155       DCHECK(test_url.parsed_.host == parsed_.host);
 156       DCHECK(test_url.parsed_.port == parsed_.port);
 157       DCHECK(test_url.parsed_.path == parsed_.path);
 158       DCHECK(test_url.parsed_.query == parsed_.query);
 159       DCHECK(test_url.parsed_.ref == parsed_.ref);
 160     }
 161   }
 162 #endif
 163 }
 164
 165 GURL::~GURL() {
 166 }
 167
 168 GURL& GURL::operator=(GURL other) {
 169   Swap(&other);
 170   return *this;
 171 }
 172
 173 const std::string& GURL::spec() const {
 174   if (is_valid_ || spec_.empty())
 175     return spec_;
 176
 177   DCHECK(false) << "Trying to get the spec of an invalid URL!";
 178   return EmptyStringForGURL();
 179 }
 180
 181 bool GURL::operator==(const GURL& other) const {
 182   return spec_ == other.spec_;
 183 }
 184
 185 bool GURL::operator!=(const GURL& other) const {
 186   return spec_ != other.spec_;
 187 }
 188
 189 bool GURL::operator<(const GURL& other) const {
 190   return spec_ < other.spec_;
 191 }
 192
 193 bool GURL::operator>(const GURL& other) const {
 194   return spec_ > other.spec_;
 195 }
 196
 197 // Note: code duplicated below (it's inconvenient to use a template here).
 198 GURL GURL::Resolve(const std::string& relative) const {
 199   // Not allowed for invalid URLs.
 200   if (!is_valid_)
 201     return GURL();
 202
 203   GURL result;
 204
 205   // Reserve enough room in the output for the input, plus some extra so that
 206   // we have room if we have to escape a few things without reallocating.
 207   result.spec_.reserve(spec_.size() + 32);
 208   url::StdStringCanonOutput output(&result.spec_);
 209
 210   if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
 211                             parsed_, relative.data(),
 212                             static_cast<int>(relative.length()),
 213                             nullptr, &output, &result.parsed_)) {
 214     // Error resolving, return an empty URL.
 215     return GURL();
 216   }
 217
 218   output.Complete();
 219   result.is_valid_ = true;
 220   if (result.SchemeIsFileSystem()) {
 221     result.inner_url_.reset(
 222         new GURL(result.spec_.data(), result.parsed_.Length(),
 223                  *result.parsed_.inner_parsed(), true));
 224   }
 225   return result;
 226 }
 227
 228 // Note: code duplicated above (it's inconvenient to use a template here).
 229 GURL GURL::Resolve(const base::string16& relative) const {
 230   // Not allowed for invalid URLs.
 231   if (!is_valid_)
 232     return GURL();
 233
 234   GURL result;
 235
 236   // Reserve enough room in the output for the input, plus some extra so that
 237   // we have room if we have to escape a few things without reallocating.
 238   result.spec_.reserve(spec_.size() + 32);
 239   url::StdStringCanonOutput output(&result.spec_);
 240
 241   if (!url::ResolveRelative(spec_.data(), static_cast<int>(spec_.length()),
 242                             parsed_, relative.data(),
 243                             static_cast<int>(relative.length()),
 244                             nullptr, &output, &result.parsed_)) {
 245     // Error resolving, return an empty URL.
 246     return GURL();
 247   }
 248
 249   output.Complete();
 250   result.is_valid_ = true;
 251   if (result.SchemeIsFileSystem()) {
 252     result.inner_url_.reset(
 253         new GURL(result.spec_.data(), result.parsed_.Length(),
 254                  *result.parsed_.inner_parsed(), true));
 255   }
 256   return result;
 257 }
 258
 259 // Note: code duplicated below (it's inconvenient to use a template here).
 260 GURL GURL::ReplaceComponents(
 261     const url::Replacements<char>& replacements) const {
 262   GURL result;
 263
 264   // Not allowed for invalid URLs.
 265   if (!is_valid_)
 266     return GURL();
 267
 268   // Reserve enough room in the output for the input, plus some extra so that
 269   // we have room if we have to escape a few things without reallocating.
 270   result.spec_.reserve(spec_.size() + 32);
 271   url::StdStringCanonOutput output(&result.spec_);
 272
 273   result.is_valid_ = url::ReplaceComponents(
 274       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
 275       NULL, &output, &result.parsed_);
 276
 277   output.Complete();
 278   if (result.is_valid_ && result.SchemeIsFileSystem()) {
 279     result.inner_url_.reset(new GURL(spec_.data(), result.parsed_.Length(),
 280                                      *result.parsed_.inner_parsed(), true));
 281   }
 282   return result;
 283 }
 284
 285 // Note: code duplicated above (it's inconvenient to use a template here).
 286 GURL GURL::ReplaceComponents(
 287     const url::Replacements<base::char16>& replacements) const {
 288   GURL result;
 289
 290   // Not allowed for invalid URLs.
 291   if (!is_valid_)
 292     return GURL();
 293
 294   // Reserve enough room in the output for the input, plus some extra so that
 295   // we have room if we have to escape a few things without reallocating.
 296   result.spec_.reserve(spec_.size() + 32);
 297   url::StdStringCanonOutput output(&result.spec_);
 298
 299   result.is_valid_ = url::ReplaceComponents(
 300       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
 301       NULL, &output, &result.parsed_);
 302
 303   output.Complete();
 304   if (result.is_valid_ && result.SchemeIsFileSystem()) {
 305     result.inner_url_.reset(new GURL(spec_.data(), result.parsed_.Length(),
 306                                      *result.parsed_.inner_parsed(), true));
 307   }
 308   return result;
 309 }
 310
 311 GURL GURL::GetOrigin() const {
 312   // This doesn't make sense for invalid or nonstandard URLs, so return
 313   // the empty URL
 314   if (!is_valid_ || !IsStandard())
 315     return GURL();
 316
 317   if (SchemeIsFileSystem())
 318     return inner_url_->GetOrigin();
 319
 320   url::Replacements<char> replacements;
 321   replacements.ClearUsername();
 322   replacements.ClearPassword();
 323   replacements.ClearPath();
 324   replacements.ClearQuery();
 325   replacements.ClearRef();
 326
 327   return ReplaceComponents(replacements);
 328 }
 329
 330 GURL GURL::GetAsReferrer() const {
 331   if (!is_valid_ || !SchemeIsHTTPOrHTTPS())
 332     return GURL();
 333
 334   if (!has_ref() && !has_username() && !has_password())
 335     return GURL(*this);
 336
 337   url::Replacements<char> replacements;
 338   replacements.ClearRef();
 339   replacements.ClearUsername();
 340   replacements.ClearPassword();
 341   return ReplaceComponents(replacements);
 342 }
 343
 344 GURL GURL::GetWithEmptyPath() const {
 345   // This doesn't make sense for invalid or nonstandard URLs, so return
 346   // the empty URL.
 347   if (!is_valid_ || !IsStandard())
 348     return GURL();
 349
 350   // We could optimize this since we know that the URL is canonical, and we are
 351   // appending a canonical path, so avoiding re-parsing.
 352   GURL other(*this);
 353   if (parsed_.path.len == 0)
 354     return other;
 355
 356   // Clear everything after the path.
 357   other.parsed_.query.reset();
 358   other.parsed_.ref.reset();
 359
 360   // Set the path, since the path is longer than one, we can just set the
 361   // first character and resize.
 362   other.spec_[other.parsed_.path.begin] = '/';
 363   other.parsed_.path.len = 1;
 364   other.spec_.resize(other.parsed_.path.begin + 1);
 365   return other;
 366 }
 367
 368 bool GURL::IsStandard() const {
 369   return url::IsStandard(spec_.data(), parsed_.scheme);
 370 }
 371
 372 bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
 373   if (parsed_.scheme.len <= 0)
 374     return lower_ascii_scheme == NULL;
 375   return base::LowerCaseEqualsASCII(
 376       base::StringPiece(spec_.data() + parsed_.scheme.begin,
 377                         parsed_.scheme.len),
 378       lower_ascii_scheme);
 379 }
 380
 381 bool GURL::SchemeIsHTTPOrHTTPS() const {
 382   return SchemeIs(url::kHttpScheme) || SchemeIs(url::kHttpsScheme);
 383 }
 384
 385 bool GURL::SchemeIsWSOrWSS() const {
 386   return SchemeIs(url::kWsScheme) || SchemeIs(url::kWssScheme);
 387 }
 388
 389 int GURL::IntPort() const {
 390   if (parsed_.port.is_nonempty())
 391     return url::ParsePort(spec_.data(), parsed_.port);
 392   return url::PORT_UNSPECIFIED;
 393 }
 394
 395 int GURL::EffectiveIntPort() const {
 396   int int_port = IntPort();
 397   if (int_port == url::PORT_UNSPECIFIED && IsStandard())
 398     return url::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
 399                                      parsed_.scheme.len);
 400   return int_port;
 401 }
 402
 403 std::string GURL::ExtractFileName() const {
 404   url::Component file_component;
 405   url::ExtractFileName(spec_.data(), parsed_.path, &file_component);
 406   return ComponentString(file_component);
 407 }
 408
 409 std::string GURL::PathForRequest() const {
 410   DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
 411   if (parsed_.ref.len >= 0) {
 412     // Clip off the reference when it exists. The reference starts after the #
 413     // sign, so we have to subtract one to also remove it.
 414     return std::string(spec_, parsed_.path.begin,
 415                        parsed_.ref.begin - parsed_.path.begin - 1);
 416   }
 417   // Compute the actual path length, rather than depending on the spec's
 418   // terminator.  If we're an inner_url, our spec continues on into our outer
 419   // url's path/query/ref.
 420   int path_len = parsed_.path.len;
 421   if (parsed_.query.is_valid())
 422     path_len = parsed_.query.end() - parsed_.path.begin;
 423
 424   return std::string(spec_, parsed_.path.begin, path_len);
 425 }
 426
 427 std::string GURL::HostNoBrackets() const {
 428   // If host looks like an IPv6 literal, strip the square brackets.
 429   url::Component h(parsed_.host);
 430   if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
 431     h.begin++;
 432     h.len -= 2;
 433   }
 434   return ComponentString(h);
 435 }
 436
 437 std::string GURL::GetContent() const {
 438   return is_valid_ ? ComponentString(parsed_.GetContent()) : std::string();
 439 }
 440
 441 bool GURL::HostIsIPAddress() const {
 442   if (!is_valid_ || spec_.empty())
 443      return false;
 444
 445   url::RawCanonOutputT<char, 128> ignored_output;
 446   url::CanonHostInfo host_info;
 447   url::CanonicalizeIPAddress(spec_.c_str(), parsed_.host, &ignored_output,
 448                              &host_info);
 449   return host_info.IsIPAddress();
 450 }
 451
 452 #ifdef WIN32
 453
 454 const GURL& GURL::EmptyGURL() {
 455   // Avoid static object construction/destruction on startup/shutdown.
 456   if (!empty_gurl) {
 457     // Create the string. Be careful that we don't break in the case that this
 458     // is being called from multiple threads.
 459     GURL* new_empty_gurl = new GURL;
 460     if (InterlockedCompareExchangePointer(
 461         reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
 462       // The old value was non-NULL, so no replacement was done. Another
 463       // thread did the initialization out from under us.
 464       delete new_empty_gurl;
 465     }
 466   }
 467   return *empty_gurl;
 468 }
 469
 470 #else
 471
 472 void EmptyGURLOnce(void) {
 473   empty_gurl = new GURL;
 474 }
 475
 476 const GURL& GURL::EmptyGURL() {
 477   // Avoid static object construction/destruction on startup/shutdown.
 478   pthread_once(&empty_gurl_once, EmptyGURLOnce);
 479   return *empty_gurl;
 480 }
 481
 482 #endif  // WIN32
 483
 484 bool GURL::DomainIs(const char* lower_ascii_domain,
 485                     int domain_len) const {
 486   // Return false if this URL is not valid or domain is empty.
 487   if (!is_valid_ || !domain_len)
 488     return false;
 489
 490   // FileSystem URLs have empty parsed_.host, so check this first.
 491   if (SchemeIsFileSystem() && inner_url_)
 492     return inner_url_->DomainIs(lower_ascii_domain, domain_len);
 493
 494   if (!parsed_.host.is_nonempty())
 495     return false;
 496
 497   // Check whether the host name is end with a dot. If yes, treat it
 498   // the same as no-dot unless the input comparison domain is end
 499   // with dot.
 500   const char* last_pos = spec_.data() + parsed_.host.end() - 1;
 501   int host_len = parsed_.host.len;
 502   if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
 503     last_pos--;
 504     host_len--;
 505   }
 506
 507   // Return false if host's length is less than domain's length.
 508   if (host_len < domain_len)
 509     return false;
 510
 511   // Compare this url whether belong specific domain.
 512   const char* start_pos = spec_.data() + parsed_.host.begin +
 513                           host_len - domain_len;
 514
 515   if (!base::LowerCaseEqualsASCII(
 516            base::StringPiece(start_pos, last_pos - start_pos + 1),
 517            base::StringPiece(lower_ascii_domain, domain_len)))
 518     return false;
 519
 520   // Check whether host has right domain start with dot, make sure we got
 521   // right domain range. For example www.google.com has domain
 522   // "google.com" but www.iamnotgoogle.com does not.
 523   if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
 524       '.' != *(start_pos - 1))
 525     return false;
 526
 527   return true;
 528 }
 529
 530 void GURL::Swap(GURL* other) {
 531   spec_.swap(other->spec_);
 532   std::swap(is_valid_, other->is_valid_);
 533   std::swap(parsed_, other->parsed_);
 534   inner_url_.swap(other->inner_url_);
 535 }
 536
 537 std::ostream& operator<<(std::ostream& out, const GURL& url) {
 538   return out << url.possibly_invalid_spec();
 539 }