src/gurl.cc

   1 // Copyright 2007, Google Inc.
   2 // All rights reserved.
   3 //
   4 // Redistribution and use in source and binary forms, with or without
   5 // modification, are permitted provided that the following conditions are
   6 // met:
   7 //
   8 //     * Redistributions of source code must retain the above copyright
   9 // notice, this list of conditions and the following disclaimer.
  10 //     * Redistributions in binary form must reproduce the above
  11 // copyright notice, this list of conditions and the following disclaimer
  12 // in the documentation and/or other materials provided with the
  13 // distribution.
  14 //     * Neither the name of Google Inc. nor the names of its
  15 // contributors may be used to endorse or promote products derived from
  16 // this software without specific prior written permission.
  17 //
  18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  29
  30 #ifdef WIN32
  31 #include <windows.h>
  32 #else
  33 #include <pthread.h>
  34 #endif
  35
  36 #include <algorithm>
  37
  38 #include "googleurl/src/gurl.h"
  39
  40 #include "base/logging.h"
  41 #include "googleurl/src/url_canon_stdstring.h"
  42 #include "googleurl/src/url_util.h"
  43
  44 namespace {
  45
  46 // External template that can handle initialization of either character type.
  47 // The input spec is given, and the canonical version will be placed in
  48 // |*canonical|, along with the parsing of the canonical spec in |*parsed|.
  49 template<typename STR>
  50 bool InitCanonical(const STR& input_spec,
  51                    std::string* canonical,
  52                    url_parse::Parsed* parsed) {
  53   // Reserve enough room in the output for the input, plus some extra so that
  54   // we have room if we have to escape a few things without reallocating.
  55   canonical->reserve(input_spec.size() + 32);
  56   url_canon::StdStringCanonOutput output(canonical);
  57   bool success = url_util::Canonicalize(
  58       input_spec.data(), static_cast<int>(input_spec.length()),
  59       NULL, &output, parsed);
  60
  61   output.Complete();  // Must be done before using string.
  62   return success;
  63 }
  64
  65 static std::string* empty_string = NULL;
  66 static GURL* empty_gurl = NULL;
  67
  68 #ifdef WIN32
  69
  70 // Returns a static reference to an empty string for returning a reference
  71 // when there is no underlying string.
  72 const std::string& EmptyStringForGURL() {
  73   // Avoid static object construction/destruction on startup/shutdown.
  74   if (!empty_string) {
  75     // Create the string. Be careful that we don't break in the case that this
  76     // is being called from multiple threads. Statics are not threadsafe.
  77     std::string* new_empty_string = new std::string;
  78     if (InterlockedCompareExchangePointer(
  79         reinterpret_cast<PVOID*>(&empty_string), new_empty_string, NULL)) {
  80       // The old value was non-NULL, so no replacement was done. Another
  81       // thread did the initialization out from under us.
  82       delete new_empty_string;
  83     }
  84   }
  85   return *empty_string;
  86 }
  87
  88 #else
  89
  90 static pthread_once_t empty_string_once = PTHREAD_ONCE_INIT;
  91 static pthread_once_t empty_gurl_once = PTHREAD_ONCE_INIT;
  92
  93 void EmptyStringForGURLOnce(void) {
  94   empty_string = new std::string;
  95 }
  96
  97 const std::string& EmptyStringForGURL() {
  98   // Avoid static object construction/destruction on startup/shutdown.
  99   pthread_once(&empty_string_once, EmptyStringForGURLOnce);
 100   return *empty_string;
 101 }
 102
 103 #endif  // WIN32
 104
 105 } // namespace
 106
 107 GURL::GURL() : is_valid_(false) {
 108 }
 109
 110 GURL::GURL(const GURL& other)
 111     : spec_(other.spec_),
 112       is_valid_(other.is_valid_),
 113       parsed_(other.parsed_) {
 114 }
 115
 116 GURL::GURL(const std::string& url_string) {
 117   is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
 118 }
 119
 120 GURL::GURL(const string16& url_string) {
 121   is_valid_ = InitCanonical(url_string, &spec_, &parsed_);
 122 }
 123
 124 GURL::GURL(const char* canonical_spec, size_t canonical_spec_len,
 125            const url_parse::Parsed& parsed, bool is_valid)
 126     : spec_(canonical_spec, canonical_spec_len),
 127       is_valid_(is_valid),
 128       parsed_(parsed) {
 129 #ifndef NDEBUG
 130   // For testing purposes, check that the parsed canonical URL is identical to
 131   // what we would have produced. Skip checking for invalid URLs have no meaning
 132   // and we can't always canonicalize then reproducabely.
 133   if (is_valid_) {
 134     GURL test_url(spec_);
 135
 136     DCHECK(test_url.is_valid_ == is_valid_);
 137     DCHECK(test_url.spec_ == spec_);
 138
 139     DCHECK(test_url.parsed_.scheme == parsed_.scheme);
 140     DCHECK(test_url.parsed_.username == parsed_.username);
 141     DCHECK(test_url.parsed_.password == parsed_.password);
 142     DCHECK(test_url.parsed_.host == parsed_.host);
 143     DCHECK(test_url.parsed_.port == parsed_.port);
 144     DCHECK(test_url.parsed_.path == parsed_.path);
 145     DCHECK(test_url.parsed_.query == parsed_.query);
 146     DCHECK(test_url.parsed_.ref == parsed_.ref);
 147   }
 148 #endif
 149 }
 150
 151 const std::string& GURL::spec() const {
 152   if (is_valid_ || spec_.empty())
 153     return spec_;
 154
 155   DCHECK(false) << "Trying to get the spec of an invalid URL!";
 156   return EmptyStringForGURL();
 157 }
 158
 159 GURL GURL::Resolve(const std::string& relative) const {
 160   return ResolveWithCharsetConverter(relative, NULL);
 161 }
 162 GURL GURL::Resolve(const string16& relative) const {
 163   return ResolveWithCharsetConverter(relative, NULL);
 164 }
 165
 166 // Note: code duplicated below (it's inconvenient to use a template here).
 167 GURL GURL::ResolveWithCharsetConverter(
 168     const std::string& relative,
 169     url_canon::CharsetConverter* charset_converter) const {
 170   // Not allowed for invalid URLs.
 171   if (!is_valid_)
 172     return GURL();
 173
 174   GURL result;
 175
 176   // Reserve enough room in the output for the input, plus some extra so that
 177   // we have room if we have to escape a few things without reallocating.
 178   result.spec_.reserve(spec_.size() + 32);
 179   url_canon::StdStringCanonOutput output(&result.spec_);
 180
 181   if (!url_util::ResolveRelative(
 182           spec_.data(), static_cast<int>(spec_.length()), parsed_,
 183           relative.data(), static_cast<int>(relative.length()),
 184           charset_converter, &output, &result.parsed_)) {
 185     // Error resolving, return an empty URL.
 186     return GURL();
 187   }
 188
 189   output.Complete();
 190   result.is_valid_ = true;
 191   return result;
 192 }
 193
 194 // Note: code duplicated above (it's inconvenient to use a template here).
 195 GURL GURL::ResolveWithCharsetConverter(
 196     const string16& relative,
 197     url_canon::CharsetConverter* charset_converter) const {
 198   // Not allowed for invalid URLs.
 199   if (!is_valid_)
 200     return GURL();
 201
 202   GURL result;
 203
 204   // Reserve enough room in the output for the input, plus some extra so that
 205   // we have room if we have to escape a few things without reallocating.
 206   result.spec_.reserve(spec_.size() + 32);
 207   url_canon::StdStringCanonOutput output(&result.spec_);
 208
 209   if (!url_util::ResolveRelative(
 210           spec_.data(), static_cast<int>(spec_.length()), parsed_,
 211           relative.data(), static_cast<int>(relative.length()),
 212           charset_converter, &output, &result.parsed_)) {
 213     // Error resolving, return an empty URL.
 214     return GURL();
 215   }
 216
 217   output.Complete();
 218   result.is_valid_ = true;
 219   return result;
 220 }
 221
 222 // Note: code duplicated below (it's inconvenient to use a template here).
 223 GURL GURL::ReplaceComponents(
 224     const url_canon::Replacements<char>& replacements) const {
 225   GURL result;
 226
 227   // Not allowed for invalid URLs.
 228   if (!is_valid_)
 229     return GURL();
 230
 231   // Reserve enough room in the output for the input, plus some extra so that
 232   // we have room if we have to escape a few things without reallocating.
 233   result.spec_.reserve(spec_.size() + 32);
 234   url_canon::StdStringCanonOutput output(&result.spec_);
 235
 236   result.is_valid_ = url_util::ReplaceComponents(
 237       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
 238       NULL, &output, &result.parsed_);
 239
 240   output.Complete();
 241   return result;
 242 }
 243
 244 // Note: code duplicated above (it's inconvenient to use a template here).
 245 GURL GURL::ReplaceComponents(
 246     const url_canon::Replacements<char16>& replacements) const {
 247   GURL result;
 248
 249   // Not allowed for invalid URLs.
 250   if (!is_valid_)
 251     return GURL();
 252
 253   // Reserve enough room in the output for the input, plus some extra so that
 254   // we have room if we have to escape a few things without reallocating.
 255   result.spec_.reserve(spec_.size() + 32);
 256   url_canon::StdStringCanonOutput output(&result.spec_);
 257
 258   result.is_valid_ = url_util::ReplaceComponents(
 259       spec_.data(), static_cast<int>(spec_.length()), parsed_, replacements,
 260       NULL, &output, &result.parsed_);
 261
 262   output.Complete();
 263   return result;
 264 }
 265
 266 GURL GURL::GetOrigin() const {
 267   // This doesn't make sense for invalid or nonstandard URLs, so return
 268   // the empty URL
 269   if (!is_valid_ || !IsStandard())
 270     return GURL();
 271
 272   url_canon::Replacements<char> replacements;
 273   replacements.ClearUsername();
 274   replacements.ClearPassword();
 275   replacements.ClearPath();
 276   replacements.ClearQuery();
 277   replacements.ClearRef();
 278
 279   return ReplaceComponents(replacements);
 280 }
 281
 282 GURL GURL::GetWithEmptyPath() const {
 283   // This doesn't make sense for invalid or nonstandard URLs, so return
 284   // the empty URL.
 285   if (!is_valid_ || !IsStandard())
 286     return GURL();
 287
 288   // We could optimize this since we know that the URL is canonical, and we are
 289   // appending a canonical path, so avoiding re-parsing.
 290   GURL other(*this);
 291   if (parsed_.path.len == 0)
 292     return other;
 293
 294   // Clear everything after the path.
 295   other.parsed_.query.reset();
 296   other.parsed_.ref.reset();
 297
 298   // Set the path, since the path is longer than one, we can just set the
 299   // first character and resize.
 300   other.spec_[other.parsed_.path.begin] = '/';
 301   other.parsed_.path.len = 1;
 302   other.spec_.resize(other.parsed_.path.begin + 1);
 303   return other;
 304 }
 305
 306 bool GURL::IsStandard() const {
 307   return url_util::IsStandard(spec_.data(), static_cast<int>(spec_.length()),
 308                               parsed_.scheme);
 309 }
 310
 311 bool GURL::SchemeIs(const char* lower_ascii_scheme) const {
 312   if (parsed_.scheme.len <= 0)
 313     return lower_ascii_scheme == NULL;
 314   return url_util::LowerCaseEqualsASCII(spec_.data() + parsed_.scheme.begin,
 315                                         spec_.data() + parsed_.scheme.end(),
 316                                         lower_ascii_scheme);
 317 }
 318
 319 int GURL::IntPort() const {
 320   if (parsed_.port.is_nonempty())
 321     return url_parse::ParsePort(spec_.data(), parsed_.port);
 322   return url_parse::PORT_UNSPECIFIED;
 323 }
 324
 325 int GURL::EffectiveIntPort() const {
 326   int int_port = IntPort();
 327   if (int_port == url_parse::PORT_UNSPECIFIED && IsStandard())
 328     return url_canon::DefaultPortForScheme(spec_.data() + parsed_.scheme.begin,
 329                                            parsed_.scheme.len);
 330   return int_port;
 331 }
 332
 333 std::string GURL::ExtractFileName() const {
 334   url_parse::Component file_component;
 335   url_parse::ExtractFileName(spec_.data(), parsed_.path, &file_component);
 336   return ComponentString(file_component);
 337 }
 338
 339 std::string GURL::PathForRequest() const {
 340   DCHECK(parsed_.path.len > 0) << "Canonical path for requests should be non-empty";
 341   if (parsed_.ref.len >= 0) {
 342     // Clip off the reference when it exists. The reference starts after the #
 343     // sign, so we have to subtract one to also remove it.
 344     return std::string(spec_, parsed_.path.begin,
 345                        parsed_.ref.begin - parsed_.path.begin - 1);
 346   }
 347
 348   // Use everything form the path to the end.
 349   return std::string(spec_, parsed_.path.begin);
 350 }
 351
 352 std::string GURL::HostNoBrackets() const {
 353   // If host looks like an IPv6 literal, strip the square brackets.
 354   url_parse::Component h(parsed_.host);
 355   if (h.len >= 2 && spec_[h.begin] == '[' && spec_[h.end() - 1] == ']') {
 356     h.begin++;
 357     h.len -= 2;
 358   }
 359   return ComponentString(h);
 360 }
 361
 362 bool GURL::HostIsIPAddress() const {
 363   if (!is_valid_ || spec_.empty())
 364      return false;
 365
 366   url_canon::RawCanonOutputT<char, 128> ignored_output;
 367   url_canon::CanonHostInfo host_info;
 368   url_canon::CanonicalizeIPAddress(spec_.c_str(), parsed_.host,
 369                                    &ignored_output, &host_info);
 370   return host_info.IsIPAddress();
 371 }
 372
 373 #ifdef WIN32
 374
 375 const GURL& GURL::EmptyGURL() {
 376   // Avoid static object construction/destruction on startup/shutdown.
 377   if (!empty_gurl) {
 378     // Create the string. Be careful that we don't break in the case that this
 379     // is being called from multiple threads.
 380     GURL* new_empty_gurl = new GURL;
 381     if (InterlockedCompareExchangePointer(
 382         reinterpret_cast<PVOID*>(&empty_gurl), new_empty_gurl, NULL)) {
 383       // The old value was non-NULL, so no replacement was done. Another
 384       // thread did the initialization out from under us.
 385       delete new_empty_gurl;
 386     }
 387   }
 388   return *empty_gurl;
 389 }
 390
 391 #else
 392
 393 void EmptyGURLOnce(void) {
 394   empty_gurl = new GURL;
 395 }
 396
 397 const GURL& GURL::EmptyGURL() {
 398   // Avoid static object construction/destruction on startup/shutdown.
 399   pthread_once(&empty_gurl_once, EmptyGURLOnce);
 400   return *empty_gurl;
 401 }
 402
 403 #endif  // WIN32
 404
 405 bool GURL::DomainIs(const char* lower_ascii_domain,
 406                     int domain_len) const {
 407   // Return false if this URL is not valid or domain is empty.
 408   if (!is_valid_ || !parsed_.host.is_nonempty() || !domain_len)
 409     return false;
 410
 411   // Check whether the host name is end with a dot. If yes, treat it
 412   // the same as no-dot unless the input comparison domain is end
 413   // with dot.
 414   const char* last_pos = spec_.data() + parsed_.host.end() - 1;
 415   int host_len = parsed_.host.len;
 416   if ('.' == *last_pos && '.' != lower_ascii_domain[domain_len - 1]) {
 417     last_pos--;
 418     host_len--;
 419   }
 420
 421   // Return false if host's length is less than domain's length.
 422   if (host_len < domain_len)
 423     return false;
 424
 425   // Compare this url whether belong specific domain.
 426   const char* start_pos = spec_.data() + parsed_.host.begin +
 427                           host_len - domain_len;
 428
 429   if (!url_util::LowerCaseEqualsASCII(start_pos,
 430                                       last_pos + 1,
 431                                       lower_ascii_domain,
 432                                       lower_ascii_domain + domain_len))
 433     return false;
 434
 435   // Check whether host has right domain start with dot, make sure we got
 436   // right domain range. For example www.google.com has domain
 437   // "google.com" but www.iamnotgoogle.com does not.
 438   if ('.' != lower_ascii_domain[0] && host_len > domain_len &&
 439       '.' != *(start_pos - 1))
 440     return false;
 441
 442   return true;
 443 }
 444
 445 void GURL::Swap(GURL* other) {
 446   spec_.swap(other->spec_);
 447   std::swap(is_valid_, other->is_valid_);
 448   std::swap(parsed_, other->parsed_);
 449 }
 450