net/base/escape.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "net/base/escape.h"
   6
   7 #include <algorithm>
   8
   9 #include "base/logging.h"
  10 #include "base/memory/scoped_ptr.h"
  11 #include "base/strings/string_piece.h"
  12 #include "base/strings/string_util.h"
  13 #include "base/strings/utf_offset_string_conversions.h"
  14 #include "base/strings/utf_string_conversions.h"
  15
  16 namespace net {
  17
  18 namespace {
  19
  20 const char kHexString[] = "0123456789ABCDEF";
  21 inline char IntToHex(int i) {
  22   DCHECK_GE(i, 0) << i << " not a hex value";
  23   DCHECK_LE(i, 15) << i << " not a hex value";
  24   return kHexString[i];
  25 }
  26
  27 // A fast bit-vector map for ascii characters.
  28 //
  29 // Internally stores 256 bits in an array of 8 ints.
  30 // Does quick bit-flicking to lookup needed characters.
  31 struct Charmap {
  32   bool Contains(unsigned char c) const {
  33     return ((map[c >> 5] & (1 << (c & 31))) != 0);
  34   }
  35
  36   uint32_t map[8];
  37 };
  38
  39 // Given text to escape and a Charmap defining which values to escape,
  40 // return an escaped string.  If use_plus is true, spaces are converted
  41 // to +, otherwise, if spaces are in the charmap, they are converted to
  42 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
  43 // '%' is in the charmap, it is converted to %25.
  44 std::string Escape(const std::string& text,
  45                    const Charmap& charmap,
  46                    bool use_plus,
  47                    bool keep_escaped = false) {
  48   std::string escaped;
  49   escaped.reserve(text.length() * 3);
  50   for (unsigned int i = 0; i < text.length(); ++i) {
  51     unsigned char c = static_cast<unsigned char>(text[i]);
  52     if (use_plus && ' ' == c) {
  53       escaped.push_back('+');
  54     } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
  55                base::IsHexDigit(text[i + 1]) && base::IsHexDigit(text[i + 2])) {
  56       escaped.push_back('%');
  57     } else if (charmap.Contains(c)) {
  58       escaped.push_back('%');
  59       escaped.push_back(IntToHex(c >> 4));
  60       escaped.push_back(IntToHex(c & 0xf));
  61     } else {
  62       escaped.push_back(c);
  63     }
  64   }
  65   return escaped;
  66 }
  67
  68 // Contains nonzero when the corresponding character is unescapable for normal
  69 // URLs. These characters are the ones that may change the parsing of a URL, so
  70 // we don't want to unescape them sometimes. In many case we won't want to
  71 // unescape spaces, but that is controlled by parameters to Unescape*.
  72 //
  73 // The basic rule is that we can't unescape anything that would changing parsing
  74 // like # or ?. We also can't unescape &, =, or + since that could be part of a
  75 // query and that could change the server's parsing of the query. Nor can we
  76 // unescape \ since src/url/ will convert it to a /.
  77 //
  78 // Lastly, we can't unescape anything that doesn't have a canonical
  79 // representation in a URL. This means that unescaping will change the URL, and
  80 // you could get different behavior if you copy and paste the URL, or press
  81 // enter in the URL bar. The list of characters that fall into this category
  82 // are the ones labeled PASS (allow either escaped or unescaped) in the big
  83 // lookup table at the top of url/url_canon_path.cc.  Also, characters
  84 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
  85 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
  86 // not unescaped, to avoid turning a valid url according to spec into an
  87 // invalid one.
  88 const char kUrlUnescape[128] = {
  89 //   NULL, control chars...
  90      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  91      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  92 //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
  93      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
  94 //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
  95      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
  96 //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
  97      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  98 //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
  99      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
 100 //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
 101      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 102 //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
 103      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0
 104 };
 105
 106 // Attempts to unescape the sequence at |index| within |escaped_text|.  If
 107 // successful, sets |value| to the unescaped value.  Returns whether
 108 // unescaping succeeded.
 109 template<typename STR>
 110 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text,
 111                                  size_t index,
 112                                  unsigned char* value) {
 113   if ((index + 2) >= escaped_text.size())
 114     return false;
 115   if (escaped_text[index] != '%')
 116     return false;
 117   const typename STR::value_type most_sig_digit(
 118       static_cast<typename STR::value_type>(escaped_text[index + 1]));
 119   const typename STR::value_type least_sig_digit(
 120       static_cast<typename STR::value_type>(escaped_text[index + 2]));
 121   if (base::IsHexDigit(most_sig_digit) && base::IsHexDigit(least_sig_digit)) {
 122     *value = base::HexDigitToInt(most_sig_digit) * 16 +
 123              base::HexDigitToInt(least_sig_digit);
 124     return true;
 125   }
 126   return false;
 127 }
 128
 129 // Returns true if there is an Arabic Language Mark at |index|. |first_byte|
 130 // is the byte at |index|.
 131 template<typename STR>
 132 bool HasArabicLanguageMarkAtIndex(const STR& escaped_text,
 133                                   unsigned char first_byte,
 134                                   size_t index) {
 135   if (first_byte != 0xD8)
 136     return false;
 137   unsigned char second_byte;
 138   if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
 139     return false;
 140   return second_byte == 0x9c;
 141 }
 142
 143 // Returns true if there is a BiDi control char at |index|. |first_byte| is the
 144 // byte at |index|.
 145 template<typename STR>
 146 bool HasThreeByteBidiControlCharAtIndex(const STR& escaped_text,
 147                                         unsigned char first_byte,
 148                                         size_t index) {
 149   if (first_byte != 0xE2)
 150     return false;
 151   unsigned char second_byte;
 152   if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
 153     return false;
 154   if (second_byte != 0x80 && second_byte != 0x81)
 155     return false;
 156   unsigned char third_byte;
 157   if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))
 158     return false;
 159   if (second_byte == 0x80) {
 160     return third_byte == 0x8E ||
 161            third_byte == 0x8F ||
 162            (third_byte >= 0xAA && third_byte <= 0xAE);
 163   }
 164   return third_byte >= 0xA6 && third_byte <= 0xA9;
 165 }
 166
 167 // Returns true if there is a four-byte banned char at |index|. |first_byte| is
 168 // the byte at |index|.
 169 template <typename STR>
 170 bool HasFourByteBannedCharAtIndex(const STR& escaped_text,
 171                                   unsigned char first_byte,
 172                                   size_t index) {
 173   // The following characters are blacklisted for spoofability concerns.
 174   // U+1F50F LOCK WITH INK PEN         (%F0%9F%94%8F)
 175   // U+1F510 CLOSED LOCK WITH KEY      (%F0%9F%94%90)
 176   // U+1F512 LOCK                      (%F0%9F%94%92)
 177   // U+1F513 OPEN LOCK                 (%F0%9F%94%93)
 178   if (first_byte != 0xF0)
 179     return false;
 180
 181   unsigned char second_byte;
 182   if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte) ||
 183       second_byte != 0x9F) {
 184     return false;
 185   }
 186
 187   unsigned char third_byte;
 188   if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte) ||
 189       third_byte != 0x94) {
 190     return false;
 191   }
 192
 193   unsigned char fourth_byte;
 194   return UnescapeUnsignedCharAtIndex(escaped_text, index + 9, &fourth_byte) &&
 195          (fourth_byte == 0x8F || fourth_byte == 0x90 || fourth_byte == 0x92 ||
 196           fourth_byte == 0x93);
 197 }
 198
 199 // Unescapes |escaped_text| according to |rules|, returning the resulting
 200 // string.  Fills in an |adjustments| parameter, if non-NULL, so it reflects
 201 // the alterations done to the string that are not one-character-to-one-
 202 // character.  The resulting |adjustments| will always be sorted by increasing
 203 // offset.
 204 template<typename STR>
 205 STR UnescapeURLWithAdjustmentsImpl(
 206     const STR& escaped_text,
 207     UnescapeRule::Type rules,
 208     base::OffsetAdjuster::Adjustments* adjustments) {
 209   if (adjustments)
 210     adjustments->clear();
 211   // Do not unescape anything, return the |escaped_text| text.
 212   if (rules == UnescapeRule::NONE)
 213     return escaped_text;
 214
 215   // The output of the unescaping is always smaller than the input, so we can
 216   // reserve the input size to make sure we have enough buffer and don't have
 217   // to allocate in the loop below.
 218   STR result;
 219   result.reserve(escaped_text.length());
 220
 221   // Locations of adjusted text.
 222   for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
 223     if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
 224       // Non ASCII character, append as is.
 225       result.push_back(escaped_text[i]);
 226       continue;
 227     }
 228
 229     unsigned char first_byte;
 230     if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) {
 231       // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi
 232       // control characters are not allowed to appear unescaped in URLs:
 233       //
 234       // U+200E LEFT-TO-RIGHT MARK         (%E2%80%8E)
 235       // U+200F RIGHT-TO-LEFT MARK         (%E2%80%8F)
 236       // U+202A LEFT-TO-RIGHT EMBEDDING    (%E2%80%AA)
 237       // U+202B RIGHT-TO-LEFT EMBEDDING    (%E2%80%AB)
 238       // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)
 239       // U+202D LEFT-TO-RIGHT OVERRIDE     (%E2%80%AD)
 240       // U+202E RIGHT-TO-LEFT OVERRIDE     (%E2%80%AE)
 241       //
 242       // Additionally, the Unicode Technical Report (TR9) as referenced by RFC
 243       // 3987 above has since added some new BiDi control characters.
 244       // http://www.unicode.org/reports/tr9
 245       //
 246       // U+061C ARABIC LETTER MARK         (%D8%9C)
 247       // U+2066 LEFT-TO-RIGHT ISOLATE      (%E2%81%A6)
 248       // U+2067 RIGHT-TO-LEFT ISOLATE      (%E2%81%A7)
 249       // U+2068 FIRST STRONG ISOLATE       (%E2%81%A8)
 250       // U+2069 POP DIRECTIONAL ISOLATE    (%E2%81%A9)
 251       //
 252       // The following spoofable characters are also banned, because they could
 253       // be used to imitate parts of a web browser's UI.
 254       //
 255       // U+1F50F LOCK WITH INK PEN         (%F0%9F%94%8F)
 256       // U+1F510 CLOSED LOCK WITH KEY      (%F0%9F%94%90)
 257       // U+1F512 LOCK                      (%F0%9F%94%92)
 258       // U+1F513 OPEN LOCK                 (%F0%9F%94%93)
 259       //
 260       // However, some schemes such as data: and file: need to parse the exact
 261       // binary data when loading the URL. For that reason,
 262       // SPOOFING_AND_CONTROL_CHARS allows unescaping BiDi control characters.
 263       // DO NOT use SPOOFING_AND_CONTROL_CHARS if the parsed URL is going to be
 264       // displayed in the UI.
 265       if (!(rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)) {
 266         if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) {
 267           // Keep Arabic Language Mark escaped.
 268           result.append(escaped_text, i, 6);
 269           i += 5;
 270           continue;
 271         }
 272         if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) {
 273           // Keep BiDi control char escaped.
 274           result.append(escaped_text, i, 9);
 275           i += 8;
 276           continue;
 277         }
 278         if (HasFourByteBannedCharAtIndex(escaped_text, first_byte, i)) {
 279           // Keep banned char escaped.
 280           result.append(escaped_text, i, 12);
 281           i += 11;
 282           continue;
 283         }
 284       }
 285
 286       if (first_byte >= 0x80 ||  // Unescape all high-bit characters.
 287           // For 7-bit characters, the lookup table tells us all valid chars.
 288           (kUrlUnescape[first_byte] ||
 289            // ...and we allow some additional unescaping when flags are set.
 290            (first_byte == ' ' && (rules & UnescapeRule::SPACES)) ||
 291            // Allow any of the prohibited but non-control characters when
 292            // we're doing "special" chars.
 293            (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
 294            // Additionally allow non-display characters if requested.
 295            (first_byte < ' ' &&
 296             (rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)))) {
 297         // Use the unescaped version of the character.
 298         if (adjustments)
 299           adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1));
 300         result.push_back(first_byte);
 301         i += 2;
 302       } else {
 303         // Keep escaped. Append a percent and we'll get the following two
 304         // digits on the next loops through.
 305         result.push_back('%');
 306       }
 307     } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
 308                escaped_text[i] == '+') {
 309       result.push_back(' ');
 310     } else {
 311       // Normal case for unescaped characters.
 312       result.push_back(escaped_text[i]);
 313     }
 314   }
 315
 316   return result;
 317 }
 318
 319 template <class str>
 320 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
 321   static const struct {
 322     char key;
 323     const char* replacement;
 324   } kCharsToEscape[] = {
 325     { '<', "&lt;" },
 326     { '>', "&gt;" },
 327     { '&', "&amp;" },
 328     { '"', "&quot;" },
 329     { '\'', "&#39;" },
 330   };
 331   size_t k;
 332   for (k = 0; k < arraysize(kCharsToEscape); ++k) {
 333     if (c == kCharsToEscape[k].key) {
 334       const char* p = kCharsToEscape[k].replacement;
 335       while (*p)
 336         output->push_back(*p++);
 337       break;
 338     }
 339   }
 340   if (k == arraysize(kCharsToEscape))
 341     output->push_back(c);
 342 }
 343
 344 template <class str>
 345 str EscapeForHTMLImpl(const str& input) {
 346   str result;
 347   result.reserve(input.size());  // Optimize for no escaping.
 348
 349   for (typename str::const_iterator i = input.begin(); i != input.end(); ++i)
 350     AppendEscapedCharForHTMLImpl(*i, &result);
 351
 352   return result;
 353 }
 354
 355 // Everything except alphanumerics and !'()*-._~
 356 // See RFC 2396 for the list of reserved characters.
 357 static const Charmap kQueryCharmap = {{
 358   0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
 359   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 360 }};
 361
 362 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
 363 static const Charmap kPathCharmap = {{
 364   0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
 365   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 366 }};
 367
 368 #if defined(OS_MACOSX)
 369 // non-printable, non-7bit, and (including space)  "#%<>[\]^`{|}
 370 static const Charmap kNSURLCharmap = {{
 371   0xffffffffL, 0x5000002dL, 0x78000000L, 0xb8000001L,
 372   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 373 }};
 374 #endif  // defined(OS_MACOSX)
 375
 376 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
 377 static const Charmap kUrlEscape = {{
 378   0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
 379   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 380 }};
 381
 382 // non-7bit
 383 static const Charmap kNonASCIICharmap = {{
 384   0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
 385   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 386 }};
 387
 388 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
 389 // !'()*-._~#[]
 390 static const Charmap kExternalHandlerCharmap = {{
 391   0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L,
 392   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 393 }};
 394
 395 }  // namespace
 396
 397 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
 398   return Escape(text, kQueryCharmap, use_plus);
 399 }
 400
 401 std::string EscapePath(const std::string& path) {
 402   return Escape(path, kPathCharmap, false);
 403 }
 404
 405 #if defined(OS_MACOSX)
 406 std::string EscapeNSURLPrecursor(const std::string& precursor) {
 407   return Escape(precursor, kNSURLCharmap, false, true);
 408 }
 409 #endif  // defined(OS_MACOSX)
 410
 411 std::string EscapeUrlEncodedData(const std::string& path, bool use_plus) {
 412   return Escape(path, kUrlEscape, use_plus);
 413 }
 414
 415 std::string EscapeNonASCII(const std::string& input) {
 416   return Escape(input, kNonASCIICharmap, false);
 417 }
 418
 419 std::string EscapeExternalHandlerValue(const std::string& text) {
 420   return Escape(text, kExternalHandlerCharmap, false, true);
 421 }
 422
 423 void AppendEscapedCharForHTML(char c, std::string* output) {
 424   AppendEscapedCharForHTMLImpl(c, output);
 425 }
 426
 427 std::string EscapeForHTML(const std::string& input) {
 428   return EscapeForHTMLImpl(input);
 429 }
 430
 431 base::string16 EscapeForHTML(const base::string16& input) {
 432   return EscapeForHTMLImpl(input);
 433 }
 434
 435 std::string UnescapeURLComponent(const std::string& escaped_text,
 436                                  UnescapeRule::Type rules) {
 437   return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL);
 438 }
 439
 440 base::string16 UnescapeURLComponent(const base::string16& escaped_text,
 441                                     UnescapeRule::Type rules) {
 442   return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL);
 443 }
 444
 445 base::string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
 446                                                  UnescapeRule::Type rules) {
 447   return UnescapeAndDecodeUTF8URLComponentWithAdjustments(text, rules, NULL);
 448 }
 449
 450 base::string16 UnescapeAndDecodeUTF8URLComponentWithAdjustments(
 451     const std::string& text,
 452     UnescapeRule::Type rules,
 453     base::OffsetAdjuster::Adjustments* adjustments) {
 454   base::string16 result;
 455   base::OffsetAdjuster::Adjustments unescape_adjustments;
 456   std::string unescaped_url(UnescapeURLWithAdjustmentsImpl(
 457       text, rules, &unescape_adjustments));
 458   if (base::UTF8ToUTF16WithAdjustments(unescaped_url.data(),
 459                                        unescaped_url.length(),
 460                                        &result, adjustments)) {
 461     // Character set looks like it's valid.
 462     if (adjustments) {
 463       base::OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
 464                                                        adjustments);
 465     }
 466     return result;
 467   }
 468   // Character set is not valid.  Return the escaped version.
 469   return base::UTF8ToUTF16WithAdjustments(text, adjustments);
 470 }
 471
 472 base::string16 UnescapeForHTML(const base::string16& input) {
 473   static const struct {
 474     const char* ampersand_code;
 475     const char replacement;
 476   } kEscapeToChars[] = {
 477     { "&lt;", '<' },
 478     { "&gt;", '>' },
 479     { "&amp;", '&' },
 480     { "&quot;", '"' },
 481     { "&#39;", '\''},
 482   };
 483
 484   if (input.find(base::ASCIIToUTF16("&")) == std::string::npos)
 485     return input;
 486
 487   base::string16 ampersand_chars[arraysize(kEscapeToChars)];
 488   base::string16 text(input);
 489   for (base::string16::iterator iter = text.begin();
 490        iter != text.end(); ++iter) {
 491     if (*iter == '&') {
 492       // Potential ampersand encode char.
 493       size_t index = iter - text.begin();
 494       for (size_t i = 0; i < arraysize(kEscapeToChars); i++) {
 495         if (ampersand_chars[i].empty()) {
 496           ampersand_chars[i] =
 497               base::ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
 498         }
 499         if (text.find(ampersand_chars[i], index) == index) {
 500           text.replace(iter, iter + ampersand_chars[i].length(),
 501                        1, kEscapeToChars[i].replacement);
 502           break;
 503         }
 504       }
 505     }
 506   }
 507   return text;
 508 }
 509
 510 }  // namespace net