net/base/escape.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "net/base/escape.h"
   6
   7 #include <algorithm>
   8
   9 #include "base/logging.h"
  10 #include "base/memory/scoped_ptr.h"
  11 #include "base/strings/string_piece.h"
  12 #include "base/strings/string_util.h"
  13 #include "base/strings/utf_offset_string_conversions.h"
  14 #include "base/strings/utf_string_conversions.h"
  15
  16 namespace net {
  17
  18 namespace {
  19
  20 const char kHexString[] = "0123456789ABCDEF";
  21 inline char IntToHex(int i) {
  22   DCHECK_GE(i, 0) << i << " not a hex value";
  23   DCHECK_LE(i, 15) << i << " not a hex value";
  24   return kHexString[i];
  25 }
  26
  27 // A fast bit-vector map for ascii characters.
  28 //
  29 // Internally stores 256 bits in an array of 8 ints.
  30 // Does quick bit-flicking to lookup needed characters.
  31 struct Charmap {
  32   bool Contains(unsigned char c) const {
  33     return ((map[c >> 5] & (1 << (c & 31))) != 0);
  34   }
  35
  36   uint32 map[8];
  37 };
  38
  39 // Given text to escape and a Charmap defining which values to escape,
  40 // return an escaped string.  If use_plus is true, spaces are converted
  41 // to +, otherwise, if spaces are in the charmap, they are converted to
  42 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
  43 // '%' is in the charmap, it is converted to %25.
  44 std::string Escape(const std::string& text,
  45                    const Charmap& charmap,
  46                    bool use_plus,
  47                    bool keep_escaped = false) {
  48   std::string escaped;
  49   escaped.reserve(text.length() * 3);
  50   for (unsigned int i = 0; i < text.length(); ++i) {
  51     unsigned char c = static_cast<unsigned char>(text[i]);
  52     if (use_plus && ' ' == c) {
  53       escaped.push_back('+');
  54     } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
  55                IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) {
  56       escaped.push_back('%');
  57     } else if (charmap.Contains(c)) {
  58       escaped.push_back('%');
  59       escaped.push_back(IntToHex(c >> 4));
  60       escaped.push_back(IntToHex(c & 0xf));
  61     } else {
  62       escaped.push_back(c);
  63     }
  64   }
  65   return escaped;
  66 }
  67
  68 // Contains nonzero when the corresponding character is unescapable for normal
  69 // URLs. These characters are the ones that may change the parsing of a URL, so
  70 // we don't want to unescape them sometimes. In many case we won't want to
  71 // unescape spaces, but that is controlled by parameters to Unescape*.
  72 //
  73 // The basic rule is that we can't unescape anything that would changing parsing
  74 // like # or ?. We also can't unescape &, =, or + since that could be part of a
  75 // query and that could change the server's parsing of the query. Nor can we
  76 // unescape \ since src/url/ will convert it to a /.
  77 //
  78 // Lastly, we can't unescape anything that doesn't have a canonical
  79 // representation in a URL. This means that unescaping will change the URL, and
  80 // you could get different behavior if you copy and paste the URL, or press
  81 // enter in the URL bar. The list of characters that fall into this category
  82 // are the ones labeled PASS (allow either escaped or unescaped) in the big
  83 // lookup table at the top of url/url_canon_path.cc.  Also, characters
  84 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
  85 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
  86 // not unescaped, to avoid turning a valid url according to spec into an
  87 // invalid one.
  88 const char kUrlUnescape[128] = {
  89 //   NULL, control chars...
  90      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  91      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  92 //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
  93      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
  94 //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
  95      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
  96 //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
  97      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  98 //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
  99      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
 100 //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
 101      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 102 //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
 103      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0
 104 };
 105
 106 // Attempts to unescape the sequence at |index| within |escaped_text|.  If
 107 // successful, sets |value| to the unescaped value.  Returns whether
 108 // unescaping succeeded.
 109 template<typename STR>
 110 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text,
 111                                  size_t index,
 112                                  unsigned char* value) {
 113   if ((index + 2) >= escaped_text.size())
 114     return false;
 115   if (escaped_text[index] != '%')
 116     return false;
 117   const typename STR::value_type most_sig_digit(
 118       static_cast<typename STR::value_type>(escaped_text[index + 1]));
 119   const typename STR::value_type least_sig_digit(
 120       static_cast<typename STR::value_type>(escaped_text[index + 2]));
 121   if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
 122     *value = HexDigitToInt(most_sig_digit) * 16 +
 123         HexDigitToInt(least_sig_digit);
 124     return true;
 125   }
 126   return false;
 127 }
 128
 129 // Returns true if there is an Arabic Language Mark at |index|. |first_byte|
 130 // is the byte at |index|.
 131 template<typename STR>
 132 bool HasArabicLanguageMarkAtIndex(const STR& escaped_text,
 133                                   unsigned char first_byte,
 134                                   size_t index) {
 135   if (first_byte != 0xD8)
 136     return false;
 137   unsigned char second_byte;
 138   if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
 139     return false;
 140   return second_byte == 0x9c;
 141 }
 142
 143 // Returns true if there is a BiDi control char at |index|. |first_byte| is the
 144 // byte at |index|.
 145 template<typename STR>
 146 bool HasThreeByteBidiControlCharAtIndex(const STR& escaped_text,
 147                                         unsigned char first_byte,
 148                                         size_t index) {
 149   if (first_byte != 0xE2)
 150     return false;
 151   unsigned char second_byte;
 152   if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
 153     return false;
 154   if (second_byte != 0x80 && second_byte != 0x81)
 155     return false;
 156   unsigned char third_byte;
 157   if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))
 158     return false;
 159   if (second_byte == 0x80) {
 160     return third_byte == 0x8E ||
 161            third_byte == 0x8F ||
 162            (third_byte >= 0xAA && third_byte <= 0xAE);
 163   }
 164   return third_byte >= 0xA6 && third_byte <= 0xA9;
 165 }
 166
 167 // Unescapes |escaped_text| according to |rules|, returning the resulting
 168 // string.  Fills in an |adjustments| parameter, if non-NULL, so it reflects
 169 // the alterations done to the string that are not one-character-to-one-
 170 // character.  The resulting |adjustments| will always be sorted by increasing
 171 // offset.
 172 template<typename STR>
 173 STR UnescapeURLWithAdjustmentsImpl(
 174     const STR& escaped_text,
 175     UnescapeRule::Type rules,
 176     base::OffsetAdjuster::Adjustments* adjustments) {
 177   if (adjustments)
 178     adjustments->clear();
 179   // Do not unescape anything, return the |escaped_text| text.
 180   if (rules == UnescapeRule::NONE)
 181     return escaped_text;
 182
 183   // The output of the unescaping is always smaller than the input, so we can
 184   // reserve the input size to make sure we have enough buffer and don't have
 185   // to allocate in the loop below.
 186   STR result;
 187   result.reserve(escaped_text.length());
 188
 189   // Locations of adjusted text.
 190   for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
 191     if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
 192       // Non ASCII character, append as is.
 193       result.push_back(escaped_text[i]);
 194       continue;
 195     }
 196
 197     unsigned char first_byte;
 198     if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) {
 199       // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi
 200       // control characters are not allowed to appear unescaped in URLs:
 201       //
 202       // U+200E LEFT-TO-RIGHT MARK         (%E2%80%8E)
 203       // U+200F RIGHT-TO-LEFT MARK         (%E2%80%8F)
 204       // U+202A LEFT-TO-RIGHT EMBEDDING    (%E2%80%AA)
 205       // U+202B RIGHT-TO-LEFT EMBEDDING    (%E2%80%AB)
 206       // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)
 207       // U+202D LEFT-TO-RIGHT OVERRIDE     (%E2%80%AD)
 208       // U+202E RIGHT-TO-LEFT OVERRIDE     (%E2%80%AE)
 209       //
 210       // Additionally, the Unicode Technical Report (TR9) as referenced by RFC
 211       // 3987 above has since added some new BiDi control characters.
 212       // http://www.unicode.org/reports/tr9
 213       //
 214       // U+061C ARABIC LETTER MARK         (%D8%9C)
 215       // U+2066 LEFT-TO-RIGHT ISOLATE      (%E2%81%A6)
 216       // U+2067 RIGHT-TO-LEFT ISOLATE      (%E2%81%A7)
 217       // U+2068 FIRST STRONG ISOLATE       (%E2%81%A8)
 218       // U+2069 POP DIRECTIONAL ISOLATE    (%E2%81%A9)
 219       //
 220       // However, some schemes such as data: and file: need to parse the exact
 221       // binary data when loading the URL. For that reason, CONTROL_CHARS allows
 222       // unescaping BiDi control characters.
 223       // DO NOT use CONTROL_CHARS if the parsed URL is going to be displayed
 224       // in the UI.
 225       if (!(rules & UnescapeRule::CONTROL_CHARS)) {
 226         if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) {
 227           // Keep Arabic Language Mark escaped.
 228           result.append(escaped_text, i, 6);
 229           i += 5;
 230           continue;
 231         }
 232         if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) {
 233           // Keep BiDi control char escaped.
 234           result.append(escaped_text, i, 9);
 235           i += 8;
 236           continue;
 237         }
 238       }
 239
 240       if (first_byte >= 0x80 ||  // Unescape all high-bit characters.
 241           // For 7-bit characters, the lookup table tells us all valid chars.
 242           (kUrlUnescape[first_byte] ||
 243            // ...and we allow some additional unescaping when flags are set.
 244            (first_byte == ' ' && (rules & UnescapeRule::SPACES)) ||
 245            // Allow any of the prohibited but non-control characters when
 246            // we're doing "special" chars.
 247            (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
 248            // Additionally allow control characters if requested.
 249            (first_byte < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
 250         // Use the unescaped version of the character.
 251         if (adjustments)
 252           adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1));
 253         result.push_back(first_byte);
 254         i += 2;
 255       } else {
 256         // Keep escaped. Append a percent and we'll get the following two
 257         // digits on the next loops through.
 258         result.push_back('%');
 259       }
 260     } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
 261                escaped_text[i] == '+') {
 262       result.push_back(' ');
 263     } else {
 264       // Normal case for unescaped characters.
 265       result.push_back(escaped_text[i]);
 266     }
 267   }
 268
 269   return result;
 270 }
 271
 272 template <class str>
 273 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
 274   static const struct {
 275     char key;
 276     const char* replacement;
 277   } kCharsToEscape[] = {
 278     { '<', "&lt;" },
 279     { '>', "&gt;" },
 280     { '&', "&amp;" },
 281     { '"', "&quot;" },
 282     { '\'', "&#39;" },
 283   };
 284   size_t k;
 285   for (k = 0; k < arraysize(kCharsToEscape); ++k) {
 286     if (c == kCharsToEscape[k].key) {
 287       const char* p = kCharsToEscape[k].replacement;
 288       while (*p)
 289         output->push_back(*p++);
 290       break;
 291     }
 292   }
 293   if (k == arraysize(kCharsToEscape))
 294     output->push_back(c);
 295 }
 296
 297 template <class str>
 298 str EscapeForHTMLImpl(const str& input) {
 299   str result;
 300   result.reserve(input.size());  // Optimize for no escaping.
 301
 302   for (typename str::const_iterator i = input.begin(); i != input.end(); ++i)
 303     AppendEscapedCharForHTMLImpl(*i, &result);
 304
 305   return result;
 306 }
 307
 308 // Everything except alphanumerics and !'()*-._~
 309 // See RFC 2396 for the list of reserved characters.
 310 static const Charmap kQueryCharmap = {{
 311   0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
 312   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 313 }};
 314
 315 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
 316 static const Charmap kPathCharmap = {{
 317   0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
 318   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 319 }};
 320
 321 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
 322 static const Charmap kUrlEscape = {{
 323   0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
 324   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 325 }};
 326
 327 // non-7bit
 328 static const Charmap kNonASCIICharmap = {{
 329   0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
 330   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 331 }};
 332
 333 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
 334 // !'()*-._~#[]
 335 static const Charmap kExternalHandlerCharmap = {{
 336   0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L,
 337   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 338 }};
 339
 340 }  // namespace
 341
 342 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
 343   return Escape(text, kQueryCharmap, use_plus);
 344 }
 345
 346 std::string EscapePath(const std::string& path) {
 347   return Escape(path, kPathCharmap, false);
 348 }
 349
 350 std::string EscapeUrlEncodedData(const std::string& path, bool use_plus) {
 351   return Escape(path, kUrlEscape, use_plus);
 352 }
 353
 354 std::string EscapeNonASCII(const std::string& input) {
 355   return Escape(input, kNonASCIICharmap, false);
 356 }
 357
 358 std::string EscapeExternalHandlerValue(const std::string& text) {
 359   return Escape(text, kExternalHandlerCharmap, false, true);
 360 }
 361
 362 void AppendEscapedCharForHTML(char c, std::string* output) {
 363   AppendEscapedCharForHTMLImpl(c, output);
 364 }
 365
 366 std::string EscapeForHTML(const std::string& input) {
 367   return EscapeForHTMLImpl(input);
 368 }
 369
 370 base::string16 EscapeForHTML(const base::string16& input) {
 371   return EscapeForHTMLImpl(input);
 372 }
 373
 374 std::string UnescapeURLComponent(const std::string& escaped_text,
 375                                  UnescapeRule::Type rules) {
 376   return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL);
 377 }
 378
 379 base::string16 UnescapeURLComponent(const base::string16& escaped_text,
 380                                     UnescapeRule::Type rules) {
 381   return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL);
 382 }
 383
 384 base::string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
 385                                                  UnescapeRule::Type rules) {
 386   return UnescapeAndDecodeUTF8URLComponentWithAdjustments(text, rules, NULL);
 387 }
 388
 389 base::string16 UnescapeAndDecodeUTF8URLComponentWithAdjustments(
 390     const std::string& text,
 391     UnescapeRule::Type rules,
 392     base::OffsetAdjuster::Adjustments* adjustments) {
 393   base::string16 result;
 394   base::OffsetAdjuster::Adjustments unescape_adjustments;
 395   std::string unescaped_url(UnescapeURLWithAdjustmentsImpl(
 396       text, rules, &unescape_adjustments));
 397   if (base::UTF8ToUTF16WithAdjustments(unescaped_url.data(),
 398                                        unescaped_url.length(),
 399                                        &result, adjustments)) {
 400     // Character set looks like it's valid.
 401     if (adjustments) {
 402       base::OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
 403                                                        adjustments);
 404     }
 405     return result;
 406   }
 407   // Character set is not valid.  Return the escaped version.
 408   return base::UTF8ToUTF16WithAdjustments(text, adjustments);
 409 }
 410
 411 base::string16 UnescapeForHTML(const base::string16& input) {
 412   static const struct {
 413     const char* ampersand_code;
 414     const char replacement;
 415   } kEscapeToChars[] = {
 416     { "&lt;", '<' },
 417     { "&gt;", '>' },
 418     { "&amp;", '&' },
 419     { "&quot;", '"' },
 420     { "&#39;", '\''},
 421   };
 422
 423   if (input.find(base::ASCIIToUTF16("&")) == std::string::npos)
 424     return input;
 425
 426   base::string16 ampersand_chars[arraysize(kEscapeToChars)];
 427   base::string16 text(input);
 428   for (base::string16::iterator iter = text.begin();
 429        iter != text.end(); ++iter) {
 430     if (*iter == '&') {
 431       // Potential ampersand encode char.
 432       size_t index = iter - text.begin();
 433       for (size_t i = 0; i < arraysize(kEscapeToChars); i++) {
 434         if (ampersand_chars[i].empty()) {
 435           ampersand_chars[i] =
 436               base::ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
 437         }
 438         if (text.find(ampersand_chars[i], index) == index) {
 439           text.replace(iter, iter + ampersand_chars[i].length(),
 440                        1, kEscapeToChars[i].replacement);
 441           break;
 442         }
 443       }
 444     }
 445   }
 446   return text;
 447 }
 448
 449 }  // namespace net