net/base/escape.cc

   1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "net/base/escape.h"
   6
   7 #include <algorithm>
   8
   9 #include "base/logging.h"
  10 #include "base/memory/scoped_ptr.h"
  11 #include "base/strings/string_piece.h"
  12 #include "base/strings/string_util.h"
  13 #include "base/strings/utf_offset_string_conversions.h"
  14 #include "base/strings/utf_string_conversions.h"
  15
  16 namespace net {
  17
  18 namespace {
  19
  20 const char kHexString[] = "0123456789ABCDEF";
  21 inline char IntToHex(int i) {
  22   DCHECK_GE(i, 0) << i << " not a hex value";
  23   DCHECK_LE(i, 15) << i << " not a hex value";
  24   return kHexString[i];
  25 }
  26
  27 // A fast bit-vector map for ascii characters.
  28 //
  29 // Internally stores 256 bits in an array of 8 ints.
  30 // Does quick bit-flicking to lookup needed characters.
  31 struct Charmap {
  32   bool Contains(unsigned char c) const {
  33     return ((map[c >> 5] & (1 << (c & 31))) != 0);
  34   }
  35
  36   uint32 map[8];
  37 };
  38
  39 // Given text to escape and a Charmap defining which values to escape,
  40 // return an escaped string.  If use_plus is true, spaces are converted
  41 // to +, otherwise, if spaces are in the charmap, they are converted to
  42 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
  43 // '%' is in the charmap, it is converted to %25.
  44 std::string Escape(const std::string& text,
  45                    const Charmap& charmap,
  46                    bool use_plus,
  47                    bool keep_escaped = false) {
  48   std::string escaped;
  49   escaped.reserve(text.length() * 3);
  50   for (unsigned int i = 0; i < text.length(); ++i) {
  51     unsigned char c = static_cast<unsigned char>(text[i]);
  52     if (use_plus && ' ' == c) {
  53       escaped.push_back('+');
  54     } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
  55                IsHexDigit(text[i + 1]) && IsHexDigit(text[i + 2])) {
  56       escaped.push_back('%');
  57     } else if (charmap.Contains(c)) {
  58       escaped.push_back('%');
  59       escaped.push_back(IntToHex(c >> 4));
  60       escaped.push_back(IntToHex(c & 0xf));
  61     } else {
  62       escaped.push_back(c);
  63     }
  64   }
  65   return escaped;
  66 }
  67
  68 // Contains nonzero when the corresponding character is unescapable for normal
  69 // URLs. These characters are the ones that may change the parsing of a URL, so
  70 // we don't want to unescape them sometimes. In many case we won't want to
  71 // unescape spaces, but that is controlled by parameters to Unescape*.
  72 //
  73 // The basic rule is that we can't unescape anything that would changing parsing
  74 // like # or ?. We also can't unescape &, =, or + since that could be part of a
  75 // query and that could change the server's parsing of the query. Nor can we
  76 // unescape \ since src/url/ will convert it to a /.
  77 //
  78 // Lastly, we can't unescape anything that doesn't have a canonical
  79 // representation in a URL. This means that unescaping will change the URL, and
  80 // you could get different behavior if you copy and paste the URL, or press
  81 // enter in the URL bar. The list of characters that fall into this category
  82 // are the ones labeled PASS (allow either escaped or unescaped) in the big
  83 // lookup table at the top of url/url_canon_path.cc.  Also, characters
  84 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
  85 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
  86 // not unescaped, to avoid turning a valid url according to spec into an
  87 // invalid one.
  88 const char kUrlUnescape[128] = {
  89 //   NULL, control chars...
  90      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  91      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  92 //  ' ' !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /
  93      0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
  94 //   0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
  95      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
  96 //   @  A  B  C  D  E  F  G  H  I  J  K  L  M  N  O
  97      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  98 //   P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
  99      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
 100 //   `  a  b  c  d  e  f  g  h  i  j  k  l  m  n  o
 101      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 102 //   p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~  <NBSP>
 103      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0
 104 };
 105
 106 // Attempts to unescape the sequence at |index| within |escaped_text|.  If
 107 // successful, sets |value| to the unescaped value.  Returns whether
 108 // unescaping succeeded.
 109 template<typename STR>
 110 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text,
 111                                  size_t index,
 112                                  unsigned char* value) {
 113   if ((index + 2) >= escaped_text.size())
 114     return false;
 115   if (escaped_text[index] != '%')
 116     return false;
 117   const typename STR::value_type most_sig_digit(
 118       static_cast<typename STR::value_type>(escaped_text[index + 1]));
 119   const typename STR::value_type least_sig_digit(
 120       static_cast<typename STR::value_type>(escaped_text[index + 2]));
 121   if (IsHexDigit(most_sig_digit) && IsHexDigit(least_sig_digit)) {
 122     *value = HexDigitToInt(most_sig_digit) * 16 +
 123         HexDigitToInt(least_sig_digit);
 124     return true;
 125   }
 126   return false;
 127 }
 128
 129 // Returns true if there is an Arabic Language Mark at |index|. |first_byte|
 130 // is the byte at |index|.
 131 template<typename STR>
 132 bool HasArabicLanguageMarkAtIndex(const STR& escaped_text,
 133                                   unsigned char first_byte,
 134                                   size_t index) {
 135   if (first_byte != 0xD8)
 136     return false;
 137   unsigned char second_byte;
 138   if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
 139     return false;
 140   return second_byte == 0x9c;
 141 }
 142
 143 // Returns true if there is a BiDi control char at |index|. |first_byte| is the
 144 // byte at |index|.
 145 template<typename STR>
 146 bool HasThreeByteBidiControlCharAtIndex(const STR& escaped_text,
 147                                         unsigned char first_byte,
 148                                         size_t index) {
 149   if (first_byte != 0xE2)
 150     return false;
 151   unsigned char second_byte;
 152   if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
 153     return false;
 154   if (second_byte != 0x80 && second_byte != 0x81)
 155     return false;
 156   unsigned char third_byte;
 157   if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))
 158     return false;
 159   if (second_byte == 0x80) {
 160     return third_byte == 0x8E ||
 161            third_byte == 0x8F ||
 162            (third_byte >= 0xAA && third_byte <= 0xAE);
 163   }
 164   return third_byte >= 0xA6 && third_byte <= 0xA9;
 165 }
 166
 167 // Unescapes |escaped_text| according to |rules|, returning the resulting
 168 // string.  Fills in an |adjustments| parameter, if non-NULL, so it reflects
 169 // the alterations done to the string that are not one-character-to-one-
 170 // character.  The resulting |adjustments| will always be sorted by increasing
 171 // offset.
 172 template<typename STR>
 173 STR UnescapeURLWithAdjustmentsImpl(
 174     const STR& escaped_text,
 175     UnescapeRule::Type rules,
 176     base::OffsetAdjuster::Adjustments* adjustments) {
 177   if (adjustments)
 178     adjustments->clear();
 179   // Do not unescape anything, return the |escaped_text| text.
 180   if (rules == UnescapeRule::NONE)
 181     return escaped_text;
 182
 183   // The output of the unescaping is always smaller than the input, so we can
 184   // reserve the input size to make sure we have enough buffer and don't have
 185   // to allocate in the loop below.
 186   STR result;
 187   result.reserve(escaped_text.length());
 188
 189   // Locations of adjusted text.
 190   for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
 191     if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
 192       // Non ASCII character, append as is.
 193       result.push_back(escaped_text[i]);
 194       continue;
 195     }
 196
 197     unsigned char first_byte;
 198     if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) {
 199       // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi
 200       // control characters are not allowed to appear unescaped in URLs:
 201       //
 202       // U+200E LEFT-TO-RIGHT MARK         (%E2%80%8E)
 203       // U+200F RIGHT-TO-LEFT MARK         (%E2%80%8F)
 204       // U+202A LEFT-TO-RIGHT EMBEDDING    (%E2%80%AA)
 205       // U+202B RIGHT-TO-LEFT EMBEDDING    (%E2%80%AB)
 206       // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)
 207       // U+202D LEFT-TO-RIGHT OVERRIDE     (%E2%80%AD)
 208       // U+202E RIGHT-TO-LEFT OVERRIDE     (%E2%80%AE)
 209       //
 210       // Additionally, the Unicode Technical Report (TR9) as referenced by RFC
 211       // 3987 above has since added some new BiDi control characters.
 212       // http://www.unicode.org/reports/tr9
 213       //
 214       // U+061C ARABIC LETTER MARK         (%D8%9C)
 215       // U+2066 LEFT-TO-RIGHT ISOLATE      (%E2%81%A6)
 216       // U+2067 RIGHT-TO-LEFT ISOLATE      (%E2%81%A7)
 217       // U+2068 FIRST STRONG ISOLATE       (%E2%81%A8)
 218       // U+2069 POP DIRECTIONAL ISOLATE    (%E2%81%A9)
 219       //
 220       // However, some schemes such as data: and file: need to parse the exact
 221       // binary data when loading the URL. For that reason, CONTROL_CHARS allows
 222       // unescaping BiDi control characters.
 223       // DO NOT use CONTROL_CHARS if the parsed URL is going to be displayed
 224       // in the UI.
 225       if (!(rules & UnescapeRule::CONTROL_CHARS)) {
 226         if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) {
 227           // Keep Arabic Language Mark escaped.
 228           result.append(escaped_text, i, 6);
 229           i += 5;
 230           continue;
 231         }
 232         if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) {
 233           // Keep BiDi control char escaped.
 234           result.append(escaped_text, i, 9);
 235           i += 8;
 236           continue;
 237         }
 238       }
 239
 240       if (first_byte >= 0x80 ||  // Unescape all high-bit characters.
 241           // For 7-bit characters, the lookup table tells us all valid chars.
 242           (kUrlUnescape[first_byte] ||
 243            // ...and we allow some additional unescaping when flags are set.
 244            (first_byte == ' ' && (rules & UnescapeRule::SPACES)) ||
 245            // Allow any of the prohibited but non-control characters when
 246            // we're doing "special" chars.
 247            (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
 248            // Additionally allow control characters if requested.
 249            (first_byte < ' ' && (rules & UnescapeRule::CONTROL_CHARS)))) {
 250         // Use the unescaped version of the character.
 251         if (adjustments)
 252           adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1));
 253         result.push_back(first_byte);
 254         i += 2;
 255       } else {
 256         // Keep escaped. Append a percent and we'll get the following two
 257         // digits on the next loops through.
 258         result.push_back('%');
 259       }
 260     } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
 261                escaped_text[i] == '+') {
 262       result.push_back(' ');
 263     } else {
 264       // Normal case for unescaped characters.
 265       result.push_back(escaped_text[i]);
 266     }
 267   }
 268
 269   return result;
 270 }
 271
 272 template <class str>
 273 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
 274   static const struct {
 275     char key;
 276     const char* replacement;
 277   } kCharsToEscape[] = {
 278     { '<', "&lt;" },
 279     { '>', "&gt;" },
 280     { '&', "&amp;" },
 281     { '"', "&quot;" },
 282     { '\'', "&#39;" },
 283   };
 284   size_t k;
 285   for (k = 0; k < arraysize(kCharsToEscape); ++k) {
 286     if (c == kCharsToEscape[k].key) {
 287       const char* p = kCharsToEscape[k].replacement;
 288       while (*p)
 289         output->push_back(*p++);
 290       break;
 291     }
 292   }
 293   if (k == arraysize(kCharsToEscape))
 294     output->push_back(c);
 295 }
 296
 297 template <class str>
 298 str EscapeForHTMLImpl(const str& input) {
 299   str result;
 300   result.reserve(input.size());  // Optimize for no escaping.
 301
 302   for (typename str::const_iterator i = input.begin(); i != input.end(); ++i)
 303     AppendEscapedCharForHTMLImpl(*i, &result);
 304
 305   return result;
 306 }
 307
 308 // Everything except alphanumerics and !'()*-._~
 309 // See RFC 2396 for the list of reserved characters.
 310 static const Charmap kQueryCharmap = {{
 311   0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
 312   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 313 }};
 314
 315 // non-printable, non-7bit, and (including space)  "#%:<>?[\]^`{|}
 316 static const Charmap kPathCharmap = {{
 317   0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
 318   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 319 }};
 320
 321 #if defined(OS_MACOSX)
 322 // non-printable, non-7bit, and (including space)  "#%<>[\]^`{|}
 323 static const Charmap kNSURLCharmap = {{
 324   0xffffffffL, 0x5000002dL, 0x78000000L, 0xb8000001L,
 325   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 326 }};
 327 #endif  // defined(OS_MACOSX)
 328
 329 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
 330 static const Charmap kUrlEscape = {{
 331   0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
 332   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 333 }};
 334
 335 // non-7bit
 336 static const Charmap kNonASCIICharmap = {{
 337   0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
 338   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 339 }};
 340
 341 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
 342 // !'()*-._~#[]
 343 static const Charmap kExternalHandlerCharmap = {{
 344   0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L,
 345   0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
 346 }};
 347
 348 }  // namespace
 349
 350 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
 351   return Escape(text, kQueryCharmap, use_plus);
 352 }
 353
 354 std::string EscapePath(const std::string& path) {
 355   return Escape(path, kPathCharmap, false);
 356 }
 357
 358 #if defined(OS_MACOSX)
 359 std::string EscapeNSURLPrecursor(const std::string& precursor) {
 360   return Escape(precursor, kNSURLCharmap, false, true);
 361 }
 362 #endif  // defined(OS_MACOSX)
 363
 364 std::string EscapeUrlEncodedData(const std::string& path, bool use_plus) {
 365   return Escape(path, kUrlEscape, use_plus);
 366 }
 367
 368 std::string EscapeNonASCII(const std::string& input) {
 369   return Escape(input, kNonASCIICharmap, false);
 370 }
 371
 372 std::string EscapeExternalHandlerValue(const std::string& text) {
 373   return Escape(text, kExternalHandlerCharmap, false, true);
 374 }
 375
 376 void AppendEscapedCharForHTML(char c, std::string* output) {
 377   AppendEscapedCharForHTMLImpl(c, output);
 378 }
 379
 380 std::string EscapeForHTML(const std::string& input) {
 381   return EscapeForHTMLImpl(input);
 382 }
 383
 384 base::string16 EscapeForHTML(const base::string16& input) {
 385   return EscapeForHTMLImpl(input);
 386 }
 387
 388 std::string UnescapeURLComponent(const std::string& escaped_text,
 389                                  UnescapeRule::Type rules) {
 390   return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL);
 391 }
 392
 393 base::string16 UnescapeURLComponent(const base::string16& escaped_text,
 394                                     UnescapeRule::Type rules) {
 395   return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL);
 396 }
 397
 398 base::string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
 399                                                  UnescapeRule::Type rules) {
 400   return UnescapeAndDecodeUTF8URLComponentWithAdjustments(text, rules, NULL);
 401 }
 402
 403 base::string16 UnescapeAndDecodeUTF8URLComponentWithAdjustments(
 404     const std::string& text,
 405     UnescapeRule::Type rules,
 406     base::OffsetAdjuster::Adjustments* adjustments) {
 407   base::string16 result;
 408   base::OffsetAdjuster::Adjustments unescape_adjustments;
 409   std::string unescaped_url(UnescapeURLWithAdjustmentsImpl(
 410       text, rules, &unescape_adjustments));
 411   if (base::UTF8ToUTF16WithAdjustments(unescaped_url.data(),
 412                                        unescaped_url.length(),
 413                                        &result, adjustments)) {
 414     // Character set looks like it's valid.
 415     if (adjustments) {
 416       base::OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
 417                                                        adjustments);
 418     }
 419     return result;
 420   }
 421   // Character set is not valid.  Return the escaped version.
 422   return base::UTF8ToUTF16WithAdjustments(text, adjustments);
 423 }
 424
 425 base::string16 UnescapeForHTML(const base::string16& input) {
 426   static const struct {
 427     const char* ampersand_code;
 428     const char replacement;
 429   } kEscapeToChars[] = {
 430     { "&lt;", '<' },
 431     { "&gt;", '>' },
 432     { "&amp;", '&' },
 433     { "&quot;", '"' },
 434     { "&#39;", '\''},
 435   };
 436
 437   if (input.find(base::ASCIIToUTF16("&")) == std::string::npos)
 438     return input;
 439
 440   base::string16 ampersand_chars[arraysize(kEscapeToChars)];
 441   base::string16 text(input);
 442   for (base::string16::iterator iter = text.begin();
 443        iter != text.end(); ++iter) {
 444     if (*iter == '&') {
 445       // Potential ampersand encode char.
 446       size_t index = iter - text.begin();
 447       for (size_t i = 0; i < arraysize(kEscapeToChars); i++) {
 448         if (ampersand_chars[i].empty()) {
 449           ampersand_chars[i] =
 450               base::ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
 451         }
 452         if (text.find(ampersand_chars[i], index) == index) {
 453           text.replace(iter, iter + ampersand_chars[i].length(),
 454                        1, kEscapeToChars[i].replacement);
 455           break;
 456         }
 457       }
 458     }
 459   }
 460   return text;
 461 }
 462
 463 }  // namespace net