Roll src/third_party/WebKit d9c6159:8139f33 (svn 201974:201975)
[chromium-blink-merge.git] / net / base / escape.cc
blob15de5e1dbd37d1720eec25cab678654c95c98777
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "net/base/escape.h"
7 #include <algorithm>
9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string_piece.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_offset_string_conversions.h"
14 #include "base/strings/utf_string_conversions.h"
16 namespace net {
18 namespace {
20 const char kHexString[] = "0123456789ABCDEF";
21 inline char IntToHex(int i) {
22 DCHECK_GE(i, 0) << i << " not a hex value";
23 DCHECK_LE(i, 15) << i << " not a hex value";
24 return kHexString[i];
27 // A fast bit-vector map for ascii characters.
29 // Internally stores 256 bits in an array of 8 ints.
30 // Does quick bit-flicking to lookup needed characters.
31 struct Charmap {
32 bool Contains(unsigned char c) const {
33 return ((map[c >> 5] & (1 << (c & 31))) != 0);
36 uint32_t map[8];
39 // Given text to escape and a Charmap defining which values to escape,
40 // return an escaped string. If use_plus is true, spaces are converted
41 // to +, otherwise, if spaces are in the charmap, they are converted to
42 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
43 // '%' is in the charmap, it is converted to %25.
44 std::string Escape(const std::string& text,
45 const Charmap& charmap,
46 bool use_plus,
47 bool keep_escaped = false) {
48 std::string escaped;
49 escaped.reserve(text.length() * 3);
50 for (unsigned int i = 0; i < text.length(); ++i) {
51 unsigned char c = static_cast<unsigned char>(text[i]);
52 if (use_plus && ' ' == c) {
53 escaped.push_back('+');
54 } else if (keep_escaped && '%' == c && i + 2 < text.length() &&
55 base::IsHexDigit(text[i + 1]) && base::IsHexDigit(text[i + 2])) {
56 escaped.push_back('%');
57 } else if (charmap.Contains(c)) {
58 escaped.push_back('%');
59 escaped.push_back(IntToHex(c >> 4));
60 escaped.push_back(IntToHex(c & 0xf));
61 } else {
62 escaped.push_back(c);
65 return escaped;
68 // Contains nonzero when the corresponding character is unescapable for normal
69 // URLs. These characters are the ones that may change the parsing of a URL, so
70 // we don't want to unescape them sometimes. In many case we won't want to
71 // unescape spaces, but that is controlled by parameters to Unescape*.
73 // The basic rule is that we can't unescape anything that would changing parsing
74 // like # or ?. We also can't unescape &, =, or + since that could be part of a
75 // query and that could change the server's parsing of the query. Nor can we
76 // unescape \ since src/url/ will convert it to a /.
78 // Lastly, we can't unescape anything that doesn't have a canonical
79 // representation in a URL. This means that unescaping will change the URL, and
80 // you could get different behavior if you copy and paste the URL, or press
81 // enter in the URL bar. The list of characters that fall into this category
82 // are the ones labeled PASS (allow either escaped or unescaped) in the big
83 // lookup table at the top of url/url_canon_path.cc. Also, characters
84 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
85 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
86 // not unescaped, to avoid turning a valid url according to spec into an
87 // invalid one.
88 const char kUrlUnescape[128] = {
89 // NULL, control chars...
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
92 // ' ' ! " # $ % & ' ( ) * + , - . /
93 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
94 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
95 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
96 // @ A B C D E F G H I J K L M N O
97 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
98 // P Q R S T U V W X Y Z [ \ ] ^ _
99 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
100 // ` a b c d e f g h i j k l m n o
101 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
102 // p q r s t u v w x y z { | } ~ <NBSP>
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0
106 // Attempts to unescape the sequence at |index| within |escaped_text|. If
107 // successful, sets |value| to the unescaped value. Returns whether
108 // unescaping succeeded.
109 template<typename STR>
110 bool UnescapeUnsignedCharAtIndex(const STR& escaped_text,
111 size_t index,
112 unsigned char* value) {
113 if ((index + 2) >= escaped_text.size())
114 return false;
115 if (escaped_text[index] != '%')
116 return false;
117 const typename STR::value_type most_sig_digit(
118 static_cast<typename STR::value_type>(escaped_text[index + 1]));
119 const typename STR::value_type least_sig_digit(
120 static_cast<typename STR::value_type>(escaped_text[index + 2]));
121 if (base::IsHexDigit(most_sig_digit) && base::IsHexDigit(least_sig_digit)) {
122 *value = base::HexDigitToInt(most_sig_digit) * 16 +
123 base::HexDigitToInt(least_sig_digit);
124 return true;
126 return false;
129 // Returns true if there is an Arabic Language Mark at |index|. |first_byte|
130 // is the byte at |index|.
131 template<typename STR>
132 bool HasArabicLanguageMarkAtIndex(const STR& escaped_text,
133 unsigned char first_byte,
134 size_t index) {
135 if (first_byte != 0xD8)
136 return false;
137 unsigned char second_byte;
138 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
139 return false;
140 return second_byte == 0x9c;
143 // Returns true if there is a BiDi control char at |index|. |first_byte| is the
144 // byte at |index|.
145 template<typename STR>
146 bool HasThreeByteBidiControlCharAtIndex(const STR& escaped_text,
147 unsigned char first_byte,
148 size_t index) {
149 if (first_byte != 0xE2)
150 return false;
151 unsigned char second_byte;
152 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte))
153 return false;
154 if (second_byte != 0x80 && second_byte != 0x81)
155 return false;
156 unsigned char third_byte;
157 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte))
158 return false;
159 if (second_byte == 0x80) {
160 return third_byte == 0x8E ||
161 third_byte == 0x8F ||
162 (third_byte >= 0xAA && third_byte <= 0xAE);
164 return third_byte >= 0xA6 && third_byte <= 0xA9;
167 // Returns true if there is a four-byte banned char at |index|. |first_byte| is
168 // the byte at |index|.
169 template <typename STR>
170 bool HasFourByteBannedCharAtIndex(const STR& escaped_text,
171 unsigned char first_byte,
172 size_t index) {
173 // The following characters are blacklisted for spoofability concerns.
174 // U+1F50F LOCK WITH INK PEN (%F0%9F%94%8F)
175 // U+1F510 CLOSED LOCK WITH KEY (%F0%9F%94%90)
176 // U+1F512 LOCK (%F0%9F%94%92)
177 // U+1F513 OPEN LOCK (%F0%9F%94%93)
178 if (first_byte != 0xF0)
179 return false;
181 unsigned char second_byte;
182 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 3, &second_byte) ||
183 second_byte != 0x9F) {
184 return false;
187 unsigned char third_byte;
188 if (!UnescapeUnsignedCharAtIndex(escaped_text, index + 6, &third_byte) ||
189 third_byte != 0x94) {
190 return false;
193 unsigned char fourth_byte;
194 return UnescapeUnsignedCharAtIndex(escaped_text, index + 9, &fourth_byte) &&
195 (fourth_byte == 0x8F || fourth_byte == 0x90 || fourth_byte == 0x92 ||
196 fourth_byte == 0x93);
199 // Unescapes |escaped_text| according to |rules|, returning the resulting
200 // string. Fills in an |adjustments| parameter, if non-NULL, so it reflects
201 // the alterations done to the string that are not one-character-to-one-
202 // character. The resulting |adjustments| will always be sorted by increasing
203 // offset.
204 template<typename STR>
205 STR UnescapeURLWithAdjustmentsImpl(
206 const STR& escaped_text,
207 UnescapeRule::Type rules,
208 base::OffsetAdjuster::Adjustments* adjustments) {
209 if (adjustments)
210 adjustments->clear();
211 // Do not unescape anything, return the |escaped_text| text.
212 if (rules == UnescapeRule::NONE)
213 return escaped_text;
215 // The output of the unescaping is always smaller than the input, so we can
216 // reserve the input size to make sure we have enough buffer and don't have
217 // to allocate in the loop below.
218 STR result;
219 result.reserve(escaped_text.length());
221 // Locations of adjusted text.
222 for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
223 if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
224 // Non ASCII character, append as is.
225 result.push_back(escaped_text[i]);
226 continue;
229 unsigned char first_byte;
230 if (UnescapeUnsignedCharAtIndex(escaped_text, i, &first_byte)) {
231 // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi
232 // control characters are not allowed to appear unescaped in URLs:
234 // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E)
235 // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F)
236 // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)
237 // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)
238 // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)
239 // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)
240 // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)
242 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC
243 // 3987 above has since added some new BiDi control characters.
244 // http://www.unicode.org/reports/tr9
246 // U+061C ARABIC LETTER MARK (%D8%9C)
247 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)
248 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
249 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)
250 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)
252 // The following spoofable characters are also banned, because they could
253 // be used to imitate parts of a web browser's UI.
255 // U+1F50F LOCK WITH INK PEN (%F0%9F%94%8F)
256 // U+1F510 CLOSED LOCK WITH KEY (%F0%9F%94%90)
257 // U+1F512 LOCK (%F0%9F%94%92)
258 // U+1F513 OPEN LOCK (%F0%9F%94%93)
260 // However, some schemes such as data: and file: need to parse the exact
261 // binary data when loading the URL. For that reason,
262 // SPOOFING_AND_CONTROL_CHARS allows unescaping BiDi control characters.
263 // DO NOT use SPOOFING_AND_CONTROL_CHARS if the parsed URL is going to be
264 // displayed in the UI.
265 if (!(rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)) {
266 if (HasArabicLanguageMarkAtIndex(escaped_text, first_byte, i)) {
267 // Keep Arabic Language Mark escaped.
268 result.append(escaped_text, i, 6);
269 i += 5;
270 continue;
272 if (HasThreeByteBidiControlCharAtIndex(escaped_text, first_byte, i)) {
273 // Keep BiDi control char escaped.
274 result.append(escaped_text, i, 9);
275 i += 8;
276 continue;
278 if (HasFourByteBannedCharAtIndex(escaped_text, first_byte, i)) {
279 // Keep banned char escaped.
280 result.append(escaped_text, i, 12);
281 i += 11;
282 continue;
286 if (first_byte >= 0x80 || // Unescape all high-bit characters.
287 // For 7-bit characters, the lookup table tells us all valid chars.
288 (kUrlUnescape[first_byte] ||
289 // ...and we allow some additional unescaping when flags are set.
290 (first_byte == ' ' && (rules & UnescapeRule::SPACES)) ||
291 // Allow any of the prohibited but non-control characters when
292 // we're doing "special" chars.
293 (first_byte > ' ' && (rules & UnescapeRule::URL_SPECIAL_CHARS)) ||
294 // Additionally allow non-display characters if requested.
295 (first_byte < ' ' &&
296 (rules & UnescapeRule::SPOOFING_AND_CONTROL_CHARS)))) {
297 // Use the unescaped version of the character.
298 if (adjustments)
299 adjustments->push_back(base::OffsetAdjuster::Adjustment(i, 3, 1));
300 result.push_back(first_byte);
301 i += 2;
302 } else {
303 // Keep escaped. Append a percent and we'll get the following two
304 // digits on the next loops through.
305 result.push_back('%');
307 } else if ((rules & UnescapeRule::REPLACE_PLUS_WITH_SPACE) &&
308 escaped_text[i] == '+') {
309 result.push_back(' ');
310 } else {
311 // Normal case for unescaped characters.
312 result.push_back(escaped_text[i]);
316 return result;
319 template <class str>
320 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
321 static const struct {
322 char key;
323 const char* replacement;
324 } kCharsToEscape[] = {
325 { '<', "&lt;" },
326 { '>', "&gt;" },
327 { '&', "&amp;" },
328 { '"', "&quot;" },
329 { '\'', "&#39;" },
331 size_t k;
332 for (k = 0; k < arraysize(kCharsToEscape); ++k) {
333 if (c == kCharsToEscape[k].key) {
334 const char* p = kCharsToEscape[k].replacement;
335 while (*p)
336 output->push_back(*p++);
337 break;
340 if (k == arraysize(kCharsToEscape))
341 output->push_back(c);
344 template <class str>
345 str EscapeForHTMLImpl(const str& input) {
346 str result;
347 result.reserve(input.size()); // Optimize for no escaping.
349 for (typename str::const_iterator i = input.begin(); i != input.end(); ++i)
350 AppendEscapedCharForHTMLImpl(*i, &result);
352 return result;
355 // Everything except alphanumerics and !'()*-._~
356 // See RFC 2396 for the list of reserved characters.
357 static const Charmap kQueryCharmap = {{
358 0xffffffffL, 0xfc00987dL, 0x78000001L, 0xb8000001L,
359 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
362 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|}
363 static const Charmap kPathCharmap = {{
364 0xffffffffL, 0xd400002dL, 0x78000000L, 0xb8000001L,
365 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
368 #if defined(OS_MACOSX)
369 // non-printable, non-7bit, and (including space) "#%<>[\]^`{|}
370 static const Charmap kNSURLCharmap = {{
371 0xffffffffL, 0x5000002dL, 0x78000000L, 0xb8000001L,
372 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
374 #endif // defined(OS_MACOSX)
376 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
377 static const Charmap kUrlEscape = {{
378 0xffffffffL, 0xf80008fdL, 0x78000001L, 0xb8000001L,
379 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
382 // non-7bit
383 static const Charmap kNonASCIICharmap = {{
384 0x00000000L, 0x00000000L, 0x00000000L, 0x00000000L,
385 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
388 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
389 // !'()*-._~#[]
390 static const Charmap kExternalHandlerCharmap = {{
391 0xffffffffL, 0x50000025L, 0x50000000L, 0xb8000001L,
392 0xffffffffL, 0xffffffffL, 0xffffffffL, 0xffffffffL
395 } // namespace
397 std::string EscapeQueryParamValue(const std::string& text, bool use_plus) {
398 return Escape(text, kQueryCharmap, use_plus);
401 std::string EscapePath(const std::string& path) {
402 return Escape(path, kPathCharmap, false);
405 #if defined(OS_MACOSX)
406 std::string EscapeNSURLPrecursor(const std::string& precursor) {
407 return Escape(precursor, kNSURLCharmap, false, true);
409 #endif // defined(OS_MACOSX)
411 std::string EscapeUrlEncodedData(const std::string& path, bool use_plus) {
412 return Escape(path, kUrlEscape, use_plus);
415 std::string EscapeNonASCII(const std::string& input) {
416 return Escape(input, kNonASCIICharmap, false);
419 std::string EscapeExternalHandlerValue(const std::string& text) {
420 return Escape(text, kExternalHandlerCharmap, false, true);
423 void AppendEscapedCharForHTML(char c, std::string* output) {
424 AppendEscapedCharForHTMLImpl(c, output);
427 std::string EscapeForHTML(const std::string& input) {
428 return EscapeForHTMLImpl(input);
431 base::string16 EscapeForHTML(const base::string16& input) {
432 return EscapeForHTMLImpl(input);
435 std::string UnescapeURLComponent(const std::string& escaped_text,
436 UnescapeRule::Type rules) {
437 return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL);
440 base::string16 UnescapeURLComponent(const base::string16& escaped_text,
441 UnescapeRule::Type rules) {
442 return UnescapeURLWithAdjustmentsImpl(escaped_text, rules, NULL);
445 base::string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
446 UnescapeRule::Type rules) {
447 return UnescapeAndDecodeUTF8URLComponentWithAdjustments(text, rules, NULL);
450 base::string16 UnescapeAndDecodeUTF8URLComponentWithAdjustments(
451 const std::string& text,
452 UnescapeRule::Type rules,
453 base::OffsetAdjuster::Adjustments* adjustments) {
454 base::string16 result;
455 base::OffsetAdjuster::Adjustments unescape_adjustments;
456 std::string unescaped_url(UnescapeURLWithAdjustmentsImpl(
457 text, rules, &unescape_adjustments));
458 if (base::UTF8ToUTF16WithAdjustments(unescaped_url.data(),
459 unescaped_url.length(),
460 &result, adjustments)) {
461 // Character set looks like it's valid.
462 if (adjustments) {
463 base::OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments,
464 adjustments);
466 return result;
468 // Character set is not valid. Return the escaped version.
469 return base::UTF8ToUTF16WithAdjustments(text, adjustments);
472 base::string16 UnescapeForHTML(const base::string16& input) {
473 static const struct {
474 const char* ampersand_code;
475 const char replacement;
476 } kEscapeToChars[] = {
477 { "&lt;", '<' },
478 { "&gt;", '>' },
479 { "&amp;", '&' },
480 { "&quot;", '"' },
481 { "&#39;", '\''},
484 if (input.find(base::ASCIIToUTF16("&")) == std::string::npos)
485 return input;
487 base::string16 ampersand_chars[arraysize(kEscapeToChars)];
488 base::string16 text(input);
489 for (base::string16::iterator iter = text.begin();
490 iter != text.end(); ++iter) {
491 if (*iter == '&') {
492 // Potential ampersand encode char.
493 size_t index = iter - text.begin();
494 for (size_t i = 0; i < arraysize(kEscapeToChars); i++) {
495 if (ampersand_chars[i].empty()) {
496 ampersand_chars[i] =
497 base::ASCIIToUTF16(kEscapeToChars[i].ampersand_code);
499 if (text.find(ampersand_chars[i], index) == index) {
500 text.replace(iter, iter + ampersand_chars[i].length(),
501 1, kEscapeToChars[i].replacement);
502 break;
507 return text;
510 } // namespace net