1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "net/base/escape.h"
9 #include "base/logging.h"
10 #include "base/memory/scoped_ptr.h"
11 #include "base/strings/string_piece.h"
12 #include "base/strings/string_util.h"
13 #include "base/strings/utf_offset_string_conversions.h"
14 #include "base/strings/utf_string_conversions.h"
20 const char kHexString
[] = "0123456789ABCDEF";
21 inline char IntToHex(int i
) {
22 DCHECK_GE(i
, 0) << i
<< " not a hex value";
23 DCHECK_LE(i
, 15) << i
<< " not a hex value";
27 // A fast bit-vector map for ascii characters.
29 // Internally stores 256 bits in an array of 8 ints.
30 // Does quick bit-flicking to lookup needed characters.
32 bool Contains(unsigned char c
) const {
33 return ((map
[c
>> 5] & (1 << (c
& 31))) != 0);
39 // Given text to escape and a Charmap defining which values to escape,
40 // return an escaped string. If use_plus is true, spaces are converted
41 // to +, otherwise, if spaces are in the charmap, they are converted to
42 // %20. And if keep_escaped is true, %XX will be kept as it is, otherwise, if
43 // '%' is in the charmap, it is converted to %25.
44 std::string
Escape(const std::string
& text
,
45 const Charmap
& charmap
,
47 bool keep_escaped
= false) {
49 escaped
.reserve(text
.length() * 3);
50 for (unsigned int i
= 0; i
< text
.length(); ++i
) {
51 unsigned char c
= static_cast<unsigned char>(text
[i
]);
52 if (use_plus
&& ' ' == c
) {
53 escaped
.push_back('+');
54 } else if (keep_escaped
&& '%' == c
&& i
+ 2 < text
.length() &&
55 base::IsHexDigit(text
[i
+ 1]) && base::IsHexDigit(text
[i
+ 2])) {
56 escaped
.push_back('%');
57 } else if (charmap
.Contains(c
)) {
58 escaped
.push_back('%');
59 escaped
.push_back(IntToHex(c
>> 4));
60 escaped
.push_back(IntToHex(c
& 0xf));
68 // Contains nonzero when the corresponding character is unescapable for normal
69 // URLs. These characters are the ones that may change the parsing of a URL, so
70 // we don't want to unescape them sometimes. In many case we won't want to
71 // unescape spaces, but that is controlled by parameters to Unescape*.
73 // The basic rule is that we can't unescape anything that would changing parsing
74 // like # or ?. We also can't unescape &, =, or + since that could be part of a
75 // query and that could change the server's parsing of the query. Nor can we
76 // unescape \ since src/url/ will convert it to a /.
78 // Lastly, we can't unescape anything that doesn't have a canonical
79 // representation in a URL. This means that unescaping will change the URL, and
80 // you could get different behavior if you copy and paste the URL, or press
81 // enter in the URL bar. The list of characters that fall into this category
82 // are the ones labeled PASS (allow either escaped or unescaped) in the big
83 // lookup table at the top of url/url_canon_path.cc. Also, characters
84 // that have CHAR_QUERY set in url/url_canon_internal.cc but are not
85 // allowed in query strings according to http://www.ietf.org/rfc/rfc3261.txt are
86 // not unescaped, to avoid turning a valid url according to spec into an
88 const char kUrlUnescape
[128] = {
89 // NULL, control chars...
90 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
91 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
92 // ' ' ! " # $ % & ' ( ) * + , - . /
93 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
94 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
95 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0,
96 // @ A B C D E F G H I J K L M N O
97 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
98 // P Q R S T U V W X Y Z [ \ ] ^ _
99 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
100 // ` a b c d e f g h i j k l m n o
101 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
102 // p q r s t u v w x y z { | } ~ <NBSP>
103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0
106 // Attempts to unescape the sequence at |index| within |escaped_text|. If
107 // successful, sets |value| to the unescaped value. Returns whether
108 // unescaping succeeded.
109 template<typename STR
>
110 bool UnescapeUnsignedCharAtIndex(const STR
& escaped_text
,
112 unsigned char* value
) {
113 if ((index
+ 2) >= escaped_text
.size())
115 if (escaped_text
[index
] != '%')
117 const typename
STR::value_type
most_sig_digit(
118 static_cast<typename
STR::value_type
>(escaped_text
[index
+ 1]));
119 const typename
STR::value_type
least_sig_digit(
120 static_cast<typename
STR::value_type
>(escaped_text
[index
+ 2]));
121 if (base::IsHexDigit(most_sig_digit
) && base::IsHexDigit(least_sig_digit
)) {
122 *value
= base::HexDigitToInt(most_sig_digit
) * 16 +
123 base::HexDigitToInt(least_sig_digit
);
129 // Returns true if there is an Arabic Language Mark at |index|. |first_byte|
130 // is the byte at |index|.
131 template<typename STR
>
132 bool HasArabicLanguageMarkAtIndex(const STR
& escaped_text
,
133 unsigned char first_byte
,
135 if (first_byte
!= 0xD8)
137 unsigned char second_byte
;
138 if (!UnescapeUnsignedCharAtIndex(escaped_text
, index
+ 3, &second_byte
))
140 return second_byte
== 0x9c;
143 // Returns true if there is a BiDi control char at |index|. |first_byte| is the
145 template<typename STR
>
146 bool HasThreeByteBidiControlCharAtIndex(const STR
& escaped_text
,
147 unsigned char first_byte
,
149 if (first_byte
!= 0xE2)
151 unsigned char second_byte
;
152 if (!UnescapeUnsignedCharAtIndex(escaped_text
, index
+ 3, &second_byte
))
154 if (second_byte
!= 0x80 && second_byte
!= 0x81)
156 unsigned char third_byte
;
157 if (!UnescapeUnsignedCharAtIndex(escaped_text
, index
+ 6, &third_byte
))
159 if (second_byte
== 0x80) {
160 return third_byte
== 0x8E ||
161 third_byte
== 0x8F ||
162 (third_byte
>= 0xAA && third_byte
<= 0xAE);
164 return third_byte
>= 0xA6 && third_byte
<= 0xA9;
167 // Returns true if there is a four-byte banned char at |index|. |first_byte| is
168 // the byte at |index|.
169 template <typename STR
>
170 bool HasFourByteBannedCharAtIndex(const STR
& escaped_text
,
171 unsigned char first_byte
,
173 // The following characters are blacklisted for spoofability concerns.
174 // U+1F50F LOCK WITH INK PEN (%F0%9F%94%8F)
175 // U+1F510 CLOSED LOCK WITH KEY (%F0%9F%94%90)
176 // U+1F512 LOCK (%F0%9F%94%92)
177 // U+1F513 OPEN LOCK (%F0%9F%94%93)
178 if (first_byte
!= 0xF0)
181 unsigned char second_byte
;
182 if (!UnescapeUnsignedCharAtIndex(escaped_text
, index
+ 3, &second_byte
) ||
183 second_byte
!= 0x9F) {
187 unsigned char third_byte
;
188 if (!UnescapeUnsignedCharAtIndex(escaped_text
, index
+ 6, &third_byte
) ||
189 third_byte
!= 0x94) {
193 unsigned char fourth_byte
;
194 return UnescapeUnsignedCharAtIndex(escaped_text
, index
+ 9, &fourth_byte
) &&
195 (fourth_byte
== 0x8F || fourth_byte
== 0x90 || fourth_byte
== 0x92 ||
196 fourth_byte
== 0x93);
199 // Unescapes |escaped_text| according to |rules|, returning the resulting
200 // string. Fills in an |adjustments| parameter, if non-NULL, so it reflects
201 // the alterations done to the string that are not one-character-to-one-
202 // character. The resulting |adjustments| will always be sorted by increasing
204 template<typename STR
>
205 STR
UnescapeURLWithAdjustmentsImpl(
206 const STR
& escaped_text
,
207 UnescapeRule::Type rules
,
208 base::OffsetAdjuster::Adjustments
* adjustments
) {
210 adjustments
->clear();
211 // Do not unescape anything, return the |escaped_text| text.
212 if (rules
== UnescapeRule::NONE
)
215 // The output of the unescaping is always smaller than the input, so we can
216 // reserve the input size to make sure we have enough buffer and don't have
217 // to allocate in the loop below.
219 result
.reserve(escaped_text
.length());
221 // Locations of adjusted text.
222 for (size_t i
= 0, max
= escaped_text
.size(); i
< max
; ++i
) {
223 if (static_cast<unsigned char>(escaped_text
[i
]) >= 128) {
224 // Non ASCII character, append as is.
225 result
.push_back(escaped_text
[i
]);
229 unsigned char first_byte
;
230 if (UnescapeUnsignedCharAtIndex(escaped_text
, i
, &first_byte
)) {
231 // Per http://tools.ietf.org/html/rfc3987#section-4.1, the following BiDi
232 // control characters are not allowed to appear unescaped in URLs:
234 // U+200E LEFT-TO-RIGHT MARK (%E2%80%8E)
235 // U+200F RIGHT-TO-LEFT MARK (%E2%80%8F)
236 // U+202A LEFT-TO-RIGHT EMBEDDING (%E2%80%AA)
237 // U+202B RIGHT-TO-LEFT EMBEDDING (%E2%80%AB)
238 // U+202C POP DIRECTIONAL FORMATTING (%E2%80%AC)
239 // U+202D LEFT-TO-RIGHT OVERRIDE (%E2%80%AD)
240 // U+202E RIGHT-TO-LEFT OVERRIDE (%E2%80%AE)
242 // Additionally, the Unicode Technical Report (TR9) as referenced by RFC
243 // 3987 above has since added some new BiDi control characters.
244 // http://www.unicode.org/reports/tr9
246 // U+061C ARABIC LETTER MARK (%D8%9C)
247 // U+2066 LEFT-TO-RIGHT ISOLATE (%E2%81%A6)
248 // U+2067 RIGHT-TO-LEFT ISOLATE (%E2%81%A7)
249 // U+2068 FIRST STRONG ISOLATE (%E2%81%A8)
250 // U+2069 POP DIRECTIONAL ISOLATE (%E2%81%A9)
252 // The following spoofable characters are also banned, because they could
253 // be used to imitate parts of a web browser's UI.
255 // U+1F50F LOCK WITH INK PEN (%F0%9F%94%8F)
256 // U+1F510 CLOSED LOCK WITH KEY (%F0%9F%94%90)
257 // U+1F512 LOCK (%F0%9F%94%92)
258 // U+1F513 OPEN LOCK (%F0%9F%94%93)
260 // However, some schemes such as data: and file: need to parse the exact
261 // binary data when loading the URL. For that reason,
262 // SPOOFING_AND_CONTROL_CHARS allows unescaping BiDi control characters.
263 // DO NOT use SPOOFING_AND_CONTROL_CHARS if the parsed URL is going to be
264 // displayed in the UI.
265 if (!(rules
& UnescapeRule::SPOOFING_AND_CONTROL_CHARS
)) {
266 if (HasArabicLanguageMarkAtIndex(escaped_text
, first_byte
, i
)) {
267 // Keep Arabic Language Mark escaped.
268 result
.append(escaped_text
, i
, 6);
272 if (HasThreeByteBidiControlCharAtIndex(escaped_text
, first_byte
, i
)) {
273 // Keep BiDi control char escaped.
274 result
.append(escaped_text
, i
, 9);
278 if (HasFourByteBannedCharAtIndex(escaped_text
, first_byte
, i
)) {
279 // Keep banned char escaped.
280 result
.append(escaped_text
, i
, 12);
286 if (first_byte
>= 0x80 || // Unescape all high-bit characters.
287 // For 7-bit characters, the lookup table tells us all valid chars.
288 (kUrlUnescape
[first_byte
] ||
289 // ...and we allow some additional unescaping when flags are set.
290 (first_byte
== ' ' && (rules
& UnescapeRule::SPACES
)) ||
291 // Allow any of the prohibited but non-control characters when
292 // we're doing "special" chars.
293 (first_byte
> ' ' && (rules
& UnescapeRule::URL_SPECIAL_CHARS
)) ||
294 // Additionally allow non-display characters if requested.
296 (rules
& UnescapeRule::SPOOFING_AND_CONTROL_CHARS
)))) {
297 // Use the unescaped version of the character.
299 adjustments
->push_back(base::OffsetAdjuster::Adjustment(i
, 3, 1));
300 result
.push_back(first_byte
);
303 // Keep escaped. Append a percent and we'll get the following two
304 // digits on the next loops through.
305 result
.push_back('%');
307 } else if ((rules
& UnescapeRule::REPLACE_PLUS_WITH_SPACE
) &&
308 escaped_text
[i
] == '+') {
309 result
.push_back(' ');
311 // Normal case for unescaped characters.
312 result
.push_back(escaped_text
[i
]);
320 void AppendEscapedCharForHTMLImpl(typename
str::value_type c
, str
* output
) {
321 static const struct {
323 const char* replacement
;
324 } kCharsToEscape
[] = {
332 for (k
= 0; k
< arraysize(kCharsToEscape
); ++k
) {
333 if (c
== kCharsToEscape
[k
].key
) {
334 const char* p
= kCharsToEscape
[k
].replacement
;
336 output
->push_back(*p
++);
340 if (k
== arraysize(kCharsToEscape
))
341 output
->push_back(c
);
345 str
EscapeForHTMLImpl(const str
& input
) {
347 result
.reserve(input
.size()); // Optimize for no escaping.
349 for (typename
str::const_iterator i
= input
.begin(); i
!= input
.end(); ++i
)
350 AppendEscapedCharForHTMLImpl(*i
, &result
);
355 // Everything except alphanumerics and !'()*-._~
356 // See RFC 2396 for the list of reserved characters.
357 static const Charmap kQueryCharmap
= {{
358 0xffffffffL
, 0xfc00987dL
, 0x78000001L
, 0xb8000001L
,
359 0xffffffffL
, 0xffffffffL
, 0xffffffffL
, 0xffffffffL
362 // non-printable, non-7bit, and (including space) "#%:<>?[\]^`{|}
363 static const Charmap kPathCharmap
= {{
364 0xffffffffL
, 0xd400002dL
, 0x78000000L
, 0xb8000001L
,
365 0xffffffffL
, 0xffffffffL
, 0xffffffffL
, 0xffffffffL
368 #if defined(OS_MACOSX)
369 // non-printable, non-7bit, and (including space) "#%<>[\]^`{|}
370 static const Charmap kNSURLCharmap
= {{
371 0xffffffffL
, 0x5000002dL
, 0x78000000L
, 0xb8000001L
,
372 0xffffffffL
, 0xffffffffL
, 0xffffffffL
, 0xffffffffL
374 #endif // defined(OS_MACOSX)
376 // non-printable, non-7bit, and (including space) ?>=<;+'&%$#"![\]^`{|}
377 static const Charmap kUrlEscape
= {{
378 0xffffffffL
, 0xf80008fdL
, 0x78000001L
, 0xb8000001L
,
379 0xffffffffL
, 0xffffffffL
, 0xffffffffL
, 0xffffffffL
383 static const Charmap kNonASCIICharmap
= {{
384 0x00000000L
, 0x00000000L
, 0x00000000L
, 0x00000000L
,
385 0xffffffffL
, 0xffffffffL
, 0xffffffffL
, 0xffffffffL
388 // Everything except alphanumerics, the reserved characters(;/?:@&=+$,) and
390 static const Charmap kExternalHandlerCharmap
= {{
391 0xffffffffL
, 0x50000025L
, 0x50000000L
, 0xb8000001L
,
392 0xffffffffL
, 0xffffffffL
, 0xffffffffL
, 0xffffffffL
397 std::string
EscapeQueryParamValue(const std::string
& text
, bool use_plus
) {
398 return Escape(text
, kQueryCharmap
, use_plus
);
401 std::string
EscapePath(const std::string
& path
) {
402 return Escape(path
, kPathCharmap
, false);
405 #if defined(OS_MACOSX)
406 std::string
EscapeNSURLPrecursor(const std::string
& precursor
) {
407 return Escape(precursor
, kNSURLCharmap
, false, true);
409 #endif // defined(OS_MACOSX)
411 std::string
EscapeUrlEncodedData(const std::string
& path
, bool use_plus
) {
412 return Escape(path
, kUrlEscape
, use_plus
);
415 std::string
EscapeNonASCII(const std::string
& input
) {
416 return Escape(input
, kNonASCIICharmap
, false);
419 std::string
EscapeExternalHandlerValue(const std::string
& text
) {
420 return Escape(text
, kExternalHandlerCharmap
, false, true);
423 void AppendEscapedCharForHTML(char c
, std::string
* output
) {
424 AppendEscapedCharForHTMLImpl(c
, output
);
427 std::string
EscapeForHTML(const std::string
& input
) {
428 return EscapeForHTMLImpl(input
);
431 base::string16
EscapeForHTML(const base::string16
& input
) {
432 return EscapeForHTMLImpl(input
);
435 std::string
UnescapeURLComponent(const std::string
& escaped_text
,
436 UnescapeRule::Type rules
) {
437 return UnescapeURLWithAdjustmentsImpl(escaped_text
, rules
, NULL
);
440 base::string16
UnescapeURLComponent(const base::string16
& escaped_text
,
441 UnescapeRule::Type rules
) {
442 return UnescapeURLWithAdjustmentsImpl(escaped_text
, rules
, NULL
);
445 base::string16
UnescapeAndDecodeUTF8URLComponent(const std::string
& text
,
446 UnescapeRule::Type rules
) {
447 return UnescapeAndDecodeUTF8URLComponentWithAdjustments(text
, rules
, NULL
);
450 base::string16
UnescapeAndDecodeUTF8URLComponentWithAdjustments(
451 const std::string
& text
,
452 UnescapeRule::Type rules
,
453 base::OffsetAdjuster::Adjustments
* adjustments
) {
454 base::string16 result
;
455 base::OffsetAdjuster::Adjustments unescape_adjustments
;
456 std::string
unescaped_url(UnescapeURLWithAdjustmentsImpl(
457 text
, rules
, &unescape_adjustments
));
458 if (base::UTF8ToUTF16WithAdjustments(unescaped_url
.data(),
459 unescaped_url
.length(),
460 &result
, adjustments
)) {
461 // Character set looks like it's valid.
463 base::OffsetAdjuster::MergeSequentialAdjustments(unescape_adjustments
,
468 // Character set is not valid. Return the escaped version.
469 return base::UTF8ToUTF16WithAdjustments(text
, adjustments
);
472 base::string16
UnescapeForHTML(const base::string16
& input
) {
473 static const struct {
474 const char* ampersand_code
;
475 const char replacement
;
476 } kEscapeToChars
[] = {
484 if (input
.find(base::ASCIIToUTF16("&")) == std::string::npos
)
487 base::string16 ampersand_chars
[arraysize(kEscapeToChars
)];
488 base::string16
text(input
);
489 for (base::string16::iterator iter
= text
.begin();
490 iter
!= text
.end(); ++iter
) {
492 // Potential ampersand encode char.
493 size_t index
= iter
- text
.begin();
494 for (size_t i
= 0; i
< arraysize(kEscapeToChars
); i
++) {
495 if (ampersand_chars
[i
].empty()) {
497 base::ASCIIToUTF16(kEscapeToChars
[i
].ampersand_code
);
499 if (text
.find(ampersand_chars
[i
], index
) == index
) {
500 text
.replace(iter
, iter
+ ampersand_chars
[i
].length(),
501 1, kEscapeToChars
[i
].replacement
);