1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "extensions/browser/api/web_request/form_data_parser.h"
9 #include "base/lazy_instance.h"
10 #include "base/logging.h"
11 #include "base/macros.h"
12 #include "base/strings/string_util.h"
13 #include "base/values.h"
14 #include "net/base/escape.h"
15 #include "net/url_request/url_request.h"
16 #include "third_party/re2/re2/re2.h"
18 using base::DictionaryValue
;
19 using base::ListValue
;
20 using base::StringPiece
;
23 namespace extensions
{
27 const char kContentDisposition
[] = "content-disposition:";
28 const size_t kContentDispositionLength
= arraysize(kContentDisposition
) - 1;
29 // kCharacterPattern is an allowed character in a URL encoding. Definition is
30 // from RFC 1738, end of section 2.2.
31 const char kCharacterPattern
[] =
32 "(?:[a-zA-Z0-9$_.+!*'(),]|-|(?:%[a-fA-F0-9]{2}))";
33 const char kEscapeClosingQuote
[] = "\\\\E";
35 // A wrapper struct for static RE2 objects to be held as LazyInstance.
39 const RE2 transfer_padding_pattern
;
40 const RE2 crlf_pattern
;
41 const RE2 closing_pattern
;
42 const RE2 epilogue_pattern
;
43 const RE2 crlf_free_pattern
;
44 const RE2 preamble_pattern
;
45 const RE2 header_pattern
;
46 const RE2 content_disposition_pattern
;
47 const RE2 name_pattern
;
48 const RE2 value_pattern
;
49 const RE2 unquote_pattern
;
50 const RE2 url_encoded_pattern
;
54 : transfer_padding_pattern("[ \\t]*\\r\\n"),
55 crlf_pattern("\\r\\n"),
56 closing_pattern("--[ \\t]*"),
57 epilogue_pattern("|\\r\\n(?s:.)*"),
58 crlf_free_pattern("(?:[^\\r]|\\r+[^\\r\\n])*"),
59 preamble_pattern(".+?"),
60 header_pattern("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"),
61 content_disposition_pattern(std::string("(?i:") + kContentDisposition
+
63 name_pattern("\\bname=\"([^\"]*)\""),
64 value_pattern("\\bfilename=\"([^\"]*)\""),
65 unquote_pattern(kEscapeClosingQuote
),
66 url_encoded_pattern(std::string("(") + kCharacterPattern
+ "*)=(" +
71 Patterns::~Patterns() {}
73 base::LazyInstance
<Patterns
>::Leaky g_patterns
= LAZY_INSTANCE_INITIALIZER
;
77 // Parses URLencoded forms, see
78 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
79 class FormDataParserUrlEncoded
: public FormDataParser
{
81 FormDataParserUrlEncoded();
82 ~FormDataParserUrlEncoded() override
;
84 // Implementation of FormDataParser.
85 bool AllDataReadOK() override
;
86 bool GetNextNameValue(Result
* result
) override
;
87 bool SetSource(base::StringPiece source
) override
;
90 // Returns the pattern to match a single name-value pair. This could be even
91 // static, but then we would have to spend more code on initializing the
92 // cached pointer to g_patterns.Get().
93 const RE2
& pattern() const {
94 return patterns_
->url_encoded_pattern
;
97 // Auxiliary constant for using RE2. Number of arguments for parsing
98 // name-value pairs (one for name, one for value).
99 static const size_t args_size_
= 2u;
100 static const net::UnescapeRule::Type unescape_rules_
;
102 re2::StringPiece source_
;
104 bool source_malformed_
;
106 // Auxiliary store for using RE2.
109 const RE2::Arg arg_name_
;
110 const RE2::Arg arg_value_
;
111 const RE2::Arg
* args_
[args_size_
];
113 // Caching the pointer to g_patterns.Get().
114 const Patterns
* patterns_
;
116 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded
);
119 // The following class, FormDataParserMultipart, parses forms encoded as
120 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
121 // encoding) and 5322 (MIME-headers).
123 // Implementation details
125 // The original grammar from RFC 2046 is this, "multipart-body" being the root
128 // boundary := 0*69<bchars> bcharsnospace
129 // bchars := bcharsnospace / " "
130 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
131 // / "-" / "." / "/" / ":" / "=" / "?"
132 // dash-boundary := "--" boundary
133 // multipart-body := [preamble CRLF]
134 // dash-boundary transport-padding CRLF
135 // body-part *encapsulation
136 // close-delimiter transport-padding
138 // transport-padding := *LWSP-char
139 // encapsulation := delimiter transport-padding CRLF body-part
140 // delimiter := CRLF dash-boundary
141 // close-delimiter := delimiter "--"
142 // preamble := discard-text
143 // epilogue := discard-text
144 // discard-text := *(*text CRLF) *text
145 // body-part := MIME-part-headers [CRLF *OCTET]
146 // OCTET := <any 0-255 octet value>
148 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,
149 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the
150 // English alphabet, respectively.
151 // The non-terminal "text" is presumably just any text, excluding line breaks.
152 // The non-terminal "LWSP-char" is not directly defined in the original grammar
153 // but it means "linear whitespace", which is a space or a horizontal tab.
154 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use
155 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322:
157 // MIME-part-headers := field-name ":" unstructured CRLF
158 // field-name := 1*ftext
159 // ftext := %d33-57 / ; Printable US-ASCII
160 // %d59-126 ; characters not including ":".
161 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which
162 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and
163 // "CRLF<horizontal tab>", which serve for "folding".
165 // The FormDataParseMultipart class reads the input source and tries to parse it
166 // according to the grammar above, rooted at the "multipart-body" non-terminal.
167 // This happens in stages:
169 // 1. The optional preamble and the initial dash-boundary with transport padding
170 // and a CRLF are read and ignored.
172 // 2. Repeatedly each body part is read. The body parts can either serve to
173 // upload a file, or just a string of bytes.
174 // 2.a. The headers of that part are searched for the "content-disposition"
175 // header, which contains the name of the value represented by that body
176 // part. If the body-part is for file upload, that header also contains a
178 // 2.b. The "*OCTET" part of the body part is then read and passed as the value
179 // of the name-value pair for body parts representing a string of bytes.
180 // For body parts for uploading a file the "*OCTET" part is just ignored
181 // and the filename is used for value instead.
183 // 3. The final close-delimiter and epilogue are read and ignored.
186 // This parser supports sources split into multiple chunks. Therefore SetSource
187 // can be called multiple times if the source is spread over several chunks.
188 // However, the split may only occur inside a body part, right after the
189 // trailing CRLF of headers.
190 class FormDataParserMultipart
: public FormDataParser
{
192 explicit FormDataParserMultipart(const std::string
& boundary_separator
);
193 ~FormDataParserMultipart() override
;
195 // Implementation of FormDataParser.
196 bool AllDataReadOK() override
;
197 bool GetNextNameValue(Result
* result
) override
;
198 bool SetSource(base::StringPiece source
) override
;
202 STATE_INIT
, // No input read yet.
203 STATE_READY
, // Ready to call GetNextNameValue.
204 STATE_FINISHED
, // Read the input until the end.
205 STATE_SUSPEND
, // Waiting until a new |source_| is set.
209 // Produces a regexp to match the string "--" + |literal|. The idea is to
210 // represent "--" + |literal| as a "quoted pattern", a verbatim copy enclosed
211 // in "\\Q" and "\\E". The only catch is to watch out for occurences of "\\E"
212 // inside |literal|. Those must be excluded from the quote and the backslash
213 // doubly escaped. For example, for literal == "abc\\Edef" the result is
214 // "\\Q--abc\\E\\\\E\\Qdef\\E".
215 static std::string
CreateBoundaryPatternFromLiteral(
216 const std::string
& literal
);
218 // Tests whether |input| has a prefix matching |pattern|.
219 static bool StartsWithPattern(const re2::StringPiece
& input
,
222 // If |source_| starts with a header, seeks |source_| beyond the header. If
223 // the header is Content-Disposition, extracts |name| from "name=" and
224 // possibly |value| from "filename=" fields of that header. Only if the
225 // "name" or "filename" fields are found, then |name| or |value| are touched.
226 // Returns true iff |source_| is seeked forward. Sets |value_assigned|
227 // to true iff |value| has been assigned to.
228 bool TryReadHeader(base::StringPiece
* name
,
229 base::StringPiece
* value
,
230 bool* value_assigned
);
232 // Helper to GetNextNameValue. Expects that the input starts with a data
233 // portion of a body part. An attempt is made to read the input until the end
234 // of that body part. If |data| is not NULL, it is set to contain the data
235 // portion. Returns true iff the reading was successful.
236 bool FinishReadingPart(base::StringPiece
* data
);
238 // These methods could be even static, but then we would have to spend more
239 // code on initializing the cached pointer to g_patterns.Get().
240 const RE2
& transfer_padding_pattern() const {
241 return patterns_
->transfer_padding_pattern
;
243 const RE2
& crlf_pattern() const {
244 return patterns_
->crlf_pattern
;
246 const RE2
& closing_pattern() const {
247 return patterns_
->closing_pattern
;
249 const RE2
& epilogue_pattern() const {
250 return patterns_
->epilogue_pattern
;
252 const RE2
& crlf_free_pattern() const {
253 return patterns_
->crlf_free_pattern
;
255 const RE2
& preamble_pattern() const {
256 return patterns_
->preamble_pattern
;
258 const RE2
& header_pattern() const {
259 return patterns_
->header_pattern
;
261 const RE2
& content_disposition_pattern() const {
262 return patterns_
->content_disposition_pattern
;
264 const RE2
& name_pattern() const {
265 return patterns_
->name_pattern
;
267 const RE2
& value_pattern() const {
268 return patterns_
->value_pattern
;
270 // However, this is used in a static method so it needs to be static.
271 static const RE2
& unquote_pattern() {
272 return g_patterns
.Get().unquote_pattern
; // No caching g_patterns here.
275 const RE2 dash_boundary_pattern_
;
277 // Because of initialisation dependency, |state_| needs to be declared after
278 // |dash_boundary_pattern_|.
281 // The parsed message can be split into multiple sources which we read
283 re2::StringPiece source_
;
285 // Caching the pointer to g_patterns.Get().
286 const Patterns
* patterns_
;
288 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart
);
291 FormDataParser::Result::Result() {}
292 FormDataParser::Result::~Result() {}
294 FormDataParser::~FormDataParser() {}
297 scoped_ptr
<FormDataParser
> FormDataParser::Create(
298 const net::URLRequest
& request
) {
300 const bool found
= request
.extra_request_headers().GetHeader(
301 net::HttpRequestHeaders::kContentType
, &value
);
302 return CreateFromContentTypeHeader(found
? &value
: NULL
);
306 scoped_ptr
<FormDataParser
> FormDataParser::CreateFromContentTypeHeader(
307 const std::string
* content_type_header
) {
308 enum ParserChoice
{URL_ENCODED
, MULTIPART
, ERROR_CHOICE
};
309 ParserChoice choice
= ERROR_CHOICE
;
310 std::string boundary
;
312 if (content_type_header
== NULL
) {
313 choice
= URL_ENCODED
;
315 const std::string
content_type(
316 content_type_header
->substr(0, content_type_header
->find(';')));
318 if (base::EqualsCaseInsensitiveASCII(content_type
,
319 "application/x-www-form-urlencoded")) {
320 choice
= URL_ENCODED
;
321 } else if (base::EqualsCaseInsensitiveASCII(content_type
,
322 "multipart/form-data")) {
323 static const char kBoundaryString
[] = "boundary=";
324 size_t offset
= content_type_header
->find(kBoundaryString
);
325 if (offset
== std::string::npos
) {
327 return scoped_ptr
<FormDataParser
>();
329 offset
+= sizeof(kBoundaryString
) - 1;
330 boundary
= content_type_header
->substr(
331 offset
, content_type_header
->find(';', offset
));
332 if (!boundary
.empty())
336 // Other cases are unparseable, including when |content_type| is "text/plain".
340 return scoped_ptr
<FormDataParser
>(new FormDataParserUrlEncoded());
342 return scoped_ptr
<FormDataParser
>(new FormDataParserMultipart(boundary
));
344 return scoped_ptr
<FormDataParser
>();
346 NOTREACHED(); // Some compilers do not believe this is unreachable.
347 return scoped_ptr
<FormDataParser
>();
350 FormDataParser::FormDataParser() {}
352 const net::UnescapeRule::Type
FormDataParserUrlEncoded::unescape_rules_
=
353 net::UnescapeRule::URL_SPECIAL_CHARS
|
354 net::UnescapeRule::SPOOFING_AND_CONTROL_CHARS
| net::UnescapeRule::SPACES
|
355 net::UnescapeRule::REPLACE_PLUS_WITH_SPACE
;
357 FormDataParserUrlEncoded::FormDataParserUrlEncoded()
360 source_malformed_(false),
363 patterns_(g_patterns
.Pointer()) {
364 args_
[0] = &arg_name_
;
365 args_
[1] = &arg_value_
;
368 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}
370 bool FormDataParserUrlEncoded::AllDataReadOK() {
371 // All OK means we read the whole source.
372 return source_set_
&& source_
.empty() && !source_malformed_
;
375 bool FormDataParserUrlEncoded::GetNextNameValue(Result
* result
) {
376 if (!source_set_
|| source_malformed_
)
379 bool success
= RE2::ConsumeN(&source_
, pattern(), args_
, args_size_
);
381 result
->set_name(net::UnescapeURLComponent(name_
, unescape_rules_
));
382 result
->set_value(net::UnescapeURLComponent(value_
, unescape_rules_
));
384 if (source_
.length() > 0) {
385 if (source_
[0] == '&')
386 source_
.remove_prefix(1); // Remove the leading '&'.
388 source_malformed_
= true; // '&' missing between two name-value pairs.
390 return success
&& !source_malformed_
;
393 bool FormDataParserUrlEncoded::SetSource(base::StringPiece source
) {
395 return false; // We do not allow multiple sources for this parser.
396 source_
.set(source
.data(), source
.size());
398 source_malformed_
= false;
403 std::string
FormDataParserMultipart::CreateBoundaryPatternFromLiteral(
404 const std::string
& literal
) {
405 static const char quote
[] = "\\Q";
406 static const char unquote
[] = "\\E";
408 // The result always starts with opening the qoute and then "--".
409 std::string
result("\\Q--");
411 // This StringPiece is used below to record the next occurrence of "\\E" in
413 re2::StringPiece
seek_unquote(literal
);
414 const char* copy_start
= literal
.data();
415 size_t copy_length
= literal
.size();
417 // Find all "\\E" in |literal| and exclude them from the \Q...\E quote.
418 while (RE2::FindAndConsume(&seek_unquote
, unquote_pattern())) {
419 copy_length
= seek_unquote
.data() - copy_start
;
420 result
.append(copy_start
, copy_length
);
421 result
.append(kEscapeClosingQuote
);
422 result
.append(quote
);
423 copy_start
= seek_unquote
.data();
426 // Finish the last \Q...\E quote.
427 copy_length
= (literal
.data() + literal
.size()) - copy_start
;
428 result
.append(copy_start
, copy_length
);
429 result
.append(unquote
);
434 bool FormDataParserMultipart::StartsWithPattern(const re2::StringPiece
& input
,
435 const RE2
& pattern
) {
436 return pattern
.Match(input
, 0, input
.size(), RE2::ANCHOR_START
, NULL
, 0);
439 FormDataParserMultipart::FormDataParserMultipart(
440 const std::string
& boundary_separator
)
441 : dash_boundary_pattern_(
442 CreateBoundaryPatternFromLiteral(boundary_separator
)),
443 state_(dash_boundary_pattern_
.ok() ? STATE_INIT
: STATE_ERROR
),
444 patterns_(g_patterns
.Pointer()) {}
446 FormDataParserMultipart::~FormDataParserMultipart() {}
448 bool FormDataParserMultipart::AllDataReadOK() {
449 return state_
== STATE_FINISHED
;
452 bool FormDataParserMultipart::FinishReadingPart(base::StringPiece
* data
) {
453 const char* data_start
= source_
.data();
454 while (!StartsWithPattern(source_
, dash_boundary_pattern_
)) {
455 if (!RE2::Consume(&source_
, crlf_free_pattern()) ||
456 !RE2::Consume(&source_
, crlf_pattern())) {
457 state_
= STATE_ERROR
;
462 if (source_
.data() == data_start
) {
463 // No data in this body part.
464 state_
= STATE_ERROR
;
467 // Subtract 2 for the trailing "\r\n".
468 data
->set(data_start
, source_
.data() - data_start
- 2);
471 // Finally, read the dash-boundary and either skip to the next body part, or
472 // finish reading the source.
473 CHECK(RE2::Consume(&source_
, dash_boundary_pattern_
));
474 if (StartsWithPattern(source_
, closing_pattern())) {
475 CHECK(RE2::Consume(&source_
, closing_pattern()));
476 if (RE2::Consume(&source_
, epilogue_pattern()))
477 state_
= STATE_FINISHED
;
479 state_
= STATE_ERROR
;
480 } else { // Next body part ahead.
481 if (!RE2::Consume(&source_
, transfer_padding_pattern()))
482 state_
= STATE_ERROR
;
484 return state_
!= STATE_ERROR
;
487 bool FormDataParserMultipart::GetNextNameValue(Result
* result
) {
488 if (source_
.empty() || state_
!= STATE_READY
)
491 // 1. Read body-part headers.
492 base::StringPiece name
;
493 base::StringPiece value
;
494 bool value_assigned
= false;
495 bool value_assigned_temp
;
496 while (TryReadHeader(&name
, &value
, &value_assigned_temp
))
497 value_assigned
|= value_assigned_temp
;
498 if (name
.empty() || state_
== STATE_ERROR
) {
499 state_
= STATE_ERROR
;
503 // 2. Read the trailing CRLF after headers.
504 if (!RE2::Consume(&source_
, crlf_pattern())) {
505 state_
= STATE_ERROR
;
509 // 3. Read the data of this body part, i.e., everything until the first
512 if (value_assigned
&& source_
.empty()) { // Wait for a new source?
514 state_
= STATE_SUSPEND
;
516 return_value
= FinishReadingPart(value_assigned
? NULL
: &value
);
519 std::string unescaped_name
= net::UnescapeURLComponent(
520 name
.as_string(), net::UnescapeRule::URL_SPECIAL_CHARS
|
521 net::UnescapeRule::SPOOFING_AND_CONTROL_CHARS
);
522 result
->set_name(unescaped_name
);
523 result
->set_value(value
);
528 bool FormDataParserMultipart::SetSource(base::StringPiece source
) {
529 if (source
.data() == NULL
|| !source_
.empty())
531 source_
.set(source
.data(), source
.size());
535 // Seek behind the preamble.
536 while (!StartsWithPattern(source_
, dash_boundary_pattern_
)) {
537 if (!RE2::Consume(&source_
, preamble_pattern())) {
538 state_
= STATE_ERROR
;
542 // Read dash-boundary, transfer padding, and CRLF.
543 if (state_
!= STATE_ERROR
) {
544 if (!RE2::Consume(&source_
, dash_boundary_pattern_
) ||
545 !RE2::Consume(&source_
, transfer_padding_pattern()))
546 state_
= STATE_ERROR
;
548 state_
= STATE_READY
;
551 case STATE_READY
: // Nothing to do.
554 state_
= FinishReadingPart(NULL
) ? STATE_READY
: STATE_ERROR
;
557 state_
= STATE_ERROR
;
559 return state_
!= STATE_ERROR
;
562 bool FormDataParserMultipart::TryReadHeader(base::StringPiece
* name
,
563 base::StringPiece
* value
,
564 bool* value_assigned
) {
565 *value_assigned
= false;
566 const char* header_start
= source_
.data();
567 if (!RE2::Consume(&source_
, header_pattern()))
569 // (*) After this point we must return true, because we consumed one header.
571 // Subtract 2 for the trailing "\r\n".
572 re2::StringPiece
header(header_start
, source_
.data() - header_start
- 2);
574 if (!StartsWithPattern(header
, content_disposition_pattern()))
575 return true; // Skip headers that don't describe the content-disposition.
577 re2::StringPiece groups
[2];
579 if (!name_pattern().Match(header
,
580 kContentDispositionLength
, header
.size(),
581 RE2::UNANCHORED
, groups
, 2)) {
582 state_
= STATE_ERROR
;
583 return true; // See (*) for why true.
585 name
->set(groups
[1].data(), groups
[1].size());
587 if (value_pattern().Match(header
,
588 kContentDispositionLength
, header
.size(),
589 RE2::UNANCHORED
, groups
, 2)) {
590 value
->set(groups
[1].data(), groups
[1].size());
591 *value_assigned
= true;
596 } // namespace extensions