Add testing/scripts/OWNERS
[chromium-blink-merge.git] / extensions / browser / api / web_request / form_data_parser.cc
blob2e962dde1deb7bf6490d0afa5cb77ab3db9c43d4
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "extensions/browser/api/web_request/form_data_parser.h"
7 #include <vector>
9 #include "base/lazy_instance.h"
10 #include "base/logging.h"
11 #include "base/macros.h"
12 #include "base/strings/string_util.h"
13 #include "base/values.h"
14 #include "net/base/escape.h"
15 #include "net/url_request/url_request.h"
16 #include "third_party/re2/re2/re2.h"
18 using base::DictionaryValue;
19 using base::ListValue;
20 using base::StringPiece;
21 using re2::RE2;
23 namespace extensions {
25 namespace {
27 const char kContentDisposition[] = "content-disposition:";
28 const size_t kContentDispositionLength = arraysize(kContentDisposition) - 1;
29 // kCharacterPattern is an allowed character in a URL encoding. Definition is
30 // from RFC 1738, end of section 2.2.
31 const char kCharacterPattern[] =
32 "(?:[a-zA-Z0-9$_.+!*'(),]|-|(?:%[a-fA-F0-9]{2}))";
33 const char kEscapeClosingQuote[] = "\\\\E";
35 // A wrapper struct for static RE2 objects to be held as LazyInstance.
36 struct Patterns {
37 Patterns();
38 ~Patterns();
39 const RE2 transfer_padding_pattern;
40 const RE2 crlf_pattern;
41 const RE2 closing_pattern;
42 const RE2 epilogue_pattern;
43 const RE2 crlf_free_pattern;
44 const RE2 preamble_pattern;
45 const RE2 header_pattern;
46 const RE2 content_disposition_pattern;
47 const RE2 name_pattern;
48 const RE2 value_pattern;
49 const RE2 unquote_pattern;
50 const RE2 url_encoded_pattern;
53 Patterns::Patterns()
54 : transfer_padding_pattern("[ \\t]*\\r\\n"),
55 crlf_pattern("\\r\\n"),
56 closing_pattern("--[ \\t]*"),
57 epilogue_pattern("|\\r\\n(?s:.)*"),
58 crlf_free_pattern("(?:[^\\r]|\\r+[^\\r\\n])*"),
59 preamble_pattern(".+?"),
60 header_pattern("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"),
61 content_disposition_pattern(std::string("(?i:") + kContentDisposition +
62 ")"),
63 name_pattern("\\bname=\"([^\"]*)\""),
64 value_pattern("\\bfilename=\"([^\"]*)\""),
65 unquote_pattern(kEscapeClosingQuote),
66 url_encoded_pattern(std::string("(") + kCharacterPattern + "*)=(" +
67 kCharacterPattern +
68 "*)") {
71 Patterns::~Patterns() {}
73 base::LazyInstance<Patterns>::Leaky g_patterns = LAZY_INSTANCE_INITIALIZER;
75 } // namespace
77 // Parses URLencoded forms, see
78 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
79 class FormDataParserUrlEncoded : public FormDataParser {
80 public:
81 FormDataParserUrlEncoded();
82 ~FormDataParserUrlEncoded() override;
84 // Implementation of FormDataParser.
85 bool AllDataReadOK() override;
86 bool GetNextNameValue(Result* result) override;
87 bool SetSource(base::StringPiece source) override;
89 private:
90 // Returns the pattern to match a single name-value pair. This could be even
91 // static, but then we would have to spend more code on initializing the
92 // cached pointer to g_patterns.Get().
93 const RE2& pattern() const {
94 return patterns_->url_encoded_pattern;
97 // Auxiliary constant for using RE2. Number of arguments for parsing
98 // name-value pairs (one for name, one for value).
99 static const size_t args_size_ = 2u;
100 static const net::UnescapeRule::Type unescape_rules_;
102 re2::StringPiece source_;
103 bool source_set_;
104 bool source_malformed_;
106 // Auxiliary store for using RE2.
107 std::string name_;
108 std::string value_;
109 const RE2::Arg arg_name_;
110 const RE2::Arg arg_value_;
111 const RE2::Arg* args_[args_size_];
113 // Caching the pointer to g_patterns.Get().
114 const Patterns* patterns_;
116 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);
119 // The following class, FormDataParserMultipart, parses forms encoded as
120 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
121 // encoding) and 5322 (MIME-headers).
123 // Implementation details
125 // The original grammar from RFC 2046 is this, "multipart-body" being the root
126 // non-terminal:
128 // boundary := 0*69<bchars> bcharsnospace
129 // bchars := bcharsnospace / " "
130 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
131 // / "-" / "." / "/" / ":" / "=" / "?"
132 // dash-boundary := "--" boundary
133 // multipart-body := [preamble CRLF]
134 // dash-boundary transport-padding CRLF
135 // body-part *encapsulation
136 // close-delimiter transport-padding
137 // [CRLF epilogue]
138 // transport-padding := *LWSP-char
139 // encapsulation := delimiter transport-padding CRLF body-part
140 // delimiter := CRLF dash-boundary
141 // close-delimiter := delimiter "--"
142 // preamble := discard-text
143 // epilogue := discard-text
144 // discard-text := *(*text CRLF) *text
145 // body-part := MIME-part-headers [CRLF *OCTET]
146 // OCTET := <any 0-255 octet value>
148 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,
149 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the
150 // English alphabet, respectively.
151 // The non-terminal "text" is presumably just any text, excluding line breaks.
152 // The non-terminal "LWSP-char" is not directly defined in the original grammar
153 // but it means "linear whitespace", which is a space or a horizontal tab.
154 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use
155 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322:
157 // MIME-part-headers := field-name ":" unstructured CRLF
158 // field-name := 1*ftext
159 // ftext := %d33-57 / ; Printable US-ASCII
160 // %d59-126 ; characters not including ":".
161 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which
162 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and
163 // "CRLF<horizontal tab>", which serve for "folding".
165 // The FormDataParseMultipart class reads the input source and tries to parse it
166 // according to the grammar above, rooted at the "multipart-body" non-terminal.
167 // This happens in stages:
169 // 1. The optional preamble and the initial dash-boundary with transport padding
170 // and a CRLF are read and ignored.
172 // 2. Repeatedly each body part is read. The body parts can either serve to
173 // upload a file, or just a string of bytes.
174 // 2.a. The headers of that part are searched for the "content-disposition"
175 // header, which contains the name of the value represented by that body
176 // part. If the body-part is for file upload, that header also contains a
177 // filename.
178 // 2.b. The "*OCTET" part of the body part is then read and passed as the value
179 // of the name-value pair for body parts representing a string of bytes.
180 // For body parts for uploading a file the "*OCTET" part is just ignored
181 // and the filename is used for value instead.
183 // 3. The final close-delimiter and epilogue are read and ignored.
185 // IMPORTANT NOTE
186 // This parser supports sources split into multiple chunks. Therefore SetSource
187 // can be called multiple times if the source is spread over several chunks.
188 // However, the split may only occur inside a body part, right after the
189 // trailing CRLF of headers.
190 class FormDataParserMultipart : public FormDataParser {
191 public:
192 explicit FormDataParserMultipart(const std::string& boundary_separator);
193 ~FormDataParserMultipart() override;
195 // Implementation of FormDataParser.
196 bool AllDataReadOK() override;
197 bool GetNextNameValue(Result* result) override;
198 bool SetSource(base::StringPiece source) override;
200 private:
201 enum State {
202 STATE_INIT, // No input read yet.
203 STATE_READY, // Ready to call GetNextNameValue.
204 STATE_FINISHED, // Read the input until the end.
205 STATE_SUSPEND, // Waiting until a new |source_| is set.
206 STATE_ERROR
209 // Produces a regexp to match the string "--" + |literal|. The idea is to
210 // represent "--" + |literal| as a "quoted pattern", a verbatim copy enclosed
211 // in "\\Q" and "\\E". The only catch is to watch out for occurences of "\\E"
212 // inside |literal|. Those must be excluded from the quote and the backslash
213 // doubly escaped. For example, for literal == "abc\\Edef" the result is
214 // "\\Q--abc\\E\\\\E\\Qdef\\E".
215 static std::string CreateBoundaryPatternFromLiteral(
216 const std::string& literal);
218 // Tests whether |input| has a prefix matching |pattern|.
219 static bool StartsWithPattern(const re2::StringPiece& input,
220 const RE2& pattern);
222 // If |source_| starts with a header, seeks |source_| beyond the header. If
223 // the header is Content-Disposition, extracts |name| from "name=" and
224 // possibly |value| from "filename=" fields of that header. Only if the
225 // "name" or "filename" fields are found, then |name| or |value| are touched.
226 // Returns true iff |source_| is seeked forward. Sets |value_assigned|
227 // to true iff |value| has been assigned to.
228 bool TryReadHeader(base::StringPiece* name,
229 base::StringPiece* value,
230 bool* value_assigned);
232 // Helper to GetNextNameValue. Expects that the input starts with a data
233 // portion of a body part. An attempt is made to read the input until the end
234 // of that body part. If |data| is not NULL, it is set to contain the data
235 // portion. Returns true iff the reading was successful.
236 bool FinishReadingPart(base::StringPiece* data);
238 // These methods could be even static, but then we would have to spend more
239 // code on initializing the cached pointer to g_patterns.Get().
240 const RE2& transfer_padding_pattern() const {
241 return patterns_->transfer_padding_pattern;
243 const RE2& crlf_pattern() const {
244 return patterns_->crlf_pattern;
246 const RE2& closing_pattern() const {
247 return patterns_->closing_pattern;
249 const RE2& epilogue_pattern() const {
250 return patterns_->epilogue_pattern;
252 const RE2& crlf_free_pattern() const {
253 return patterns_->crlf_free_pattern;
255 const RE2& preamble_pattern() const {
256 return patterns_->preamble_pattern;
258 const RE2& header_pattern() const {
259 return patterns_->header_pattern;
261 const RE2& content_disposition_pattern() const {
262 return patterns_->content_disposition_pattern;
264 const RE2& name_pattern() const {
265 return patterns_->name_pattern;
267 const RE2& value_pattern() const {
268 return patterns_->value_pattern;
270 // However, this is used in a static method so it needs to be static.
271 static const RE2& unquote_pattern() {
272 return g_patterns.Get().unquote_pattern; // No caching g_patterns here.
275 const RE2 dash_boundary_pattern_;
277 // Because of initialisation dependency, |state_| needs to be declared after
278 // |dash_boundary_pattern_|.
279 State state_;
281 // The parsed message can be split into multiple sources which we read
282 // sequentially.
283 re2::StringPiece source_;
285 // Caching the pointer to g_patterns.Get().
286 const Patterns* patterns_;
288 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);
291 FormDataParser::Result::Result() {}
292 FormDataParser::Result::~Result() {}
294 FormDataParser::~FormDataParser() {}
296 // static
297 scoped_ptr<FormDataParser> FormDataParser::Create(
298 const net::URLRequest& request) {
299 std::string value;
300 const bool found = request.extra_request_headers().GetHeader(
301 net::HttpRequestHeaders::kContentType, &value);
302 return CreateFromContentTypeHeader(found ? &value : NULL);
305 // static
306 scoped_ptr<FormDataParser> FormDataParser::CreateFromContentTypeHeader(
307 const std::string* content_type_header) {
308 enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};
309 ParserChoice choice = ERROR_CHOICE;
310 std::string boundary;
312 if (content_type_header == NULL) {
313 choice = URL_ENCODED;
314 } else {
315 const std::string content_type(
316 content_type_header->substr(0, content_type_header->find(';')));
318 if (base::strcasecmp(
319 content_type.c_str(), "application/x-www-form-urlencoded") == 0) {
320 choice = URL_ENCODED;
321 } else if (base::strcasecmp(
322 content_type.c_str(), "multipart/form-data") == 0) {
323 static const char kBoundaryString[] = "boundary=";
324 size_t offset = content_type_header->find(kBoundaryString);
325 if (offset == std::string::npos) {
326 // Malformed header.
327 return scoped_ptr<FormDataParser>();
329 offset += sizeof(kBoundaryString) - 1;
330 boundary = content_type_header->substr(
331 offset, content_type_header->find(';', offset));
332 if (!boundary.empty())
333 choice = MULTIPART;
336 // Other cases are unparseable, including when |content_type| is "text/plain".
338 switch (choice) {
339 case URL_ENCODED:
340 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());
341 case MULTIPART:
342 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));
343 case ERROR_CHOICE:
344 return scoped_ptr<FormDataParser>();
346 NOTREACHED(); // Some compilers do not believe this is unreachable.
347 return scoped_ptr<FormDataParser>();
350 FormDataParser::FormDataParser() {}
352 const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =
353 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS |
354 net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;
356 FormDataParserUrlEncoded::FormDataParserUrlEncoded()
357 : source_(NULL),
358 source_set_(false),
359 source_malformed_(false),
360 arg_name_(&name_),
361 arg_value_(&value_),
362 patterns_(g_patterns.Pointer()) {
363 args_[0] = &arg_name_;
364 args_[1] = &arg_value_;
367 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}
369 bool FormDataParserUrlEncoded::AllDataReadOK() {
370 // All OK means we read the whole source.
371 return source_set_ && source_.empty() && !source_malformed_;
374 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {
375 if (!source_set_ || source_malformed_)
376 return false;
378 bool success = RE2::ConsumeN(&source_, pattern(), args_, args_size_);
379 if (success) {
380 result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));
381 result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));
383 if (source_.length() > 0) {
384 if (source_[0] == '&')
385 source_.remove_prefix(1); // Remove the leading '&'.
386 else
387 source_malformed_ = true; // '&' missing between two name-value pairs.
389 return success && !source_malformed_;
392 bool FormDataParserUrlEncoded::SetSource(base::StringPiece source) {
393 if (source_set_)
394 return false; // We do not allow multiple sources for this parser.
395 source_.set(source.data(), source.size());
396 source_set_ = true;
397 source_malformed_ = false;
398 return true;
401 // static
402 std::string FormDataParserMultipart::CreateBoundaryPatternFromLiteral(
403 const std::string& literal) {
404 static const char quote[] = "\\Q";
405 static const char unquote[] = "\\E";
407 // The result always starts with opening the qoute and then "--".
408 std::string result("\\Q--");
410 // This StringPiece is used below to record the next occurrence of "\\E" in
411 // |literal|.
412 re2::StringPiece seek_unquote(literal);
413 const char* copy_start = literal.data();
414 size_t copy_length = literal.size();
416 // Find all "\\E" in |literal| and exclude them from the \Q...\E quote.
417 while (RE2::FindAndConsume(&seek_unquote, unquote_pattern())) {
418 copy_length = seek_unquote.data() - copy_start;
419 result.append(copy_start, copy_length);
420 result.append(kEscapeClosingQuote);
421 result.append(quote);
422 copy_start = seek_unquote.data();
425 // Finish the last \Q...\E quote.
426 copy_length = (literal.data() + literal.size()) - copy_start;
427 result.append(copy_start, copy_length);
428 result.append(unquote);
429 return result;
432 // static
433 bool FormDataParserMultipart::StartsWithPattern(const re2::StringPiece& input,
434 const RE2& pattern) {
435 return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);
438 FormDataParserMultipart::FormDataParserMultipart(
439 const std::string& boundary_separator)
440 : dash_boundary_pattern_(
441 CreateBoundaryPatternFromLiteral(boundary_separator)),
442 state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR),
443 patterns_(g_patterns.Pointer()) {}
445 FormDataParserMultipart::~FormDataParserMultipart() {}
447 bool FormDataParserMultipart::AllDataReadOK() {
448 return state_ == STATE_FINISHED;
451 bool FormDataParserMultipart::FinishReadingPart(base::StringPiece* data) {
452 const char* data_start = source_.data();
453 while (!StartsWithPattern(source_, dash_boundary_pattern_)) {
454 if (!RE2::Consume(&source_, crlf_free_pattern()) ||
455 !RE2::Consume(&source_, crlf_pattern())) {
456 state_ = STATE_ERROR;
457 return false;
460 if (data != NULL) {
461 if (source_.data() == data_start) {
462 // No data in this body part.
463 state_ = STATE_ERROR;
464 return false;
466 // Subtract 2 for the trailing "\r\n".
467 data->set(data_start, source_.data() - data_start - 2);
470 // Finally, read the dash-boundary and either skip to the next body part, or
471 // finish reading the source.
472 CHECK(RE2::Consume(&source_, dash_boundary_pattern_));
473 if (StartsWithPattern(source_, closing_pattern())) {
474 CHECK(RE2::Consume(&source_, closing_pattern()));
475 if (RE2::Consume(&source_, epilogue_pattern()))
476 state_ = STATE_FINISHED;
477 else
478 state_ = STATE_ERROR;
479 } else { // Next body part ahead.
480 if (!RE2::Consume(&source_, transfer_padding_pattern()))
481 state_ = STATE_ERROR;
483 return state_ != STATE_ERROR;
486 bool FormDataParserMultipart::GetNextNameValue(Result* result) {
487 if (source_.empty() || state_ != STATE_READY)
488 return false;
490 // 1. Read body-part headers.
491 base::StringPiece name;
492 base::StringPiece value;
493 bool value_assigned = false;
494 bool value_assigned_temp;
495 while (TryReadHeader(&name, &value, &value_assigned_temp))
496 value_assigned |= value_assigned_temp;
497 if (name.empty() || state_ == STATE_ERROR) {
498 state_ = STATE_ERROR;
499 return false;
502 // 2. Read the trailing CRLF after headers.
503 if (!RE2::Consume(&source_, crlf_pattern())) {
504 state_ = STATE_ERROR;
505 return false;
508 // 3. Read the data of this body part, i.e., everything until the first
509 // dash-boundary.
510 bool return_value;
511 if (value_assigned && source_.empty()) { // Wait for a new source?
512 return_value = true;
513 state_ = STATE_SUSPEND;
514 } else {
515 return_value = FinishReadingPart(value_assigned ? NULL : &value);
518 std::string unescaped_name = net::UnescapeURLComponent(
519 name.as_string(),
520 net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS);
521 result->set_name(unescaped_name);
522 result->set_value(value);
524 return return_value;
527 bool FormDataParserMultipart::SetSource(base::StringPiece source) {
528 if (source.data() == NULL || !source_.empty())
529 return false;
530 source_.set(source.data(), source.size());
532 switch (state_) {
533 case STATE_INIT:
534 // Seek behind the preamble.
535 while (!StartsWithPattern(source_, dash_boundary_pattern_)) {
536 if (!RE2::Consume(&source_, preamble_pattern())) {
537 state_ = STATE_ERROR;
538 break;
541 // Read dash-boundary, transfer padding, and CRLF.
542 if (state_ != STATE_ERROR) {
543 if (!RE2::Consume(&source_, dash_boundary_pattern_) ||
544 !RE2::Consume(&source_, transfer_padding_pattern()))
545 state_ = STATE_ERROR;
546 else
547 state_ = STATE_READY;
549 break;
550 case STATE_READY: // Nothing to do.
551 break;
552 case STATE_SUSPEND:
553 state_ = FinishReadingPart(NULL) ? STATE_READY : STATE_ERROR;
554 break;
555 default:
556 state_ = STATE_ERROR;
558 return state_ != STATE_ERROR;
561 bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,
562 base::StringPiece* value,
563 bool* value_assigned) {
564 *value_assigned = false;
565 const char* header_start = source_.data();
566 if (!RE2::Consume(&source_, header_pattern()))
567 return false;
568 // (*) After this point we must return true, because we consumed one header.
570 // Subtract 2 for the trailing "\r\n".
571 re2::StringPiece header(header_start, source_.data() - header_start - 2);
573 if (!StartsWithPattern(header, content_disposition_pattern()))
574 return true; // Skip headers that don't describe the content-disposition.
576 re2::StringPiece groups[2];
578 if (!name_pattern().Match(header,
579 kContentDispositionLength, header.size(),
580 RE2::UNANCHORED, groups, 2)) {
581 state_ = STATE_ERROR;
582 return true; // See (*) for why true.
584 name->set(groups[1].data(), groups[1].size());
586 if (value_pattern().Match(header,
587 kContentDispositionLength, header.size(),
588 RE2::UNANCHORED, groups, 2)) {
589 value->set(groups[1].data(), groups[1].size());
590 *value_assigned = true;
592 return true;
595 } // namespace extensions