Add ICU message format support
[chromium-blink-merge.git] / extensions / browser / api / web_request / form_data_parser.cc
blobf12c34face959309b604eaee88960eab56a1bd12
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "extensions/browser/api/web_request/form_data_parser.h"
7 #include <vector>
9 #include "base/lazy_instance.h"
10 #include "base/logging.h"
11 #include "base/macros.h"
12 #include "base/strings/string_util.h"
13 #include "base/values.h"
14 #include "net/base/escape.h"
15 #include "net/url_request/url_request.h"
16 #include "third_party/re2/re2/re2.h"
18 using base::DictionaryValue;
19 using base::ListValue;
20 using base::StringPiece;
21 using re2::RE2;
23 namespace extensions {
25 namespace {
27 const char kContentDisposition[] = "content-disposition:";
28 const size_t kContentDispositionLength = arraysize(kContentDisposition) - 1;
29 // kCharacterPattern is an allowed character in a URL encoding. Definition is
30 // from RFC 1738, end of section 2.2.
31 const char kCharacterPattern[] =
32 "(?:[a-zA-Z0-9$_.+!*'(),]|-|(?:%[a-fA-F0-9]{2}))";
33 const char kEscapeClosingQuote[] = "\\\\E";
35 // A wrapper struct for static RE2 objects to be held as LazyInstance.
36 struct Patterns {
37 Patterns();
38 ~Patterns();
39 const RE2 transfer_padding_pattern;
40 const RE2 crlf_pattern;
41 const RE2 closing_pattern;
42 const RE2 epilogue_pattern;
43 const RE2 crlf_free_pattern;
44 const RE2 preamble_pattern;
45 const RE2 header_pattern;
46 const RE2 content_disposition_pattern;
47 const RE2 name_pattern;
48 const RE2 value_pattern;
49 const RE2 unquote_pattern;
50 const RE2 url_encoded_pattern;
53 Patterns::Patterns()
54 : transfer_padding_pattern("[ \\t]*\\r\\n"),
55 crlf_pattern("\\r\\n"),
56 closing_pattern("--[ \\t]*"),
57 epilogue_pattern("|\\r\\n(?s:.)*"),
58 crlf_free_pattern("(?:[^\\r]|\\r+[^\\r\\n])*"),
59 preamble_pattern(".+?"),
60 header_pattern("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"),
61 content_disposition_pattern(std::string("(?i:") + kContentDisposition +
62 ")"),
63 name_pattern("\\bname=\"([^\"]*)\""),
64 value_pattern("\\bfilename=\"([^\"]*)\""),
65 unquote_pattern(kEscapeClosingQuote),
66 url_encoded_pattern(std::string("(") + kCharacterPattern + "*)=(" +
67 kCharacterPattern +
68 "*)") {
71 Patterns::~Patterns() {}
73 base::LazyInstance<Patterns>::Leaky g_patterns = LAZY_INSTANCE_INITIALIZER;
75 } // namespace
77 // Parses URLencoded forms, see
78 // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 .
79 class FormDataParserUrlEncoded : public FormDataParser {
80 public:
81 FormDataParserUrlEncoded();
82 ~FormDataParserUrlEncoded() override;
84 // Implementation of FormDataParser.
85 bool AllDataReadOK() override;
86 bool GetNextNameValue(Result* result) override;
87 bool SetSource(base::StringPiece source) override;
89 private:
90 // Returns the pattern to match a single name-value pair. This could be even
91 // static, but then we would have to spend more code on initializing the
92 // cached pointer to g_patterns.Get().
93 const RE2& pattern() const {
94 return patterns_->url_encoded_pattern;
97 // Auxiliary constant for using RE2. Number of arguments for parsing
98 // name-value pairs (one for name, one for value).
99 static const size_t args_size_ = 2u;
100 static const net::UnescapeRule::Type unescape_rules_;
102 re2::StringPiece source_;
103 bool source_set_;
104 bool source_malformed_;
106 // Auxiliary store for using RE2.
107 std::string name_;
108 std::string value_;
109 const RE2::Arg arg_name_;
110 const RE2::Arg arg_value_;
111 const RE2::Arg* args_[args_size_];
113 // Caching the pointer to g_patterns.Get().
114 const Patterns* patterns_;
116 DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded);
119 // The following class, FormDataParserMultipart, parses forms encoded as
120 // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart
121 // encoding) and 5322 (MIME-headers).
123 // Implementation details
125 // The original grammar from RFC 2046 is this, "multipart-body" being the root
126 // non-terminal:
128 // boundary := 0*69<bchars> bcharsnospace
129 // bchars := bcharsnospace / " "
130 // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / ","
131 // / "-" / "." / "/" / ":" / "=" / "?"
132 // dash-boundary := "--" boundary
133 // multipart-body := [preamble CRLF]
134 // dash-boundary transport-padding CRLF
135 // body-part *encapsulation
136 // close-delimiter transport-padding
137 // [CRLF epilogue]
138 // transport-padding := *LWSP-char
139 // encapsulation := delimiter transport-padding CRLF body-part
140 // delimiter := CRLF dash-boundary
141 // close-delimiter := delimiter "--"
142 // preamble := discard-text
143 // epilogue := discard-text
144 // discard-text := *(*text CRLF) *text
145 // body-part := MIME-part-headers [CRLF *OCTET]
146 // OCTET := <any 0-255 octet value>
148 // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF,
149 // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the
150 // English alphabet, respectively.
151 // The non-terminal "text" is presumably just any text, excluding line breaks.
152 // The non-terminal "LWSP-char" is not directly defined in the original grammar
153 // but it means "linear whitespace", which is a space or a horizontal tab.
154 // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use
155 // the syntax for "optional fields" from Section 3.6.8 of RFC 5322:
157 // MIME-part-headers := field-name ":" unstructured CRLF
158 // field-name := 1*ftext
159 // ftext := %d33-57 / ; Printable US-ASCII
160 // %d59-126 ; characters not including ":".
161 // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which
162 // does not contain a CRLF sub-string, except for substrings "CRLF<space>" and
163 // "CRLF<horizontal tab>", which serve for "folding".
165 // The FormDataParseMultipart class reads the input source and tries to parse it
166 // according to the grammar above, rooted at the "multipart-body" non-terminal.
167 // This happens in stages:
169 // 1. The optional preamble and the initial dash-boundary with transport padding
170 // and a CRLF are read and ignored.
172 // 2. Repeatedly each body part is read. The body parts can either serve to
173 // upload a file, or just a string of bytes.
174 // 2.a. The headers of that part are searched for the "content-disposition"
175 // header, which contains the name of the value represented by that body
176 // part. If the body-part is for file upload, that header also contains a
177 // filename.
178 // 2.b. The "*OCTET" part of the body part is then read and passed as the value
179 // of the name-value pair for body parts representing a string of bytes.
180 // For body parts for uploading a file the "*OCTET" part is just ignored
181 // and the filename is used for value instead.
183 // 3. The final close-delimiter and epilogue are read and ignored.
185 // IMPORTANT NOTE
186 // This parser supports sources split into multiple chunks. Therefore SetSource
187 // can be called multiple times if the source is spread over several chunks.
188 // However, the split may only occur inside a body part, right after the
189 // trailing CRLF of headers.
190 class FormDataParserMultipart : public FormDataParser {
191 public:
192 explicit FormDataParserMultipart(const std::string& boundary_separator);
193 ~FormDataParserMultipart() override;
195 // Implementation of FormDataParser.
196 bool AllDataReadOK() override;
197 bool GetNextNameValue(Result* result) override;
198 bool SetSource(base::StringPiece source) override;
200 private:
201 enum State {
202 STATE_INIT, // No input read yet.
203 STATE_READY, // Ready to call GetNextNameValue.
204 STATE_FINISHED, // Read the input until the end.
205 STATE_SUSPEND, // Waiting until a new |source_| is set.
206 STATE_ERROR
209 // Produces a regexp to match the string "--" + |literal|. The idea is to
210 // represent "--" + |literal| as a "quoted pattern", a verbatim copy enclosed
211 // in "\\Q" and "\\E". The only catch is to watch out for occurences of "\\E"
212 // inside |literal|. Those must be excluded from the quote and the backslash
213 // doubly escaped. For example, for literal == "abc\\Edef" the result is
214 // "\\Q--abc\\E\\\\E\\Qdef\\E".
215 static std::string CreateBoundaryPatternFromLiteral(
216 const std::string& literal);
218 // Tests whether |input| has a prefix matching |pattern|.
219 static bool StartsWithPattern(const re2::StringPiece& input,
220 const RE2& pattern);
222 // If |source_| starts with a header, seeks |source_| beyond the header. If
223 // the header is Content-Disposition, extracts |name| from "name=" and
224 // possibly |value| from "filename=" fields of that header. Only if the
225 // "name" or "filename" fields are found, then |name| or |value| are touched.
226 // Returns true iff |source_| is seeked forward. Sets |value_assigned|
227 // to true iff |value| has been assigned to.
228 bool TryReadHeader(base::StringPiece* name,
229 base::StringPiece* value,
230 bool* value_assigned);
232 // Helper to GetNextNameValue. Expects that the input starts with a data
233 // portion of a body part. An attempt is made to read the input until the end
234 // of that body part. If |data| is not NULL, it is set to contain the data
235 // portion. Returns true iff the reading was successful.
236 bool FinishReadingPart(base::StringPiece* data);
238 // These methods could be even static, but then we would have to spend more
239 // code on initializing the cached pointer to g_patterns.Get().
240 const RE2& transfer_padding_pattern() const {
241 return patterns_->transfer_padding_pattern;
243 const RE2& crlf_pattern() const {
244 return patterns_->crlf_pattern;
246 const RE2& closing_pattern() const {
247 return patterns_->closing_pattern;
249 const RE2& epilogue_pattern() const {
250 return patterns_->epilogue_pattern;
252 const RE2& crlf_free_pattern() const {
253 return patterns_->crlf_free_pattern;
255 const RE2& preamble_pattern() const {
256 return patterns_->preamble_pattern;
258 const RE2& header_pattern() const {
259 return patterns_->header_pattern;
261 const RE2& content_disposition_pattern() const {
262 return patterns_->content_disposition_pattern;
264 const RE2& name_pattern() const {
265 return patterns_->name_pattern;
267 const RE2& value_pattern() const {
268 return patterns_->value_pattern;
270 // However, this is used in a static method so it needs to be static.
271 static const RE2& unquote_pattern() {
272 return g_patterns.Get().unquote_pattern; // No caching g_patterns here.
275 const RE2 dash_boundary_pattern_;
277 // Because of initialisation dependency, |state_| needs to be declared after
278 // |dash_boundary_pattern_|.
279 State state_;
281 // The parsed message can be split into multiple sources which we read
282 // sequentially.
283 re2::StringPiece source_;
285 // Caching the pointer to g_patterns.Get().
286 const Patterns* patterns_;
288 DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart);
291 FormDataParser::Result::Result() {}
292 FormDataParser::Result::~Result() {}
294 FormDataParser::~FormDataParser() {}
296 // static
297 scoped_ptr<FormDataParser> FormDataParser::Create(
298 const net::URLRequest& request) {
299 std::string value;
300 const bool found = request.extra_request_headers().GetHeader(
301 net::HttpRequestHeaders::kContentType, &value);
302 return CreateFromContentTypeHeader(found ? &value : NULL);
305 // static
306 scoped_ptr<FormDataParser> FormDataParser::CreateFromContentTypeHeader(
307 const std::string* content_type_header) {
308 enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE};
309 ParserChoice choice = ERROR_CHOICE;
310 std::string boundary;
312 if (content_type_header == NULL) {
313 choice = URL_ENCODED;
314 } else {
315 const std::string content_type(
316 content_type_header->substr(0, content_type_header->find(';')));
318 if (base::EqualsCaseInsensitiveASCII(content_type,
319 "application/x-www-form-urlencoded")) {
320 choice = URL_ENCODED;
321 } else if (base::EqualsCaseInsensitiveASCII(content_type,
322 "multipart/form-data")) {
323 static const char kBoundaryString[] = "boundary=";
324 size_t offset = content_type_header->find(kBoundaryString);
325 if (offset == std::string::npos) {
326 // Malformed header.
327 return scoped_ptr<FormDataParser>();
329 offset += sizeof(kBoundaryString) - 1;
330 boundary = content_type_header->substr(
331 offset, content_type_header->find(';', offset));
332 if (!boundary.empty())
333 choice = MULTIPART;
336 // Other cases are unparseable, including when |content_type| is "text/plain".
338 switch (choice) {
339 case URL_ENCODED:
340 return scoped_ptr<FormDataParser>(new FormDataParserUrlEncoded());
341 case MULTIPART:
342 return scoped_ptr<FormDataParser>(new FormDataParserMultipart(boundary));
343 case ERROR_CHOICE:
344 return scoped_ptr<FormDataParser>();
346 NOTREACHED(); // Some compilers do not believe this is unreachable.
347 return scoped_ptr<FormDataParser>();
350 FormDataParser::FormDataParser() {}
352 const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ =
353 net::UnescapeRule::URL_SPECIAL_CHARS |
354 net::UnescapeRule::SPOOFING_AND_CONTROL_CHARS | net::UnescapeRule::SPACES |
355 net::UnescapeRule::REPLACE_PLUS_WITH_SPACE;
357 FormDataParserUrlEncoded::FormDataParserUrlEncoded()
358 : source_(NULL),
359 source_set_(false),
360 source_malformed_(false),
361 arg_name_(&name_),
362 arg_value_(&value_),
363 patterns_(g_patterns.Pointer()) {
364 args_[0] = &arg_name_;
365 args_[1] = &arg_value_;
368 FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {}
370 bool FormDataParserUrlEncoded::AllDataReadOK() {
371 // All OK means we read the whole source.
372 return source_set_ && source_.empty() && !source_malformed_;
375 bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) {
376 if (!source_set_ || source_malformed_)
377 return false;
379 bool success = RE2::ConsumeN(&source_, pattern(), args_, args_size_);
380 if (success) {
381 result->set_name(net::UnescapeURLComponent(name_, unescape_rules_));
382 result->set_value(net::UnescapeURLComponent(value_, unescape_rules_));
384 if (source_.length() > 0) {
385 if (source_[0] == '&')
386 source_.remove_prefix(1); // Remove the leading '&'.
387 else
388 source_malformed_ = true; // '&' missing between two name-value pairs.
390 return success && !source_malformed_;
393 bool FormDataParserUrlEncoded::SetSource(base::StringPiece source) {
394 if (source_set_)
395 return false; // We do not allow multiple sources for this parser.
396 source_.set(source.data(), source.size());
397 source_set_ = true;
398 source_malformed_ = false;
399 return true;
402 // static
403 std::string FormDataParserMultipart::CreateBoundaryPatternFromLiteral(
404 const std::string& literal) {
405 static const char quote[] = "\\Q";
406 static const char unquote[] = "\\E";
408 // The result always starts with opening the qoute and then "--".
409 std::string result("\\Q--");
411 // This StringPiece is used below to record the next occurrence of "\\E" in
412 // |literal|.
413 re2::StringPiece seek_unquote(literal);
414 const char* copy_start = literal.data();
415 size_t copy_length = literal.size();
417 // Find all "\\E" in |literal| and exclude them from the \Q...\E quote.
418 while (RE2::FindAndConsume(&seek_unquote, unquote_pattern())) {
419 copy_length = seek_unquote.data() - copy_start;
420 result.append(copy_start, copy_length);
421 result.append(kEscapeClosingQuote);
422 result.append(quote);
423 copy_start = seek_unquote.data();
426 // Finish the last \Q...\E quote.
427 copy_length = (literal.data() + literal.size()) - copy_start;
428 result.append(copy_start, copy_length);
429 result.append(unquote);
430 return result;
433 // static
434 bool FormDataParserMultipart::StartsWithPattern(const re2::StringPiece& input,
435 const RE2& pattern) {
436 return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0);
439 FormDataParserMultipart::FormDataParserMultipart(
440 const std::string& boundary_separator)
441 : dash_boundary_pattern_(
442 CreateBoundaryPatternFromLiteral(boundary_separator)),
443 state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR),
444 patterns_(g_patterns.Pointer()) {}
446 FormDataParserMultipart::~FormDataParserMultipart() {}
448 bool FormDataParserMultipart::AllDataReadOK() {
449 return state_ == STATE_FINISHED;
452 bool FormDataParserMultipart::FinishReadingPart(base::StringPiece* data) {
453 const char* data_start = source_.data();
454 while (!StartsWithPattern(source_, dash_boundary_pattern_)) {
455 if (!RE2::Consume(&source_, crlf_free_pattern()) ||
456 !RE2::Consume(&source_, crlf_pattern())) {
457 state_ = STATE_ERROR;
458 return false;
461 if (data != NULL) {
462 if (source_.data() == data_start) {
463 // No data in this body part.
464 state_ = STATE_ERROR;
465 return false;
467 // Subtract 2 for the trailing "\r\n".
468 data->set(data_start, source_.data() - data_start - 2);
471 // Finally, read the dash-boundary and either skip to the next body part, or
472 // finish reading the source.
473 CHECK(RE2::Consume(&source_, dash_boundary_pattern_));
474 if (StartsWithPattern(source_, closing_pattern())) {
475 CHECK(RE2::Consume(&source_, closing_pattern()));
476 if (RE2::Consume(&source_, epilogue_pattern()))
477 state_ = STATE_FINISHED;
478 else
479 state_ = STATE_ERROR;
480 } else { // Next body part ahead.
481 if (!RE2::Consume(&source_, transfer_padding_pattern()))
482 state_ = STATE_ERROR;
484 return state_ != STATE_ERROR;
487 bool FormDataParserMultipart::GetNextNameValue(Result* result) {
488 if (source_.empty() || state_ != STATE_READY)
489 return false;
491 // 1. Read body-part headers.
492 base::StringPiece name;
493 base::StringPiece value;
494 bool value_assigned = false;
495 bool value_assigned_temp;
496 while (TryReadHeader(&name, &value, &value_assigned_temp))
497 value_assigned |= value_assigned_temp;
498 if (name.empty() || state_ == STATE_ERROR) {
499 state_ = STATE_ERROR;
500 return false;
503 // 2. Read the trailing CRLF after headers.
504 if (!RE2::Consume(&source_, crlf_pattern())) {
505 state_ = STATE_ERROR;
506 return false;
509 // 3. Read the data of this body part, i.e., everything until the first
510 // dash-boundary.
511 bool return_value;
512 if (value_assigned && source_.empty()) { // Wait for a new source?
513 return_value = true;
514 state_ = STATE_SUSPEND;
515 } else {
516 return_value = FinishReadingPart(value_assigned ? NULL : &value);
519 std::string unescaped_name = net::UnescapeURLComponent(
520 name.as_string(), net::UnescapeRule::URL_SPECIAL_CHARS |
521 net::UnescapeRule::SPOOFING_AND_CONTROL_CHARS);
522 result->set_name(unescaped_name);
523 result->set_value(value);
525 return return_value;
528 bool FormDataParserMultipart::SetSource(base::StringPiece source) {
529 if (source.data() == NULL || !source_.empty())
530 return false;
531 source_.set(source.data(), source.size());
533 switch (state_) {
534 case STATE_INIT:
535 // Seek behind the preamble.
536 while (!StartsWithPattern(source_, dash_boundary_pattern_)) {
537 if (!RE2::Consume(&source_, preamble_pattern())) {
538 state_ = STATE_ERROR;
539 break;
542 // Read dash-boundary, transfer padding, and CRLF.
543 if (state_ != STATE_ERROR) {
544 if (!RE2::Consume(&source_, dash_boundary_pattern_) ||
545 !RE2::Consume(&source_, transfer_padding_pattern()))
546 state_ = STATE_ERROR;
547 else
548 state_ = STATE_READY;
550 break;
551 case STATE_READY: // Nothing to do.
552 break;
553 case STATE_SUSPEND:
554 state_ = FinishReadingPart(NULL) ? STATE_READY : STATE_ERROR;
555 break;
556 default:
557 state_ = STATE_ERROR;
559 return state_ != STATE_ERROR;
562 bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name,
563 base::StringPiece* value,
564 bool* value_assigned) {
565 *value_assigned = false;
566 const char* header_start = source_.data();
567 if (!RE2::Consume(&source_, header_pattern()))
568 return false;
569 // (*) After this point we must return true, because we consumed one header.
571 // Subtract 2 for the trailing "\r\n".
572 re2::StringPiece header(header_start, source_.data() - header_start - 2);
574 if (!StartsWithPattern(header, content_disposition_pattern()))
575 return true; // Skip headers that don't describe the content-disposition.
577 re2::StringPiece groups[2];
579 if (!name_pattern().Match(header,
580 kContentDispositionLength, header.size(),
581 RE2::UNANCHORED, groups, 2)) {
582 state_ = STATE_ERROR;
583 return true; // See (*) for why true.
585 name->set(groups[1].data(), groups[1].size());
587 if (value_pattern().Match(header,
588 kContentDispositionLength, header.size(),
589 RE2::UNANCHORED, groups, 2)) {
590 value->set(groups[1].data(), groups[1].size());
591 *value_assigned = true;
593 return true;
596 } // namespace extensions