Updating trunk VERSION from 2139.0 to 2140.0
[chromium-blink-merge.git] / net / tools / balsa / balsa_frame.cc
blobd8590bbb66bb530f474ce2fff39f54d0dea7982e
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "net/tools/balsa/balsa_frame.h"
7 // Visual C++ defines _M_IX86_FP as 2 if the /arch:SSE2 compiler option is
8 // specified.
9 #if !defined(__SSE2__) && _M_IX86_FP == 2
10 #define __SSE2__ 1
11 #endif
13 #include <assert.h>
14 #if __SSE2__
15 #include <emmintrin.h>
16 #endif // __SSE2__
18 #include <limits>
19 #include <string>
20 #include <utility>
21 #include <vector>
23 #include "base/logging.h"
24 #include "base/port.h"
25 #include "base/strings/string_piece.h"
26 #include "net/tools/balsa/balsa_enums.h"
27 #include "net/tools/balsa/balsa_headers.h"
28 #include "net/tools/balsa/balsa_visitor_interface.h"
29 #include "net/tools/balsa/buffer_interface.h"
30 #include "net/tools/balsa/simple_buffer.h"
31 #include "net/tools/balsa/split.h"
32 #include "net/tools/balsa/string_piece_utils.h"
34 #if defined(COMPILER_MSVC)
35 #include <intrin.h>
36 #include <string.h>
38 #pragma intrinsic(_BitScanForward)
40 static int ffs(int i) {
41 unsigned long index;
42 return _BitScanForward(&index, i) ? index + 1 : 0;
45 #define strncasecmp _strnicmp
46 #else
47 #include <strings.h>
48 #endif
50 namespace net {
52 // Constants holding some header names for headers which can affect the way the
53 // HTTP message is framed, and so must be processed specially:
54 static const char kContentLength[] = "content-length";
55 static const size_t kContentLengthSize = sizeof(kContentLength) - 1;
56 static const char kTransferEncoding[] = "transfer-encoding";
57 static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1;
59 BalsaFrame::BalsaFrame()
60 : last_char_was_slash_r_(false),
61 saw_non_newline_char_(false),
62 start_was_space_(true),
63 chunk_length_character_extracted_(false),
64 is_request_(true),
65 request_was_head_(false),
66 max_header_length_(16 * 1024),
67 max_request_uri_length_(2048),
68 visitor_(&do_nothing_visitor_),
69 chunk_length_remaining_(0),
70 content_length_remaining_(0),
71 last_slash_n_loc_(NULL),
72 last_recorded_slash_n_loc_(NULL),
73 last_slash_n_idx_(0),
74 term_chars_(0),
75 parse_state_(BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE),
76 last_error_(BalsaFrameEnums::NO_ERROR),
77 headers_(NULL) {
80 BalsaFrame::~BalsaFrame() {}
82 void BalsaFrame::Reset() {
83 last_char_was_slash_r_ = false;
84 saw_non_newline_char_ = false;
85 start_was_space_ = true;
86 chunk_length_character_extracted_ = false;
87 // is_request_ = true; // not reset between messages.
88 // request_was_head_ = false; // not reset between messages.
89 // max_header_length_ = 4096; // not reset between messages.
90 // max_request_uri_length_ = 2048; // not reset between messages.
91 // visitor_ = &do_nothing_visitor_; // not reset between messages.
92 chunk_length_remaining_ = 0;
93 content_length_remaining_ = 0;
94 last_slash_n_loc_ = NULL;
95 last_recorded_slash_n_loc_ = NULL;
96 last_slash_n_idx_ = 0;
97 term_chars_ = 0;
98 parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE;
99 last_error_ = BalsaFrameEnums::NO_ERROR;
100 lines_.clear();
101 if (headers_ != NULL) {
102 headers_->Clear();
106 const char* BalsaFrameEnums::ParseStateToString(
107 BalsaFrameEnums::ParseState error_code) {
108 switch (error_code) {
109 case PARSE_ERROR:
110 return "PARSE_ERROR";
111 case READING_HEADER_AND_FIRSTLINE:
112 return "READING_HEADER_AND_FIRSTLINE";
113 case READING_CHUNK_LENGTH:
114 return "READING_CHUNK_LENGTH";
115 case READING_CHUNK_EXTENSION:
116 return "READING_CHUNK_EXTENSION";
117 case READING_CHUNK_DATA:
118 return "READING_CHUNK_DATA";
119 case READING_CHUNK_TERM:
120 return "READING_CHUNK_TERM";
121 case READING_LAST_CHUNK_TERM:
122 return "READING_LAST_CHUNK_TERM";
123 case READING_TRAILER:
124 return "READING_TRAILER";
125 case READING_UNTIL_CLOSE:
126 return "READING_UNTIL_CLOSE";
127 case READING_CONTENT:
128 return "READING_CONTENT";
129 case MESSAGE_FULLY_READ:
130 return "MESSAGE_FULLY_READ";
131 case NUM_STATES:
132 return "UNKNOWN_STATE";
134 return "UNKNOWN_STATE";
137 const char* BalsaFrameEnums::ErrorCodeToString(
138 BalsaFrameEnums::ErrorCode error_code) {
139 switch (error_code) {
140 case NO_ERROR:
141 return "NO_ERROR";
142 case NO_STATUS_LINE_IN_RESPONSE:
143 return "NO_STATUS_LINE_IN_RESPONSE";
144 case NO_REQUEST_LINE_IN_REQUEST:
145 return "NO_REQUEST_LINE_IN_REQUEST";
146 case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION:
147 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION";
148 case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD:
149 return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD";
150 case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE:
151 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE";
152 case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI:
153 return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI";
154 case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE:
155 return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE";
156 case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION:
157 return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION";
158 case FAILED_CONVERTING_STATUS_CODE_TO_INT:
159 return "FAILED_CONVERTING_STATUS_CODE_TO_INT";
160 case REQUEST_URI_TOO_LONG:
161 return "REQUEST_URI_TOO_LONG";
162 case HEADERS_TOO_LONG:
163 return "HEADERS_TOO_LONG";
164 case UNPARSABLE_CONTENT_LENGTH:
165 return "UNPARSABLE_CONTENT_LENGTH";
166 case MAYBE_BODY_BUT_NO_CONTENT_LENGTH:
167 return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH";
168 case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH:
169 return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH";
170 case HEADER_MISSING_COLON:
171 return "HEADER_MISSING_COLON";
172 case INVALID_CHUNK_LENGTH:
173 return "INVALID_CHUNK_LENGTH";
174 case CHUNK_LENGTH_OVERFLOW:
175 return "CHUNK_LENGTH_OVERFLOW";
176 case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO:
177 return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO";
178 case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT:
179 return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT";
180 case MULTIPLE_CONTENT_LENGTH_KEYS:
181 return "MULTIPLE_CONTENT_LENGTH_KEYS";
182 case MULTIPLE_TRANSFER_ENCODING_KEYS:
183 return "MULTIPLE_TRANSFER_ENCODING_KEYS";
184 case UNKNOWN_TRANSFER_ENCODING:
185 return "UNKNOWN_TRANSFER_ENCODING";
186 case INVALID_HEADER_FORMAT:
187 return "INVALID_HEADER_FORMAT";
188 case INTERNAL_LOGIC_ERROR:
189 return "INTERNAL_LOGIC_ERROR";
190 case NUM_ERROR_CODES:
191 return "UNKNOWN_ERROR";
193 return "UNKNOWN_ERROR";
196 // Summary:
197 // Parses the first line of either a request or response.
198 // Note that in the case of a detected warning, error_code will be set
199 // but the function will not return false.
200 // Exactly zero or one warning or error (but not both) may be detected
201 // by this function.
202 // Note that this function will not write the data of the first-line
203 // into the header's buffer (that should already have been done elsewhere).
205 // Pre-conditions:
206 // begin != end
207 // *begin should be a character which is > ' '. This implies that there
208 // is at least one non-whitespace characters between [begin, end).
209 // headers is a valid pointer to a BalsaHeaders class.
210 // error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value.
211 // Entire first line must exist between [begin, end)
212 // Exactly zero or one newlines -may- exist between [begin, end)
213 // [begin, end) should exist in the header's buffer.
215 // Side-effects:
216 // headers will be modified
217 // error_code may be modified if either a warning or error is detected
219 // Returns:
220 // True if no error (as opposed to warning) is detected.
221 // False if an error (as opposed to warning) is detected.
224 // If there is indeed non-whitespace in the line, then the following
225 // will take care of this for you:
226 // while (*begin <= ' ') ++begin;
227 // ProcessFirstLine(begin, end, is_request, &headers, &error_code);
229 bool ParseHTTPFirstLine(const char* begin,
230 const char* end,
231 bool is_request,
232 size_t max_request_uri_length,
233 BalsaHeaders* headers,
234 BalsaFrameEnums::ErrorCode* error_code) {
235 const char* current = begin;
236 // HTTP firstlines all have the following structure:
237 // LWS NONWS LWS NONWS LWS NONWS NOTCRLF CRLF
238 // [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n"
239 // ws1 nws1 ws2 nws2 ws3 nws3 ws4
240 // | [-------) [-------) [----------------)
241 // REQ: method request_uri version
242 // RESP: version statuscode reason
244 // The first NONWS->LWS component we'll call firstline_a.
245 // The second firstline_b, and the third firstline_c.
247 // firstline_a goes from nws1 to (but not including) ws2
248 // firstline_b goes from nws2 to (but not including) ws3
249 // firstline_c goes from nws3 to (but not including) ws4
251 // In the code:
252 // ws1 == whitespace_1_idx_
253 // nws1 == non_whitespace_1_idx_
254 // ws2 == whitespace_2_idx_
255 // nws2 == non_whitespace_2_idx_
256 // ws3 == whitespace_3_idx_
257 // nws3 == non_whitespace_3_idx_
258 // ws4 == whitespace_4_idx_
260 // Kill all whitespace (including '\r\n') at the end of the line.
261 --end;
262 if (*end != '\n') {
263 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
264 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
265 << headers->OriginalHeadersForDebugging();
266 return false;
268 while (begin < end && *end <= ' ') {
269 --end;
271 DCHECK(*end != '\n');
272 if (*end == '\n') {
273 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
274 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
275 << headers->OriginalHeadersForDebugging();
276 return false;
278 ++end;
280 // The two following statements should not be possible.
281 if (end == begin) {
282 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
283 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
284 << headers->OriginalHeadersForDebugging();
285 return false;
288 // whitespace_1_idx_
289 headers->whitespace_1_idx_ = current - begin;
290 // This loop is commented out as it is never used in current code. This is
291 // true only because we don't begin parsing the headers at all until we've
292 // encountered a non whitespace character at the beginning of the stream, at
293 // which point we begin our demarcation of header-start. If we did -not- do
294 // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop
295 // would be necessary for the proper functioning of this parsing.
296 // This is left here as this function may (in the future) be refactored out
297 // of the BalsaFrame class so that it may be shared between code in
298 // BalsaFrame and BalsaHeaders (where it would be used in some variant of the
299 // set_first_line() function (at which point it would be necessary).
300 #if 0
301 while (*current <= ' ') {
302 ++current;
304 #endif
305 // non_whitespace_1_idx_
306 headers->non_whitespace_1_idx_ = current - begin;
307 do {
308 // The first time through, we're guaranteed that the current character
309 // won't be a whitespace (else the loop above wouldn't have terminated).
310 // That implies that we're guaranteed to get at least one non-whitespace
311 // character if we get into this loop at all.
312 ++current;
313 if (current == end) {
314 headers->whitespace_2_idx_ = current - begin;
315 headers->non_whitespace_2_idx_ = current - begin;
316 headers->whitespace_3_idx_ = current - begin;
317 headers->non_whitespace_3_idx_ = current - begin;
318 headers->whitespace_4_idx_ = current - begin;
319 // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD for request
320 // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response
321 *error_code =
322 static_cast<BalsaFrameEnums::ErrorCode>(
323 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION +
324 is_request);
325 if (!is_request) { // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION
326 return false;
328 goto output_exhausted;
330 } while (*current > ' ');
331 // whitespace_2_idx_
332 headers->whitespace_2_idx_ = current - begin;
333 do {
334 ++current;
335 // Note that due to the loop which consumes all of the whitespace
336 // at the end of the line, current can never == end while in this function.
337 } while (*current <= ' ');
338 // non_whitespace_2_idx_
339 headers->non_whitespace_2_idx_ = current - begin;
340 do {
341 ++current;
342 if (current == end) {
343 headers->whitespace_3_idx_ = current - begin;
344 headers->non_whitespace_3_idx_ = current - begin;
345 headers->whitespace_4_idx_ = current - begin;
346 // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request
347 // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response
348 *error_code =
349 static_cast<BalsaFrameEnums::ErrorCode>(
350 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE
351 + is_request);
352 goto output_exhausted;
354 } while (*current > ' ');
355 // whitespace_3_idx_
356 headers->whitespace_3_idx_ = current - begin;
357 do {
358 ++current;
359 // Note that due to the loop which consumes all of the whitespace
360 // at the end of the line, current can never == end while in this function.
361 } while (*current <= ' ');
362 // non_whitespace_3_idx_
363 headers->non_whitespace_3_idx_ = current - begin;
364 headers->whitespace_4_idx_ = end - begin;
366 output_exhausted:
367 // Note that we don't fail the parse immediately when parsing of the
368 // firstline fails. Depending on the protocol type, we may want to accept
369 // a firstline with only one or two elements, e.g., for HTTP/0.9:
370 // GET\r\n
371 // or
372 // GET /\r\n
373 // should be parsed without issue (though the visitor should know that
374 // parsing the entire line was not exactly as it should be).
376 // Eventually, these errors may be removed alltogether, as the visitor can
377 // detect them on its own by examining the size of the various fields.
378 // headers->set_first_line(non_whitespace_1_idx_, current);
380 if (is_request) {
381 if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) >
382 max_request_uri_length) {
383 // For requests, we need at least the method. We could assume that a
384 // blank URI means "/". If version isn't stated, it should be assumed
385 // to be HTTP/0.9 by the visitor.
386 *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG;
387 return false;
389 } else {
390 headers->parsed_response_code_ = 0;
392 const char* parsed_response_code_current =
393 begin + headers->non_whitespace_2_idx_;
394 const char* parsed_response_code_end = begin + headers->whitespace_3_idx_;
395 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
397 // Convert a string of [0-9]* into an int.
398 // Note that this allows for the conversion of response codes which
399 // are outside the bounds of normal HTTP response codes (no checking
400 // is done to ensure that these are valid-- they're merely parsed)!
401 while (parsed_response_code_current < parsed_response_code_end) {
402 if (*parsed_response_code_current < '0' ||
403 *parsed_response_code_current > '9') {
404 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
405 return false;
407 size_t status_code_x_10 = headers->parsed_response_code_ * 10;
408 uint8 c = *parsed_response_code_current - '0';
409 if ((headers->parsed_response_code_ > kMaxDiv10) ||
410 (std::numeric_limits<size_t>::max() - status_code_x_10) < c) {
411 // overflow.
412 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
413 return false;
415 headers->parsed_response_code_ = status_code_x_10 + c;
416 ++parsed_response_code_current;
420 return true;
423 // begin - beginning of the firstline
424 // end - end of the firstline
426 // A precondition for this function is that there is non-whitespace between
427 // [begin, end). If this precondition is not met, the function will not perform
428 // as expected (and bad things may happen, and it will eat your first, second,
429 // and third unborn children!).
431 // Another precondition for this function is that [begin, end) includes
432 // at most one newline, which must be at the end of the line.
433 void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) {
434 BalsaFrameEnums::ErrorCode previous_error = last_error_;
435 if (!ParseHTTPFirstLine(begin,
436 end,
437 is_request_,
438 max_request_uri_length_,
439 headers_,
440 &last_error_)) {
441 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
442 visitor_->HandleHeaderError(this);
443 return;
445 if (previous_error != last_error_) {
446 visitor_->HandleHeaderWarning(this);
449 if (is_request_) {
450 size_t version_length =
451 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_;
452 visitor_->ProcessRequestFirstLine(
453 begin + headers_->non_whitespace_1_idx_,
454 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
455 begin + headers_->non_whitespace_1_idx_,
456 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
457 begin + headers_->non_whitespace_2_idx_,
458 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
459 begin + headers_->non_whitespace_3_idx_,
460 version_length);
461 if (version_length == 0)
462 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
463 } else {
464 visitor_->ProcessResponseFirstLine(
465 begin + headers_->non_whitespace_1_idx_,
466 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
467 begin + headers_->non_whitespace_1_idx_,
468 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
469 begin + headers_->non_whitespace_2_idx_,
470 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
471 begin + headers_->non_whitespace_3_idx_,
472 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_);
476 // 'stream_begin' points to the first character of the headers buffer.
477 // 'line_begin' points to the first character of the line.
478 // 'current' points to a char which is ':'.
479 // 'line_end' points to the position of '\n' + 1.
480 // 'line_begin' points to the position of first character of line.
481 void BalsaFrame::CleanUpKeyValueWhitespace(
482 const char* stream_begin,
483 const char* line_begin,
484 const char* current,
485 const char* line_end,
486 HeaderLineDescription* current_header_line) {
487 const char* colon_loc = current;
488 DCHECK_LT(colon_loc, line_end);
489 DCHECK_EQ(':', *colon_loc);
490 DCHECK_EQ(':', *current);
491 DCHECK_GE(' ', *line_end)
492 << "\"" << std::string(line_begin, line_end) << "\"";
494 // TODO(fenix): Investigate whether or not the bounds tests in the
495 // while loops here are redundant, and if so, remove them.
496 --current;
497 while (current > line_begin && *current <= ' ') --current;
498 current += (current != colon_loc);
499 current_header_line->key_end_idx = current - stream_begin;
501 current = colon_loc;
502 DCHECK_EQ(':', *current);
503 ++current;
504 while (current < line_end && *current <= ' ') ++current;
505 current_header_line->value_begin_idx = current - stream_begin;
507 DCHECK_GE(current_header_line->key_end_idx,
508 current_header_line->first_char_idx);
509 DCHECK_GE(current_header_line->value_begin_idx,
510 current_header_line->key_end_idx);
511 DCHECK_GE(current_header_line->last_char_idx,
512 current_header_line->value_begin_idx);
515 inline void BalsaFrame::FindColonsAndParseIntoKeyValue() {
516 DCHECK(!lines_.empty());
517 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
518 // The last line is always just a newline (and is uninteresting).
519 const Lines::size_type lines_size_m1 = lines_.size() - 1;
520 #if __SSE2__
521 const __m128i colons = _mm_set1_epi8(':');
522 const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16;
523 #endif // __SSE2__
524 const char* current = stream_begin + lines_[1].first;
525 // This code is a bit more subtle than it may appear at first glance.
526 // This code looks for a colon in the current line... but it also looks
527 // beyond the current line. If there is no colon in the current line, then
528 // for each subsequent line (until the colon which -has- been found is
529 // associated with a line), no searching for a colon will be performed. In
530 // this way, we minimize the amount of bytes we have scanned for a colon.
531 for (Lines::size_type i = 1; i < lines_size_m1;) {
532 const char* line_begin = stream_begin + lines_[i].first;
534 // Here we handle possible continuations. Note that we do not replace
535 // the '\n' in the line before a continuation (at least, as of now),
536 // which implies that any code which looks for a value must deal with
537 // "\r\n", etc -within- the line (and not just at the end of it).
538 for (++i; i < lines_size_m1; ++i) {
539 const char c = *(stream_begin + lines_[i].first);
540 if (c > ' ') {
541 // Not a continuation, so stop. Note that if the 'original' i = 1,
542 // and the next line is not a continuation, we'll end up with i = 2
543 // when we break. This handles the incrementing of i for the outer
544 // loop.
545 break;
548 const char* line_end = stream_begin + lines_[i - 1].second;
549 DCHECK_LT(line_begin - stream_begin, line_end - stream_begin);
551 // We cleanup the whitespace at the end of the line before doing anything
552 // else of interest as it allows us to do nothing when irregularly formatted
553 // headers are parsed (e.g. those with only keys, only values, or no colon).
555 // We're guaranteed to have *line_end > ' ' while line_end >= line_begin.
556 --line_end;
557 DCHECK_EQ('\n', *line_end)
558 << "\"" << std::string(line_begin, line_end) << "\"";
559 while (*line_end <= ' ' && line_end > line_begin) {
560 --line_end;
562 ++line_end;
563 DCHECK_GE(' ', *line_end);
564 DCHECK_LT(line_begin, line_end);
566 // We use '0' for the block idx, because we're always writing to the first
567 // block from the framer (we do this because the framer requires that the
568 // entire header sequence be in a contiguous buffer).
569 headers_->header_lines_.push_back(
570 HeaderLineDescription(line_begin - stream_begin,
571 line_end - stream_begin,
572 line_end - stream_begin,
573 line_end - stream_begin,
574 0));
575 if (current >= line_end) {
576 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
577 visitor_->HandleHeaderWarning(this);
578 // Then the next colon will not be found within this header line-- time
579 // to try again with another header-line.
580 continue;
581 } else if (current < line_begin) {
582 // When this condition is true, the last detected colon was part of a
583 // previous line. We reset to the beginning of the line as we don't care
584 // about the presence of any colon before the beginning of the current
585 // line.
586 current = line_begin;
588 #if __SSE2__
589 while (current < header_lines_end_m16) {
590 __m128i header_bytes =
591 _mm_loadu_si128(reinterpret_cast<const __m128i *>(current));
592 __m128i colon_cmp = _mm_cmpeq_epi8(header_bytes, colons);
593 int colon_msk = _mm_movemask_epi8(colon_cmp);
594 if (colon_msk == 0) {
595 current += 16;
596 continue;
598 current += (ffs(colon_msk) - 1);
599 if (current > line_end) {
600 break;
602 goto found_colon;
604 #endif // __SSE2__
605 for (; current < line_end; ++current) {
606 if (*current != ':') {
607 continue;
609 goto found_colon;
611 // If we've gotten to here, then there was no colon
612 // in the line. The arguments we passed into the construction
613 // for the HeaderLineDescription object should be OK-- it assumes
614 // that the entire content is 'key' by default (which is true, as
615 // there was no colon, there can be no value). Note that this is a
616 // construct which is technically not allowed by the spec.
617 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
618 visitor_->HandleHeaderWarning(this);
619 continue;
620 found_colon:
621 DCHECK_EQ(*current, ':');
622 DCHECK_LE(current - stream_begin, line_end - stream_begin);
623 DCHECK_LE(stream_begin - stream_begin, current - stream_begin);
625 HeaderLineDescription& current_header_line = headers_->header_lines_.back();
626 current_header_line.key_end_idx = current - stream_begin;
627 current_header_line.value_begin_idx = current_header_line.key_end_idx;
628 if (current < line_end) {
629 ++current_header_line.key_end_idx;
631 CleanUpKeyValueWhitespace(stream_begin,
632 line_begin,
633 current,
634 line_end,
635 &current_header_line);
640 void BalsaFrame::ProcessContentLengthLine(
641 HeaderLines::size_type line_idx,
642 BalsaHeadersEnums::ContentLengthStatus* status,
643 size_t* length) {
644 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
645 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
646 const char* line_end = stream_begin + header_line.last_char_idx;
647 const char* value_begin = (stream_begin + header_line.value_begin_idx);
649 if (value_begin >= line_end) {
650 // There is no non-whitespace value data.
651 #if DEBUGFRAMER
652 LOG(INFO) << "invalid content-length -- no non-whitespace value data";
653 #endif
654 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
655 return;
658 *length = 0;
659 while (value_begin < line_end) {
660 if (*value_begin < '0' || *value_begin > '9') {
661 // bad! content-length found, and couldn't parse all of it!
662 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
663 #if DEBUGFRAMER
664 LOG(INFO) << "invalid content-length - non numeric character detected";
665 #endif // DEBUGFRAMER
666 return;
668 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
669 size_t length_x_10 = *length * 10;
670 const unsigned char c = *value_begin - '0';
671 if (*length > kMaxDiv10 ||
672 (std::numeric_limits<size_t>::max() - length_x_10) < c) {
673 *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW;
674 #if DEBUGFRAMER
675 LOG(INFO) << "content-length overflow";
676 #endif // DEBUGFRAMER
677 return;
679 *length = length_x_10 + c;
680 ++value_begin;
682 #if DEBUGFRAMER
683 LOG(INFO) << "content_length parsed: " << *length;
684 #endif // DEBUGFRAMER
685 *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH;
688 void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) {
689 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
690 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
691 const char* line_end = stream_begin + header_line.last_char_idx;
692 const char* value_begin = stream_begin + header_line.value_begin_idx;
693 size_t value_length = line_end - value_begin;
695 if ((value_length == 7) &&
696 !strncasecmp(value_begin, "chunked", 7)) {
697 headers_->transfer_encoding_is_chunked_ = true;
698 } else if ((value_length == 8) &&
699 !strncasecmp(value_begin, "identity", 8)) {
700 headers_->transfer_encoding_is_chunked_ = false;
701 } else {
702 last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING;
703 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
704 visitor_->HandleHeaderError(this);
705 return;
709 namespace {
710 bool SplitStringPiece(base::StringPiece original, char delim,
711 base::StringPiece* before, base::StringPiece* after) {
712 const char* p = original.data();
713 const char* end = p + original.size();
715 while (p != end) {
716 if (*p == delim) {
717 ++p;
718 } else {
719 const char* start = p;
720 while (++p != end && *p != delim) {
721 // Skip to the next occurence of the delimiter.
723 *before = base::StringPiece(start, p - start);
724 if (p != end)
725 *after = base::StringPiece(p + 1, end - (p + 1));
726 else
727 *after = base::StringPiece("");
728 StringPieceUtils::RemoveWhitespaceContext(before);
729 StringPieceUtils::RemoveWhitespaceContext(after);
730 return true;
734 *before = original;
735 *after = "";
736 return false;
739 // TODO(phython): Fix this function to properly deal with quoted values.
740 // E.g. ";;foo", "\";;\"", or \"aa;
741 // The last example, the semi-colon is a separator between extensions.
742 void ProcessChunkExtensionsManual(base::StringPiece all_extensions,
743 BalsaHeaders* extensions) {
744 base::StringPiece extension;
745 base::StringPiece remaining;
746 StringPieceUtils::RemoveWhitespaceContext(&all_extensions);
747 SplitStringPiece(all_extensions, ';', &extension, &remaining);
748 while (!extension.empty()) {
749 base::StringPiece key;
750 base::StringPiece value;
751 SplitStringPiece(extension, '=', &key, &value);
752 if (!value.empty()) {
753 // Strip quotation marks if they exist.
754 if (!value.empty() && value[0] == '"')
755 value.remove_prefix(1);
756 if (!value.empty() && value[value.length() - 1] == '"')
757 value.remove_suffix(1);
760 extensions->AppendHeader(key, value);
762 StringPieceUtils::RemoveWhitespaceContext(&remaining);
763 SplitStringPiece(remaining, ';', &extension, &remaining);
767 } // anonymous namespace
769 void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size,
770 BalsaHeaders* extensions) {
771 ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions);
774 void BalsaFrame::ProcessHeaderLines() {
775 HeaderLines::size_type content_length_idx = 0;
776 HeaderLines::size_type transfer_encoding_idx = 0;
778 DCHECK(!lines_.empty());
779 #if DEBUGFRAMER
780 LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n";
781 #endif // DEBUGFRAMER
783 // There is no need to attempt to process headers if no header lines exist.
784 // There are at least two lines in the message which are not header lines.
785 // These two non-header lines are the first line of the message, and the
786 // last line of the message (which is an empty line).
787 // Thus, we test to see if we have more than two lines total before attempting
788 // to parse any header lines.
789 if (lines_.size() > 2) {
790 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
792 // Then, for the rest of the header data, we parse these into key-value
793 // pairs.
794 FindColonsAndParseIntoKeyValue();
795 // At this point, we've parsed all of the headers. Time to look for those
796 // headers which we require for framing.
797 const HeaderLines::size_type
798 header_lines_size = headers_->header_lines_.size();
799 for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) {
800 const HeaderLineDescription& current_header_line =
801 headers_->header_lines_[i];
802 const char* key_begin =
803 (stream_begin + current_header_line.first_char_idx);
804 const char* key_end = (stream_begin + current_header_line.key_end_idx);
805 const size_t key_len = key_end - key_begin;
806 const char c = *key_begin;
807 #if DEBUGFRAMER
808 LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len)
809 << " c: '" << c << "' key_len: " << key_len;
810 #endif // DEBUGFRAMER
811 // If a header begins with either lowercase or uppercase 'c' or 't', then
812 // the header may be one of content-length, connection, content-encoding
813 // or transfer-encoding. These headers are special, as they change the way
814 // that the message is framed, and so the framer is required to search
815 // for them.
818 if (c == 'c' || c == 'C') {
819 if ((key_len == kContentLengthSize) &&
820 0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) {
821 BalsaHeadersEnums::ContentLengthStatus content_length_status =
822 BalsaHeadersEnums::NO_CONTENT_LENGTH;
823 size_t length = 0;
824 ProcessContentLengthLine(i, &content_length_status, &length);
825 if (content_length_idx != 0) { // then we've already seen one!
826 if ((headers_->content_length_status_ != content_length_status) ||
827 ((headers_->content_length_status_ ==
828 BalsaHeadersEnums::VALID_CONTENT_LENGTH) &&
829 length != headers_->content_length_)) {
830 last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS;
831 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
832 visitor_->HandleHeaderError(this);
833 return;
835 continue;
836 } else {
837 content_length_idx = i + 1;
838 headers_->content_length_status_ = content_length_status;
839 headers_->content_length_ = length;
840 content_length_remaining_ = length;
844 } else if (c == 't' || c == 'T') {
845 if ((key_len == kTransferEncodingSize) &&
846 0 == strncasecmp(key_begin, kTransferEncoding,
847 kTransferEncodingSize)) {
848 if (transfer_encoding_idx != 0) {
849 last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS;
850 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
851 visitor_->HandleHeaderError(this);
852 return;
854 transfer_encoding_idx = i + 1;
856 } else if (i == 0 && (key_len == 0 || c == ' ')) {
857 last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT;
858 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
859 visitor_->HandleHeaderError(this);
860 return;
863 if (headers_->transfer_encoding_is_chunked_) {
864 headers_->content_length_ = 0;
865 headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH;
866 content_length_remaining_ = 0;
868 if (transfer_encoding_idx != 0) {
869 ProcessTransferEncodingLine(transfer_encoding_idx - 1);
874 void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() {
875 // For responses, can't have a body if the request was a HEAD, or if it is
876 // one of these response-codes. rfc2616 section 4.3
877 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
878 if (is_request_ ||
879 !(request_was_head_ ||
880 (headers_->parsed_response_code_ >= 100 &&
881 headers_->parsed_response_code_ < 200) ||
882 (headers_->parsed_response_code_ == 204) ||
883 (headers_->parsed_response_code_ == 304))) {
884 // Then we can have a body.
885 if (headers_->transfer_encoding_is_chunked_) {
886 // Note that
887 // if ( Transfer-Encoding: chunked && Content-length: )
888 // then Transfer-Encoding: chunked trumps.
889 // This is as specified in the spec.
890 // rfc2616 section 4.4.3
891 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
892 } else {
893 // Errors parsing content-length definitely can cause
894 // protocol errors/warnings
895 switch (headers_->content_length_status_) {
896 // If we have a content-length, and it is parsed
897 // properly, there are two options.
898 // 1) zero content, in which case the message is done, and
899 // 2) nonzero content, in which case we have to
900 // consume the body.
901 case BalsaHeadersEnums::VALID_CONTENT_LENGTH:
902 if (headers_->content_length_ == 0) {
903 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
904 } else {
905 parse_state_ = BalsaFrameEnums::READING_CONTENT;
907 break;
908 case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW:
909 case BalsaHeadersEnums::INVALID_CONTENT_LENGTH:
910 // If there were characters left-over after parsing the
911 // content length, we should flag an error and stop.
912 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
913 last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH;
914 visitor_->HandleHeaderError(this);
915 break;
916 // We can have: no transfer-encoding, no content length, and no
917 // connection: close...
918 // Unfortunately, this case doesn't seem to be covered in the spec.
919 // We'll assume that the safest thing to do here is what the google
920 // binaries before 2008 already do, which is to assume that
921 // everything until the connection is closed is body.
922 case BalsaHeadersEnums::NO_CONTENT_LENGTH:
923 if (is_request_) {
924 base::StringPiece method = headers_->request_method();
925 // POSTs and PUTs should have a detectable body length. If they
926 // do not we consider it an error.
927 if ((method.size() == 4 &&
928 strncmp(method.data(), "POST", 4) == 0) ||
929 (method.size() == 3 &&
930 strncmp(method.data(), "PUT", 3) == 0)) {
931 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
932 last_error_ =
933 BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH;
934 visitor_->HandleHeaderError(this);
935 break;
937 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
938 } else {
939 parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE;
940 last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH;
941 visitor_->HandleHeaderWarning(this);
943 break;
944 // The COV_NF_... statements here provide hints to the apparatus
945 // which computes coverage reports/ratios that this code is never
946 // intended to be executed, and should technically be impossible.
947 // COV_NF_START
948 default:
949 LOG(FATAL) << "Saw a content_length_status: "
950 << headers_->content_length_status_ << " which is unknown.";
951 // COV_NF_END
957 size_t BalsaFrame::ProcessHeaders(const char* message_start,
958 size_t message_length) {
959 const char* const original_message_start = message_start;
960 const char* const message_end = message_start + message_length;
961 const char* message_current = message_start;
962 const char* checkpoint = message_start;
964 if (message_length == 0) {
965 goto bottom;
968 while (message_current < message_end) {
969 size_t base_idx = headers_->GetReadableBytesFromHeaderStream();
971 // Yes, we could use strchr (assuming null termination), or
972 // memchr, but as it turns out that is slower than this tight loop
973 // for the input that we see.
974 if (!saw_non_newline_char_) {
975 do {
976 const char c = *message_current;
977 if (c != '\r' && c != '\n') {
978 if (c <= ' ') {
979 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
980 last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST;
981 visitor_->HandleHeaderError(this);
982 goto bottom;
983 } else {
984 saw_non_newline_char_ = true;
985 checkpoint = message_start = message_current;
986 goto read_real_message;
989 ++message_current;
990 } while (message_current < message_end);
991 goto bottom; // this is necessary to skip 'last_char_was_slash_r' checks
992 } else {
993 read_real_message:
994 // Note that SSE2 can be enabled on certain piii platforms.
995 #if __SSE2__
997 const char* const message_end_m16 = message_end - 16;
998 __m128i newlines = _mm_set1_epi8('\n');
999 while (message_current < message_end_m16) {
1000 // What this does (using compiler intrinsics):
1002 // Load 16 '\n's into an xmm register
1003 // Load 16 bytes of currennt message into an xmm register
1004 // Do byte-wise equals on those two xmm registers
1005 // Take the first bit of each byte, and put that into the first
1006 // 16 bits of a mask
1007 // If the mask is zero, no '\n' found. increment by 16 and try again
1008 // Else scan forward to find the first set bit.
1009 // Increment current by the index of the first set bit
1010 // (ffs returns index of first set bit + 1)
1011 __m128i msg_bytes =
1012 _mm_loadu_si128(const_cast<__m128i *>(
1013 reinterpret_cast<const __m128i *>(message_current)));
1014 __m128i newline_cmp = _mm_cmpeq_epi8(msg_bytes, newlines);
1015 int newline_msk = _mm_movemask_epi8(newline_cmp);
1016 if (newline_msk == 0) {
1017 message_current += 16;
1018 continue;
1020 message_current += (ffs(newline_msk) - 1);
1021 const size_t relative_idx = message_current - message_start;
1022 const size_t message_current_idx = 1 + base_idx + relative_idx;
1023 lines_.push_back(std::make_pair(last_slash_n_idx_,
1024 message_current_idx));
1025 if (lines_.size() == 1) {
1026 headers_->WriteFromFramer(checkpoint,
1027 1 + message_current - checkpoint);
1028 checkpoint = message_current + 1;
1029 const char* begin = headers_->OriginalHeaderStreamBegin();
1030 #if DEBUGFRAMER
1031 LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1032 LOG(INFO) << "is_request_: " << is_request_;
1033 #endif
1034 ProcessFirstLine(begin, begin + lines_[0].second);
1035 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1036 goto process_lines;
1037 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1038 goto bottom;
1040 const size_t chars_since_last_slash_n = (message_current_idx -
1041 last_slash_n_idx_);
1042 last_slash_n_idx_ = message_current_idx;
1043 if (chars_since_last_slash_n > 2) {
1044 // We have a slash-n, but the last slash n was
1045 // more than 2 characters away from this. Thus, we know
1046 // that this cannot be an end-of-header.
1047 ++message_current;
1048 continue;
1050 if ((chars_since_last_slash_n == 1) ||
1051 (((message_current > message_start) &&
1052 (*(message_current - 1) == '\r')) ||
1053 (last_char_was_slash_r_))) {
1054 goto process_lines;
1056 ++message_current;
1059 #endif // __SSE2__
1060 while (message_current < message_end) {
1061 if (*message_current != '\n') {
1062 ++message_current;
1063 continue;
1065 const size_t relative_idx = message_current - message_start;
1066 const size_t message_current_idx = 1 + base_idx + relative_idx;
1067 lines_.push_back(std::make_pair(last_slash_n_idx_,
1068 message_current_idx));
1069 if (lines_.size() == 1) {
1070 headers_->WriteFromFramer(checkpoint,
1071 1 + message_current - checkpoint);
1072 checkpoint = message_current + 1;
1073 const char* begin = headers_->OriginalHeaderStreamBegin();
1074 #if DEBUGFRAMER
1075 LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1076 LOG(INFO) << "is_request_: " << is_request_;
1077 #endif
1078 ProcessFirstLine(begin, begin + lines_[0].second);
1079 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1080 goto process_lines;
1081 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1082 goto bottom;
1084 const size_t chars_since_last_slash_n = (message_current_idx -
1085 last_slash_n_idx_);
1086 last_slash_n_idx_ = message_current_idx;
1087 if (chars_since_last_slash_n > 2) {
1088 // false positive.
1089 ++message_current;
1090 continue;
1092 if ((chars_since_last_slash_n == 1) ||
1093 (((message_current > message_start) &&
1094 (*(message_current - 1) == '\r')) ||
1095 (last_char_was_slash_r_))) {
1096 goto process_lines;
1098 ++message_current;
1101 continue;
1102 process_lines:
1103 ++message_current;
1104 DCHECK(message_current >= message_start);
1105 if (message_current > message_start) {
1106 headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1109 // Check if we have exceeded maximum headers length
1110 // Although we check for this limit before and after we call this function
1111 // we check it here as well to make sure that in case the visitor changed
1112 // the max_header_length_ (for example after processing the first line)
1113 // we handle it gracefully.
1114 if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) {
1115 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1116 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1117 visitor_->HandleHeaderError(this);
1118 goto bottom;
1121 // Since we know that we won't be writing any more bytes of the header,
1122 // we tell that to the headers object. The headers object may make
1123 // more efficient allocation decisions when this is signaled.
1124 headers_->DoneWritingFromFramer();
1126 const char* readable_ptr = NULL;
1127 size_t readable_size = 0;
1128 headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size);
1129 visitor_->ProcessHeaderInput(readable_ptr, readable_size);
1132 // Ok, now that we've written everything into our header buffer, it is
1133 // time to process the header lines (extract proper values for headers
1134 // which are important for framing).
1135 ProcessHeaderLines();
1136 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1137 goto bottom;
1139 AssignParseStateAfterHeadersHaveBeenParsed();
1140 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1141 goto bottom;
1143 visitor_->ProcessHeaders(*headers_);
1144 visitor_->HeaderDone();
1145 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) {
1146 visitor_->MessageDone();
1148 goto bottom;
1150 // If we've gotten to here, it means that we've consumed all of the
1151 // available input. We need to record whether or not the last character we
1152 // saw was a '\r' so that a subsequent call to ProcessInput correctly finds
1153 // a header framing that is split across the two calls.
1154 last_char_was_slash_r_ = (*(message_end - 1) == '\r');
1155 DCHECK(message_current >= message_start);
1156 if (message_current > message_start) {
1157 headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1159 bottom:
1160 return message_current - original_message_start;
1164 size_t BalsaFrame::BytesSafeToSplice() const {
1165 switch (parse_state_) {
1166 case BalsaFrameEnums::READING_CHUNK_DATA:
1167 return chunk_length_remaining_;
1168 case BalsaFrameEnums::READING_UNTIL_CLOSE:
1169 return std::numeric_limits<size_t>::max();
1170 case BalsaFrameEnums::READING_CONTENT:
1171 return content_length_remaining_;
1172 default:
1173 return 0;
1177 void BalsaFrame::BytesSpliced(size_t bytes_spliced) {
1178 switch (parse_state_) {
1179 case BalsaFrameEnums::READING_CHUNK_DATA:
1180 if (chunk_length_remaining_ >= bytes_spliced) {
1181 chunk_length_remaining_ -= bytes_spliced;
1182 if (chunk_length_remaining_ == 0) {
1183 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1185 return;
1186 } else {
1187 last_error_ =
1188 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1189 goto error_exit;
1192 case BalsaFrameEnums::READING_UNTIL_CLOSE:
1193 return;
1195 case BalsaFrameEnums::READING_CONTENT:
1196 if (content_length_remaining_ >= bytes_spliced) {
1197 content_length_remaining_ -= bytes_spliced;
1198 if (content_length_remaining_ == 0) {
1199 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1200 visitor_->MessageDone();
1202 return;
1203 } else {
1204 last_error_ =
1205 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1206 goto error_exit;
1209 default:
1210 last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO;
1211 goto error_exit;
1214 error_exit:
1215 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1216 visitor_->HandleBodyError(this);
1219 // You may note that the state-machine contained within this function has both
1220 // switch and goto labels for nearly the same thing. For instance, the
1221 // following two labels refer to the same code block:
1222 // label_reading_chunk_data:
1223 // case BalsaFrameEnums::READING_CHUNK_DATA:
1224 // The 'case' statement is required for the switch statement which occurs when
1225 // ProcessInput is invoked. The goto label is required as the state-machine
1226 // does not use a computed goto in any subsequent operations.
1228 // Since several states exit the state machine for various reasons, there is
1229 // also one label at the bottom of the function. When it is appropriate to
1230 // return from the function, that part of the state machine instead issues a
1231 // goto bottom; This results in less code duplication, and makes debugging
1232 // easier (as you can add a statement to a section of code which is guaranteed
1233 // to be invoked when the function is exiting.
1234 size_t BalsaFrame::ProcessInput(const char* input, size_t size) {
1235 const char* current = input;
1236 const char* on_entry = current;
1237 const char* end = current + size;
1238 #if DEBUGFRAMER
1239 LOG(INFO) << "\n=============="
1240 << BalsaFrameEnums::ParseStateToString(parse_state_)
1241 << "===============\n";
1242 #endif // DEBUGFRAMER
1244 DCHECK(headers_ != NULL);
1245 if (headers_ == NULL) return 0;
1247 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1248 const size_t header_length = headers_->GetReadableBytesFromHeaderStream();
1249 // Yes, we still have to check this here as the user can change the
1250 // max_header_length amount!
1251 // Also it is possible that we have reached the maximum allowed header size,
1252 // and we have more to consume (remember we are still inside
1253 // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error.
1254 if (header_length > max_header_length_ ||
1255 (header_length == max_header_length_ && size > 0)) {
1256 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1257 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1258 visitor_->HandleHeaderError(this);
1259 goto bottom;
1261 size_t bytes_to_process = max_header_length_ - header_length;
1262 if (bytes_to_process > size) {
1263 bytes_to_process = size;
1265 current += ProcessHeaders(input, bytes_to_process);
1266 // If we are still reading headers check if we have crossed the headers
1267 // limit. Note that we check for >= as opposed to >. This is because if
1268 // header_length_after equals max_header_length_ and we are still in the
1269 // parse_state_ BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for
1270 // sure that the headers limit will be crossed later on
1271 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1272 // Note that headers_ is valid only if we are still reading headers.
1273 const size_t header_length_after =
1274 headers_->GetReadableBytesFromHeaderStream();
1275 if (header_length_after >= max_header_length_) {
1276 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1277 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1278 visitor_->HandleHeaderError(this);
1281 goto bottom;
1282 } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ ||
1283 parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1284 // Can do nothing more 'till we're reset.
1285 goto bottom;
1288 while (current < end) {
1289 switch (parse_state_) {
1290 label_reading_chunk_length:
1291 case BalsaFrameEnums::READING_CHUNK_LENGTH:
1292 // In this state we read the chunk length.
1293 // Note that once we hit a character which is not in:
1294 // [0-9;A-Fa-f\n], we transition to a different state.
1297 // If we used strtol, etc, we'd have to buffer this line.
1298 // This is more annoying than simply doing the conversion
1299 // here. This code accounts for overflow.
1300 static const signed char buf[] = {
1301 // %0 %1 %2 %3 %4 %5 %6 %7 %8 \t \n %b %c \r %e %f
1302 -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1,
1303 // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f
1304 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1305 // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f
1306 -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1307 // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f
1308 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2, -1, -1, -1, -1,
1309 // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f
1310 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1311 // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f
1312 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1313 // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f
1314 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1315 // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f
1316 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1318 // valid cases:
1319 // "09123\n" // -> 09123
1320 // "09123\r\n" // -> 09123
1321 // "09123 \n" // -> 09123
1322 // "09123 \r\n" // -> 09123
1323 // "09123 12312\n" // -> 09123
1324 // "09123 12312\r\n" // -> 09123
1325 // "09123; foo=bar\n" // -> 09123
1326 // "09123; foo=bar\r\n" // -> 09123
1327 // "FFFFFFFFFFFFFFFF\r\n" // -> FFFFFFFFFFFFFFFF
1328 // "FFFFFFFFFFFFFFFF 22\r\n" // -> FFFFFFFFFFFFFFFF
1329 // invalid cases:
1330 // "[ \t]+[^\n]*\n"
1331 // "FFFFFFFFFFFFFFFFF\r\n" (would overflow)
1332 // "\r\n"
1333 // "\n"
1334 while (current < end) {
1335 const char c = *current;
1336 ++current;
1337 const signed char addition = buf[static_cast<int>(c)];
1338 if (addition >= 0) {
1339 chunk_length_character_extracted_ = true;
1340 size_t length_x_16 = chunk_length_remaining_ * 16;
1341 const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16;
1342 if ((chunk_length_remaining_ > kMaxDiv16) ||
1343 ((std::numeric_limits<size_t>::max() - length_x_16) <
1344 static_cast<size_t>(addition))) {
1345 // overflow -- asked for a chunk-length greater than 2^64 - 1!!
1346 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1347 last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW;
1348 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1349 visitor_->HandleChunkingError(this);
1350 goto bottom;
1352 chunk_length_remaining_ = length_x_16 + addition;
1353 continue;
1356 if (!chunk_length_character_extracted_ || addition == -1) {
1357 // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no
1358 // characters were converted, or an unexpected character was
1359 // seen.
1360 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1361 last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH;
1362 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1363 visitor_->HandleChunkingError(this);
1364 goto bottom;
1367 --current;
1368 parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION;
1369 visitor_->ProcessChunkLength(chunk_length_remaining_);
1370 goto label_reading_chunk_extension;
1373 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1374 goto bottom; // case BalsaFrameEnums::READING_CHUNK_LENGTH
1376 label_reading_chunk_extension:
1377 case BalsaFrameEnums::READING_CHUNK_EXTENSION:
1379 // TODO(phython): Convert this scanning to be 16 bytes at a time if
1380 // there is data to be read.
1381 const char* extensions_start = current;
1382 size_t extensions_length = 0;
1383 while (current < end) {
1384 const char c = *current;
1385 if (c == '\r' || c == '\n') {
1386 extensions_length =
1387 (extensions_start == current) ?
1389 current - extensions_start - 1;
1392 ++current;
1393 if (c == '\n') {
1394 chunk_length_character_extracted_ = false;
1395 visitor_->ProcessChunkExtensions(
1396 extensions_start, extensions_length);
1397 if (chunk_length_remaining_ != 0) {
1398 parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA;
1399 goto label_reading_chunk_data;
1401 HeaderFramingFound('\n');
1402 parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM;
1403 goto label_reading_last_chunk_term;
1406 visitor_->ProcessChunkExtensions(
1407 extensions_start, extensions_length);
1410 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1411 goto bottom; // case BalsaFrameEnums::READING_CHUNK_EXTENSION
1413 label_reading_chunk_data:
1414 case BalsaFrameEnums::READING_CHUNK_DATA:
1415 while (current < end) {
1416 if (chunk_length_remaining_ == 0) {
1417 break;
1419 // read in the chunk
1420 size_t bytes_remaining = end - current;
1421 size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ?
1422 chunk_length_remaining_ : bytes_remaining;
1423 const char* tmp_current = current + consumed_bytes;
1424 visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry);
1425 visitor_->ProcessBodyData(current, consumed_bytes);
1426 on_entry = current = tmp_current;
1427 chunk_length_remaining_ -= consumed_bytes;
1429 if (chunk_length_remaining_ == 0) {
1430 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1431 goto label_reading_chunk_term;
1433 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1434 goto bottom; // case BalsaFrameEnums::READING_CHUNK_DATA
1436 label_reading_chunk_term:
1437 case BalsaFrameEnums::READING_CHUNK_TERM:
1438 while (current < end) {
1439 const char c = *current;
1440 ++current;
1442 if (c == '\n') {
1443 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
1444 goto label_reading_chunk_length;
1447 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1448 goto bottom; // case BalsaFrameEnums::READING_CHUNK_TERM
1450 label_reading_last_chunk_term:
1451 case BalsaFrameEnums::READING_LAST_CHUNK_TERM:
1452 while (current < end) {
1453 const char c = *current;
1455 if (!HeaderFramingFound(c)) {
1456 // If not, however, since the spec only suggests that the
1457 // client SHOULD indicate the presence of trailers, we get to
1458 // *test* that they did or didn't.
1459 // If all of the bytes we've seen since:
1460 // OPTIONAL_WS 0 OPTIONAL_STUFF CRLF
1461 // are either '\r', or '\n', then we can assume that we don't yet
1462 // know if we need to parse headers, or if the next byte will make
1463 // the HeaderFramingFound condition (above) true.
1464 if (HeaderFramingMayBeFound()) {
1465 // If true, then we have seen only characters '\r' or '\n'.
1466 ++current;
1468 // Lets try again! There is no state change here.
1469 continue;
1470 } else {
1471 // If (!HeaderFramingMayBeFound()), then we know that we must be
1472 // reading the first non CRLF character of a trailer.
1473 parse_state_ = BalsaFrameEnums::READING_TRAILER;
1474 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1475 on_entry = current;
1476 goto label_reading_trailer;
1478 } else {
1479 // If we've found a "\r\n\r\n", then the message
1480 // is done.
1481 ++current;
1482 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1483 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1484 visitor_->MessageDone();
1485 goto bottom;
1487 break; // from while loop
1489 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1490 goto bottom; // case BalsaFrameEnums::READING_LAST_CHUNK_TERM
1492 label_reading_trailer:
1493 case BalsaFrameEnums::READING_TRAILER:
1494 while (current < end) {
1495 const char c = *current;
1496 ++current;
1497 // TODO(fenix): If we ever care about trailers as part of framing,
1498 // deal with them here (see below for part of the 'solution')
1499 // if (LineFramingFound(c)) {
1500 // trailer_lines_.push_back(make_pair(start_of_line_,
1501 // trailer_length_ - 1));
1502 // start_of_line_ = trailer_length_;
1503 // }
1504 if (HeaderFramingFound(c)) {
1505 // ProcessTrailers(visitor_, &trailers_);
1506 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1507 visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1508 visitor_->MessageDone();
1509 goto bottom;
1512 visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1513 break; // case BalsaFrameEnums::READING_TRAILER
1515 // Note that there is no label:
1516 // 'label_reading_until_close'
1517 // here. This is because the state-machine exists immediately after
1518 // reading the headers instead of transitioning here (as it would
1519 // do if it was consuming all the data it could, all the time).
1520 case BalsaFrameEnums::READING_UNTIL_CLOSE:
1522 const size_t bytes_remaining = end - current;
1523 if (bytes_remaining > 0) {
1524 visitor_->ProcessBodyInput(current, bytes_remaining);
1525 visitor_->ProcessBodyData(current, bytes_remaining);
1526 current += bytes_remaining;
1529 goto bottom; // case BalsaFrameEnums::READING_UNTIL_CLOSE
1531 // label_reading_content:
1532 case BalsaFrameEnums::READING_CONTENT:
1533 #if DEBUGFRAMER
1534 LOG(INFO) << "ReadingContent: " << content_length_remaining_;
1535 #endif // DEBUGFRAMER
1536 while (content_length_remaining_ && current < end) {
1537 // read in the content
1538 const size_t bytes_remaining = end - current;
1539 const size_t consumed_bytes =
1540 (content_length_remaining_ < bytes_remaining) ?
1541 content_length_remaining_ : bytes_remaining;
1542 visitor_->ProcessBodyInput(current, consumed_bytes);
1543 visitor_->ProcessBodyData(current, consumed_bytes);
1544 current += consumed_bytes;
1545 content_length_remaining_ -= consumed_bytes;
1547 if (content_length_remaining_ == 0) {
1548 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1549 visitor_->MessageDone();
1551 goto bottom; // case BalsaFrameEnums::READING_CONTENT
1553 default:
1554 // The state-machine should never be in a state that isn't handled
1555 // above. This is a glaring logic error, and we should do something
1556 // drastic to ensure that this gets looked-at and fixed.
1557 LOG(FATAL) << "Unknown state: " << parse_state_ // COV_NF_LINE
1558 << " memory corruption?!"; // COV_NF_LINE
1561 bottom:
1562 #if DEBUGFRAMER
1563 LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n"
1564 << std::string(input, current)
1565 << "\n$$$$$$$$$$$$$$"
1566 << BalsaFrameEnums::ParseStateToString(parse_state_)
1567 << "$$$$$$$$$$$$$$$"
1568 << " consumed: " << (current - input);
1569 if (Error()) {
1570 LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode());
1572 #endif // DEBUGFRAMER
1573 return current - input;
1576 } // namespace net